This work has been replicated from https://www.kaggle.com/code/priyankdl/sentiment-analysis-imdb-torchtext-gru

Installing a few dependencies

In [None]:
!pip install torch==2.0.1 torchtext==0.15.2
!pip install 'portalocker>=2.0.0'


Collecting torch==2.0.1
  Downloading torch-2.0.1-cp310-cp310-manylinux1_x86_64.whl.metadata (24 kB)
Collecting torchtext==0.15.2
  Downloading torchtext-0.15.2-cp310-cp310-manylinux1_x86_64.whl.metadata (7.4 kB)
Collecting nvidia-cuda-nvrtc-cu11==11.7.99 (from torch==2.0.1)
  Downloading nvidia_cuda_nvrtc_cu11-11.7.99-2-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu11==11.7.99 (from torch==2.0.1)
  Downloading nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cuda-cupti-cu11==11.7.101 (from torch==2.0.1)
  Downloading nvidia_cuda_cupti_cu11-11.7.101-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu11==8.5.0.96 (from torch==2.0.1)
  Downloading nvidia_cudnn_cu11-8.5.0.96-2-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu11==11.10.3.66 (from torch==2.0.1)
  Downloading nvidia_cublas_cu11-11.10.3.66-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Co

Importing all the dependencies

In [None]:
import torch
from torch import nn
from torch import optim
from torch.utils.data import DataLoader

from torchtext import datasets
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

import numpy as np
from functools import partial

Getting to know the training and testing datasets

In [None]:
train_dataset_iterator=datasets.IMDB(split='train')
test_dataset_iterator=datasets.IMDB(split='test')

In [None]:
train_data=[]
test_data=[]

train_reviews=[]

for label,review in train_dataset_iterator:
  train_data.append([review,label])
  train_reviews.append(review)

for label,review in test_dataset_iterator:
  test_data.append([review,label])

print("Training_data_length:",len(train_data))
print("Testing_data_length:",len(test_data))

Training_data_length: 25000
Testing_data_length: 25000


In [None]:
tokenizer=get_tokenizer("basic_english","en")
vocab=build_vocab_from_iterator(
    map(tokenizer,train_reviews),
    specials=["<unk>","<pad>","<eos>"],
    special_first=True,
    min_freq=5
)

vocab.set_default_index(vocab["<unk>"])
print("The Vocab Size is :",vocab.__len__())

def text_pipeline(review):
  return vocab.lookup_indices(tokenizer(review))

The Vocab Size is : 30124


In [None]:
vocab_file_path="vocab.txt"
with open (vocab_file_path,"w") as f:
  for token, index in vocab.get_stoi().items():
    f.write(f"{token}\t{index}\n")

In [None]:
def colate_fn(batch,text_pipeline):
  input=[]
  ground_truth=[]
  for data in batch:
    review=data[0]
    label=data[1]-1

    numeric_tokens=text_pipeline(review)
    if (len(numeric_tokens)>256):
      numeric_tokens=numeric_tokens[:256]
    while (len(numeric_tokens)<256):
      numeric_tokens.append(0)
    input.append(numeric_tokens)
    ground_truth.append(label)

  input=torch.tensor(input,dtype=torch.long)
  ground_truth=torch.tensor(ground_truth,dtype=torch.long)

  return input,ground_truth

Mounting the dataloaders onto the train and test data

In [None]:
train_dataloader=DataLoader(
    train_data,
    batch_size=16,
    shuffle=True,
    collate_fn=partial(colate_fn,text_pipeline=text_pipeline)
)

test_dataloader=DataLoader(
    test_data,
    batch_size=16,
    shuffle=True,
    collate_fn=partial(colate_fn,text_pipeline=text_pipeline)
)

Now for the model

In [None]:
class SAnalysis(nn.Module):
  def __init__(self,vocab_size):
    super().__init__()
    self.em=nn.Embedding(vocab_size,128)
    self.drop=nn.Dropout(0.2)
    self.gru=nn.GRU(128,256,batch_first=True)
    self.classifier=nn.Linear(256,2)

  def forward(self,x):
    x=self.em(x)
    x=self.drop(x)
    outputs,hidden=self.gru(x)
    hidden.squeeze_(0)
    x=self.classifier(hidden)

    return x

train one epoch

In [None]:
model=SAnalysis(vocab.__len__())

optimiser=optim.Adam(model.parameters(),lr=0.001)
loss_function=nn.CrossEntropyLoss()


In [None]:
def train_one_epoch(model,dataloader,optim,loss_function):
  model.train()

  running_loss=[]
  running_acc=[]
  track_loss=0
  num_correct=0
  i=0
  for reviews,label in dataloader:
    output=model(reviews)
    loss=loss_function(output,label)

    track_loss+=loss.item()
    num_correct+=(torch.argmax(output,dim=1)==label).type(torch.float).sum().item()
    running_loss=round(track_loss/(i+(reviews.shape[0]/16)),4)
    running_acc=round((num_correct/((i*16+reviews.shape[0])))*100,4)

    optim.zero_grad()
    loss.backward()
    optim.step()

    i+=1

  print("EVAL LOOP:LOSS ",running_loss)
  print("EVAL LOOP:Accuracy ",running_acc)


In [None]:
def eval_one_epoch(model,dataloader,loss_function):

  model.eval()

  running_loss=[]
  running_acc=[]
  track_loss=0
  num_correct=0

  i=0

  for reviews,label in dataloader:
    output=model(reviews)
    loss=loss_function(output,label)

    track_loss+=loss.item()
    num_correct+=(torch.argmax(output,dim=1)==label).type(torch.float).sum().item()
    running_loss=round(track_loss/(i+(reviews.shape[0]/16)),4)
    running_acc=round((num_correct/((i*16+reviews.shape[0])))*100,4)

    i+=1

  print("EVAL LOOP:LOSS ",running_loss)
  print("EVAL LOOP:Accuracy ",running_acc)

In [None]:
epochs=2
for i in range(epochs):
  train_one_epoch(model,train_dataloader,optimiser,loss_function)
  eval_one_epoch(model,test_dataloader,loss_function)

EVAL LOOP:LOSS  0.6791
EVAL LOOP:Accuracy  54.544
EVAL LOOP:LOSS  0.5219
EVAL LOOP:Accuracy  75.492
EVAL LOOP:LOSS  0.3735
EVAL LOOP:Accuracy  83.612
EVAL LOOP:LOSS  0.3137
EVAL LOOP:Accuracy  86.288


In [None]:
torch.save(model,"model.pth")

In [None]:
torch.save(model.state_dict(),"model_state_dict.pth")