In [4]:
import torch
import torch.nn as nn
import kagglehub
from torch import optim
from torch.utils.data import Dataset, DataLoader, random_split

from io import open
import unicodedata
import re
import pandas as pd

In [5]:
if torch.cuda.is_available():
    device=torch.device(type='cuda', index=0)
else:
    device=torch.device(type='cpu', index=0)

In [6]:
path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")
data=pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv", header='infer')
data=data[:5000]
data.head()

Using Colab cache for faster access to the 'imdb-dataset-of-50k-movie-reviews' dataset.


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
def shorten(s):
    temp=""
    cnt=0
    for word in s.split(" "):
        temp+=word + " "
        cnt+=1
        if cnt==99:
            break
    return temp.strip()

In [8]:
def tonum(s):
    if s == 'positive':
        return 1
    else:
        return 0

In [9]:
data['sentiment']=data['sentiment'].apply(lambda x:tonum(x))

data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [10]:
review_texts=list(data['review'])
review_sentiments=torch.tensor(data['sentiment'])

In [11]:
print("Number of reviews and labels:",len(review_texts))
print("Number of positive reviews:", torch.sum(review_sentiments==1))
print("Number of negative reviews:", torch.sum(review_sentiments==0))

Number of reviews and labels: 5000
Number of positive reviews: tensor(2468)
Number of negative reviews: tensor(2532)


In [12]:
def normalizeString(s):
    sres=""
    for ch in unicodedata.normalize('NFD', s):
        if unicodedata.category(ch) != 'Mn':
            sres+=ch
    sres = re.sub(r"[^a-zA-Z!?']+", r" ", sres)
    return sres.strip()

def createNormalizedData(review_texts):
    reviews=[]
    for review in review_texts:
        nor_review=normalizeString(review.lower().strip())
        reviews.append(nor_review)
    return reviews

In [13]:
reviews=createNormalizedData(review_texts)

print("Check the Length/number of reviews:", len(reviews))

reviews=list(map(shorten,reviews))
print("Check the Length/number of reviews:", len(reviews))

Check the Length/number of reviews: 5000
Check the Length/number of reviews: 5000


In [14]:
max_review_length=float('-inf')

for review in reviews:
    words=review.split(" ")
    l=len(words)
    if l>max_review_length:
        max_review_length=l
        that_review=review
        that_words=words

print(max_review_length)
print(that_review)
print(that_words)

99
one of the other reviewers has mentioned that after watching just oz episode you'll be hooked they are right as this is exactly what happened with me br br the first thing that struck me about oz was its brutality and unflinching scenes of violence which set in right from the word go trust me this is not a show for the faint hearted or timid this show pulls no punches with regards to drugs sex or violence its is hardcore in the classic use of the word br br it is called oz as that is the nickname
['one', 'of', 'the', 'other', 'reviewers', 'has', 'mentioned', 'that', 'after', 'watching', 'just', 'oz', 'episode', "you'll", 'be', 'hooked', 'they', 'are', 'right', 'as', 'this', 'is', 'exactly', 'what', 'happened', 'with', 'me', 'br', 'br', 'the', 'first', 'thing', 'that', 'struck', 'me', 'about', 'oz', 'was', 'its', 'brutality', 'and', 'unflinching', 'scenes', 'of', 'violence', 'which', 'set', 'in', 'right', 'from', 'the', 'word', 'go', 'trust', 'me', 'this', 'is', 'not', 'a', 'show', '

In [15]:
class Vocab:
    def __init__(self):
        self.word2index={'PAD':0, 'EOS':1}
        self.index2word={0:'PAD', 1:'EOS'}
        self.word2count={}
        self.nwords=2

    def buildVocab(self,s):
        for word in s.split(" "):
            if word not in self.word2index:
                self.word2index[word]=self.nwords
                self.index2word[self.nwords]=word
                self.word2count[word]=1
                self.nwords+=1
            else:
                self.word2count[word]+=1

In [16]:
vocab=Vocab()

for review in reviews:
    vocab.buildVocab(review)

print("Vocab Length:",vocab.nwords)

Vocab Length: 28216


In [17]:
class CustomDataset(Dataset):
    def __init__(self, reviews, review_sentiments, vocab, max_length):
        super().__init__()
        self.reviews=reviews
        self.review_sentiments=review_sentiments
        self.max_length=max_length
        self.vocab=vocab

    def __len__(self):
        return len(self.reviews)

    def get_input_ids(self,review,vocab):
        input_ids=[]
        input_ids_tensor=torch.zeros(self.max_length+1, dtype=torch.int64)
        for word in review.split(" "):
            input_ids.append(self.vocab.word2index[word])

        input_ids.append(self.vocab.word2index['EOS'])
        input_ids_tensor[:len(input_ids)]=torch.tensor(input_ids)
        return input_ids_tensor

    def __getitem__(self,idx):
        review=self.reviews[idx]
        review_sentiment=self.review_sentiments[idx]

        return self.get_input_ids(review,self.vocab), review_sentiment

In [18]:
dataset=CustomDataset(reviews,review_sentiments,vocab,max_review_length)

train_dataset,test_dataset=random_split(dataset,[0.9,0.1])

batch_size=64

train_dataloader=DataLoader(dataset=train_dataset,batch_size=batch_size, shuffle=True)
test_dataloader=DataLoader(dataset=test_dataset,batch_size=32, shuffle=False)

In [19]:
class SentiNN(nn.Module):
    def __init__(self, input_size, embed_size, hidden_size):
        super().__init__()
        self.e=nn.Embedding(input_size, embed_size)
        self.dropout=nn.Dropout(0.2)
        self.rnn=nn.GRU(embed_size,hidden_size, batch_first=True)
        self.out=nn.Linear(in_features=hidden_size,out_features=2)

    def forward(self,x): #64 x 100
        x=self.e(x) #64 x 100 x 128
        x=self.dropout(x)
        outputs, hidden=self.rnn(x) # hidden is 1 x batch_size x hidden_size (1 x 64 x 256)
        #output is 64 x 100 x 256
        hidden.squeeze_(0) #now, batch_size x hidden_size (64 x 256)
        logits=self.out(hidden)
        return logits

In [20]:
embed_size=128
hidden_size=256

#create instance of a neural network
sentinn=SentiNN(vocab.nwords,embed_size,hidden_size).to(device) #translation-direction sensitive

#specify loss, learning rate, and an optimizer
loss_fn=nn.CrossEntropyLoss().to(device)
lr=0.001
opt=optim.Adam(params=sentinn.parameters(), lr=lr)

In [21]:
def train_one_epoch():
    sentinn.train()
    track_loss=0
    num_correct=0
    batch_size=64

    for i, (reviews_ids,sentiments) in enumerate(train_dataloader):

        reviews_ids=reviews_ids.to(device) #64 x 100
        sentiments=sentiments.to(device) #64
        logits=sentinn(reviews_ids)
        loss=loss_fn(logits,sentiments)


        track_loss+=loss.item()
        num_correct+=(torch.argmax(logits,dim=1)==sentiments).type(torch.float).sum().item()

        running_loss=round(track_loss/(i+(reviews_ids.shape[0]/batch_size)),4)
        running_acc=round((num_correct/((i*batch_size+reviews_ids.shape[0])))*100,4)

        opt.zero_grad()
        loss.backward()
        opt.step()


    epoch_loss=running_loss
    epoch_acc=running_acc
    return epoch_loss, epoch_acc

In [22]:
def eval_one_epoch():
    sentinn.eval()
    track_loss=0
    num_correct=0
    batch_size=32

    for i, (reviews_ids,sentiments) in enumerate(test_dataloader):

        reviews_ids=reviews_ids.to(device)
        sentiments=sentiments.to(device)
        logits=sentinn(reviews_ids)

        loss=loss_fn(logits,sentiments)


        track_loss+=loss.item()
        num_correct+=(torch.argmax(logits,dim=1)==sentiments).type(torch.float).sum().item()

        running_loss=round(track_loss/(i+(reviews_ids.shape[0]/batch_size)),4)
        running_acc=round((num_correct/((i*batch_size+reviews_ids.shape[0])))*100,4)

    epoch_loss=running_loss
    epoch_acc=running_acc
    return epoch_loss, epoch_acc

In [23]:
n_epochs=30

for e in range(n_epochs):
    print("Epoch=",e+1, sep="", end=", ")
    epoch_loss,epoch_acc=train_one_epoch()
    print("Train Loss=", epoch_loss, "Train Acc", epoch_acc)
    epoch_loss,epoch_acc=eval_one_epoch()
    print("Eval Loss=", epoch_loss, "Eval Acc", epoch_acc)

Epoch=1, Train Loss= 0.7048 Train Acc 52.2667
Eval Loss= 0.7081 Eval Acc 54.0
Epoch=2, Train Loss= 0.6878 Train Acc 56.6889
Eval Loss= 0.711 Eval Acc 54.4
Epoch=3, Train Loss= 0.6539 Train Acc 62.2222
Eval Loss= 0.6904 Eval Acc 58.6
Epoch=4, Train Loss= 0.5863 Train Acc 68.3556
Eval Loss= 0.6292 Eval Acc 67.2
Epoch=5, Train Loss= 0.4853 Train Acc 77.2889
Eval Loss= 0.6677 Eval Acc 65.6
Epoch=6, Train Loss= 0.3877 Train Acc 82.6667
Eval Loss= 0.6252 Eval Acc 72.0
Epoch=7, Train Loss= 0.2841 Train Acc 88.8222
Eval Loss= 0.6608 Eval Acc 72.6
Epoch=8, Train Loss= 0.2019 Train Acc 91.9333
Eval Loss= 0.7456 Eval Acc 75.8
Epoch=9, Train Loss= 0.1313 Train Acc 95.2444
Eval Loss= 0.9441 Eval Acc 75.6
Epoch=10, Train Loss= 0.1085 Train Acc 95.9778
Eval Loss= 0.9222 Eval Acc 78.2
Epoch=11, Train Loss= 0.0936 Train Acc 96.6889
Eval Loss= 0.8867 Eval Acc 76.6
Epoch=12, Train Loss= 0.0566 Train Acc 98.0667
Eval Loss= 1.0189 Eval Acc 76.6
Epoch=13, Train Loss= 0.0481 Train Acc 98.4
Eval Loss= 1.0922 