In [21]:
import torch
import torch.nn as nn
import kagglehub
from torch import optim
from torch.utils.data import Dataset, DataLoader, random_split

from io import open
import unicodedata
import re
import pandas as pd

In [22]:
if torch.cuda.is_available():
    device=torch.device(type='cuda', index=0)
else:
    device=torch.device(type='cpu', index=0)

In [23]:
path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")
data=pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv", header='infer')
data=data[:5000]
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [24]:
def shorten(s):
    temp=""
    cnt=0
    for word in s.split(" "):
        temp+=word + " "
        cnt+=1
        if cnt==99:
            break
    return temp.strip()

In [25]:
def tonum(s):
    if s == 'positive':
        return 1
    else:
        return 0

In [26]:
data['sentiment']=data['sentiment'].apply(lambda x:tonum(x))

data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [27]:
review_texts=list(data['review'])
review_sentiments=torch.tensor(data['sentiment'])

In [28]:
print("Number of reviews and labels:",len(review_texts))
print("Number of positive reviews:", torch.sum(review_sentiments==1))
print("Number of negative reviews:", torch.sum(review_sentiments==0))

Number of reviews and labels: 5000
Number of positive reviews: tensor(2468)
Number of negative reviews: tensor(2532)


In [29]:
def normalizeString(s):
    sres=""
    for ch in unicodedata.normalize('NFD', s):
        if unicodedata.category(ch) != 'Mn':
            sres+=ch
    sres = re.sub(r"[^a-zA-Z!?']+", r" ", sres)
    return sres.strip()

def createNormalizedData(review_texts):
    reviews=[]
    for review in review_texts:
        nor_review=normalizeString(review.lower().strip())
        reviews.append(nor_review)
    return reviews

In [30]:
reviews=createNormalizedData(review_texts)

print("Check the Length/number of reviews:", len(reviews))

reviews=list(map(shorten,reviews))
print("Check the Length/number of reviews:", len(reviews))

Check the Length/number of reviews: 5000
Check the Length/number of reviews: 5000


In [31]:
max_review_length=float('-inf')

for review in reviews:
    words=review.split(" ")
    l=len(words)
    if l>max_review_length:
        max_review_length=l
        that_review=review
        that_words=words

print(max_review_length)
print(that_review)
print(that_words)

99
one of the other reviewers has mentioned that after watching just oz episode you'll be hooked they are right as this is exactly what happened with me br br the first thing that struck me about oz was its brutality and unflinching scenes of violence which set in right from the word go trust me this is not a show for the faint hearted or timid this show pulls no punches with regards to drugs sex or violence its is hardcore in the classic use of the word br br it is called oz as that is the nickname
['one', 'of', 'the', 'other', 'reviewers', 'has', 'mentioned', 'that', 'after', 'watching', 'just', 'oz', 'episode', "you'll", 'be', 'hooked', 'they', 'are', 'right', 'as', 'this', 'is', 'exactly', 'what', 'happened', 'with', 'me', 'br', 'br', 'the', 'first', 'thing', 'that', 'struck', 'me', 'about', 'oz', 'was', 'its', 'brutality', 'and', 'unflinching', 'scenes', 'of', 'violence', 'which', 'set', 'in', 'right', 'from', 'the', 'word', 'go', 'trust', 'me', 'this', 'is', 'not', 'a', 'show', '

In [32]:
class Vocab:
    def __init__(self):
        self.word2index={'PAD':0, 'EOS':1}
        self.index2word={0:'PAD', 1:'EOS'}
        self.word2count={}
        self.nwords=2

    def buildVocab(self,s):
        for word in s.split(" "):
            if word not in self.word2index:
                self.word2index[word]=self.nwords
                self.index2word[self.nwords]=word
                self.word2count[word]=1
                self.nwords+=1
            else:
                self.word2count[word]+=1

In [33]:
vocab=Vocab()

for review in reviews:
    vocab.buildVocab(review)

print("Vocab Length:",vocab.nwords)

Vocab Length: 28216


In [34]:
class CustomDataset(Dataset):
    def __init__(self, reviews, review_sentiments, vocab, max_length):
        super().__init__()
        self.reviews=reviews
        self.review_sentiments=review_sentiments
        self.max_length=max_length
        self.vocab=vocab

    def __len__(self):
        return len(self.reviews)

    def get_input_ids(self,review,vocab):
        input_ids=[]
        input_ids_tensor=torch.zeros(self.max_length+1, dtype=torch.int64)
        for word in review.split(" "):
            input_ids.append(self.vocab.word2index[word])

        input_ids.append(self.vocab.word2index['EOS'])
        input_ids_tensor[:len(input_ids)]=torch.tensor(input_ids)
        return input_ids_tensor

    def __getitem__(self,idx):
        review=self.reviews[idx]
        review_sentiment=self.review_sentiments[idx]

        return self.get_input_ids(review,self.vocab), review_sentiment

In [35]:
dataset=CustomDataset(reviews,review_sentiments,vocab,max_review_length)

train_dataset,test_dataset=random_split(dataset,[0.9,0.1])

batch_size=64

train_dataloader=DataLoader(dataset=train_dataset,batch_size=batch_size, shuffle=True)
test_dataloader=DataLoader(dataset=test_dataset,batch_size=32, shuffle=False)

In [36]:
class SentiNN(nn.Module):
    def __init__(self, input_size, embed_size, hidden_size):
        super().__init__()
        self.e = nn.Embedding(input_size, embed_size)
        self.dropout = nn.Dropout(0.2)
        self.rnn = nn.LSTM(embed_size, hidden_size, batch_first=True)  # <-- LSTM instead of GRU
        self.out = nn.Linear(hidden_size, 2)

    def forward(self, x):
        x = self.e(x)
        x = self.dropout(x)
        outputs, (hidden, cell) = self.rnn(x)  # LSTM returns (output, (hidden, cell))
        hidden = hidden[-1]  # Take last layer hidden state
        logits = self.out(hidden)
        return logits


In [37]:
embed_size=128
hidden_size=256

#create instance of a neural network
sentinn=SentiNN(vocab.nwords,embed_size,hidden_size).to(device) #translation-direction sensitive

#specify loss, learning rate, and an optimizer
loss_fn=nn.CrossEntropyLoss().to(device)
lr=0.001
opt=optim.Adam(params=sentinn.parameters(), lr=lr)

In [38]:
def train_one_epoch():
    sentinn.train()
    track_loss=0
    num_correct=0
    batch_size=64

    for i, (reviews_ids,sentiments) in enumerate(train_dataloader):

        reviews_ids=reviews_ids.to(device) #64 x 100
        sentiments=sentiments.to(device) #64
        logits=sentinn(reviews_ids)
        loss=loss_fn(logits,sentiments)


        track_loss+=loss.item()
        num_correct+=(torch.argmax(logits,dim=1)==sentiments).type(torch.float).sum().item()

        running_loss=round(track_loss/(i+(reviews_ids.shape[0]/batch_size)),4)
        running_acc=round((num_correct/((i*batch_size+reviews_ids.shape[0])))*100,4)

        opt.zero_grad()
        loss.backward()
        opt.step()


    epoch_loss=running_loss
    epoch_acc=running_acc
    return epoch_loss, epoch_acc

In [39]:
def eval_one_epoch():
    sentinn.eval()
    track_loss=0
    num_correct=0
    batch_size=32

    for i, (reviews_ids,sentiments) in enumerate(test_dataloader):

        reviews_ids=reviews_ids.to(device)
        sentiments=sentiments.to(device)
        logits=sentinn(reviews_ids)

        loss=loss_fn(logits,sentiments)


        track_loss+=loss.item()
        num_correct+=(torch.argmax(logits,dim=1)==sentiments).type(torch.float).sum().item()

        running_loss=round(track_loss/(i+(reviews_ids.shape[0]/batch_size)),4)
        running_acc=round((num_correct/((i*batch_size+reviews_ids.shape[0])))*100,4)

    epoch_loss=running_loss
    epoch_acc=running_acc
    return epoch_loss, epoch_acc

In [40]:
n_epochs=10

for e in range(n_epochs):
    print("Epoch=",e+1, sep="", end=", ")
    epoch_loss,epoch_acc=train_one_epoch()
    print("Train Loss=", epoch_loss, "Train Acc", epoch_acc)
    epoch_loss,epoch_acc=eval_one_epoch()
    print("Eval Loss=", epoch_loss, "Eval Acc", epoch_acc)

Epoch=1, Train Loss= 0.7021 Train Acc 51.2
Eval Loss= 0.7097 Eval Acc 51.4
Epoch=2, Train Loss= 0.6848 Train Acc 57.2889
Eval Loss= 0.7124 Eval Acc 55.8
Epoch=3, Train Loss= 0.6447 Train Acc 63.2444
Eval Loss= 0.705 Eval Acc 59.8
Epoch=4, Train Loss= 0.5994 Train Acc 68.3556
Eval Loss= 0.727 Eval Acc 58.8
Epoch=5, Train Loss= 0.5207 Train Acc 74.8889
Eval Loss= 0.7435 Eval Acc 64.2
Epoch=6, Train Loss= 0.4455 Train Acc 78.7556
Eval Loss= 0.8309 Eval Acc 60.2
Epoch=7, Train Loss= 0.3698 Train Acc 83.4222
Eval Loss= 0.8964 Eval Acc 64.4
Epoch=8, Train Loss= 0.2743 Train Acc 88.7556
Eval Loss= 0.8752 Eval Acc 67.4
Epoch=9, Train Loss= 0.2092 Train Acc 91.4889
Eval Loss= 1.0655 Eval Acc 64.2
Epoch=10, Train Loss= 0.1758 Train Acc 93.1111
Eval Loss= 1.1792 Eval Acc 68.6
