In [24]:
import torch
import torch.nn as nn
import kagglehub
from torch import optim
from torch.utils.data import Dataset, DataLoader, random_split

from io import open
import unicodedata
import re
import pandas as pd

In [25]:
if torch.cuda.is_available():
    device=torch.device(type='cuda', index=0)
else:
    device=torch.device(type='cpu', index=0)

In [26]:
import kagglehub
import shutil
import os


path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")


print("Dataset downloaded to:", path)

download_folder = os.path.expanduser("~/Downloads")
os.makedirs(download_folder, exist_ok=True)

src_file = os.path.join(path, "IMDB Dataset.csv")
dest_file = os.path.join(download_folder, "IMDB Dataset.csv")

shutil.copy(src_file, dest_file)



Dataset downloaded to: C:\Users\Lenovo\.cache\kagglehub\datasets\lakshmi25npathi\imdb-dataset-of-50k-movie-reviews\versions\1


'C:\\Users\\Lenovo/Downloads\\IMDB Dataset.csv'

In [27]:
import pandas as pd

data = pd.read_csv("~/Downloads/IMDB Dataset.csv")
data = data[:5000]  
data.head()


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [28]:
def shorten(s):
    temp=""
    cnt=0
    for word in s.split(" "):
        temp+=word + " "
        cnt+=1
        if cnt==99:
            break
    return temp.strip()

In [29]:
def tonum(s):
    if s == 'positive':
        return 1
    else:
        return 0

In [30]:
data['sentiment']=data['sentiment'].apply(lambda x:tonum(x))

data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [31]:
review_texts=list(data['review'])
review_sentiments=torch.tensor(data['sentiment'])

In [32]:
print("Number of reviews and labels:",len(review_texts))
print("Number of positive reviews:", torch.sum(review_sentiments==1))
print("Number of negative reviews:", torch.sum(review_sentiments==0))

Number of reviews and labels: 5000
Number of positive reviews: tensor(2468)
Number of negative reviews: tensor(2532)


In [33]:
def normalizeString(s):
    sres=""
    for ch in unicodedata.normalize('NFD', s):
        if unicodedata.category(ch) != 'Mn':
            sres+=ch
    sres = re.sub(r"[^a-zA-Z!?']+", r" ", sres)
    return sres.strip()

def createNormalizedData(review_texts):
    reviews=[]
    for review in review_texts:
        nor_review=normalizeString(review.lower().strip())
        reviews.append(nor_review)
    return reviews

In [34]:
reviews = createNormalizedData(review_texts)

print("Check the Length/number of reviews:", len(reviews))

reviews = list(map(shorten, reviews))
print("Check the Length/number of reviews:", len(reviews))


Check the Length/number of reviews: 5000
Check the Length/number of reviews: 5000


In [35]:
max_review_length=float('-inf')

for review in reviews:
    words=review.split(" ")
    l=len(words)
    if l>max_review_length:
        max_review_length=l
        that_review=review
        that_words=words

print(max_review_length)
print(that_review)
print(that_words)

99
one of the other reviewers has mentioned that after watching just oz episode you'll be hooked they are right as this is exactly what happened with me br br the first thing that struck me about oz was its brutality and unflinching scenes of violence which set in right from the word go trust me this is not a show for the faint hearted or timid this show pulls no punches with regards to drugs sex or violence its is hardcore in the classic use of the word br br it is called oz as that is the nickname
['one', 'of', 'the', 'other', 'reviewers', 'has', 'mentioned', 'that', 'after', 'watching', 'just', 'oz', 'episode', "you'll", 'be', 'hooked', 'they', 'are', 'right', 'as', 'this', 'is', 'exactly', 'what', 'happened', 'with', 'me', 'br', 'br', 'the', 'first', 'thing', 'that', 'struck', 'me', 'about', 'oz', 'was', 'its', 'brutality', 'and', 'unflinching', 'scenes', 'of', 'violence', 'which', 'set', 'in', 'right', 'from', 'the', 'word', 'go', 'trust', 'me', 'this', 'is', 'not', 'a', 'show', '

In [36]:
class Vocab:
    def __init__(self):
        self.word2index={'PAD':0, 'EOS':1}
        self.index2word={0:'PAD', 1:'EOS'}
        self.word2count={}
        self.nwords=2

    def buildVocab(self,s):
        for word in s.split(" "):
            if word not in self.word2index:
                self.word2index[word]=self.nwords
                self.index2word[self.nwords]=word
                self.word2count[word]=1
                self.nwords+=1
            else:
                self.word2count[word]+=1

In [37]:
vocab=Vocab()

for review in reviews:
    vocab.buildVocab(review)

print("Vocab Length:",vocab.nwords)

Vocab Length: 28216


In [38]:
class CustomDataset(Dataset):
    def __init__(self, reviews, review_sentiments, vocab, max_length):
        super().__init__()
        self.reviews=reviews
        self.review_sentiments=review_sentiments
        self.max_length=max_length
        self.vocab=vocab

    def __len__(self):
        return len(self.reviews)

    def get_input_ids(self,review,vocab):
        input_ids=[]
        input_ids_tensor=torch.zeros(self.max_length+1, dtype=torch.int64)
        for word in review.split(" "):
            input_ids.append(self.vocab.word2index[word])

        input_ids.append(self.vocab.word2index['EOS'])
        input_ids_tensor[:len(input_ids)]=torch.tensor(input_ids)
        return input_ids_tensor

    def __getitem__(self,idx):
        review=self.reviews[idx]
        review_sentiment=self.review_sentiments[idx]

        return self.get_input_ids(review,self.vocab), review_sentiment

In [39]:
dataset=CustomDataset(reviews,review_sentiments,vocab,max_review_length)

train_dataset,test_dataset=random_split(dataset,[0.9,0.1])

batch_size=64

train_dataloader=DataLoader(dataset=train_dataset,batch_size=batch_size, shuffle=True)
test_dataloader=DataLoader(dataset=test_dataset,batch_size=32, shuffle=False)

In [40]:

class SentimentGRU(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, n_layers=2, dropout=0.5):
        super(SentimentGRU, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.gru = nn.GRU(embed_dim, hidden_dim, num_layers=n_layers,
                          dropout=dropout if n_layers > 1 else 0, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        embedded = self.dropout(self.embedding(x))
        out, hidden = self.gru(embedded)
        hidden = hidden[-1]
        output = self.fc(self.dropout(hidden))
        return output  



In [41]:
embed_size = 128
hidden_size = 256
output_dim = 2       
n_layers = 2
dropout = 0.5
lr = 0.001


In [42]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

sentinn = SentimentGRU(
    vocab_size=vocab.nwords,     
    embed_dim=embed_size,
    hidden_dim=hidden_size,
    output_dim=output_dim,
    n_layers=n_layers,
    dropout=dropout
).to(device)


In [43]:
loss_fn = nn.CrossEntropyLoss().to(device)
opt = optim.Adam(params=sentinn.parameters(), lr=lr)


In [44]:
def train_one_epoch():
    sentinn.train()
    track_loss=0
    num_correct=0
    batch_size=64

    for i, (reviews_ids,sentiments) in enumerate(train_dataloader):

        reviews_ids=reviews_ids.to(device) 
        sentiments=sentiments.to(device) 
        logits=sentinn(reviews_ids)
        loss=loss_fn(logits,sentiments)


        track_loss+=loss.item()
        num_correct+=(torch.argmax(logits,dim=1)==sentiments).type(torch.float).sum().item()

        running_loss=round(track_loss/(i+(reviews_ids.shape[0]/batch_size)),4)
        running_acc=round((num_correct/((i*batch_size+reviews_ids.shape[0])))*100,4)

        opt.zero_grad()
        loss.backward()
        opt.step()


    epoch_loss=running_loss
    epoch_acc=running_acc
    return epoch_loss, epoch_acc

In [45]:
def eval_one_epoch():
    sentinn.eval()
    track_loss = 0
    num_correct = 0
    batch_size = 32

    with torch.no_grad():  
        for i, (reviews_ids, sentiments) in enumerate(test_dataloader):
            reviews_ids = reviews_ids.to(device)
            sentiments = sentiments.to(device)
            logits = sentinn(reviews_ids)

            loss = loss_fn(logits, sentiments)

            track_loss += loss.item()
            num_correct += (torch.argmax(logits, dim=1) == sentiments).type(torch.float).sum().item()

            running_loss = round(track_loss / (i + (reviews_ids.shape[0] / batch_size)), 4)
            running_acc = round((num_correct / ((i * batch_size + reviews_ids.shape[0]))) * 100, 4)

    epoch_loss = running_loss
    epoch_acc = running_acc
    return epoch_loss, epoch_acc


In [46]:
n_epochs = 20

for e in range(n_epochs):
    print(f"\nEpoch {e+1}/{n_epochs}")
    
    train_loss, train_acc = train_one_epoch()
    print(f" Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.2f}%")
    
    val_loss, val_acc = eval_one_epoch()
    print(f" Eval  Loss: {val_loss:.4f} | Eval  Acc: {val_acc:.2f}%")



Epoch 1/20
 Train Loss: 0.7152 | Train Acc: 49.31%
 Eval  Loss: 0.7134 | Eval  Acc: 51.60%

Epoch 2/20
 Train Loss: 0.7034 | Train Acc: 52.20%
 Eval  Loss: 0.7096 | Eval  Acc: 52.40%

Epoch 3/20
 Train Loss: 0.6930 | Train Acc: 54.60%
 Eval  Loss: 0.7093 | Eval  Acc: 53.60%

Epoch 4/20
 Train Loss: 0.6723 | Train Acc: 59.87%
 Eval  Loss: 0.6965 | Eval  Acc: 54.80%

Epoch 5/20
 Train Loss: 0.6612 | Train Acc: 62.36%
 Eval  Loss: 0.6716 | Eval  Acc: 63.00%

Epoch 6/20
 Train Loss: 0.6193 | Train Acc: 66.78%
 Eval  Loss: 0.7915 | Eval  Acc: 58.00%

Epoch 7/20
 Train Loss: 0.5675 | Train Acc: 71.44%
 Eval  Loss: 0.5769 | Eval  Acc: 70.80%

Epoch 8/20
 Train Loss: 0.4954 | Train Acc: 76.78%
 Eval  Loss: 0.5890 | Eval  Acc: 73.00%

Epoch 9/20
 Train Loss: 0.4321 | Train Acc: 81.36%
 Eval  Loss: 0.5723 | Eval  Acc: 74.40%

Epoch 10/20
 Train Loss: 0.3788 | Train Acc: 83.53%
 Eval  Loss: 0.5513 | Eval  Acc: 77.40%

Epoch 11/20
 Train Loss: 0.3243 | Train Acc: 86.58%
 Eval  Loss: 0.6451 | Eval