In [1]:
!pip uninstall -y gensim numpy
!pip install numpy --upgrade
!pip install gensim --upgrade

[0mFound existing installation: numpy 2.0.2
Uninstalling numpy-2.0.2:
  Successfully uninstalled numpy-2.0.2
Collecting numpy
  Downloading numpy-2.3.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-2.3.0-cp311-cp311-manylinux_2_28_x86_64.whl (16.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.9/16.9 MB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cupy-cuda12x 13.3.0 requires numpy<2.3,>=1.22, but you have numpy 2.3.0 which is incompatible.
numba 0.60.0 requires numpy<2.1,>=1.22, but you have numpy 2.3.0 which is incompatible.
tensorflow 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 2

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import kagglehub
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")
df = pd.read_csv(path + "/IMDB Dataset.csv")
Df = df.sample(frac=0.2,random_state=42)
print(len(Df))
print(Df.head())

10000
                                                  review sentiment
33553  I really liked this Summerslam due to the look...  positive
9427   Not many television shows appeal to quite as m...  positive
199    The film quickly gets to a major chase scene w...  negative
12447  Jane Austen would definitely approve of this o...  positive
39489  Expectations were somewhat high for me when I ...  negative


In [None]:
print(df['sentiment'].value_counts())

sentiment
positive    25000
negative    25000
Name: count, dtype: int64


In [3]:
Df.replace({"sentiment": {"positive": 1, "negative": 0}}, inplace=True)

  Df.replace({"sentiment": {"positive": 1, "negative": 0}}, inplace=True)


In [4]:
train_data, test_data = train_test_split(Df,test_size=0.2,random_state=42)
print(train_data.shape)

(8000, 2)


### Preprocessing data

In [5]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [6]:
from nltk.stem import PorterStemmer

In [7]:
stemmer = PorterStemmer()

def preprocess(text):
  text = text.lower()
  tokens = word_tokenize(text)
  tokens = [stemmer.stem(token) for token in tokens]
  return tokens

In [8]:
vocab = {'<UNK>':0}

def build_vocab(reviews):
  for review in reviews:
    tokens = preprocess(review)
    for token in tokens:
      if token not in vocab:
        vocab[token] = len(vocab)

build_vocab(train_data['review'])

In [9]:
def text_to_indices(text,vocab):
  indexed_text = []
  for token in preprocess(text):
    if token in vocab:
      indexed_text.append(vocab[token])
    else:
      indexed_text.append(vocab['<UNK>'])
  return indexed_text

### LSTM - Long Short Term Memory

In [10]:
class IMDBDataset(Dataset):

  def __init__(self,data,vocab):
    self.data = data
    self.vocab = vocab

  def __len__(self):
    return len(self.data)

  def __getitem__(self,idx):
    row = self.data.iloc[idx]
    review = text_to_indices(row['review'],self.vocab)
    label = row['sentiment']
    return torch.tensor(review), torch.tensor(label)

In [11]:
train_dataset = IMDBDataset(train_data,vocab)
test_dataset = IMDBDataset(test_data,vocab)

In [30]:
print(type(train_dataset[0]))
print(type(train_dataset[1]))

<class 'tuple'>
<class 'tuple'>


In [12]:
def collate_fn(batch):
    reviews, labels = zip(*batch)
    lengths = torch.tensor([len(rev) for rev in reviews])
    padded_reviews = pad_sequence(reviews, batch_first=True)
    labels = torch.stack(labels)
    return padded_reviews, labels, lengths

In [13]:
train_loader = DataLoader(train_dataset,batch_size=32,shuffle=True,collate_fn=collate_fn)
test_loader = DataLoader(test_dataset,batch_size=32,shuffle=False,collate_fn=collate_fn)

In [14]:
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence

In [15]:
class IMDBnn(nn.Module):

  def __init__(self,vocab_size):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size,embedding_dim=100)
    self.lstm = nn.LSTM(100,64,batch_first=True)
    self.linear = nn.Linear(64,1)

  def forward(self,review,lengths):
    embedded = self.embedding(review)
    lengths_sorted, idx_sort = torch.sort(lengths, descending=True)
    embedded_sorted = embedded[idx_sort]

    packed = pack_padded_sequence(
        embedded_sorted,
        lengths_sorted.cpu(),
        batch_first=True,
        enforce_sorted=True
    )

    output, (h_n, _) = self.lstm(packed)

    _, idx_unsort = torch.sort(idx_sort)
    h_n = h_n[:, idx_unsort, :]

    out = self.linear(h_n.squeeze(0))
    return out

In [16]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = IMDBnn(len(vocab)).to(device)

In [17]:
lr = 0.001
epochs = 25

In [18]:
criterion = nn.BCEWithLogitsLoss()
optimiser = torch.optim.Adam(model.parameters(),lr=lr)

In [19]:
# training loop

print_interval = 100
for epoch in range(epochs):
  total_loss = 0;
  for batch_idx, (reviews, labels, lengths) in enumerate(train_loader):
    reviews, labels = reviews.to(device), labels.to(device).float()

    # Forward pass
    outputs = model(reviews, lengths)

    # Loss calculation
    loss = criterion(outputs.squeeze(1), labels)
    total_loss += loss.item()
    if (batch_idx + 1) % print_interval == 0:
      avg_loss = total_loss / print_interval
      print(f"Epoch: {epoch+1}, Batch: {batch_idx+1}, Loss: {avg_loss:.4f}")
      total_loss = 0

    # Backward pass and update
    optimiser.zero_grad()
    loss.backward()
    optimiser.step()

Epoch: 1, Batch: 100, Loss: 0.6904
Epoch: 1, Batch: 200, Loss: 0.6611
Epoch: 2, Batch: 100, Loss: 0.5925
Epoch: 2, Batch: 200, Loss: 0.5480
Epoch: 3, Batch: 100, Loss: 0.4935
Epoch: 3, Batch: 200, Loss: 0.4342
Epoch: 4, Batch: 100, Loss: 0.3760
Epoch: 4, Batch: 200, Loss: 0.3553
Epoch: 5, Batch: 100, Loss: 0.3446
Epoch: 5, Batch: 200, Loss: 0.3028
Epoch: 6, Batch: 100, Loss: 0.2219
Epoch: 6, Batch: 200, Loss: 0.2579
Epoch: 7, Batch: 100, Loss: 0.1824
Epoch: 7, Batch: 200, Loss: 0.1617
Epoch: 8, Batch: 100, Loss: 0.2077
Epoch: 8, Batch: 200, Loss: 0.2185
Epoch: 9, Batch: 100, Loss: 0.1353
Epoch: 9, Batch: 200, Loss: 0.1265
Epoch: 10, Batch: 100, Loss: 0.2125
Epoch: 10, Batch: 200, Loss: 0.1105
Epoch: 11, Batch: 100, Loss: 0.0957
Epoch: 11, Batch: 200, Loss: 0.0666
Epoch: 12, Batch: 100, Loss: 0.0434
Epoch: 12, Batch: 200, Loss: 0.0487
Epoch: 13, Batch: 100, Loss: 0.0381
Epoch: 13, Batch: 200, Loss: 0.0453
Epoch: 14, Batch: 100, Loss: 0.0186
Epoch: 14, Batch: 200, Loss: 0.0542
Epoch: 15,

In [25]:
# model evaluation
model.eval()

correct = 0
total = 0

with torch.no_grad():
  for reviews, labels, lengths in test_loader:
    reviews, labels = reviews.to(device), labels.to(device).float()
    pred_val = model(reviews, lengths)
    pred_val = pred_val.squeeze(1)
    y_cap = np.where(pred_val.cpu().detach().numpy() >= 0.5, 1, 0)
    labels_np = labels.cpu().detach().numpy()
    correct += np.sum(y_cap == labels_np)
    total += len(labels)

accuracy = correct / total
print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.8030
