# Sentiment Analysis on IMDB Reviews

### importing libraries

In [None]:
import pandas as pd
import numpy as np
import re
import contractions
from collections import Counter
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### loading the data

In [137]:
df = pd.read_csv("IMDB Dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [77]:
df['sentiment'] = df['sentiment'].apply(lambda x : 1 if x == 'positive' else 0)
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
...,...,...
49995,I thought this movie did a down right good job...,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0
49997,I am a Catholic taught in parochial elementary...,0
49998,I'm going to have to disagree with the previou...,0


### pre-processing the reviews

In [78]:
stop_words = set(stopwords.words('english'))
preserve_words = {"not", "no", "nor"}
stop_words = stop_words - preserve_words
lemmatizer = WordNetLemmatizer()

def clean_text(text) :
    text = text.lower()
    text = re.sub(r'<.+?>','' , text)
    text = contractions.fix(text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    tokens = [lemmatizer.lemmatize(t) for t in text.split() if t not in stop_words]
    return tokens

In [79]:
review_tokens = [clean_text(text) for text in df['review']]
len(review_tokens)

50000

### building the vocab

In [None]:
vocab = {"<unk>" : 0}
token_counts = Counter(token for review in review_tokens for token in review)

most_common_tokens = token_counts.most_common(20000)

for token, _ in most_common_tokens :
    vocab[token] = len(vocab)

print(len(vocab))
print(vocab)

### coverting the reviews into list of word_indices

In [81]:
def tokens_to_indices(tokens) :
    indices = []
    for token in tokens :
        if token in vocab :
            indices.append(vocab[token])
        else :
            indices.append(vocab['<unk>'])
    return indices

In [82]:
review_indices = []
max_len = 0
for review in review_tokens :
    indices = tokens_to_indices(review)
    review_indices.append(indices)
    max_len = max(max_len, len(indices))
print(len(review_indices))
print(max_len)

50000
1430


### padding the sequence of word_indices

In [None]:
def pad_sequences(sequences, max_len, pad_value) :
    padded = []
    for seq in sequences :
        if len(seq) >= max_len :
            seq = seq[:max_len]
        else :
            seq = seq + [pad_value]*(max_len - len(seq))
        padded.append(seq)
    return padded

padded_reviews = pad_sequences(review_indices, max_len=200, pad_value=vocab['<unk>'])
padded_reviews = torch.tensor(padded_reviews, dtype = torch.long)
print(padded_reviews.shape)
print(padded_reviews)

### labels

In [84]:
sentiment = df['sentiment'].values
sentiment = torch.tensor(sentiment, dtype= torch.float)
print(sentiment.shape)

torch.Size([50000])


### dataset class

In [87]:
class IMDBDataset(Dataset) :
    def __init__(self, x, y) :
        self.x = x
        self.y = y

    def __len__(self) :
        return self.x.shape[0]

    def __getitem__(self, index) :
        return self.x[index], self.y[index]

### splitting the data into train and test

In [88]:
x_train , x_test , y_train, y_test = train_test_split(padded_reviews, sentiment, test_size=0.2,stratify = sentiment, random_state=42)

In [89]:
train_dataset = IMDBDataset(x_train, y_train)
test_dataset = IMDBDataset(x_test, y_test)

### dataloader

In [90]:
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

### lstm architecture

In [None]:
class LSTM_model(nn.Module) :
    def __init__(self, vocab_size) :
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, 32, padding_idx=vocab['<unk>'])
        self.lstm1 = nn.LSTM(32, 64, batch_first=True)
        self.dropout1 = nn.Dropout(0.5)
        self.lstm2 = nn.LSTM(64, 32, batch_first=True)
        self.dropout2 = nn.Dropout(0.5)
        self.fc = nn.Linear(32, 1)

    def forward(self, x) :
        x = self.embedding(x)
        x, _ = self.lstm1(x)
        x = self.dropout1(x)
        x, _ = self.lstm2(x)
        x = self.dropout2(x)
        x = x[:, -1, :]  # get last output from sequence
        x = self.fc(x)
        return torch.sigmoid(x)


### model and hyper parameters

In [132]:
model = LSTM_model(len(vocab))
learning_rate = 0.001
epochs = 60
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr = learning_rate)

### training loop

In [133]:
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

for epoch in range(epochs) :
    total_loss = 0.0
    model.train()
    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}", leave=False)
    for batch_x, batch_y in loop :
        batch_x = batch_x.to(device)
        batch_y = batch_y.to(device)
        output = model(batch_x)
        optimizer.zero_grad()
        loss = criterion(output, batch_y.unsqueeze(1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        loop.set_postfix(loss=loss.item())
    if (epoch+1) % 10 == 0 :
      print(f"Epoch {epoch+1}: Total Loss = {total_loss:.4f}")




Epoch 10: Total Loss = 216.1132




Epoch 20: Total Loss = 198.1806




Epoch 30: Total Loss = 153.0342




Epoch 40: Total Loss = 60.6877




Epoch 50: Total Loss = 31.2993


                                                                            

Epoch 60: Total Loss = 18.3835




### model evaluation

In [None]:
model.eval()
correct = 0
total = 0
with torch.no_grad() :
  for batch_features, batch_labels in test_loader :
    batch_features = batch_features.to(device)
    batch_labels = batch_labels.to(device)
    outputs = model(batch_features)
    predicted = torch.round(outputs)
    total += batch_labels.size(0)
    correct += (predicted == batch_labels.unsqueeze(1)).sum().item()

  print(f'Accuracy: {correct/total*100}')

Accuracy: 86.75


### prediction function

In [135]:
def predict(model, text) :
  model.eval()
  text_processed = clean_text(text)
  text_indices = tokens_to_indices(text_processed)
  if (len(text_indices) > 200) :
    text_indices = text_indices[:200]
  else :
    text_indices = text_indices + [vocab['<unk>']]*(200 - len(text_indices))
  text_tensor = torch.tensor(text_indices, dtype=torch.long).unsqueeze(0)
  # Move the tensor to the same device as the model
  text_tensor = text_tensor.to(device)
  output = model(text_tensor)
  predicted = torch.round(output)
  return 'POSITIVE' if (predicted.item() == 1) else 'NEGATIVE'

### few examples

In [136]:
print(predict(model, "I loved this movie"))
print(predict(model, "I hated this movie"))
print(predict(model, "the movie is ok but not good"))
print(predict(model, "the movie was great"))
print(predict(model, "the movie was bad"))

POSITIVE
NEGATIVE
NEGATIVE
POSITIVE
NEGATIVE


### saving the model

In [138]:
torch.save(model.state_dict(), 'lstm_model_state.pth')