# Sentiment Analysis using LSTM MODEL and Transformer embeddings

### importing libraries

In [41]:
import pandas as pd
import numpy as np
import re
import contractions
from collections import Counter
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### load the dataset

In [92]:
df = pd.read_csv("IMDB Dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [93]:
df['sentiment'] = df['sentiment'].apply(lambda x : 1 if x == 'positive' else 0)
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
...,...,...
49995,I thought this movie did a down right good job...,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0
49997,I am a Catholic taught in parochial elementary...,0
49998,I'm going to have to disagree with the previou...,0


### review processing

In [94]:
stop_words = set(stopwords.words('english'))
preserve_words = {"not", "no", "nor"}
stop_words = stop_words - preserve_words
lemmatizer = WordNetLemmatizer()

# def clean_text(text) :
#     text = text.lower()
#     text = re.sub(r'<.+?>','' , text)
#     text = contractions.fix(text)
#     text = re.sub(r'[^\w\s]', '', text)
#     text = re.sub(r'\s+', ' ', text)
#     tokens = [lemmatizer.lemmatize(t) for t in text.split() if t not in stop_words]
#     return tokens


def clean_text_new(text) :
    text = text.lower()
    text = re.sub(r'<.+?>','' , text)
    text = contractions.fix(text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    tokens = ' '.join(lemmatizer.lemmatize(t) for t in text.split() if t not in stop_words)
    return tokens


In [None]:
cleaned_reviews = df['review'].apply(clean_text_new)

### loading the bert model

In [None]:
from transformers import BertTokenizer, BertModel

bert_tokenizer = BertTokenizer.from_pretrained('google/bert_uncased_L-2_H-128_A-2')
bert_model = BertModel.from_pretrained('google/bert_uncased_L-2_H-128_A-2')

### sample

In [101]:
sample = ['i enjoyed watching the movie', 'i hated the movie']
tokens = bert_tokenizer(sample, truncation = True, padding = 'max_length', max_length = 200, return_tensors = 'pt')

In [102]:
print(tokens.input_ids.shape)

torch.Size([2, 200])


In [103]:
outputs = bert_model(**tokens)
# print(outputs)
print(outputs[0].shape)
print(outputs[1].shape)
embedding_dim = outputs[0].shape[2]
print(embedding_dim)

torch.Size([2, 200, 128])
torch.Size([2, 128])
128


### tokenizing the reviews

In [104]:
review_tokens = bert_tokenizer(cleaned_reviews.tolist(), truncation = True, padding = 'max_length', max_length = 200, return_tensors = 'pt')

### sentiment labels

In [105]:
sentiment = df['sentiment'].values
sentiment = torch.tensor(sentiment, dtype= torch.float)
print(sentiment.shape)

torch.Size([50000])


### dataset class

In [None]:
class IMDBDataset(Dataset) :
    def __init__(self, tokens, sentiment) :
        self.x = tokens
        self.y = sentiment

    def __len__(self) :
        return self.y.shape[0]

    def __getitem__(self, index) :
        x_dict = {key : val[index] for key, val in self.x.items()}
        return x_dict , self.y[index]

dataset = IMDBDataset(review_tokens, sentiment)

In [107]:
print(review_tokens['input_ids'].shape)
print(review_tokens['attention_mask'].shape)

torch.Size([50000, 200])
torch.Size([50000, 200])


### splitting the data

In [108]:
input_ids = review_tokens['input_ids']
attention_mask = review_tokens['attention_mask']

input_ids_train, input_ids_test, attention_mask_train, attention_mask_test, y_train, y_test = train_test_split(
    input_ids, attention_mask, sentiment, test_size=0.2, stratify=sentiment, random_state=42
)

x_train = {'input_ids': input_ids_train, 'attention_mask': attention_mask_train}
x_test = {'input_ids': input_ids_test, 'attention_mask': attention_mask_test}

### dataset and dataloader

In [109]:
train_dataset = IMDBDataset(x_train, y_train)
test_dataset = IMDBDataset(x_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

### lstm model class with bert embeddings

In [None]:
class LSTM_model(nn.Module) :
    def __init__(self) :
        super().__init__()
        self.bert = bert_model
        for parameters in self.bert.parameters() :
            parameters.requires_grad = False
        self.lstm1 = nn.LSTM(embedding_dim, 64, batch_first=True)
        self.dropout1 = nn.Dropout(0.5)
        self.lstm2 = nn.LSTM(64, 32, batch_first=True)
        self.dropout2 = nn.Dropout(0.5)
        self.fc = nn.Linear(32, 1)

    def forward(self, x) :
        x = self.bert(**x)
        x, _ = self.lstm1(x.last_hidden_state)
        x = self.dropout1(x)
        x, _ = self.lstm2(x)
        x = self.dropout2(x)
        x = x[:, -1, :]  
        x = self.fc(x)
        return torch.sigmoid(x)

### model and hyper parameters

In [116]:
model = LSTM_model()
learning_rate = 0.001
epochs = 40
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr = learning_rate)

### training loop

In [None]:

from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

for epoch in range(epochs) :
    total_loss = 0.0
    model.train()
    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}", leave=False)
    for batch_x, batch_y in loop :
        batch_x = {key: val.to(device) for key, val in batch_x.items()}
        batch_y = batch_y.to(device)
        output = model(batch_x)
        optimizer.zero_grad()
        loss = criterion(output, batch_y.unsqueeze(1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        loop.set_postfix(loss=loss.item())

    print(f"Epoch {epoch+1}: Total Loss = {total_loss:.4f}")



Epoch 1: Total Loss = 188.9548




Epoch 2: Total Loss = 179.0637




Epoch 3: Total Loss = 171.3876




Epoch 4: Total Loss = 167.4747




Epoch 5: Total Loss = 163.7597




Epoch 6: Total Loss = 159.8441




Epoch 7: Total Loss = 154.9783




Epoch 8: Total Loss = 149.2756




Epoch 9: Total Loss = 145.4905




Epoch 10: Total Loss = 140.9678




Epoch 11: Total Loss = 138.5117




Epoch 12: Total Loss = 135.7388




Epoch 13: Total Loss = 134.5268




Epoch 14: Total Loss = 130.4364




Epoch 15: Total Loss = 128.3234




Epoch 16: Total Loss = 126.8948




Epoch 17: Total Loss = 124.3426




Epoch 18: Total Loss = 121.9088




Epoch 19: Total Loss = 122.1963




Epoch 20: Total Loss = 119.8779




Epoch 21: Total Loss = 117.8364




Epoch 22: Total Loss = 116.7157




Epoch 23: Total Loss = 113.6429




Epoch 24: Total Loss = 113.9535




Epoch 25: Total Loss = 112.5394




Epoch 26: Total Loss = 111.5073




Epoch 27: Total Loss = 109.5812




Epoch 28: Total Loss = 108.4486




Epoch 29: Total Loss = 108.0203




Epoch 30: Total Loss = 107.7805




Epoch 31: Total Loss = 106.0365




Epoch 32: Total Loss = 104.6472




Epoch 33: Total Loss = 103.7934




Epoch 34: Total Loss = 101.8529




Epoch 35: Total Loss = 103.0390




Epoch 36: Total Loss = 100.8927




Epoch 37: Total Loss = 100.2280




Epoch 38: Total Loss = 100.1784




Epoch 39: Total Loss = 98.6725


                                                                         

Epoch 40: Total Loss = 97.5107




### evaluation and accuracy

In [None]:
model.eval()
correct = 0
total = 0
with torch.no_grad() :
  for batch_features, batch_labels in test_loader :

    batch_features = {key: val.to(device) for key, val in batch_features.items()}
    batch_labels = batch_labels.to(device)
    outputs = model(batch_features)
    predicted = torch.round(outputs)
    total += batch_labels.size(0)
    correct += (predicted == batch_labels.unsqueeze(1)).sum().item()

  print(f'Accuracy: {correct/total*100:.4f}')

Accuracy: 83.6600


### predict function

In [None]:
def predict(model, text) :
  model.eval()
  text_processed = clean_text_new(text)
  bert_tokens = bert_tokenizer(text_processed, truncation = True, padding = 'max_length', max_length = 200, return_tensors = 'pt')
  bert_tokens = {key: val.to(device) for key, val in bert_tokens.items()}
  output = model(bert_tokens)
  predicted = torch.round(output)
  return 'POSITIVE' if (predicted.item() == 1) else 'NEGATIVE'

### examples

In [122]:
print(predict(model, "I loved this movie"))
print(predict(model, "the movie is ok but not good"))
print(predict(model, "the movie was great"))
print(predict(model, "the movie was bad"))

POSITIVE
NEGATIVE
POSITIVE
NEGATIVE
