In [None]:
import pandas as pd

url = "https://raw.githubusercontent.com/Ataullha/CSE476-Machine-Learning-Lab/main/IMDB%20Dataset.csv"
df = pd.read_csv(url)

# Quick check
df.head()


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
df.shape

(50000, 2)

In [None]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size=0.3, random_state=42)

In [None]:
df_train.shape

(35000, 2)

In [None]:
df_test.shape

(15000, 2)

In [None]:
print(df_train['sentiment'].value_counts())
print(df_test['sentiment'].value_counts())

sentiment
negative    17589
positive    17411
Name: count, dtype: int64
sentiment
positive    7589
negative    7411
Name: count, dtype: int64


In [None]:
a_string= 'i am the master<br /><br /> !this looks like it was very  special <br /><br />.So i will& recommend it very# much.'

def remove_br(a_string):
  a_string = a_string.replace('<br />','')
  return a_string

In [None]:
import string
punctuation_set = set(string.punctuation)

def remove_punctuation(a_string):
  clean_char_array = [char if char not in punctuation_set else '' for char in a_string]
  clean_string = ''.join(clean_char_array)
  return clean_string



In [None]:
def lower_string(a_string):
  return a_string.lower()



In [None]:
def split_words(a_string):
  return [word for word in a_string.split(" ") if word!='']

In [None]:
!pip install nltk
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
stopwords = stopwords.words('english')

def remove_stopwords(a_string_list):
  return [ word for word in a_string_list if word not in stopwords]



In [None]:
def clean_string(a_string):
  a_string = remove_stopwords(split_words(lower_string(remove_punctuation(remove_br(a_string)))))
  return a_string
a_string = clean_string(a_string)

In [None]:
a_string

['master', 'looks', 'like', 'special', 'recommend', 'much']

In [None]:
df_train['clean_review'] = df_train['review'].apply(lambda x: clean_string(x))
df_test['clean_review'] = df_test['review'].apply(lambda x: clean_string(x))

In [None]:
df_train.head()

Unnamed: 0,review,sentiment,clean_review
38094,"As much as I love trains, I couldn't stomach t...",negative,"[much, love, trains, couldnt, stomach, movie, ..."
40624,"This was a very good PPV, but like Wrestlemani...",positive,"[good, ppv, like, wrestlemania, xx, 14, years,..."
49425,Not finding the right words is everybody's pro...,negative,"[finding, right, words, everybodys, problem, v..."
35734,I'm really suprised this movie didn't get a hi...,positive,"[im, really, suprised, movie, didnt, get, high..."
41708,I'll start by confessing that I tend to really...,negative,"[ill, start, confessing, tend, really, enjoy, ..."


In [None]:
df_train['label'] = (df_train['sentiment']=='positive').astype(int)
df_test['label'] = (df_test['sentiment']=='positive').astype(int)

df_train.head()

Unnamed: 0,review,sentiment,clean_review,label
38094,"As much as I love trains, I couldn't stomach t...",negative,"[much, love, trains, couldnt, stomach, movie, ...",0
40624,"This was a very good PPV, but like Wrestlemani...",positive,"[good, ppv, like, wrestlemania, xx, 14, years,...",1
49425,Not finding the right words is everybody's pro...,negative,"[finding, right, words, everybodys, problem, v...",0
35734,I'm really suprised this movie didn't get a hi...,positive,"[im, really, suprised, movie, didnt, get, high...",1
41708,I'll start by confessing that I tend to really...,negative,"[ill, start, confessing, tend, really, enjoy, ...",0


In [None]:
from gensim.models import Word2Vec

all_reviews = df_train['clean_review'].tolist()
all_reviews.extend(df_test['clean_review'].tolist())

w2v_model = Word2Vec(sentences=all_reviews, vector_size=50)

In [None]:
w2v_model.wv.most_similar('good')

[('decent', 0.8402342796325684),
 ('great', 0.7954424023628235),
 ('bad', 0.7883915305137634),
 ('cool', 0.7175356149673462),
 ('alright', 0.7136542201042175),
 ('nice', 0.7136452198028564),
 ('ok', 0.7070584297180176),
 ('okay', 0.6807668209075928),
 ('fine', 0.6789599657058716),
 ('awesome', 0.6714643836021423)]

In [None]:
import torch
from torch.utils.data import dataloader, TensorDataset

Sequence_Length = 500
def convert_sequence_to_tensor(sequences, num_tokens_in_sequence, embedding_size):
  num_sequences = len(sequences)
  data_tensor = torch.zeros((num_sequences, num_tokens_in_sequence, embedding_size))
  for index, review in enumerate(list(sequences)):
    truncated_clean_review = review[:num_tokens_in_sequence]
    list_of_word_embeddings = [w2v_model.wv[word] if word in w2v_model.wv else [0.0]*embedding_size for word in truncated_clean_review]
    sequence_tensor = torch.FloatTensor(list_of_word_embeddings)
    review_length = sequence_tensor.shape[0]
    data_tensor[index, :review_length,:] = sequence_tensor
  return data_tensor

In [None]:
train_data_X = convert_sequence_to_tensor(df_train['clean_review'].to_numpy(), Sequence_Length, 50)
train_data_Y = torch.FloatTensor(df_train['label'].to_numpy())

test_data_X = convert_sequence_to_tensor(df_test['clean_review'].to_numpy(), Sequence_Length, 50)
test_data_Y = torch.FloatTensor(df_test['label'].to_numpy())

  sequence_tensor = torch.FloatTensor(list_of_word_embeddings)


In [None]:
print('Example Sequence Tensor')
print(train_data_X[0])

print('Example Sequence Label')
print(train_data_Y[0])

Example Sequence Tensor
tensor([[-0.6623, -2.6400, -0.8477,  ..., -1.8172, -2.3835, -2.5042],
        [-2.8494,  0.9294,  1.8706,  ...,  0.1550,  0.1357, -0.4054],
        [ 0.1820, -0.0857, -0.1359,  ...,  0.5319, -0.1523,  0.3601],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])
Example Sequence Label
tensor(0.)


In [None]:
train_data=TensorDataset(train_data_X, train_data_Y)
test_data=TensorDataset(test_data_X, test_data_Y)

In [None]:
Batch_size = 64
train_loader = dataloader.DataLoader(train_data, shuffle=True, batch_size=Batch_size)
test_loader = dataloader.DataLoader(test_data, shuffle=True, batch_size=Batch_size)

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [None]:
import torch.nn as nn
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns

In [None]:
import torch.nn as nn

class SentimentLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, dropout_prob=0.5):
        super(SentimentLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers


        self.lstm = nn.LSTM(
            input_size,
            hidden_size,
            num_layers,
            batch_first=True,
        )


        self.fc1 = nn.Linear(hidden_size, 256)
        self.dropout = nn.Dropout(dropout_prob)
        self.fc2 = nn.Linear(256, output_size)

    def forward(self, x):

        output, (hidden, cell) = self.lstm(x)


        last_hidden = hidden[-1]


        out = self.fc1(last_hidden)
        out = self.dropout(out)
        out = self.fc2(out)

        return out


In [None]:
input_size = 50  # Embedding size from Word2Vec
hidden_size = 128  # Number of features in hidden state
num_layers = 2    # Single LSTM layer
output_size = 1   # Binary classification output

model = SentimentLSTM(input_size, hidden_size, num_layers, output_size).to(device)
criterion = nn.BCEWithLogitsLoss()  # Combines sigmoid and binary cross-entropy
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)  # Adam optimizer

In [None]:
num_epochs = 50
train_losses = []
train_accuracies = []
test_losses = []
test_accuracies = []

In [None]:
for epoch in range(num_epochs):
    # Training phase
    model.train()
    train_loss = 0.0
    train_correct = 0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.squeeze(), labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * inputs.size(0)
        predictions = (outputs.squeeze() > 0).float()
        train_correct += (predictions == labels).sum().item()
    train_loss /= len(train_loader.dataset)
    train_accuracy = train_correct / len(train_loader.dataset)
    train_losses.append(train_loss)
    train_accuracies.append(train_accuracy)

    # Testing phase
    model.eval()
    test_loss = 0.0
    test_correct = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), labels)
            test_loss += loss.item() * inputs.size(0)
            predictions = (outputs.squeeze() > 0).float()
            test_correct += (predictions == labels).sum().item()
    test_loss /= len(test_loader.dataset)
    test_accuracy = test_correct / len(test_loader.dataset)
    test_losses.append(test_loss)
    test_accuracies.append(test_accuracy)

    print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Train Acc: {train_accuracy:.4f}, Test Loss: {test_loss:.4f}, Test Acc: {test_accuracy:.4f}')

In [None]:
plt.figure(figsize=(12, 4))

# Plot Loss
plt.subplot(1, 2, 1)
plt.plot(train_losses, label='Train Loss')
plt.plot(test_losses, label='Test Loss')
plt.legend()
plt.title('Loss')

# Plot Accuracy
plt.subplot(1, 2, 2)
plt.plot(train_accuracies, label='Train Accuracy')
plt.plot(test_accuracies, label='Test Accuracy')
plt.legend()
plt.title('Accuracy')

plt.show()

In [None]:
model.eval()
all_predictions = []
all_labels = []
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        predictions = (outputs.squeeze() > 0).float()
        all_predictions.extend(predictions.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

cm = confusion_matrix(all_labels, all_predictions)
print('Confusion Matrix:')
print(cm)

label_names = ['Negative', 'Positive']
# Visualize Confusion Matrix
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_names, yticklabels=label_names)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()