In [1]:
import pandas as pd

df = pd.read_csv("IMDB Dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
df.info()
df['sentiment'].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


In [4]:
df['sentiment'] = df['sentiment'].map({'positive':1,'negative':0})

In [5]:
df['sentiment'].head()

Unnamed: 0,sentiment
0,1
1,1
2,1
3,0
4,1


In [6]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [13]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [23]:
stop_words = set(stopwords.words('english'))

negations = {"not",'no','nor','never'}

stop_words = stop_words-negations

In [24]:
def clean_text(text):
  text = text.lower()
  text = re.sub(r'<.*?>',' ',text)
  text = re.sub(r'[^a-zA-Z\s]'," ",text)
  text = re.sub(r'\s+', ' ', text).strip()
  tokens = word_tokenize(text)
  tokens = [word for word in tokens if word not in stop_words]
  return tokens

In [25]:
print(df['review'][0])
print(clean_text(df['review'][0]))

One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fac

In [26]:
length = clean_text(df['review'][0])
len(length)

166

In [27]:
print(clean_text("This movie is not good"))

['movie', 'not', 'good']


In [28]:
df["cleaned"] = df["review"].apply(clean_text)

In [29]:
df.head()

Unnamed: 0,review,sentiment,cleaned
0,One of the other reviewers has mentioned that ...,1,"[one, reviewers, mentioned, watching, oz, epis..."
1,A wonderful little production. <br /><br />The...,1,"[wonderful, little, production, filming, techn..."
2,I thought this was a wonderful way to spend ti...,1,"[thought, wonderful, way, spend, time, hot, su..."
3,Basically there's a family where a little boy ...,0,"[basically, family, little, boy, jake, thinks,..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,"[petter, mattei, love, time, money, visually, ..."


In [30]:
from collections import Counter

all_words = []

for tokens in df['cleaned']:
  all_words.extend(tokens)

word_freq = Counter(all_words)

print('Total unique words :', len(word_freq))

Total unique words : 99265


In [35]:
print(word_freq.most_common(10))

[('movie', 87972), ('film', 79708), ('not', 62851), ('one', 53603), ('like', 40172), ('good', 29753), ('no', 25301), ('time', 25109), ('even', 24872), ('would', 24602)]


In [36]:
min_freq = 5

filtered_vocab = {word: freq for word, freq in word_freq.items() if freq >= min_freq}

print("Original vocab size:", len(word_freq))
print("Filtered vocab size:", len(filtered_vocab))

Original vocab size: 99265
Filtered vocab size: 39129


In [37]:
word2idx = {word: idx for idx, word in enumerate(filtered_vocab.keys())}

In [38]:
print(list(word2idx.items())[:10])

[('one', 0), ('reviewers', 1), ('mentioned', 2), ('watching', 3), ('oz', 4), ('episode', 5), ('hooked', 6), ('right', 7), ('exactly', 8), ('happened', 9)]


In [39]:
max_vocab = 10000

most_common_words = word_freq.most_common(max_vocab)

word2idx = {word: idx for idx, (word, freq) in enumerate(most_common_words)}

print("Final vocab size:", len(word2idx))

Final vocab size: 10000


In [40]:
import numpy as np

def text_to_bow(tokens,word2idx):
  vector = np.zeros(len(word2idx))

  for word in tokens:
    if word in word2idx:
      index = word2idx[word]
      vector[index]+=1
  return vector

In [42]:
sample_vector = text_to_bow(df["cleaned"][0], word2idx)
print("Non-zero elements:", np.count_nonzero(sample_vector))
print(sample_vector.shape)

Non-zero elements: 120
(10000,)


In [43]:
from collections import defaultdict

doc_freq = defaultdict(int)

for tokens in df["cleaned"]:
    unique_words = set(tokens)
    for word in unique_words:
        if word in word2idx:
            doc_freq[word] += 1

In [44]:
import math

N = len(df)

idf = {}

for word in word2idx:
    df_count = doc_freq[word]
    idf[word] = math.log(N / (df_count + 1))

In [45]:
print("IDF of 'movie':", idf['movie'])
print("IDF of 'good':", idf['good'])

IDF of 'movie': 0.49210200976788815
IDF of 'good': 0.9673209029869341


In [46]:
def text_to_tfidf(tokens, word2idx, idf):
    vector = np.zeros(len(word2idx))

    # Count term frequency
    for word in tokens:
        if word in word2idx:
            index = word2idx[word]
            vector[index] += 1

    # Convert TF to TF-IDF
    for word, index in word2idx.items():
        if vector[index] > 0:
            vector[index] = vector[index] * idf[word]

    return vector

In [47]:
sample_tfidf = text_to_tfidf(df["cleaned"][0], word2idx, idf)

print("Non-zero TF-IDF values:", np.count_nonzero(sample_tfidf))

Non-zero TF-IDF values: 120


In [48]:
from sklearn.model_selection import train_test_split

X = df["review"]
y = df["sentiment"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Train size:", len(X_train))
print("Test size:", len(X_test))

Train size: 40000
Test size: 10000


In [51]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_features=10000,
    min_df=5
)

X_tfidf = vectorizer.fit_transform(df["review"])
y = df["sentiment"]

print(X_tfidf.shape)

(50000, 10000)


In [55]:
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

model = LogisticRegression(max_iter=1000)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

y_pred = cross_val_predict(model, X_tfidf, y, cv=skf)

In [56]:
accuracy = accuracy_score(y, y_pred)
precision = precision_score(y, y_pred)
recall = recall_score(y, y_pred)
f1 = f1_score(y, y_pred)

print("Accuracy :", accuracy)
print("Precision:", precision)
print("Recall   :", recall)
print("F1 Score :", f1)

Accuracy : 0.8945
Precision: 0.887022721029706
Recall   : 0.90416
F1 Score : 0.8955093793950438


In [57]:
from sklearn.metrics import classification_report

print(classification_report(y, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.88      0.89     25000
           1       0.89      0.90      0.90     25000

    accuracy                           0.89     50000
   macro avg       0.89      0.89      0.89     50000
weighted avg       0.89      0.89      0.89     50000



In [58]:
cm = confusion_matrix(y, y_pred)
print("Confusion Matrix:\n", cm)

Confusion Matrix:
 [[22121  2879]
 [ 2396 22604]]


In [59]:
sentences = df["cleaned"].tolist()

In [61]:
pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m57.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [62]:
from gensim.models import Word2Vec

w2v_model = Word2Vec(
    sentences=sentences,
    vector_size=100,      # embedding dimension
    window=5,             # context window
    min_count=5,          # ignore rare words
    workers=4,
    sg=1                  # 1 = Skip-gram, 0 = CBOW
)

In [63]:
print("Word2Vec vocab size:", len(w2v_model.wv))

Word2Vec vocab size: 39129


In [64]:
w2v_model.wv.most_similar("good")

[('decent', 0.7941888570785522),
 ('great', 0.7907998561859131),
 ('bad', 0.7770896553993225),
 ('godawful', 0.7368122339248657),
 ('fine', 0.7350819706916809),
 ('allright', 0.7332845330238342),
 ('workable', 0.7313437461853027),
 ('atleast', 0.7288007140159607),
 ('storywise', 0.7285873889923096),
 ('iffy', 0.7272350192070007)]

In [65]:
w2v_model.wv.most_similar("bad")

[('terrible', 0.822011411190033),
 ('awful', 0.8103813529014587),
 ('horrible', 0.8009147644042969),
 ('lousy', 0.7824139595031738),
 ('good', 0.7770897746086121),
 ('stank', 0.7531719207763672),
 ('laughingly', 0.7432406544685364),
 ('atleast', 0.7373307347297668),
 ('woeful', 0.7371237874031067),
 ('redline', 0.7338756918907166)]

In [66]:
word2idx = {word: i+1 for i, word in enumerate(w2v_model.wv.index_to_key)}

In [67]:
def text_to_sequence(tokens, word2idx):
    sequence = []
    for word in tokens:
        if word in word2idx:
            sequence.append(word2idx[word])
    return sequence

df["sequence"] = df["cleaned"].apply(lambda x: text_to_sequence(x, word2idx))

In [68]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_len = 200

X_padded = pad_sequences(
    df["sequence"],
    maxlen=max_len,
    padding='post',
    truncating='post'
)

In [69]:
print(X_padded.shape)

(50000, 200)


In [70]:
vocab_size = len(word2idx) + 1
embedding_dim = 100

print("Vocab size:", vocab_size)

Vocab size: 39130


In [71]:
import numpy as np

embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, index in word2idx.items():
    embedding_vector = w2v_model.wv[word]
    embedding_matrix[index] = embedding_vector

print("Embedding matrix shape:", embedding_matrix.shape)

Embedding matrix shape: (39130, 100)


In [74]:
X_padded   # shape (50000, 200)
df["sentiment"]
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.03351673,  0.26101252,  0.06947498, ..., -0.21094473,
         0.06973183,  0.12839556],
       [ 0.00097843,  0.13637477,  0.05603603, ..., -0.29537037,
         0.08241118,  0.20245129],
       ...,
       [ 0.00869185,  0.25823331,  0.06747241, ..., -0.15912123,
         0.08995294, -0.00718982],
       [-0.06593593,  0.1590403 ,  0.08861187, ..., -0.25138953,
         0.03154143,  0.04226951],
       [-0.09229092,  0.22073098,  0.06480042, ..., -0.16219428,
         0.03338078,  0.01184584]])

In [75]:
import torch
print(torch.cuda.is_available())

True


In [79]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = SentimentLSTM(vocab_size, 100, embedding_matrix)
model = model.to(device)

In [83]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Create word2idx from Word2Vec
word2idx = {word: i+1 for i, word in enumerate(w2v_model.wv.index_to_key)}

vocab_size = len(word2idx) + 1
embedding_dim = 100
max_len = 300  # slightly longer than 200

def text_to_sequence(tokens):
    return [word2idx[word] for word in tokens if word in word2idx]

df["sequence"] = df["cleaned"].apply(text_to_sequence)

X = pad_sequences(df["sequence"], maxlen=max_len, padding='post', truncating='post')
y = df["sentiment"].values

print("Data shape:", X.shape)

Data shape: (50000, 300)


In [84]:
from sklearn.model_selection import train_test_split

# First split train + temp
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Split temp into validation and test
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)

print("Train:", X_train.shape)
print("Val:", X_val.shape)
print("Test:", X_test.shape)

Train: (40000, 300)
Val: (5000, 300)
Test: (5000, 300)


In [85]:
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, index in word2idx.items():
    embedding_matrix[index] = w2v_model.wv[word]

print("Embedding matrix shape:", embedding_matrix.shape)

Embedding matrix shape: (39130, 100)


In [95]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Create word2idx
word2idx = {word: i+1 for i, word in enumerate(w2v_model.wv.index_to_key)}

vocab_size = len(word2idx) + 1
embedding_dim = 100
max_len = 300

def text_to_sequence(tokens):
    return [word2idx[word] for word in tokens if word in word2idx]

df["sequence"] = df["cleaned"].apply(text_to_sequence)

X = pad_sequences(df["sequence"], maxlen=max_len, padding='post', truncating='post')
y = df["sentiment"].values

# Train / Val / Test split (80 / 10 / 10)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)

print("Train:", X_train.shape)
print("Val:", X_val.shape)
print("Test:", X_test.shape)

Train: (40000, 300)
Val: (5000, 300)
Test: (5000, 300)


In [96]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Create word2idx
word2idx = {word: i+1 for i, word in enumerate(w2v_model.wv.index_to_key)}

vocab_size = len(word2idx) + 1
embedding_dim = 100
max_len = 300

def text_to_sequence(tokens):
    return [word2idx[word] for word in tokens if word in word2idx]

df["sequence"] = df["cleaned"].apply(text_to_sequence)

X = pad_sequences(df["sequence"], maxlen=max_len, padding='post', truncating='post')
y = df["sentiment"].values

# Train / Val / Test split (80 / 10 / 10)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)

print("Train:", X_train.shape)
print("Val:", X_val.shape)
print("Test:", X_test.shape)

Train: (40000, 300)
Val: (5000, 300)
Test: (5000, 300)


In [97]:
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, index in word2idx.items():
    embedding_matrix[index] = w2v_model.wv[word]

print("Embedding matrix shape:", embedding_matrix.shape)

Embedding matrix shape: (39130, 100)


In [98]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

lstm_model = Sequential([
    Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        weights=[embedding_matrix],
        input_length=max_len,
        trainable=True
    ),

    LSTM(128, return_sequences=True),
    Dropout(0.5),

    LSTM(64, return_sequences=False),
    Dropout(0.5),

    Dense(1, activation='sigmoid')
])

lstm_model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

lstm_model.summary()



In [99]:
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=2,
    restore_best_weights=True
)

history_lstm = lstm_model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=10,
    batch_size=64,
    callbacks=[early_stop]
)

Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 35ms/step - accuracy: 0.5123 - loss: 0.6916 - val_accuracy: 0.5142 - val_loss: 0.6826
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 29ms/step - accuracy: 0.5366 - loss: 0.6757 - val_accuracy: 0.5240 - val_loss: 0.6839
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 29ms/step - accuracy: 0.7140 - loss: 0.5447 - val_accuracy: 0.8490 - val_loss: 0.3951
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 30ms/step - accuracy: 0.8802 - loss: 0.3336 - val_accuracy: 0.8866 - val_loss: 0.2898
Epoch 5/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 30ms/step - accuracy: 0.9153 - loss: 0.2396 - val_accuracy: 0.8944 - val_loss: 0.2798
Epoch 6/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 30ms/step - accuracy: 0.9378 - loss: 0.1907 - val_accuracy: 0.8962 - val_loss: 0.2817
Epoch 7/10
[1m6

In [100]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

y_pred_probs = lstm_model.predict(X_test)
y_pred = (y_pred_probs > 0.5).astype(int).reshape(-1)

print("=== 2-Layer LSTM Results ===")
print("Accuracy :", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall   :", recall_score(y_test, y_pred))
print("F1 Score :", f1_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step
=== 2-Layer LSTM Results ===
Accuracy : 0.8974
Precision: 0.8965922444183314
Recall   : 0.9018912529550828
F1 Score : 0.8992339422510313
Confusion Matrix:
 [[2198  264]
 [ 249 2289]]


In [101]:
from tensorflow.keras.layers import Bidirectional

bilstm_model = Sequential([
    Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        weights=[embedding_matrix],
        input_length=max_len,
        trainable=True
    ),

    Bidirectional(LSTM(128, return_sequences=True)),
    Dropout(0.5),

    Bidirectional(LSTM(64, return_sequences=False)),
    Dropout(0.5),

    Dense(1, activation='sigmoid')
])

bilstm_model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

bilstm_model.summary()



In [102]:
history_bilstm = bilstm_model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=10,
    batch_size=64,
    callbacks=[early_stop]
)

Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 68ms/step - accuracy: 0.7751 - loss: 0.4741 - val_accuracy: 0.8830 - val_loss: 0.2889
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 64ms/step - accuracy: 0.9029 - loss: 0.2537 - val_accuracy: 0.8982 - val_loss: 0.2556
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 63ms/step - accuracy: 0.9414 - loss: 0.1695 - val_accuracy: 0.8902 - val_loss: 0.3026
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 65ms/step - accuracy: 0.9593 - loss: 0.1195 - val_accuracy: 0.8944 - val_loss: 0.2878


In [103]:
y_pred_probs = bilstm_model.predict(X_test)
y_pred = (y_pred_probs > 0.5).astype(int).reshape(-1)

print("=== 2-Layer BiLSTM Results ===")
print("Accuracy :", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall   :", recall_score(y_test, y_pred))
print("F1 Score :", f1_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step
=== 2-Layer BiLSTM Results ===
Accuracy : 0.906
Precision: 0.9074074074074074
Recall   : 0.9074074074074074
F1 Score : 0.9074074074074074
Confusion Matrix:
 [[2227  235]
 [ 235 2303]]
