In [55]:
import pandas as pd
import numpy as np
import re
import os
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, GRU, SimpleRNN

In [56]:
df = pd.read_csv("IMDB Dataset.csv")


In [57]:
print(df.columns)


Index(['review', 'sentiment'], dtype='object')


In [58]:
def preprocess_text(text):
    text = text.lower()  
    text = re.sub(r'<.*?>', '', text)  
    text = re.sub(r'[^a-z\s]', '', text)  
    return text


In [59]:
df["review"] = df["review"].apply(preprocess_text)


In [60]:
df["sentiment"] = df["sentiment"].apply(lambda x: 1 if x == "positive" else 0)


In [61]:
X = df["review"]
y = df["sentiment"]

In [62]:
texts = df["review"].astype(str)
labels = df["sentiment"]

In [63]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [64]:
max_words = 20000
max_len = 200

In [65]:
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

In [66]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [67]:
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

GloVe

In [68]:
embedding_index = {}
with open("C:/Users/mwtok/OneDrive/Desktop/glove.6B/glove.6B.100d.txt", encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = vector

In [69]:
embedding_dim = 100
word_index = tokenizer.word_index
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    if i < max_words:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

LTSM

In [70]:
lstm_model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False),
    LSTM(128),
    Dense(1, activation="sigmoid")
])
lstm_model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
lstm_model.fit(X_train_pad, y_train, epochs=20, batch_size=64, validation_split=0.2)

Epoch 1/20




[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 156ms/step - accuracy: 0.6334 - loss: 0.6389 - val_accuracy: 0.7936 - val_loss: 0.4550
Epoch 2/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 144ms/step - accuracy: 0.7920 - loss: 0.4598 - val_accuracy: 0.8389 - val_loss: 0.3738
Epoch 3/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 140ms/step - accuracy: 0.8437 - loss: 0.3569 - val_accuracy: 0.8599 - val_loss: 0.3254
Epoch 4/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 142ms/step - accuracy: 0.8625 - loss: 0.3211 - val_accuracy: 0.8636 - val_loss: 0.3133
Epoch 5/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 144ms/step - accuracy: 0.8768 - loss: 0.2895 - val_accuracy: 0.8685 - val_loss: 0.3064
Epoch 6/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 140ms/step - accuracy: 0.8873 - loss: 0.2725 - val_accuracy: 0.8746 - val_loss: 0.2976
Epoch 7/20
[1m500/50

<keras.src.callbacks.history.History at 0x2a76b42d070>

In [71]:
lstm_preds = (lstm_model.predict(X_test_pad) > 0.5).astype("int32")
print("🔹 LSTM Classification Report:")
print(classification_report(y_test, lstm_preds, digits=4))

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 37ms/step
🔹 LSTM Classification Report:
              precision    recall  f1-score   support

           0     0.8596    0.8809    0.8701      4961
           1     0.8798    0.8583    0.8689      5039

    accuracy                         0.8695     10000
   macro avg     0.8697    0.8696    0.8695     10000
weighted avg     0.8697    0.8695    0.8695     10000



GRU

In [72]:
gru_model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False),
    GRU(128),
    Dense(1, activation="sigmoid")
])
gru_model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
gru_model.fit(X_train_pad, y_train, epochs=5, batch_size=64, validation_split=0.2)

Epoch 1/5




[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 135ms/step - accuracy: 0.6933 - loss: 0.5627 - val_accuracy: 0.8505 - val_loss: 0.3425
Epoch 2/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 133ms/step - accuracy: 0.8502 - loss: 0.3475 - val_accuracy: 0.8691 - val_loss: 0.3075
Epoch 3/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 131ms/step - accuracy: 0.8701 - loss: 0.3049 - val_accuracy: 0.8781 - val_loss: 0.2870
Epoch 4/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 135ms/step - accuracy: 0.8865 - loss: 0.2711 - val_accuracy: 0.8813 - val_loss: 0.2834
Epoch 5/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 134ms/step - accuracy: 0.8979 - loss: 0.2502 - val_accuracy: 0.8852 - val_loss: 0.2763


<keras.src.callbacks.history.History at 0x2a753da0a40>

In [73]:
gru_preds = (gru_model.predict(X_test_pad) > 0.5).astype("int32")
print("🔹 GRU Classification Report:")
print(classification_report(y_test, gru_preds, digits=4))

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 27ms/step
🔹 GRU Classification Report:
              precision    recall  f1-score   support

           0     0.8970    0.8690    0.8828      4961
           1     0.8749    0.9018    0.8881      5039

    accuracy                         0.8855     10000
   macro avg     0.8859    0.8854    0.8854     10000
weighted avg     0.8858    0.8855    0.8855     10000



rnn

In [74]:
rnn_model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False),
    SimpleRNN(128),
    Dense(1, activation="sigmoid")
])
rnn_model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
rnn_model.fit(X_train_pad, y_train, epochs=5, batch_size=64, validation_split=0.2)

Epoch 1/5




[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 34ms/step - accuracy: 0.5844 - loss: 0.6744 - val_accuracy: 0.6160 - val_loss: 0.6506
Epoch 2/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 33ms/step - accuracy: 0.6365 - loss: 0.6361 - val_accuracy: 0.7055 - val_loss: 0.5867
Epoch 3/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 33ms/step - accuracy: 0.6421 - loss: 0.6346 - val_accuracy: 0.7013 - val_loss: 0.5848
Epoch 4/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 33ms/step - accuracy: 0.6387 - loss: 0.6355 - val_accuracy: 0.5601 - val_loss: 0.6710
Epoch 5/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 33ms/step - accuracy: 0.6176 - loss: 0.6456 - val_accuracy: 0.5450 - val_loss: 0.7020


<keras.src.callbacks.history.History at 0x2a74d388980>

In [76]:
rnn_preds = (rnn_model.predict(X_test_pad) > 0.5).astype("int32")
print("Simple RNN Classification Report:")
print(classification_report(y_test, rnn_preds, digits=4))


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step
Simple RNN Classification Report:
              precision    recall  f1-score   support

           0     0.5220    0.9546    0.6749      4961
           1     0.7573    0.1393    0.2353      5039

    accuracy                         0.5438     10000
   macro avg     0.6396    0.5470    0.4551     10000
weighted avg     0.6406    0.5438    0.4534     10000



In [77]:
lstm_model.save("lstm_model.h5")




In [78]:
import pickle

with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)
