In [4]:
import pandas as pd

# Load the IMDB dataset
df = pd.read_csv('/content/drive/MyDrive/IMDB Dataset.csv')
print(df.head())

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [5]:
import gensim.downloader as api

In [6]:
import nltk
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

nltk.download('punkt')
from nltk.tokenize import word_tokenize

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    return tokens

df['tokens'] = df['review'].apply(preprocess_text)

# Encode labels
le = LabelEncoder()
df['sentiment'] = le.fit_transform(df['sentiment'])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df['tokens'], df['sentiment'], test_size=0.2, random_state=42
)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
glove = api.load("glove-wiki-gigaword-100")



In [8]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

word_index = tokenizer.word_index
num_words = len(word_index) + 1
embedding_dim = 100  # GloVe embedding dimension

embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    if word in glove:
        embedding_matrix[i] = glove[word]


In [9]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_padded = pad_sequences(X_train_seq, maxlen=100, padding='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=100, padding='post')


In [10]:
import tensorflow as tf

model_rnn = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=num_words, output_dim=embedding_dim, weights=[embedding_matrix], trainable=False),
    tf.keras.layers.SimpleRNN(64),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model_rnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_rnn.fit(X_train_padded, y_train, epochs=10, batch_size=64, validation_data=(X_test_padded, y_test))


Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 15ms/step - accuracy: 0.5588 - loss: 0.6791 - val_accuracy: 0.5632 - val_loss: 0.6704
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 14ms/step - accuracy: 0.6383 - loss: 0.6300 - val_accuracy: 0.6260 - val_loss: 0.6415
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 12ms/step - accuracy: 0.6147 - loss: 0.6475 - val_accuracy: 0.6654 - val_loss: 0.6087
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 14ms/step - accuracy: 0.6396 - loss: 0.6274 - val_accuracy: 0.6806 - val_loss: 0.5965
Epoch 5/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 14ms/step - accuracy: 0.6472 - loss: 0.6247 - val_accuracy: 0.6871 - val_loss: 0.5996
Epoch 6/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 11ms/step - accuracy: 0.6079 - loss: 0.6519 - val_accuracy: 0.6064 - val_loss: 0.6503
Epoch 7/10
[1m625/6

<keras.src.callbacks.history.History at 0x7e101a8a2f20>

In [11]:
model_lstm = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=num_words, output_dim=embedding_dim, weights=[embedding_matrix], trainable=False),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model_lstm.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_lstm.fit(X_train_padded, y_train, epochs=10, batch_size=64, validation_data=(X_test_padded, y_test))


Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 10ms/step - accuracy: 0.6447 - loss: 0.6095 - val_accuracy: 0.7874 - val_loss: 0.4581
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - accuracy: 0.7897 - loss: 0.4447 - val_accuracy: 0.8196 - val_loss: 0.3954
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.8169 - loss: 0.3978 - val_accuracy: 0.8324 - val_loss: 0.3615
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - accuracy: 0.8367 - loss: 0.3581 - val_accuracy: 0.8476 - val_loss: 0.3409
Epoch 5/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.8499 - loss: 0.3373 - val_accuracy: 0.8464 - val_loss: 0.3448
Epoch 6/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 0.8596 - loss: 0.3214 - val_accuracy: 0.8572 - val_loss: 0.3242
Epoch 7/10
[1m625/625[0m

<keras.src.callbacks.history.History at 0x7e0f82eed270>

In [12]:
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim

# Convert text to indices
vocab_size = len(word_index) + 1
X_train_indices = tokenizer.texts_to_sequences(X_train)
X_test_indices = tokenizer.texts_to_sequences(X_test)

# Padding sequences
X_train_padded = pad_sequences(X_train_indices, maxlen=100, padding='post')
X_test_padded = pad_sequences(X_test_indices, maxlen=100, padding='post')

train_data = TensorDataset(torch.tensor(X_train_padded), torch.tensor(y_train.values))
test_data = TensorDataset(torch.tensor(X_test_padded), torch.tensor(y_test.values))

train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
test_loader = DataLoader(test_data, batch_size=64, shuffle=False)


In [13]:
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.rnn(x)
        x = self.fc(x[:, -1, :])
        return x

rnn_model = RNNModel(vocab_size=vocab_size, embedding_dim=100, hidden_dim=64, output_dim=1)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(rnn_model.parameters(), lr=0.001)

# Training loop
for epoch in range(10):
    rnn_model.train()
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = rnn_model(inputs)
        loss = criterion(outputs.squeeze(), labels.float())
        loss.backward()
        optimizer.step()


In [14]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = self.fc(x[:, -1, :])
        return x

lstm_model = LSTMModel(vocab_size=vocab_size, embedding_dim=100, hidden_dim=64, output_dim=1)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(lstm_model.parameters(), lr=0.001)

# Training loop
for epoch in range(10):
    lstm_model.train()
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = lstm_model(inputs)
        loss = criterion(outputs.squeeze(), labels.float())
        loss.backward()
        optimizer.step()


In [15]:
# Evaluate on the test set
rnn_test_loss, rnn_test_acc = model_rnn.evaluate(X_test_padded, y_test)
print(f"Vanilla RNN with GloVe Embeddings - Test Loss: {rnn_test_loss}, Test Accuracy: {rnn_test_acc}")


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.6584 - loss: 0.6023
Vanilla RNN with GloVe Embeddings - Test Loss: 0.6055726408958435, Test Accuracy: 0.6581000089645386


In [16]:
# Evaluate on the test set
lstm_test_loss, lstm_test_acc = model_lstm.evaluate(X_test_padded, y_test)
print(f"LSTM with GloVe Embeddings - Test Loss: {lstm_test_loss}, Test Accuracy: {lstm_test_acc}")


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8577 - loss: 0.3272
LSTM with GloVe Embeddings - Test Loss: 0.33133894205093384, Test Accuracy: 0.857200026512146


In [17]:
def evaluate_model(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            predicted = torch.round(torch.sigmoid(outputs.squeeze()))
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = correct / total
    return accuracy

rnn_test_acc = evaluate_model(rnn_model, test_loader)
print(f"Vanilla RNN with On-the-Fly Embeddings - Test Accuracy: {rnn_test_acc}")


Vanilla RNN with On-the-Fly Embeddings - Test Accuracy: 0.7788


In [18]:
lstm_test_acc = evaluate_model(lstm_model, test_loader)
print(f"LSTM with On-the-Fly Embeddings - Test Accuracy: {lstm_test_acc}")


LSTM with On-the-Fly Embeddings - Test Accuracy: 0.8466


In [19]:
print("Comparison of Models")
print("---------------------")
print(f"Vanilla RNN with GloVe Embeddings - Test Accuracy: {rnn_test_acc}, Test Loss: {rnn_test_loss}")
print(f"LSTM with GloVe Embeddings - Test Accuracy: {lstm_test_acc}, Test Loss: {lstm_test_loss}")
print(f"Vanilla RNN with On-the-Fly Embeddings - Test Accuracy: {rnn_test_acc}")
print(f"LSTM with On-the-Fly Embeddings - Test Accuracy: {lstm_test_acc}")


Comparison of Models
---------------------
Vanilla RNN with GloVe Embeddings - Test Accuracy: 0.7788, Test Loss: 0.6055726408958435
LSTM with GloVe Embeddings - Test Accuracy: 0.8466, Test Loss: 0.33133894205093384
Vanilla RNN with On-the-Fly Embeddings - Test Accuracy: 0.7788
LSTM with On-the-Fly Embeddings - Test Accuracy: 0.8466
