In [1]:
!pip install kaggle



In [2]:
from google.colab import drive
drive.mount('/content/drive')
!mkdir -p ~/.kaggle
!cp '//content/drive/MyDrive/kaggle.json' ~/.kaggle/kaggle.json
!chmod 600 ~/.kaggle/kaggle.json

Mounted at /content/drive


In [3]:
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
License(s): other
Downloading imdb-dataset-of-50k-movie-reviews.zip to /content
 19% 5.00M/25.7M [00:00<00:00, 39.5MB/s]
100% 25.7M/25.7M [00:00<00:00, 133MB/s] 


In [4]:
!unzip '/content/imdb-dataset-of-50k-movie-reviews.zip'

Archive:  /content/imdb-dataset-of-50k-movie-reviews.zip
  inflating: IMDB Dataset.csv        


In [5]:
import pandas as pd

In [6]:
data = pd.read_csv('/content/IMDB Dataset.csv', on_bad_lines = 'skip', engine = 'python', encoding = 'utf-8')

In [7]:
df = data.copy()

In [8]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [9]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
import re
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def clean_text(text):
  text = re.sub(r'<br\s*/>', ' ', text)
  text = re.sub(r'[^a-zA-Z\s]', '', text)
  text = text.lower()
  text = re.sub(r'\s+', ' ', text).strip()
  return text

df['cleaned_text'] = df['review'].apply(clean_text)

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
  tokens = text.split()
  tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
  return ' '.join(tokens)

df['processed_text'] = df['cleaned_text'].apply(preprocess_text)
train_data, test_data = train_test_split(df[['processed_text','sentiment']], test_size=0.2, random_state = 42)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [10]:
train_data.head()

Unnamed: 0,processed_text,sentiment
39087,thats kept asking many fight screaming match s...,negative
30893,watch entire movie could watch entire movie st...,negative
45278,touching love story reminiscent mood love draw...,positive
16398,latterday fulci schlocker totally abysmal conc...,negative
13653,first firmly believe norwegian movie continual...,negative


In [11]:
test_data.head()

Unnamed: 0,processed_text,sentiment
33553,really liked summerslam due look arena curtain...,positive
9427,many television show appeal quite many differe...,positive
199,film quickly get major chase scene ever increa...,negative
12447,jane austen would definitely approve one gwyne...,positive
39489,expectation somewhat high went see movie thoug...,negative


In [12]:
X_train = train_data['processed_text']
Y_train = train_data['sentiment']

X_test = test_data['processed_text']
Y_test = test_data['sentiment']


In [13]:
X_train = X_train.tolist()
X_test = X_test.tolist()

In [14]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import OneHotEncoder
import numpy as np

tf = Tokenizer(oov_token = '<oov')
tf.fit_on_texts(X_train)
X_train_seq = tf.texts_to_sequences(X_train)
X_test_seq = tf.texts_to_sequences(X_test)

X_train_padded = pad_sequences(X_train_seq, maxlen = 100, padding = 'post')
X_test_padded = pad_sequences(X_test_seq, maxlen = 100, padding = 'post')

ohe = OneHotEncoder()
Y_train_encoded = ohe.fit_transform(np.array(Y_train).reshape(-1,1))
Y_test_encoded = ohe.fit_transform(np.array(Y_test).reshape(-1,1))

print(f"X_Train shape: {X_train_padded.shape}, Y_Train shape: {Y_train_encoded.shape}")
print(f"X_Test shape: {X_test_padded.shape}, Y_Test shape: {Y_test_encoded.shape}")

X_Train shape: (40000, 100), Y_Train shape: (40000, 2)
X_Test shape: (10000, 100), Y_Test shape: (10000, 2)


In [15]:
Y_train_encoded = Y_train_encoded.toarray()
Y_test_encoded = Y_test_encoded.toarray()

In [16]:
print(Y_train_encoded.shape)
print(Y_test_encoded.shape)

(40000, 2)
(10000, 2)


In [17]:
Y_train_encoded

array([[1., 0.],
       [1., 0.],
       [0., 1.],
       ...,
       [1., 0.],
       [0., 1.],
       [0., 1.]])

In [18]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, LSTM, Dense, Dropout


In [19]:
RNN_model = Sequential()
RNN_model.add(Embedding(input_dim=len(tf.word_index)+1, output_dim=128, input_length = 100))
RNN_model.add(SimpleRNN(units=128, return_sequences=True))
RNN_model.add(Dropout(0.2))
RNN_model.add(SimpleRNN(units=64))
RNN_model.add(Dropout(0.2))
RNN_model.add(Dense(2, activation = 'softmax'))

RNN_model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])



In [20]:
LSTM_model = Sequential()
LSTM_model.add(Embedding(input_dim=len(tf.word_index)+1, output_dim=128, input_length = 100))
LSTM_model.add(LSTM(units=128, return_sequences=True))
LSTM_model.add(Dropout(0.2))
LSTM_model.add(LSTM(units=64))
LSTM_model.add(Dropout(0.2))
LSTM_model.add(Dense(2, activation = 'softmax'))

LSTM_model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

In [21]:
RNN_model.fit(X_train_padded, Y_train_encoded, epochs = 10, batch_size = 32, validation_data = (X_test_padded, Y_test_encoded))

Epoch 1/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 32ms/step - accuracy: 0.4965 - loss: 0.7493 - val_accuracy: 0.5141 - val_loss: 0.6965
Epoch 2/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 30ms/step - accuracy: 0.5033 - loss: 0.6980 - val_accuracy: 0.4983 - val_loss: 0.6947
Epoch 3/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 31ms/step - accuracy: 0.5105 - loss: 0.6947 - val_accuracy: 0.4841 - val_loss: 0.7000
Epoch 4/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 30ms/step - accuracy: 0.5123 - loss: 0.6946 - val_accuracy: 0.5113 - val_loss: 0.6961
Epoch 5/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 29ms/step - accuracy: 0.5128 - loss: 0.6948 - val_accuracy: 0.4925 - val_loss: 0.6934
Epoch 6/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 29ms/step - accuracy: 0.5081 - loss: 0.6951 - val_accuracy: 0.5116 - val_loss: 0.6975
Epoc

<keras.src.callbacks.history.History at 0x79233004c460>

In [22]:
LSTM_model.fit(X_train_padded, Y_train_encoded, epochs = 10, batch_size = 32, validation_data = (X_test_padded, Y_test_encoded))

Epoch 1/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 23ms/step - accuracy: 0.6619 - loss: 0.5970 - val_accuracy: 0.8321 - val_loss: 0.4201
Epoch 2/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 22ms/step - accuracy: 0.8667 - loss: 0.3430 - val_accuracy: 0.8733 - val_loss: 0.3100
Epoch 3/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 23ms/step - accuracy: 0.9413 - loss: 0.1657 - val_accuracy: 0.8789 - val_loss: 0.3008
Epoch 4/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 23ms/step - accuracy: 0.9758 - loss: 0.0821 - val_accuracy: 0.8701 - val_loss: 0.3683
Epoch 5/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 23ms/step - accuracy: 0.9910 - loss: 0.0336 - val_accuracy: 0.8619 - val_loss: 0.4373
Epoch 6/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 23ms/step - accuracy: 0.9935 - loss: 0.0233 - val_accuracy: 0.8627 - val_loss: 0.5680
Epoc

<keras.src.callbacks.history.History at 0x79230f738910>

In [23]:
RNN_accuracy = RNN_model.evaluate(X_test_padded, Y_test_encoded)
LSTM_accuracy = LSTM_model.evaluate(X_test_padded, Y_test_encoded)


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 12ms/step - accuracy: 0.5955 - loss: 0.6648
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.8596 - loss: 0.7520


In [28]:
import numpy as np

single_instance_index = 9
single_instance = X_test_padded[single_instance_index].reshape(1, -1)
true_label = Y_test_encoded[single_instance_index]

rnn_prediction = RNN_model.predict(single_instance)
lstm_prediction = LSTM_model.predict(single_instance)

true_label = np.argmax(true_label)
rnn_prediction = np.argmax(rnn_prediction)
lstm_prediction = np.argmax(lstm_prediction)

print("True Label:", true_label)
print("RNN Model Prediction:", rnn_prediction)
print("LSTM Model Prediction:", lstm_prediction)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
True Label: 0
RNN Model Prediction: 1
LSTM Model Prediction: 0


## Which model performed well and why?
The LSTM model performed better **(85.96% accuracy)** compared to the RNN model **(59.55% accuracy)** because:

* LSTMs handle long-term dependencies better using gates (input, forget, output), which is crucial for the long text sequences in the IMDB dataset.
* RNNs suffer from the vanishing gradient problem, making them less effective for learning relationships in long sequences.
* The LSTM’s advanced architecture allows it to capture contextual patterns more effectively, resulting in significantly higher accuracy.

