<a href="https://colab.research.google.com/github/HaywhyCoder/imdb-lstm/blob/main/imdb-lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### Import Libraries

In [None]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from bs4 import BeautifulSoup
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Preprocess Text Data

In [None]:
dataset = pd.read_csv("/content/drive/MyDrive/IMDB Dataset - IMDB Dataset.csv")
data = dataset.copy()
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
data['sentiment'].value_counts()

In [None]:
def clean_data(row):
  lemmatizer = WordNetLemmatizer()
  sentence = row[0]
  review = BeautifulSoup(sentence).get_text()
  clean_sentence = re.sub(r'[^A-Za-z\s]+', '', review.lower())
  stops = set(stopwords.words('english'))
  tokens = [lemmatizer.lemmatize(word) for word in clean_sentence.split() if word not in stops]
  row[0] = ' '.join(tokens)
  return row

In [None]:
cleaned_data = data.apply(clean_data, axis=1)

  sentence = row[0]
  row[0] = ' '.join(tokens)


In [None]:
cleaned_data.head()

Unnamed: 0,review,sentiment
0,one reviewer mentioned watching oz episode you...,positive
1,wonderful little production filming technique ...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically there family little boy jake think t...,negative
4,petter matteis love time money visually stunni...,positive


In [None]:
cleaned_data.map({'positive': 1, 'negative': 0})
cleaned_data.head()

  cleaned_data.replace({'sentiment' : {'positive': 1, 'negative': 0}}, inplace=True)


Unnamed: 0,review,sentiment
0,one reviewer mentioned watching oz episode you...,1
1,wonderful little production filming technique ...,1
2,thought wonderful way spend time hot summer we...,1
3,basically there family little boy jake think t...,0
4,petter matteis love time money visually stunni...,1


In [None]:
# Split data into train and test sets
train_data, test_data = train_test_split(cleaned_data, test_size=0.2, random_state=42)
print(train_data.shape)
print(test_data.shape)

(40000, 2)
(10000, 2)


#### Data Preprocessing

In [None]:
#Tokenize test data
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data['review'])
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data['review']), maxlen=200)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data['review']), maxlen=200)

In [None]:
print(X_train)

[[   0    0    0 ...  121  129 3450]
 [   0    0    0 ...  118   24   33]
 [   0    0    0 ...  212  614    8]
 ...
 [   0    0    0 ...  999  364  521]
 [   0    0    0 ... 1141  153   33]
 [   0    0    0 ...  316   16 1780]]


In [None]:
Y_train = train_data['sentiment']
Y_test = test_data['sentiment']

#### Build the LSTM Model

In [None]:
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=200))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))



In [None]:
model.summary()

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(X_train, Y_train, epochs=5, batch_size=64, validation_split=0.2)

Epoch 1/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m213s[0m 411ms/step - accuracy: 0.7657 - loss: 0.4773 - val_accuracy: 0.8634 - val_loss: 0.3282
Epoch 2/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m252s[0m 399ms/step - accuracy: 0.8796 - loss: 0.3014 - val_accuracy: 0.8673 - val_loss: 0.3175
Epoch 3/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m208s[0m 411ms/step - accuracy: 0.9083 - loss: 0.2412 - val_accuracy: 0.8570 - val_loss: 0.3370
Epoch 4/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m254s[0m 395ms/step - accuracy: 0.9188 - loss: 0.2075 - val_accuracy: 0.8699 - val_loss: 0.3315
Epoch 5/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m203s[0m 406ms/step - accuracy: 0.9340 - loss: 0.1754 - val_accuracy: 0.8659 - val_loss: 0.3484


<keras.src.callbacks.history.History at 0x787ff49cf110>

In [None]:
loss, accuracy = model.evaluate(X_test, Y_test)
print(f"Test Loss: {loss}")
print(f'Test Accuracy: {accuracy}')

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 132ms/step - accuracy: 0.8677 - loss: 0.3392
Test Loss: 0.3384304940700531
Test Accuracy: 0.8708000183105469


In [None]:
# Predict and evaluate using confusion matrix
Y_pred = (model.predict(X_test) > 0.5).astype(int)
cm = confusion_matrix(Y_test, Y_pred)
print("Confusion Matrix:\n", cm)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 143ms/step
Confusion Matrix:
 [[4331  630]
 [ 662 4377]]


In [None]:
# Performance metrics
report = classification_report(Y_test, Y_pred, target_names=['Negative', 'Positive'])
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

    Negative       0.87      0.87      0.87      4961
    Positive       0.87      0.87      0.87      5039

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000

