In [None]:
# Disable tensorflow warnings:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [None]:
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
# from keras.preprocessing.text import Tokenizer
# from keras.utils import pad_sequences
# from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [None]:
print("TensorFlow version:", tf.__version__)

TensorFlow version: 2.20.0


In [None]:
data = pd.read_csv("data/IMDB_Dataset.csv")
print(data.head())

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [13]:
# Data preprocessing
def clean_text(text):
    text = text.lower()
    text = re.sub('<.*?>', '', text) # removes HTML tags
    text = re.sub('[^a-zA-Z]', ' ', text) # removes special characters
    text = re.sub(r'\s+', ' ', text) # removes extra whitespace
    return text.strip()

data['clean_review'] = data['review'].apply(clean_text)
print('\n', data['clean_review'].head())


 0    one of the other reviewers has mentioned that ...
1    a wonderful little production the filming tech...
2    i thought this was a wonderful way to spend ti...
3    basically there s a family where a little boy ...
4    petter mattei s love in the time of money is a...
Name: clean_review, dtype: object


In [14]:
#   Prepare data for LSTM
X = data['clean_review'] # input features
y = data['sentiment'].map({'positive': 1, 'negative': 0}) # target labels

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#   Tokenisation
max_words = 10000
max_len = 200

tokeniser = TextVectorization(max_tokens=max_words, output_sequence_length=max_len)

# Train-Test Split (on raw text first)
X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

tokeniser.adapt(X_train_raw)

# Vectorize AFTER splitting
X_train = tokeniser(X_train_raw)
X_test = tokeniser(X_test_raw)

In [15]:
# Build Model
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128)) # Turns words into dense vectors
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2)) # Learns sequential patterns in text
model.add(Dense(1, activation='sigmoid')) # Outputs final sentiment prediction: 0 or 1

# Compile Model
# Model uses binary cross entropy loss, an ADAM optimiser and tracks accuracy by using metrics
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [16]:
# Train Model
model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.1)

Epoch 1/5
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 132ms/step - accuracy: 0.5766 - loss: 0.6771 - val_accuracy: 0.5395 - val_loss: 0.6860
Epoch 2/5
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 131ms/step - accuracy: 0.6416 - loss: 0.6316 - val_accuracy: 0.6745 - val_loss: 0.6171
Epoch 3/5
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 131ms/step - accuracy: 0.7485 - loss: 0.5141 - val_accuracy: 0.8410 - val_loss: 0.3933
Epoch 4/5
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 131ms/step - accuracy: 0.8690 - loss: 0.3336 - val_accuracy: 0.8432 - val_loss: 0.4160
Epoch 5/5
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 131ms/step - accuracy: 0.9018 - loss: 0.2685 - val_accuracy: 0.8717 - val_loss: 0.3303


<keras.src.callbacks.history.History at 0x248e57b2960>

In [17]:
# Evaluate Model

y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 23ms/step
Accuracy: 0.8675
              precision    recall  f1-score   support

           0       0.87      0.86      0.87      4961
           1       0.86      0.88      0.87      5039

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000



In [None]:
# Making Predictions

# while True:
#     user_input = input("Enter a movie review (or type 'exit' to quit): \n")
#     if user_input.lower() == 'exit':
#         break

#     # Preprocess text
#     cleaned_input = clean_text(user_input)

#     # Vectorise text (already tokenised and padded to max_len)
#     input_seq = tokeniser([cleaned_input])

#     # Predict
#     prediction = model.predict(input_seq)[0][0]
#     sentiment = "Positive" if prediction > 0.5 else "Negative"

#     print(f"\nPredicted Sentiment: {sentiment}")
#     print(f"Confidence: {prediction:.2f}\n")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step

Predicted Sentiment: Positive
Confidence: 0.63

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step

Predicted Sentiment: Negative
Confidence: 0.06

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step

Predicted Sentiment: Positive
Confidence: 0.65

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step

Predicted Sentiment: Positive
Confidence: 0.65

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step

Predicted Sentiment: Positive
Confidence: 0.93

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step

Predicted Sentiment: Positive
Confidence: 0.91

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step

Predicted Sentiment: Negative
Confidence: 0.20

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step

Predicted Sentiment: Negative
Confidence: 0.11

[1m1/1[0m [32m━━━━━━━━━━━━━━━