In [None]:
# Disable tensorflow warnings:
import os # imports OS library which can perform OS commands
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # sets environment variable

In [None]:
import pandas as pd # Needed to store data as a dataframe for data preprocessing
import numpy as np
import re # Used to generate regex expressions
import tensorflow as tf # Import ML functionality
from tensorflow.keras.layers import TextVectorization # Creates a vectoriser -> converts tokens into model embeddings
# from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential # Used for creating model based on its layers
from tensorflow.keras.layers import Embedding, LSTM, Dense # Functionality for creating specific layers used in model
from sklearn.model_selection import train_test_split # Used for creating a train-test split
from sklearn.metrics import classification_report, accuracy_score # Used for performance metrics

In [None]:
# Prints version of TensorFlow
print("TensorFlow version:", tf.__version__)

TensorFlow version: 2.20.0


In [None]:
data = pd.read_csv("data/IMDB_Dataset.csv") # Reads data in the IMDB dataset and converts it into a dataframe
print(data.head()) # Prints the first 5 lines of the dataset

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [None]:
# Data preprocessing

def clean_text(text):

    # Cleans the text
    text = text.lower() # Converts text to lower case

    # re is a regex library
    text = re.sub('<.*?>', '', text) # Removes HTML tags from the text (replaces them with empty string)
                                     # <> is a typical structure for HTML tags
                                     # . means that any character but new line would be removed
                                     # * means 0 or more characters so .* means to remove all characters within <> (i.e.: the full HTML tag)
                                     # ? specifies that you want to remove the HTML tags in a non-greedy way -> removes smallest possible pattern (i.e.: does not remove text inbetween HTML tags)

    text = re.sub('[^a-zA-Z]', ' ', text) # Removes special characters from the text and replaces them with a space
                                          # [^a-zA-Z] Removes any characters that are not (^) lower (a-z) or uppercase (A-Z) letters


    text = re.sub(r'\s+', ' ', text) # Removes extra whitespace from the text and replaces them with a space
                                     # r'' means raw string
                                     # \s means any whitespace characters
                                     # + means one or more occurences of the expression, i.e.: replace more than one instance of whitespace


    return text.strip() # Removes any whitespace from the beginning and ends of text

data['clean_review'] = data['review'].apply(clean_text) # Creates a new column in the dataframe called 'clean_review', which applies the clean_text function to data['review']
print(data['clean_review'].head()) # Prints first 5 lines of data[clean_review]

0    one of the other reviewers has mentioned that ...
1    a wonderful little production the filming tech...
2    i thought this was a wonderful way to spend ti...
3    basically there s a family where a little boy ...
4    petter mattei s love in the time of money is a...
Name: clean_review, dtype: object


In [None]:
# Prepare data for LSTM

X = data['clean_review'] # Input features, i.e. the reviews that have been preprocessed, so data[clean_review]
y = data['sentiment'].map({'positive': 1, 'negative': 0}) # Target labels, i.e.: the outcomes so data[sentiment]
                                                          # Maps the sentiment labels in the dataframe (positive and negative) to 0 and 1.

# Vectoriser
max_words = 10000 # Maximum length of vocabulary
max_len = 200 # Maximum length of output
# If the input that is greater than max_len, the text is truncated (cut down to max_len)
# If the input is less than max_len, the text is padded (use filler characters to increase length to max_len)

# Implement a vectoriser using the TextVectorization Library
# max_tokens represents the maximum size of the vocabulary, i.e.: max_words so 10000 words
# output_sequence_length sets the length of the output sequence to a specific length, i.e.: max_len so 200 words
vectoriser = TextVectorization(max_tokens=max_words, output_sequence_length=max_len)

# Train-Test Split
# Train set is the set used to fit the model.
# Test set is the set used to evaluate a final model accurately (not part of the training set).
# 20% of data is the test set so test_size is 0.2.
# random_state is set to a number so that the splits generated are reproducible; otherwise the split would be generated randomly (a seed value)
# Raw data means the data that has been preprocessed but has not be 
X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectoriser.adapt(X_train_raw) # Creates a vocabulary (collection of frequent terms) of string terms from tokens in the X_train_raw data

# Vectorize AFTER splitting -> 
X_train = vectoriser(X_train_raw) # Applies vectoriser to X_train_raw
X_test = vectoriser(X_test_raw) # Applies vectoriser to X_test_raw

In [49]:
# Build Model

model = Sequential() # Model should be sequential so that it is created layer by layer

# Embedding layer added
# This layer converts the high-dimensional tokenised text into a low-dimensional vector
# The input dimension in this case is the maximum tokens in our input, so 10000.
# The output dimension is the dimension the vector should be
model.add(Embedding(input_dim=max_words, output_dim=128))

# LSTM layer added
# 128 units -> this represents the dimensionality of the output space.
# dropout = 0.2 means that 20% of units would be dropped for the linear transformation of the inputs.
# recurrent_dropout = 0.2 implies that 20% of units will be dropped for linear transformations in the recurrent states.
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))

# Dense layer added to feed all outputs from the previous layer to all neurons
# In this case, this layer has 1 neuron and uses a sigmoid activation function.
model.add(Dense(1, activation='sigmoid'))

# Compile Model for training
# Model uses binary cross entropy loss, an ADAM optimiser and tracks accuracy
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [50]:
# Train Model

# Trains the model for 5 epochs
# batch_size represents the number of samples (rows of data) per gradient update (i.e.: updating model parameters)
# validation_split represents the fraction of training data that will be used as validation_data (which is not trained but used to evaluate the loss of a model as well as any metrics like accuracy)
model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.1)

Epoch 1/5
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 128ms/step - accuracy: 0.5694 - loss: 0.6760 - val_accuracy: 0.6133 - val_loss: 0.6700
Epoch 2/5
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 142ms/step - accuracy: 0.6332 - loss: 0.6436 - val_accuracy: 0.6463 - val_loss: 0.6390
Epoch 3/5
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 124ms/step - accuracy: 0.7308 - loss: 0.5461 - val_accuracy: 0.8468 - val_loss: 0.3734
Epoch 4/5
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 124ms/step - accuracy: 0.8712 - loss: 0.3172 - val_accuracy: 0.8730 - val_loss: 0.3071
Epoch 5/5
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 123ms/step - accuracy: 0.9081 - loss: 0.2366 - val_accuracy: 0.8773 - val_loss: 0.3167


<keras.src.callbacks.history.History at 0x22bb84ac4a0>

In [51]:
# Evaluate Model

y_pred_prob = model.predict(X_test) # Creates a probability based on model predictions from X_test
y_pred = (y_pred_prob > 0.5).astype(int)

print("Accuracy:", accuracy_score(y_test, y_pred)) # Generates accuracy by comparing y_test (target labels) from y_pred (model prediction of the target labels)
print(classification_report(y_test, y_pred)) # Generates classification report that uses different performance metrics such as precision, recall, f1-score and support

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 26ms/step
Accuracy: 0.8749
              precision    recall  f1-score   support

           0       0.87      0.88      0.87      4961
           1       0.88      0.87      0.87      5039

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000



In [52]:
# Making Predictions

# while True:
#     user_input = input("Enter a movie review (or type 'exit' to quit): \n")
#     if user_input.lower() == 'exit':
#         break

#     # Preprocess text
#     cleaned_input = clean_text(user_input)

#     # Vectorise text (already tokenised and padded to max_len)
#     input_seq = vectoriser([cleaned_input])

#     # Predict
#     prediction = model.predict(input_seq)[0][0]
#     sentiment = "Positive" if prediction > 0.5 else "Negative"

#     print(f"\nPredicted Sentiment: {sentiment}")
#     print(f"Confidence: {prediction:.2f}\n")