In [53]:
# Disable tensorflow warnings:
import os # imports OS library which can perform OS commands
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # sets environment variable

In [None]:
import pandas as pd # Needed to store data as a dataframe for data preprocessing
import numpy as np
import re # Used to generate regex expressions
import tensorflow as tf # Import ML functionality
from tensorflow.keras.layers import TextVectorization # Creates a vectoriser -> converts tokens into model embeddings
from tensorflow.keras.models import Sequential # Used for creating model based on its layers
from tensorflow.keras.layers import Embedding, LSTM, Dense # Functionality for creating specific layers used in model
from sklearn.model_selection import train_test_split # Used for creating a train-test split
from sklearn.metrics import classification_report, accuracy_score # Used for performance metrics

In [55]:
# Prints version of TensorFlow
print("TensorFlow version:", tf.__version__)

TensorFlow version: 2.20.0


In [56]:
data = pd.read_csv("data/IMDB_Dataset.csv") # Reads data in the IMDB dataset and converts it into a dataframe
print(data.head()) # Prints the first 5 lines of the dataset

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [57]:
# Data preprocessing

def clean_text(text):

    # Cleans the text
    text = text.lower() # Converts text to lower case

    # re is a regex library
    text = re.sub('<.*?>', '', text) # Removes HTML tags from the text (replaces them with empty string)
                                     # <> is a typical structure for HTML tags
                                     # . means that any character but new line would be removed
                                     # * means 0 or more characters so .* means to remove all characters within <> (i.e.: the full HTML tag)
                                     # ? specifies that you want to remove the HTML tags in a non-greedy way -> removes smallest possible pattern (i.e.: does not remove text inbetween HTML tags)

    text = re.sub('[^a-zA-Z]', ' ', text) # Removes special characters from the text and replaces them with a space
                                          # [^a-zA-Z] Removes any characters that are not (^) lower (a-z) or uppercase (A-Z) letters


    text = re.sub(r'\s+', ' ', text) # Removes extra whitespace from the text and replaces them with a space
                                     # r'' means raw string
                                     # \s means any whitespace characters
                                     # + means one or more occurences of the expression, i.e.: replace more than one instance of whitespace


    return text.strip() # Removes any whitespace from the beginning and ends of text

data['clean_review'] = data['review'].apply(clean_text) # Creates a new column in the dataframe called 'clean_review', which applies the clean_text function to data['review']
print(data['clean_review'].head()) # Prints first 5 lines of data[clean_review]

0    one of the other reviewers has mentioned that ...
1    a wonderful little production the filming tech...
2    i thought this was a wonderful way to spend ti...
3    basically there s a family where a little boy ...
4    petter mattei s love in the time of money is a...
Name: clean_review, dtype: object


In [None]:
# Prepare data for LSTM

X = data['clean_review'] # Input data, i.e. the reviews that have been preprocessed, so data[clean_review]
y = data['sentiment'].map({'positive': 1, 'negative': 0}) # Target labels, i.e.: the outcomes so data[sentiment]
                                                          # Returns a copy of the sentiment labels in the dataframe by mapping positive and negative to 0 and 1.
# Vectoriser
max_words = 10000 # Maximum length of vocabulary, to save computational resources it only selects 10000 most useful words
max_len = 200 # Maximum length of input, so dimension of embedding vectors is the same as input layer
# If the input that is greater than max_len, the text is truncated (cut down to max_len)
# If the input is less than max_len, the text is padded (use filler characters to increase length to max_len)

# Implement a vectoriser using the TextVectorization Library
# max_tokens represents the maximum size of the vocabulary, i.e.: max_words so 10000 words
# sequence length sets the length of the input sequence to a specific length, i.e.: max_len so 200 words
# Generates tokens and converts them to word vectors (not embeddings yet)
vectoriser = TextVectorization(max_tokens=max_words, output_sequence_length=max_len)

# Train-Test Split
# Train set is the set used to fit the model.
# Test set is the set used to evaluate a final model accurately (not part of the training set).
# 20% of data is the test set so test_size is 0.2.
# random_state is set to a number so that the splits generated are reproducible; otherwise the split would be generated randomly (a seed value)
# Raw data means the data that has been preprocessed but not converted to embeddings
X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectoriser.adapt(X_train_raw) # Creates a vocabulary (collection of frequent terms) of string terms from tokens in the X_train_raw data

# Vectorize AFTER splitting -> this prevents data leakage so the model cannot learn patterns from the test data vocabulary and artificially boost the performance of the model. 
X_train = vectoriser(X_train_raw) # Applies vectoriser to X_train_raw
X_test = vectoriser(X_test_raw) # Applies vectoriser to X_test_raw
# The results are word vectors (words replaced with indices of where they appear in vocabulary).

In [None]:
# Build Model

model = Sequential() # Model should be sequential so that it is created layer by layer

# Embedding layer added
# This layer converts each word/token index (from the vocabulary) into a low-dimensional embedding vectors
# The input dimension is the maximum possible word index, in other words, the size of the vocabulary: max_words = 10000
# The output dimension is the dimension of the embedding vector for a word/token, 128
model.add(Embedding(input_dim=max_words, output_dim=128))

# LSTM layer added
# 256 is the number of LSTM neurons
# dropout = 20% of the neuron connections are deactivated in this layer to prevent overfitting
# recurrent_dropout = 20% of the recurrent connections (weighted activations from last word/timestep) are deactivated in this layer to prevent overfitting
model.add(LSTM(256, dropout=0.2, recurrent_dropout=0.2))

# Output layer is a dense layer (fully connected to previous layer).
# In this case, this layer has 1 neuron and uses a sigmoid activation function.
model.add(Dense(1, activation='sigmoid'))

# Compile Model for training
# Model uses binary cross entropy loss, an ADAM optimiser and tracks accuracy
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [66]:
# Train Model

# Trains the model for 5 epochs
# batch_size represents the number of samples (rows of data) per gradient update (i.e.: updating model parameters)
# validation_split represents the fraction of training data that will be used as validation_data (which is not trained but used to evaluate the loss of a model as well as any metrics like accuracy)
model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.1)

Epoch 1/5
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m185s[0m 329ms/step - accuracy: 0.5218 - loss: 0.6927 - val_accuracy: 0.5433 - val_loss: 0.6802
Epoch 2/5
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m180s[0m 320ms/step - accuracy: 0.5755 - loss: 0.6720 - val_accuracy: 0.5608 - val_loss: 0.6758
Epoch 3/5
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m182s[0m 323ms/step - accuracy: 0.7244 - loss: 0.5420 - val_accuracy: 0.8545 - val_loss: 0.3456
Epoch 4/5
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m187s[0m 332ms/step - accuracy: 0.8798 - loss: 0.2941 - val_accuracy: 0.8725 - val_loss: 0.2995
Epoch 5/5
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m195s[0m 346ms/step - accuracy: 0.9170 - loss: 0.2177 - val_accuracy: 0.8752 - val_loss: 0.3450


<keras.src.callbacks.history.History at 0x22c5e21e060>

In [None]:
# Evaluate Model

y_pred_prob = model.predict(X_test) # Creates a probability matrix based on model predictions from X_test
y_pred = (y_pred_prob > 0.5).astype(int) # Creates a boolean matrix based on the labels from y_pred_prob.
                                         # Probabilities greater than 0.5 in the matrix correspond to 1 and the rest correspond to 0

print("Accuracy:", accuracy_score(y_test, y_pred)) # Generates accuracy by comparing y_test (target labels) from y_pred (model prediction of the target labels)
print(classification_report(y_test, y_pred)) # Generates classification report that uses different performance metrics such as precision, recall, f1-score and support

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 24ms/step
Accuracy: 0.8698
              precision    recall  f1-score   support

           0       0.84      0.91      0.87      4961
           1       0.90      0.83      0.87      5039

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000



In [None]:
# Summary

# Import Libraries: pandas, numpy, re, tensorflow and sk-learn

# 1.  Read data from csv file

# 2.  Preprocess data and create a new data frame column to store the preprocessed data
        # Main steps include converting to lowercase and removing HTML tags, non-letter characters and whitespace.

# 3.  Set your input values and target labels (remember to map target labels into numbers).

# 4.  Create variables for your vectoriser: max_words, max_len.

# 5.  Create vectoriser with the parameters: max_tokens, output_sequence_length.

# 6.  Create train-test split: remember test_size and random_state and that X for this is raw data.

# 7.  Adapt vectoriser so it creates a vocabulary.

# 8.  Set X_train and X_test by using the vectoriser.

# 9.  Create a model by doing the following:
        # 1. Make model sequential.
        # 2. Add embedding layer (with parameters input_dim and output_dim).
        # 3. Create LSTM layer (with parameters: number of neurons, dropout and recurrent dropout).
        # 4. Add output/dense layer (with parameters number of neurons and activation).
        # 5. Compile model by using loss, optimiser and metrics as parameters.

# 10. Train model with parameters: epochs, batch size and validation split

# 11. Create two variables: y_pred_prob and y_pred.

# 12. Use y_pred_prob and y_pred to compare with y_test in terms of accuracy score and classification report.
