<a href="https://colab.research.google.com/github/KemasRaihan/Sentiment-Analysis-Of-Social-Media-Posts-Of-Phones-Using-Hybrid-Neural-Networks/blob/main/Customer_Reviews_Sentiment_Analysis_draft7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Prerequisites

## Libraries

In [1]:
# Standard libraries
import pandas as pd
import numpy as np
from numpy import asarray
from numpy import zeros
import matplotlib.pyplot as plt
# % matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [2]:
#For Qualitative Analysis
from itertools import groupby
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from textblob import TextBlob

In [3]:
# For Text Preprocessing
import nltk
from sklearn.feature_extraction.text import re
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('words')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [4]:
# For dataset partitioning
import string

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_curve,auc

from tqdm.notebook import tqdm

import csv
import matplotlib.pyplot as plt
#!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

In [5]:
# For building the neural networks
import tensorflow as tf
from tensorflow import keras
from keras import regularizers
from keras import models, layers
#from keras.layers import Embedding, Flatten, Conv1D, MaxPooling1D, GlobalMaxPooling1D, SpatialDropout1D, LSTM, Bidirectional, Dense, Dropout
from keras.preprocessing.sequence import pad_sequences
from keras.utils import plot_model
from sklearn.svm import SVC

In [6]:
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer

In [9]:
# For testing and evaluation
import random
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.utils.multiclass import unique_labels
from sklearn.model_selection import KFold, cross_val_score
#from keras.wrappers import KerasClassifier
import seaborn as sns
import time

## Other

In [10]:
# Mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [11]:
# Directory path
dirpath = '/content/drive/MyDrive/Sheffield_Hallam_University/Final Year/Development Project/development_project_shared/Source_Code/'

In [12]:
# Define classes
# There are 3 classes: negative, neutral and postive
sentiment_classes = [0,1,2]

# Load Files

**Models**

In [None]:
# Load model
def load_model_from_path(name):
  # Define file name
  file = name + '_sentiment_predictor.h5'

  # Define path to file
  path = dirpath + '/models/' + file

  # Load model from path
  model = tf.keras.models.load_model(
    path, custom_objects=None, compile=True
    )

  return model

In [None]:
#CNN_model = load_model_from_path('CNN')

In [None]:
#LSTM_model = load_model_from_path('LSTM')

In [None]:
#BiLSTM_model = load_model_from_path('BiLSTM')

In [None]:
#HybridCL_model = load_model_from_path('CNN_LSTM')

In [None]:
#HybridCB_model = load_model_from_path('CNN_BiLSTM')

**Training Times**

In [None]:
#open and read the file after the appending:
# f = open("training_times.txt", "r")
# print(f.read())

# Amazon Customer Reviews Dataset

## Import Raw Dataset

**Raw Dataset**

PromptCloud extracted 400 thousand reviews of unlocked mobile phones sold on Amazon.com to find out insights with respect to reviews, ratings, price and their relationships (https://www.kaggle.com/datasets/PromptCloudHQ/amazon-reviews-unlocked-mobile-phones)

In [None]:
# Import Raw Dataset
filepath = dirpath + 'Amazon_Unlocked_Mobile.csv'

# Control number of rows to read from csv file
nrows = 4000

df = pd.read_csv(filepath,nrows=nrows, encoding = 'latin')

## Examination

In [None]:
df.shape

In [None]:
df.head(10)

## Drop NA Rows

In [None]:
# Fine any null rows
df.isnull().any()

In [None]:
# Drop all null rows from dataframe
df = df.dropna()

## View Frequency Of Ratings

In [None]:
# Convert rating to list for further processing
ratings = df['Rating'].values.tolist()

In [None]:
ratings[:5]

In [None]:
frequency = []

# For each rating
for i in range(1,6):
  # Count each rating that occurs in ratings and append to frequency array
  frequency.append(ratings.count(i))

In [None]:
print(frequency)

In [None]:
labels = [1,2,3,4,5]

fig, ax = plt.subplots()

ax.set_title('Percentage Of Reviews By Ratings')
ax.set_xticklabels(labels)

pps = ax.pie(frequency, labels=labels, autopct='%1.1f%%')

plt.show()

## Reduce Classes

In [None]:
sentiments = []
ratings = df['Rating']

for rating in ratings:
  # if rating is 1 or 2 (negative) append 0 to sentiments array
  if rating < 3:
    sentiments.append(0)
  # if rating is 3 (neutral), append 1 to sentiments array
  elif rating == 3:
    sentiments.append(1)
  # if rating is 4 or 5 (positive), append 2 to sentiments array
  else:
    sentiments.append(2)


In [None]:
frequency = []

for i in range(3):
  frequency.append(sentiments.count(i))

In [None]:
labels = ['Negative', 'Neutral', 'Positive']
colors = ['red', 'orange', 'green']
fig, ax = plt.subplots()

ax.set_title('Percentage Of Reviews By Ratings')
ax.set_xticklabels(labels)

pps = ax.pie(frequency, labels=labels, autopct='%1.1f%%', colors=colors)

plt.show()

In [None]:
# Append new column to original dataframe
df = df.assign(Sentiment=sentiments)

In [None]:
df.tail()

In [None]:
df.to_csv(dirpath + 'Amazon_Unlocked_Mobile_Updated.csv', sep=',', index=False, encoding='utf-8')

# Preprocessing Text

In [None]:
reviews = df['Reviews']

In [None]:
# Reviews before preprocessing
reviews.head()

In [None]:
len(reviews[0])

In [None]:
# Remove stopwords 'n punctuation
STOPWORDS = set(stopwords.words('english'))

# Update Stopwords To Exclude the word 'phone'
STOPWORDS.update(["phone", "Phone"])

In [None]:
def preprocess_text(review):

    # Convert to lowercase
    review = review.lower()

     # Remove words with non-ASCII characters
    review = re.sub(r'[^\x00-\x7F]+',' ', review)

    # Remove stop words
    words = review.split()
    words = [word for word in words if word not in STOPWORDS]

    review = " ".join(words)

    return review

In [None]:
reviews = reviews.apply(preprocess_text)

In [None]:
# Reviews after preprocessing
reviews.head()

In [None]:
len(reviews[0])

# Dataset Analysis

## Quantitative Analysis

In [None]:
# Group by sentiment
df_sentiment = df.groupby('Sentiment')

# Define sentiment labekls

# initialise array to store each boxplot data
data = []

for sentiment in sentiment_classes:
  # Group dataframe by the rating
  dfr = df_sentiment.get_group(sentiment)

  prices = np.array(dfr['Price'])

  data.append(prices)

In [None]:
fig = plt.figure(figsize =(9, 7))

# Creating axes instance
ax = fig.add_axes([1, 1, 1, 1])

ax.set_title('Boxplot of Sentiment by Price')
ax.set_ylabel('Price')
ax.set_xlabel('Sentiment')

labels = ['Negative', 'Neutral', 'Positive']
ax.set_xticklabels(labels)

# Creating plot
bp = ax.boxplot(data,
           patch_artist = True,
           boxprops = dict(facecolor = "lightblue"),
           showfliers=False)
# show plot
plt.show()

## Qualitative Analysis

In [None]:
# Creating an object using groupby
df_sentiments = df.groupby('Sentiment')

In [None]:
def get_nouns(text):
    blob = TextBlob(text)
    return [word for (word,tag) in blob.tags if tag == "NN"]

In [None]:
reviews = df['Reviews']

In [None]:
print(reviews.head())

In [None]:
def generate_wordcloud(sentiment, ax, title, cm):

    # Group dataframe by the sentiment
    df = df_sentiments.get_group(sentiment)

    # Circle mask
    x, y = np.ogrid[:300, :300]
    mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2
    mask = 255 * mask.astype(int)

    # Creating the text variable
    text = " ".join(review for review in df['Reviews'])

    # Creating word cloud with text as argument in .generate() method
    wordcloud = WordCloud(collocations=False, background_color='white', colormap=cm, mask=mask, stopwords=STOPWORDS).generate(text)

    # Display the generated Word Cloud
    ax.imshow(wordcloud, interpolation='bilinear')
    ax.axis("off")
    ax.set_title(title)
    # ax.show()

In [None]:
fig, axes = plt.subplots(1,3, figsize=(18,6))
fig.suptitle('Word Cloud For Each Sentiment', fontsize=20)

# Generate wordcloud of negative reviews
generate_wordcloud(0, axes[0],  'negative', 'Reds')

# Generate wordcloud of neutral reviews
generate_wordcloud(1, axes[1],  'neutral', 'Purples')

# Generate wordcloud of positive reviews
generate_wordcloud(2, axes[2],  'positive', 'Greens')

plt.show()

# Define Training, Validation and Testing Dataset

## Preparing Embedding Layer

In [None]:
# Padding all reviews to fixed length 100
maxlen = 100

word_tokenizer = Tokenizer()

word_tokenizer.fit_on_texts(reviews)

vocab_size = len(word_tokenizer.word_index) + 1

In [None]:
embeddings_dictionary = dict()
glove_file = open(dirpath + 'a2_glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions
glove_file.close()

In [None]:
# Initialize an empty matrix with zeros, where the number of rows is determined by the vocabulary size (vocab_size),
# and the number of columns is set to 100, matching the dimensionality of the GloVe word embeddings.
embedding_matrix = np.zeros((vocab_size, 100))

# Iterate through each word in the tokenizer's word index
for word, i in word_tokenizer.word_index.items():
  # Retrieve the GloVe word embedding vector for the current word
    embedding_vector = embeddings_dictionary.get(word)
    # Check if the word has a corresponding embedding in the GloVe model
    if embedding_vector is not None:
      # If an embedding exists, update the corresponding row in the embedding matrix
        # with the GloVe word embedding vector for the current word.
        embedding_matrix[i] = embedding_vector

## Define X and Y

In [None]:
one_hot_encoded_Y = pd.get_dummies(sentiments).values

In [None]:
one_hot_encoded_Y.shape

In [None]:
reviews.shape

In [None]:
# Firstly split them into the training dataset and the rest as remaining dataset for validation and testing
X_train, X_rem, y_train, y_rem = train_test_split(reviews, one_hot_encoded_Y, train_size=0.8)

# Split the remaining dataset for validation and testing
X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, train_size=0.5)

In [None]:
print(X_train[:5])

In [None]:
X_train = word_tokenizer.texts_to_sequences(X_train)
X_valid = word_tokenizer.texts_to_sequences(X_valid)

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_valid = pad_sequences(X_valid, padding='post', maxlen=maxlen)

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
X_valid.shape

# Build and Train The Models

In [None]:
# Number of Epochs for each model to train
EPOCHS = 20

In [None]:
training_times = {}

In [None]:
# function to add value labels
def addlabels(x,y):
    for i in range(len(x)):
        plt.text(i,y[i],y[i])

In [None]:
def display_graphs(history):
  fig, ax = plt.subplots(1, 2, figsize=(17, 5))
  ax[0].plot(history.history['acc'])
  ax[0].plot(history.history['val_acc'])

  ax[0].set_title('Model Accuracy')
  ax[0].set_ylabel('Accuracy')
  ax[0].set_xlabel('Epochs')
  ax[0].legend(['train','test'], loc='upper left')

  ax[1].plot(history.history['loss'])
  ax[1].plot(history.history['val_loss'])

  ax[1].set_title('Model Loss')
  ax[1].set_xlabel('Epochs')
  ax[1].legend(['train','test'], loc='upper left')
  plt.show()

## Convolutional Neural Network (CNN)

In [None]:
# Building convolutional neural network model
def create_CNN_model(vocab_size, max_len, embedding_matrix):
  model = models.Sequential()
  # Input - Layer
  model.add(layers.Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=maxlen , trainable=False))
  # Hidden - Layers
  model.add(layers.Conv1D(128, kernel_size=5, activation='relu'))
  model.add(layers.Dropout(0.2, noise_shape=None, seed=None))
  model.add(layers.Flatten())
  # Output- Layer
  model.add(layers.Dense(3, activation='softmax'))

  # Define optimiser for CNN model
  optimiser = keras.optimizers.RMSprop(learning_rate=2e-4)

  # Compile model
  model.compile(optimizer=optimiser, loss='categorical_crossentropy', metrics=['acc'])
  return model

In [None]:
CNN_model = create_CNN_model(vocab_size, max_len, embedding_matrix)

In [None]:
plot_model(
  CNN_model,
  show_shapes = True,
  show_dtype = True,
  show_layer_activations = True
)

In [None]:
t0 = time.time()
CNN_history = CNN_model.fit(X_train, y_train, epochs=EPOCHS, validation_data=(X_valid, y_valid))
CNN_ts = time.time() - t0
training_times['CNN'] = CNN_ts

In [None]:
display_graphs(CNN_model.history)

## Long-Short Term Memory (LSTM)

In [None]:
# Build an LSTM model
def create_LSTM_model(vocab_size, max_len, embedding_matrix):
  model = models.Sequential()

  # Input - Layer
  model.add(layers.Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=maxlen , trainable=False))
  # Hidden - Layers
  model.add(layers.LSTM(128, return_sequences=True))
  model.add(layers.Dropout(0.2, noise_shape=None, seed=None))
  model.add(layers.Flatten())
  # Output- Layer
  model.add(layers.Dense(3, activation='softmax'))

  # Define optimiser for LSTM model
  optimiser = keras.optimizers.RMSprop(learning_rate=2e-4)

  # Compile model
  model.compile(optimizer=optimiser, loss='categorical_crossentropy', metrics=['acc'])
  return model

In [None]:
LSTM_model = create_LSTM_model(vocab_size, max_len, embedding_matrix)

In [None]:
plot_model(
    LSTM_model,
    show_shapes = True,
    show_dtype = True,
    show_layer_activations = True
)

In [None]:
# Train the model
t0 = time.time()
LSTM_history = LSTM_model.fit(X_train, y_train, epochs=EPOCHS, validation_data=(X_valid, y_valid))
LSTM_ts = time.time()-t0
training_times['LSTM'] = LSTM_ts

In [None]:
display_graphs(LSTM_model.history)

## Bidirectional Long-Short Term Memory (Bi-LSTM)

In [None]:
# Build an LSTM model
def create_BiLSTM_model(vocab_size, max_len, embedding_matrix):
  model = models.Sequential()

  # Input - Layer
  model.add(layers.Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=maxlen , trainable=False))
  # Hidden - Layers
  model.add(layers.Bidirectional(layers.LSTM(128, return_sequences=True)))
  model.add(layers.Dropout(0.2, noise_shape=None, seed=None))
  model.add(layers.Flatten())
  # Output- Layer
  model.add(layers.Dense(3, activation='softmax'))

  # Define optimiser for LSTM model
  optimiser = keras.optimizers.RMSprop(learning_rate=2e-4)

  # Compile model
  model.compile(optimizer=optimiser, loss='categorical_crossentropy', metrics=['acc'])
  return model

In [None]:
BiLSTM_model = create_BiLSTM_Model(vocab_size, max_len, embedding_matrix)

In [None]:
plot_model(
    BiLSTM_model,
    show_shapes = True,
    show_dtype = True,
    show_layer_activations = True
)

In [None]:
# Train the model
t0 = time.time()
BiLSTM_history = BiLSTM_model.fit(X_train, y_train, epochs=EPOCHS, validation_data=(X_valid, y_valid))
BiLSTM_ts = time.time()-t0
training_times['BiLSTM'] = BiLSTM_ts

In [None]:
display_graphs(BiLSTM_model.history)

## CNN + LSTM

In [None]:
# Build an CNN + LSTM model
def create_HybridCL_model(vocab_size, max_len, embedding_matrix):
  model = models.Sequential()

  # Input - Layer
  model.add(layers.Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=maxlen , trainable=False))
  # Hidden - Layers
  # CNN layers
  model.add(layers.Conv1D(128, kernel_size=5, activation='relu'))
  model.add(layers.Dropout(0.2, noise_shape=None, seed=None))
  # LSTM layers
  model.add(layers.LSTM(128, return_sequences=True))
  model.add(layers.Dropout(0.2, noise_shape=None, seed=None))
  model.add(layers.Flatten())
  # Output- Layer
  model.add(layers.Dense(3, activation='softmax'))

  # Define optimiser for CNN + Dense model
  optimiser = keras.optimizers.RMSprop(learning_rate=2e-4)

  # Compile model
  model.compile(optimizer=optimiser, loss='categorical_crossentropy', metrics=['acc'])
  return model

In [None]:
HybridCL_model = create_HybridCL_model(vocab_size, max_len, embedding_matrix)

In [None]:
plot_model(
    HybridCL_model,
    show_shapes = True,
    show_dtype = True,
    show_layer_activations = True
)

In [None]:
# Train the CNN + LSTM model
t0 = time.time()
HybridCL_history = HybridCL_model.fit(X_train, y_train, epochs=EPOCHS, validation_data=(X_valid, y_valid))
HybridCL_ts = time.time() - t0
training_times['CNN + LSTM'] = HybridCL_ts

In [None]:
display_graphs(HybridCL_model.history)

## CNN + Bi-LSTM

In [None]:
# Build an CNN + LSTM model
def create_HybridCB_model(vocab_size, max_len, embedding_matrix):
  model = models.Sequential()

  # Input - Layer
  model.add(layers.Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=maxlen , trainable=False))
  # Hidden - Layers
  # CNN layers
  model.add(layers.Conv1D(128, kernel_size=5, activation='relu'))
  model.add(layers.Dropout(0.2, noise_shape=None, seed=None))
  # LSTM layers
  model.add(layers.Bidirectional(layers.LSTM(128, return_sequences=True)))
  model.add(layers.Dropout(0.2, noise_shape=None, seed=None))
  model.add(layers.Flatten())
  # Output- Layer
  model.add(layers.Dense(3, activation='softmax'))

  # Define optimiser for CNN + Dense model
  optimiser = keras.optimizers.RMSprop(learning_rate=2e-4)

  # Compile model
  model.compile(optimizer=optimiser, loss='categorical_crossentropy', metrics=['acc'])
  return model

In [None]:
HybridCB_model = create_HybridCB_model(vocab_size, max_len, embedding_matrix)

In [None]:
plot_model(
    HybridCB_model,
    show_shapes = True,
    show_dtype = True,
    show_layer_activations = True
)

In [None]:
# Train the CNN + LSTM model
t0 = time.time()
HybridCB_history = HybridCB_model.fit(X_train, y_train, epochs=EPOCHS, validation_data=(X_valid, y_valid))
HybridCB_ts = time.time() - t0
training_times['CNN + BiLSTM'] = HybridCB_ts

In [None]:
display_graphs(HybridCB_model.history)

# Testing and Evaluation

In [None]:
models = [CNN_model, LSTM_model, BiLSTM_model, HybridCL_model, HybridCB_model]
names = ['CNN', 'LSTM', 'BiLSTM', 'CNN + LSTM', 'CNN + BiLSTM']

In [None]:
def test_model(model, review):
  pred = model.predict(review)
  pred = pred.flatten()
  return pred

In [None]:
def preprocess_review(review):
  review = preprocess_text(review)
  review = word_tokenizer.texts_to_sequences([review])
  review = pad_sequences(review, padding='post', maxlen=maxlen)
  return review

In [None]:
def draw_piecharts(review):

  print("Review: ", review)

  review = preprocess_review(review)

  labels = ['Negative', 'Neutral', 'Positive']
  colors = ['red', 'orange', 'green']

  fig, ax = plt.subplots(nrows=1, ncols=5, figsize=(18, 10))
  plt.title('Neural Network Predictions', fontsize=14)
  #2 rows 2 columns
  i = 0
  for i in range(len(models)):
    pred = test_model(models[i], review)
    ax[i].pie(pred,colors=colors, labels=labels, autopct='%1.1f%%')
    ax[i].set_title(names[i])
    i=i+1

  plt.show()


In [None]:
# function to add value labels
def addlabels(x,y):
    for i in range(len(x)):
        plt.text(i,y[i],y[i])

## User Input

The expected output should be one vector that represent the probability distribution of each sentiment prediction: negative, neutral and positive.

In [None]:
#input_review = input("Enter review: ")
input_review = 'I like this phone'
draw_piecharts(input_review)

## Random Sample Review

In [None]:
X_test = X_test.tolist()

In [None]:
X_test[:5]

In [None]:
random_num = random.randint(0, len(X_test)-1)

random_review = X_test[random_num]
actual = y_test[random_num]
print(random_review)

In [None]:
print('Actual: ', actual)
draw_piecharts(random_review)

## Evaluation

### Test Scores and Accuracies

In [None]:
X_test_processed = word_tokenizer.texts_to_sequences(X_test)

X_test_processed = pad_sequences(X_test_processed, padding='post', maxlen=maxlen)

In [None]:
def calc_accuracy(model):
  # Predictions on the Test Set
  score = model.evaluate(X_test_processed, y_test, verbose=1)
  return score

In [None]:
def plot_scores_bar():
  accuracy_scores = []
  test_scores = []
  test_accuracies = []

  # For each model calculate the accuracy scores
  for model in models:
    accuracy_scores.append(calc_accuracy(model))

  # Extract each test scores and test accuracy of each model
  for score in accuracy_scores:
    test_scores.append(score[0])
    test_accuracies.append(score[1])

  X_axis = np.arange(len(names))

  plt.bar(X_axis - 0.2, test_scores, 0.4, label = 'Test Scores')
  plt.bar(X_axis + 0.2, test_accuracies, 0.4, label = 'Test Accuracies')

  plt.xticks(X_axis, names, rotation=45, ha='right')
  plt.xlabel("Model")
  plt.title("Scores")
  plt.legend()
  plt.show()


In [None]:
plot_scores_bar()

### Accuracy, precision, recall and f1 scores

In [None]:
# Convert multidimensional y array to a one-dimensional array
def convert_y_to_1D(y):
  return np.argmax(y, axis=1)

In [None]:
#Calculates accuracy, precision, recall, f1 and specificity from predicted and true results
def calc_scores(model, X_fold, y_true):
  # Calculate predicted values
  y_pred = model.predict(X_fold)

  # Convert y_true and y_pred to one-dimensional arrays
  y_true, y_pred = convert_y_to_1D(y_true), convert_y_to_1D(y_pred)

  # Calculate accuracy score
  accuracy = accuracy_score(y_true, y_pred)

  # Calculate precision score
  precision = precision_score(y_true, y_pred, average='weighted')

  # Calculate recall score
  recall = recall_score(y_true, y_pred, average='weighted')

  # Calculate f1 score
  f1 = f1_score(y_true, y_pred, average='weighted')

  return [accuracy, precision, recall, f1]

In [None]:
#Calculates accuracy, precision, recall, f1 and specificity from predicted and true results
def calc_scores(model, X_fold, y_true):
  # Calculate predicted values
  y_pred = model.predict(X_fold)

  # Convert y_true and y_pred to one-dimensional arrays
  y_true, y_pred = convert_y_to_1D(y_true), convert_y_to_1D(y_pred)

  # Calculate accuracy score
  accuracy = accuracy_score(y_true, y_pred)

  # Calculate precision score
  precision = precision_score(y_true, y_pred, average='weighted')

  # Calculate recall score
  recall = recall_score(y_true, y_pred, average='weighted')

  # Calculate f1 score
  f1 = f1_score(y_true, y_pred, average='weighted')

  return [accuracy, precision, recall, f1]

In [None]:
def create_scores_dict(X_fold, y_fold):
  scores_dict = {}

  for i in range(len(models)):
    # Calculate the scores (accuracy, precision, recall and F1) of the model
    scores = calc_scores(models[i], X_fold, y_fold)

    # Round each value to 2 d.p.
    #scores = [ '%.4f' % score for score in scores ]

    # Create sub dictionary for the scores of the model
    model_scores = {'Accuracy' : scores[0],
              'Precision' : scores[1],
              'Recall' : scores[2],
              'F1' : scores[3]}

    # Assign scores to the model in the parent dictionary
    scores_dict[names[i]] = model_scores

  return scores_dict

In [None]:
# Define the model names and their scores for the training set
scores_train = create_scores_dict(X_train, y_train)

# Define the model names and their scores for the training set
scores_valid = create_scores_dict(X_valid, y_valid)

In [None]:
#Create a dataframe table of scores for the training set
scores_train_df = pd.DataFrame(scores_train).T
scores_train_df

In [None]:
#Create a dataframe table of scores for the validation set
scores_valid_df = pd.DataFrame(scores_valid).T
scores_valid_df

**From AL and ML 1 Assignment:** Based on table it is evident that the accuracy score for KNN and SVM models for the training dataset is slightly higher than their respective accuracy score for the validation dataset. However, the LR and DT models have much higher training accuracy score compared to its validation accuracy score. The results for precision, recall, f1, and specificity show a similar pattern, with the DT model having the highest scores for its training set but SVM having the highest scores for validation.

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 10))

# Define models
models = scores_train_df.index

X_axis = np.arange(len(models))

# Define colors
colors = ['blue', 'orange']

# Bar plot for Accuracy
for i, metric in enumerate(['Accuracy', 'Precision', 'Recall', 'F1']):
    ax = axes[i // 2, i % 2]
    ax.bar(X_axis - 0.2, scores_train_df[f'{metric}'], width=0.4, color='blue', label='Train')
    ax.bar(X_axis + 0.2, scores_valid_df[f'{metric}'], width=0.4, color='orange', label='Valid')
    ax.set_title(metric)
    ax.set_xticks(X_axis)
    ax.set_xticklabels(names, rotation=45, ha='right')

# Adjust layout
plt.tight_layout()

plt.title('Scores For The Models')

# Plot legend
plt.legend(bbox_to_anchor=(1.2, 2.1), loc='upper right', fontsize="12")

plt.show()

### Cross-Validation (CV) Scheme

One way we could solve overfitting is using the Cross-validation (CV) scheme, one of the more common solutions being KFolds (Loukas 2023). Using a KFolds scheme, we wiil train and test each model k-times on different subsets of the training data and estimate a performance metric using each test data.

In [None]:
# Wrap the CNN model in a KerasClassifier
CNN_KC = KerasClassifier(build_fn=create_CNN_model, epochs=10, batch_size=32, verbose=0)

# Wrap the LSTM model in a KerasClassifier
LSTM_KC = KerasClassifier(build_fn=create_LSTM_model, epochs=10, batch_size=32, verbose=0)

# Wrap the BiLSTM model in a KerasClassifier
BiLSTM_KC = KerasClassifier(build_fn=create_BiLSTM_model, epochs=10, batch_size=32, verbose=0)

# Wrap the Hybrid CNN + LSTM model in a KerasClassifier
HybridCL_KC = KerasClassifier(build_fn=create_HybridCL_model, epochs=10, batch_size=32, verbose=0)

# Wrap the Hybrid CNN + BiLSTM model in a KerasClassifier
HybridCB_KC = KerasClassifier(build_fn=create_HybridCB_model, epochs=10, batch_size=32, verbose=0)

In [None]:
def calc_cross_valid_scores(model, X_train, y_train):
  k_folds = KFold(n_splits = 5)
  scores = cross_val_score(model, X=X_train, y=y_train, cv=k_folds)
  print('Cross Validation accuracy scores: %s' % scores)
  print('Cross Validation accuracy: %.3f +/- %.3f' % (np.mean(scores),np.std(scores)))

In [None]:
calc_cross_valid_scores(CNN_KC, X_train, y_train)

In [None]:
calc_cross_valid_scores(LSTM_KC, X_train, y_train)

In [None]:
calc_cross_valid_scores(BiLSTM_KC, X_train, y_train)

In [None]:
calc_cross_valid_scores(HybridCL_KC, X_train, y_train)

In [None]:
calc_cross_valid_scores(HybridCB_KC, X_train, y_train)

**From AL and ML 1 assignment >>**

The mean cross-validation score for the DT model is the lowest among all the other machine learning models. These results suggest that the KNN and SVM models may have been slightly overfit, with SVM being the least overfit since it has the highest cross validation mean. The DT model has scored the highest accuracy for the training dataset, but it has been significantly overfit compared to the other models due to having the largest difference its accuracy scores for the training and validation dataset. Decision trees, being both non-parametric and non-linear machine learning algorithm, are known to be highly flexible and have a high potential for overfitting the training data. This is expected since overfitting is more likely with non-parametric and non-linear models (Brownlee 2019). LR is both linear and parametric but due to our dataset containing too many features (1215 to be exact). it has the second highest difference between its training and validation accuracy score.

### Training Time

In [None]:
# Create a dataframe table of the dictionary of the models ad their time
training_times_df = pd.DataFrame.from_dict(training_times, orient='index')

# Rename the first column
training_times_df.rename(columns={training_times_df.columns[0]: 'Training Time (s)'}, inplace=True)

training_times_df

In [None]:
def plot_training_times_bar():
  X_axis = np.arange(len(names))

  # Get the training times of each model from the dataframe and define it as y
  y = training_times_df['Training Time (s)'].apply(lambda x: round(x, 2))

  plt.bar(X_axis, y, 0.4)

  plt.xticks(X_axis, names, rotation=45, ha='right')

  # calling the function to add value labels
  addlabels(X_axis, y)

  plt.xlabel("Model")
  plt.ylabel("Training Time (s)")
  plt.title("Training Time For The Models")
  plt.legend()
  plt.show()

In [None]:
plot_training_times_bar()

# Save and Store

**Models**

In [None]:
def save_model(model, name):
  file = name + '_sentiment_predictor.h5'
  path = dirpath + '/models/' + file
  keras.saving.save_model(model, path)

In [None]:
# Define file path to save CNN model
save_model(CNN_model, 'CNN')

In [None]:
# Define file path to save LSTM model
save_model(LSTM_model, 'LSTM')

In [None]:
# Define file path to save SNN model
save_model(BiLSTM_model, 'BiLSTM')

In [None]:
# Define file path to save CNN + LSTM model
save_model(HybridCL_model, 'CNN_LSTM')

In [None]:
# Define file path to save CNN + LSTM model
save_model(HybridCB_model, 'CNN_BiLSTM')

**Training Time**

In [None]:
f = open("training_time.txt", "w")
for time in training_times.values():
  f.write("%s\n" % time)
  print(time)
f.close()

In [None]:
#open and read the file after the appending:
f = open("training_time.txt", "r")
print(f.read())