In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import libs.utils as utl
import tensorflow_hub as hub
import matplotlib.pyplot as plt

from tqdm import tqdm
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential

## Instantiating the embedding generator

In [None]:
# Define the URL of the pre-trained Universal Sentence Encoder (USE) model 
module_url = 'https://tfhub.dev/google/universal-sentence-encoder-large/4'
# Create a Keras Layer using the Universal Sentence Encoder (USE) model
USE_embed = hub.KerasLayer(module_url, trainable=False, name='USE_embedding')

In [None]:
# Loading the dataset from CSV files for training, testing, and validation
train_dataframe = pd.read_csv("data/Corona_NLP_train.csv", encoding = "ISO-8859-1")
train_dataframe["Sentiment"] = train_dataframe["Sentiment"].astype('category')

test_dataframe = pd.read_csv("data/Corona_NLP_test.csv", encoding = "ISO-8859-1")
test_dataframe["Sentiment"] = test_dataframe["Sentiment"].astype('category')

valid_dataframe = pd.read_csv("data/Corona_NLP_valid.csv", encoding = "ISO-8859-1")
valid_dataframe["Sentiment"] = valid_dataframe["Sentiment"].astype('category')

# Create a dictionary mapping category codes to their original string labels
label_dict = dict(enumerate(train_dataframe["Sentiment"].cat.categories))

# Displaying information about the dataset using Seaborn
sns.catplot(x="Sentiment", kind="count", data=train_dataframe, aspect=2).set_xticklabels(rotation=90)
sns.catplot(x="Sentiment", kind="count", data=valid_dataframe, aspect=2).set_xticklabels(rotation=90)
sns.catplot(x="Sentiment", kind="count", data=test_dataframe, aspect=2).set_xticklabels(rotation=90)

## Encoding tweet text for embeddings

In [None]:
# Extracting the tweet text data from the dataframes
train_x = train_dataframe['OriginalTweet'].to_numpy()
valid_x = valid_dataframe['OriginalTweet'].to_numpy()
test_x = test_dataframe['OriginalTweet'].to_numpy()

# Encoding the categorical labels for sentiment
train_y = train_dataframe["Sentiment"].cat.codes.to_numpy()
valid_y = valid_dataframe["Sentiment"].cat.codes.to_numpy()
test_y = test_dataframe["Sentiment"].cat.codes.to_numpy()

# Generating random batches of data for training, validation, and testing
train_batches = utl.gen_random_batches(train_x, train_y, batch_size = 100)
valid_batches = utl.gen_random_batches(valid_x, valid_y, batch_size = 100)
test_batches = utl.gen_random_batches(test_x, test_y, batch_size = 100)

print("Number of training batches:", len(train_batches))
print("Number of validation batches:", len(valid_batches))

# Encoding the input tweet text data using the Universal Sentence Encoder (USE) model
print("Shape of X in the training batch before encoding:", train_batches[0][0].shape)
for index, batch in enumerate(train_batches):
    (batch_x, batch_y) = batch
    embeddings = USE_embed(batch_x.astype('str').tolist())
    train_batches[index] = (embeddings['outputs'].numpy(), batch_y)
print("Shape of X in the training batch after encoding:", train_batches[0][0].shape)

print("Shape of X in the validation batch before encoding:", valid_batches[0][0].shape)
for index, batch in enumerate(valid_batches):
    (batch_x, batch_y) = batch
    embeddings = USE_embed(batch_x.astype('str').tolist())
    valid_batches[index] = (embeddings['outputs'].numpy(), batch_y)
print("Shape of X in the validation batch after encoding:", valid_batches[0][0].shape)


print("Shape of X in the test batch before encoding:", valid_batches[0][0].shape)
for index, batch in enumerate(test_batches):
    (batch_x, batch_y) = batch
    embeddings = USE_embed(batch_x.astype('str').tolist())
    test_batches[index] = (embeddings['outputs'].numpy(), batch_y)
print("Shape of X in the test batch after encoding:", valid_batches[0][0].shape)

## Instantiating the model

In [None]:
# Definition of the neural network model using the Sequential API in Keras
model = Sequential([
  layers.Dense(1014, activation='relu'),
  layers.Dense(512, activation='relu'),
  layers.Dense(len(label_dict))
])

# Building the model with an input shape of (None, 512)
model.build((None,512))

# Printing a summary of the model architecture
print(model.summary())

## Defining the loss function and evaluation metrics

In [None]:
# Loss function: Sparse Categorical Crossentropy
train_loss_obj = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# Optimizer: Adam optimizer with a learning rate of 0.01
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)

# Metrics for training:
#   - Sparse Categorical Accuracy: Measures accuracy for integer labels
#   - Mean: Computes the mean of elements for tracking the training loss
train_acu_acc = tf.keras.metrics.SparseCategoricalAccuracy()
train_loss_acc = tf.keras.metrics.Mean(name='train_loss')

# Metric for validation:
valid_acu_acc = tf.keras.metrics.SparseCategoricalAccuracy()

# Custom metrics class calculates F1, Recall, and Precision
#   - To add a new metric, refer to the CustomMetrics class in the utils.py file
custom_metrics = utl.CustomMetrics()

In [None]:
# Lists to store results for each epoch
list_train_loss_results = []
list_train_acc_results = []
list_valid_acc_results = []
list_valid_rec_results = []
list_valid_pre_results = []
list_valid_f1_results = []

# Dictionary to track the best scores for F1, Recall, and Precision
best_score = {"F1": 0, "Recall": 0, "Precision": 0}

# TensorFlow function for training a single batch
@tf.function
def train_step(batch_x, batch_y):
    with tf.GradientTape() as tape:
        predictions = model(batch_x)
        loss = train_loss_obj(batch_y, predictions)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    train_loss_acc(loss)
    train_acu_acc(batch_y, predictions)

# Maximum number of training epochs
MAX_EPOCHS = 40
for epoch in tqdm(range(0, MAX_EPOCHS)):
    
   
    # Training with the training batches
    for batch in train_batches:
        (batch_x, batch_y) = batch
        train_step(batch_x, batch_y)
    
    # Evaluation with the validation batches
    for batch in valid_batches:
        (batch_x, batch_y) = batch
        predictions = model(batch_x)
        
        # Updating TensorFlow accuracy metric
        valid_acu_acc(batch_y, predictions)
        
        # Getting predictions with the highest confidence for custom metrics
        predictions = tf.argmax(tf.nn.softmax(predictions), axis=1)
        custom_metrics.feed(batch_y, predictions)
        
       
    
    # Saving results in lists for later visualization
    list_train_loss_results.append(train_loss_acc.result().numpy())
    list_train_acc_results.append(train_acu_acc.result().numpy())
    list_valid_acc_results.append(valid_acu_acc.result().numpy())
    
    # Extracting custom metric results
    custom_results = custom_metrics.results()
     
    # Saving custom metric results in lists for later visualization
    list_valid_rec_results.append(custom_results["Recall"])
    list_valid_pre_results.append(custom_results["Precision"])
    list_valid_f1_results.append(custom_results["F1"])
    
    # Checking if the best result has been achieved
    if custom_results["F1"] > best_score["F1"]:
        best_score = custom_results
        model.save_weights("/model_save/model", save_format='tf')
    
    # Resetting the metrics
    train_loss_acc.reset_states()
    train_acu_acc.reset_states()
    valid_acu_acc.reset_states()
    custom_metrics.reset_states()

    
print("best result:", best_score)

### Displaying experiment graphs

In [None]:
# Plotting the training loss over epochs
sns.lineplot(data=list_train_loss_results).set_title('Loss training')
plt.pause(0.1)

# Plotting the training accuracy over epochs
sns.lineplot(data=list_train_acc_results).set_title('Accuracy training')
plt.pause(0.1)

# Plotting the validation accuracy over epochs
sns.lineplot(data=list_valid_acc_results).set_title('Accuracy validation')
plt.pause(0.1)

# Plotting the validation recall over epochs
sns.lineplot(data=list_valid_rec_results).set_title('Recall validation')
plt.pause(0.1)

# Plotting the validation precision over epochs
sns.lineplot(data=list_valid_pre_results).set_title('Precision validation')
plt.pause(0.1)

# Plotting the validation F1 score over epochs
sns.lineplot(data=list_valid_f1_results).set_title('F1 validation')
plt.pause(0.1)

In [None]:
# Loading the model with the best F1 score
model.load_weights("/model_save/model")

# Plotting the confusion matrix
cf_matrix = utl.calcule_confusion_matrix(model, valid_batches)

# Creating a subplot with the specified dimensions
fig_dims = (20, 20)
fig, ax = plt.subplots(figsize=fig_dims)

# Using Seaborn to create a heatmap for the confusion matrix
sns.heatmap(cf_matrix,  annot=True, fmt="d", linewidths=.5, xticklabels=label_dict.values(), yticklabels=label_dict.values(), cmap='Blues')

## Test

In [None]:
# Loading the model with the best F1 score
model.load_weights("/model_save/model")

# Resetting the custom metrics states before evaluation
custom_metrics.reset_states()

# Evaluation with the batches from the test dataset
for batch in test_batches:
    (batch_x, batch_y) = batch
    predictions = model(batch_x)

    # Getting predictions with the highest confidence for custom metrics
    predictions = tf.argmax(tf.nn.softmax(predictions), axis=1)
    custom_metrics.feed(batch_y, predictions)

# Printing the results of custom metrics (F1, Recall, Precision) for the test dataset
print(custom_metrics.results())