**Nina Dobša, 28.7.2025.**

# Imports

In [None]:
#%pip install gensim

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional, Input
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.backend import clear_session
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelEncoder
from gensim.models import Word2Vec
from transformers import BertTokenizer, BertForMaskedLM
import torch
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# GoEmotions dataset

In [None]:
splits = {'train': 'simplified/train-00000-of-00001.parquet',
          'validation': 'simplified/validation-00000-of-00001.parquet',
          'test': 'simplified/test-00000-of-00001.parquet'}

goemotions_train = pd.read_parquet("hf://datasets/google-research-datasets/go_emotions/" + splits["train"])
goemotions_validation = pd.read_parquet("hf://datasets/google-research-datasets/go_emotions/" + splits["validation"])
goemotions_test = pd.read_parquet("hf://datasets/google-research-datasets/go_emotions/" + splits["test"])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
goemotions_data = pd.concat([goemotions_train, goemotions_validation, goemotions_test], ignore_index=True)
goemotions_data.shape

(54263, 3)

Cleaning and labeling emotions with 0 and 1

In [None]:
# Mapping numbers from labels with emotions
emotion_dictionary = { 0 : 'admiration', 1 : 'amusement', 2 : 'anger', 3 : 'annoyance', 4 : 'approval', 5 : 'caring', 6 : 'confusion', 7 : 'curiosity', 8 : 'desire',
                      9 : 'disappointment', 10 : 'disapproval', 11 : 'disgust', 12 : 'embarrassment', 13 : 'excitement', 14 : 'fear', 15 : 'gratitude', 16 : 'grief',
                      17 : 'joy', 18 : 'love', 19 : 'nervousness', 20 : 'optimism', 21 : 'pride', 22 : 'realization', 23 : 'relief', 24 : 'remorse', 25 : 'sadness',
                      26 : 'surprise', 27 : 'neutral' }


target_emotions = {'anger': 2, 'sadness': 25, 'joy': 17, 'disgust' : 11, 'surprise' : 26}

In [None]:
# Creating new columns for each emotion (anger, joy, sadness, disgust, surprise)
# in train, validation and test goemotion datasets
for emotion, label in target_emotions.items():
    goemotions_data[emotion] = goemotions_data['labels'].apply(lambda x: 1 if label in x else 0)

# Dropping id and labels columns
goemotions_data = goemotions_data.drop(columns = ["id", "labels"])
goemotions_data.head()

Unnamed: 0,text,anger,sadness,joy,disgust,surprise
0,My favourite food is anything I didn't have to...,0,0,0,0,0
1,"Now if he does off himself, everyone will thin...",0,0,0,0,0
2,WHY THE FUCK IS BAYLESS ISOING,1,0,0,0,0
3,To make her feel threatened,0,0,0,0,0
4,Dirty Southern Wankers,0,0,0,0,0


In [None]:
# Checking class distribution of target valuess
for i in range (1, 6):
    class_distribution = goemotions_data.iloc[:, i].value_counts()
    print("\nNumber of samples per class:")
    print(class_distribution)


Number of samples per class:
anger
0    52303
1     1960
Name: count, dtype: int64

Number of samples per class:
sadness
0    52638
1     1625
Name: count, dtype: int64

Number of samples per class:
joy
0    52478
1     1785
Name: count, dtype: int64

Number of samples per class:
disgust
0    53250
1     1013
Name: count, dtype: int64

Number of samples per class:
surprise
0    52933
1     1330
Name: count, dtype: int64


Classes are disbalanced...

In [None]:
# Defining number of classes and length of the sequence (if the sentence is shorter padding will be added)
num_classes = 2 # sample has the target emotion or it does not (0 or 1)
max_sequence_length = 30

# Word2vec embeddings

In [None]:
# Connecting with google drive where fine tuned models are stored
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Import of fine tuned word2vec model
model_path_SG = "/content/drive/My Drive/modeli/SG_15/SG_15.model"
model_path_CBOW = "/content/drive/My Drive/modeli/CBOW_10/CBOW_10.model"
word2vec_model_SG = Word2Vec.load(model_path_SG)
word2vec_model_CBOW = Word2Vec.load(model_path_CBOW)

embedding_dim_SG = word2vec_model_SG.vector_size
embedding_dim_CBOW = word2vec_model_CBOW.vector_size

In [None]:
# Function for getting word2vec embeddings
def get_word2vec_embeddings(sentence, model):
    # Tokenization
    sentence = sentence.lower()
    words = sentence.split()
    word_embeddings = [model.wv[word] for word in words if word in model.wv]

    if not word_embeddings:  # If no words are in the vocabulary, return a zero vector
        return []

    return word_embeddings

In [None]:
# Getting word2vec embeddings for each word in the sentence (except stowords)
list_of_embeddings_CBOW = [
   get_word2vec_embeddings(text, word2vec_model_CBOW)
    for text in goemotions_data['text']
]

list_of_embeddings_SG = [
   get_word2vec_embeddings(text, word2vec_model_SG)
    for text in goemotions_data['text']
]

In [None]:
# Padding sequences to the same length -> adding zeroes to the sequences shorter than embedding_dim and cutting sequences longer than embedding_dim
# Converting to NumPy array
embeddings_CBOW = pad_sequences(
    list_of_embeddings_CBOW,
    maxlen = max_sequence_length,
    dtype = 'float16',
    padding = 'post',
    truncating = 'post'
)

print(embeddings_CBOW)

[[[ 3.9001e-02  3.6621e-01  1.5027e-01 ...  1.4844e-01  8.5388e-02
   -3.7183e-01]
  [ 5.8777e-02  1.4099e-01 -2.4506e-02 ...  1.6016e-01  1.4539e-01
   -2.3120e-01]
  [ 1.3867e-01  6.0516e-02 -2.9492e-01 ...  4.0863e-02 -1.1774e-01
   -1.3013e-01]
  ...
  [ 0.0000e+00  0.0000e+00  0.0000e+00 ...  0.0000e+00  0.0000e+00
    0.0000e+00]
  [ 0.0000e+00  0.0000e+00  0.0000e+00 ...  0.0000e+00  0.0000e+00
    0.0000e+00]
  [ 0.0000e+00  0.0000e+00  0.0000e+00 ...  0.0000e+00  0.0000e+00
    0.0000e+00]]

 [[-7.1960e-02  3.5034e-01  1.0547e-01 ...  9.9335e-03  5.4169e-02
   -3.6914e-01]
  [ 3.0823e-02  1.7139e-01  1.3000e-01 ... -1.9348e-01  1.0748e-01
   -3.3325e-01]
  [-3.1677e-02  2.4951e-01  5.9998e-02 ... -9.0332e-02  2.2083e-01
   -2.9492e-01]
  ...
  [ 0.0000e+00  0.0000e+00  0.0000e+00 ...  0.0000e+00  0.0000e+00
    0.0000e+00]
  [ 0.0000e+00  0.0000e+00  0.0000e+00 ...  0.0000e+00  0.0000e+00
    0.0000e+00]
  [ 0.0000e+00  0.0000e+00  0.0000e+00 ...  0.0000e+00  0.0000e+00
    0.

In [None]:
# Padding sequences to the same length -> adding zeroes to the sequences shorter than embedding_dim and cutting sequences longer than embedding_dim
# Converting to NumPy array
embeddings_SG = pad_sequences(
    list_of_embeddings_SG,
    maxlen = max_sequence_length,
    dtype = 'float16',
    padding = 'post',
    truncating = 'post'
)

print(embeddings_SG)

[[[ 0.07043   0.1515   -0.12054  ...  0.06464   0.1066    0.0746  ]
  [-0.08136   0.02637  -0.546    ...  0.0169    0.2227    0.1559  ]
  [ 0.3926    0.3855   -0.2922   ... -0.1248   -0.518     0.1681  ]
  ...
  [ 0.        0.        0.       ...  0.        0.        0.      ]
  [ 0.        0.        0.       ...  0.        0.        0.      ]
  [ 0.        0.        0.       ...  0.        0.        0.      ]]

 [[-0.2399    0.528     0.011635 ...  0.0336   -0.137     0.01942 ]
  [-0.01888   0.3691   -0.0424   ... -0.2284    0.2747   -0.08514 ]
  [-0.05377   0.07684   0.316    ... -0.0638    0.4543   -0.2002  ]
  ...
  [ 0.        0.        0.       ...  0.        0.        0.      ]
  [ 0.        0.        0.       ...  0.        0.        0.      ]
  [ 0.        0.        0.       ...  0.        0.        0.      ]]

 [[ 0.02072   0.436     0.2274   ... -0.0771    0.2123   -0.32    ]
  [ 0.0618    0.169    -0.2253   ...  0.06683   0.04663  -0.028   ]
  [ 0.02646   0.187    -0.145   

In [None]:
# Checking the dimensions of the embeddings
print(f"Dimensions of embeddings_CBOW: {embeddings_CBOW.shape}") # (number_of_examples, sequence_length, embedding_dim)
print(f"Dimensions of embeddings_SG: {embeddings_SG.shape}") # (number_of_examples, sequence_length, embedding_dim)

Dimensions of embeddings_CBOW: (54263, 30, 300)
Dimensions of embeddings_SG: (54263, 30, 300)


# RNN predictions for Word2Vec embeddings

### Predictions for SG embeddings

In [None]:
# 5-fold cross validation using Bidirectional LSTM RNN
# Model parameters
n_splits = 5 # 5-fold cross-validation
epochs = 50
batch_size = 16
lstm_units = 128
dropout_rate = 0.3

emotion_columns = ['anger', 'joy', 'sadness', 'disgust', 'surprise'] # column names
all_emotions_results_SG = {}

for emotion_column_name in emotion_columns:
    print(f"\n--- Training a model for emotion: {emotion_column_name} ---")

    y_binary = goemotions_data[emotion_column_name].values

    # Initialize list to store results for individual folds of this emotion
    emotion_fold_results = []

    # StratifiedKFold initialization
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # nside loop: 5-fold cross-validation
    for fold, (train_index, val_index) in enumerate(skf.split(embeddings_SG, y_binary)):
        print(f"  --- Fold {fold + 1}/{n_splits} ---")

        # Splitting data to train and validation sets for the current fold
        X_train_fold, X_val_fold = embeddings_SG[train_index], embeddings_SG[val_index]
        y_train_fold, y_val_fold = y_binary[train_index], y_binary[val_index]

        clear_session() # In each fold we train model from the beginning

        # Building an LSTM Bidirectional model
        model = Sequential()
        model.add(Input(shape=(max_sequence_length, embedding_dim_SG)))
        model.add(Bidirectional(LSTM(lstm_units, return_sequences=False)))
        model.add(Dropout(dropout_rate))
        model.add(Dense(64, activation='relu'))
        model.add(Dropout(dropout_rate))
        model.add(Dense(1, activation='sigmoid')) # 1 neuoron for binary classification (sigmoid activation)

        # Model compilation for binary classification
        model.compile(
            optimizer='adam',
            loss='binary_crossentropy',
            metrics=['accuracy']
        )

        # Early Stopping callback
        early_stopping_callback = EarlyStopping(
            monitor='val_accuracy',
            patience=5,
            restore_best_weights=True,
            verbose=0
        )

        # Training the model
        history = model.fit(
            X_train_fold, y_train_fold,
            epochs=epochs,
            batch_size=batch_size,
            validation_data=(X_val_fold, y_val_fold),
            callbacks=[early_stopping_callback],
            verbose=0
        )

        # Evaluating the model
        loss, accuracy = model.evaluate(X_val_fold, y_val_fold, verbose=0)

        # Getting predictions
        y_pred_proba = model.predict(X_val_fold, verbose=0)
        y_pred = (y_pred_proba > 0.5).astype(int) # Binary prediction (0 ili 1)

        # Calculating metrics
        precision = precision_score(y_val_fold, y_pred, zero_division=0)
        recall = recall_score(y_val_fold, y_pred, zero_division=0)
        f1 = f1_score(y_val_fold, y_pred, zero_division=0)

        if len(np.unique(y_val_fold)) < 2: # Cannot calculate if validation set doesn't have both classes
            roc_auc = np.nan
        else:
            roc_auc = roc_auc_score(y_val_fold, y_pred_proba)

        # Store results for the current fold
        emotion_fold_results.append({
            'fold': fold + 1,
            'loss': loss,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'roc_auc': roc_auc
        })

    # Store all fold results for the current emotion
    all_emotions_results_SG[emotion_column_name] = emotion_fold_results


--- Training a model for emotion: anger ---
  --- Fold 1/5 ---
  --- Fold 2/5 ---
  --- Fold 3/5 ---
  --- Fold 4/5 ---
  --- Fold 5/5 ---

--- Training a model for emotion: joy ---
  --- Fold 1/5 ---
  --- Fold 2/5 ---
  --- Fold 3/5 ---
  --- Fold 4/5 ---
  --- Fold 5/5 ---

--- Training a model for emotion: sadness ---
  --- Fold 1/5 ---
  --- Fold 2/5 ---
  --- Fold 3/5 ---
  --- Fold 4/5 ---
  --- Fold 5/5 ---

--- Training a model for emotion: disgust ---
  --- Fold 1/5 ---
  --- Fold 2/5 ---
  --- Fold 3/5 ---
  --- Fold 4/5 ---
  --- Fold 5/5 ---

--- Training a model for emotion: surprise ---
  --- Fold 1/5 ---
  --- Fold 2/5 ---
  --- Fold 3/5 ---
  --- Fold 4/5 ---
  --- Fold 5/5 ---


In [None]:
# Calculating and printing the average results for each emotion across all folds
for emotion, fold_results_list in all_emotions_results_SG.items():
    print(f"\nResults for emotion: {emotion}")

    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []
    roc_aucs = []

    for result_dict in fold_results_list:
        accuracies.append(result_dict['accuracy'])
        precisions.append(result_dict['precision'])
        recalls.append(result_dict['recall'])
        f1_scores.append(result_dict['f1_score'])
        roc_aucs.append(result_dict['roc_auc'])

    # Calculate mean and standard deviation for each metric
    avg_accuracy = np.mean(accuracies)
    std_accuracy = np.std(accuracies)

    avg_precision = np.mean(precisions)
    std_precision = np.std(precisions)

    avg_recall = np.mean(recalls)
    std_recall = np.std(recalls)

    avg_f1_score = np.mean(f1_scores)
    std_f1_score = np.std(f1_scores)

    avg_roc_auc = np.nanmean(roc_aucs)
    std_roc_auc = np.nanstd(roc_aucs)

    print(f"  Accuracy:  {avg_accuracy:.4f} +/- {std_accuracy:.4f}")
    print(f"  Precision: {avg_precision:.4f} +/- {std_precision:.4f}")
    print(f"  Recall:    {avg_recall:.4f} +/- {std_recall:.4f}")
    print(f"  F1-Score:  {avg_f1_score:.4f} +/- {std_f1_score:.4f}")

    if not np.isnan(avg_roc_auc):
        print(f"  ROC AUC:   {avg_roc_auc:.4f} +/- {std_roc_auc:.4f}")
    else:
        print("  ROC AUC:   N/A (not enough classes in validation folds)")

    print("-" * 30)


Results for emotion: anger
  Accuracy:  0.9668 +/- 0.0012
  Precision: 0.6273 +/- 0.0623
  Recall:    0.2214 +/- 0.0444
  F1-Score:  0.3226 +/- 0.0479
  ROC AUC:   0.8647 +/- 0.0065
------------------------------

Results for emotion: joy
  Accuracy:  0.9721 +/- 0.0007
  Precision: 0.6215 +/- 0.0195
  Recall:    0.3899 +/- 0.0450
  F1-Score:  0.4774 +/- 0.0314
  ROC AUC:   0.8863 +/- 0.0090
------------------------------

Results for emotion: sadness
  Accuracy:  0.9737 +/- 0.0009
  Precision: 0.7199 +/- 0.0477
  Recall:    0.2000 +/- 0.0420
  F1-Score:  0.3104 +/- 0.0507
  ROC AUC:   0.8500 +/- 0.0143
------------------------------

Results for emotion: disgust
  Accuracy:  0.9824 +/- 0.0002
  Precision: 0.7892 +/- 0.1297
  Recall:    0.0898 +/- 0.0318
  F1-Score:  0.1575 +/- 0.0459
  ROC AUC:   0.8211 +/- 0.0216
------------------------------

Results for emotion: surprise
  Accuracy:  0.9787 +/- 0.0003
  Precision: 0.6630 +/- 0.0329
  Recall:    0.2729 +/- 0.0446
  F1-Score:  0.383

### Predictions for CBOW embeddings

In [None]:
# 5-fold cross validation using Bidirectional LSTM RNN
# Model parameters
n_splits = 5 # 5-fold cross-validation
epochs = 50
batch_size = 16
lstm_units = 128
dropout_rate = 0.3

emotion_columns = ['anger', 'joy', 'sadness', 'disgust', 'surprise'] # column names
all_emotions_results_CBOW = {}

for emotion_column_name in emotion_columns:
    print(f"\n--- Training a model for emotion: {emotion_column_name} ---")

    y_binary = goemotions_data[emotion_column_name].values

    # Initialize list to store results for individual folds of this emotion
    emotion_fold_results = []

    # StratifiedKFold initialization
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # nside loop: 5-fold cross-validation
    for fold, (train_index, val_index) in enumerate(skf.split(embeddings_CBOW, y_binary)):
        print(f"  --- Fold {fold + 1}/{n_splits} ---")

        # Splitting data to train and validation sets for the current fold
        X_train_fold, X_val_fold = embeddings_CBOW[train_index], embeddings_CBOW[val_index]
        y_train_fold, y_val_fold = y_binary[train_index], y_binary[val_index]

        clear_session()  # In each fold we train model from the beginning

        # Building an LSTM Bidirectional model
        model = Sequential()
        model.add(Input(shape=(max_sequence_length, embedding_dim_CBOW)))
        model.add(Bidirectional(LSTM(lstm_units, return_sequences=False)))
        model.add(Dropout(dropout_rate))
        model.add(Dense(64, activation='relu'))
        model.add(Dropout(dropout_rate))
        model.add(Dense(1, activation='sigmoid')) # 1 neuoron for binary classification (sigmoid activation)

        # Model compilation for binary classification
        model.compile(
            optimizer='adam',
            loss='binary_crossentropy',
            metrics=['accuracy']
        )

        # Early Stopping callback
        early_stopping_callback = EarlyStopping(
            monitor='val_accuracy',
            patience=5,
            restore_best_weights=True,
            verbose=0
        )

        # Training the model
        history = model.fit(
            X_train_fold, y_train_fold,
            epochs=epochs,
            batch_size=batch_size,
            validation_data=(X_val_fold, y_val_fold),
            callbacks=[early_stopping_callback],
            verbose=0
        )

        # Evaluating the model
        loss, accuracy = model.evaluate(X_val_fold, y_val_fold, verbose=0)

        # Getting predictions
        y_pred_proba = model.predict(X_val_fold, verbose=0)
        y_pred = (y_pred_proba > 0.5).astype(int) # Binary prediction (0 ili 1)

        # Calculating metrics, with zero_division=0 to handle division by zero cases
        precision = precision_score(y_val_fold, y_pred, zero_division=0)
        recall = recall_score(y_val_fold, y_pred, zero_division=0)
        f1 = f1_score(y_val_fold, y_pred, zero_division=0)

        if len(np.unique(y_val_fold)) < 2: # Cannot calculate if validation set doesn't have both classes
            roc_auc = np.nan
        else:
            roc_auc = roc_auc_score(y_val_fold, y_pred_proba) # Use probabilities for ROC AUC

        # Store results for the current fold
        emotion_fold_results.append({
            'fold': fold + 1,
            'loss': loss,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'roc_auc': roc_auc
        })

    # Store all fold results for the current emotion
    all_emotions_results_CBOW[emotion_column_name] = emotion_fold_results


--- Training a model for emotion: anger ---
  --- Fold 1/5 ---
  --- Fold 2/5 ---
  --- Fold 3/5 ---
  --- Fold 4/5 ---
  --- Fold 5/5 ---

--- Training a model for emotion: joy ---
  --- Fold 1/5 ---
  --- Fold 2/5 ---
  --- Fold 3/5 ---
  --- Fold 4/5 ---
  --- Fold 5/5 ---

--- Training a model for emotion: sadness ---
  --- Fold 1/5 ---
  --- Fold 2/5 ---
  --- Fold 3/5 ---
  --- Fold 4/5 ---
  --- Fold 5/5 ---

--- Training a model for emotion: disgust ---
  --- Fold 1/5 ---
  --- Fold 2/5 ---
  --- Fold 3/5 ---
  --- Fold 4/5 ---
  --- Fold 5/5 ---

--- Training a model for emotion: surprise ---
  --- Fold 1/5 ---
  --- Fold 2/5 ---
  --- Fold 3/5 ---
  --- Fold 4/5 ---
  --- Fold 5/5 ---


In [None]:
# Calculating and printing the average results for each emotion across all folds
for emotion, fold_results_list in all_emotions_results_CBOW.items():
    print(f"\nResults for emotion: {emotion}")

    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []
    roc_aucs = []

    for result_dict in fold_results_list:
        accuracies.append(result_dict['accuracy'])
        precisions.append(result_dict['precision'])
        recalls.append(result_dict['recall'])
        f1_scores.append(result_dict['f1_score'])
        roc_aucs.append(result_dict['roc_auc'])

    # Calculate mean and standard deviation for each metric
    avg_accuracy = np.mean(accuracies)
    std_accuracy = np.std(accuracies)

    avg_precision = np.mean(precisions)
    std_precision = np.std(precisions)

    avg_recall = np.mean(recalls)
    std_recall = np.std(recalls)

    avg_f1_score = np.mean(f1_scores)
    std_f1_score = np.std(f1_scores)

    avg_roc_auc = np.nanmean(roc_aucs)
    std_roc_auc = np.nanstd(roc_aucs)

    print(f"  Accuracy:  {avg_accuracy:.4f} +/- {std_accuracy:.4f}")
    print(f"  Precision: {avg_precision:.4f} +/- {std_precision:.4f}")
    print(f"  Recall:    {avg_recall:.4f} +/- {std_recall:.4f}")
    print(f"  F1-Score:  {avg_f1_score:.4f} +/- {std_f1_score:.4f}")

    if not np.isnan(avg_roc_auc):
        print(f"  ROC AUC:   {avg_roc_auc:.4f} +/- {std_roc_auc:.4f}")
    else:
        print("  ROC AUC:   N/A (not enough classes in validation folds)")

    print("-" * 30)


Results for emotion: anger
  Accuracy:  0.9669 +/- 0.0011
  Precision: 0.6622 +/- 0.0760
  Recall:    0.1949 +/- 0.0595
  F1-Score:  0.2926 +/- 0.0695
  ROC AUC:   0.8482 +/- 0.0101
------------------------------

Results for emotion: joy
  Accuracy:  0.9715 +/- 0.0011
  Precision: 0.5969 +/- 0.0240
  Recall:    0.4179 +/- 0.0560
  F1-Score:  0.4895 +/- 0.0399
  ROC AUC:   0.8616 +/- 0.0104
------------------------------

Results for emotion: sadness
  Accuracy:  0.9735 +/- 0.0007
  Precision: 0.7509 +/- 0.0434
  Recall:    0.1778 +/- 0.0403
  F1-Score:  0.2844 +/- 0.0535
  ROC AUC:   0.8162 +/- 0.0189
------------------------------

Results for emotion: disgust
  Accuracy:  0.9813 +/- 0.0000
  Precision: 0.0000 +/- 0.0000
  Recall:    0.0000 +/- 0.0000
  F1-Score:  0.0000 +/- 0.0000
  ROC AUC:   0.5929 +/- 0.0211
------------------------------

Results for emotion: surprise
  Accuracy:  0.9782 +/- 0.0006
  Precision: 0.6482 +/- 0.0371
  Recall:    0.2451 +/- 0.0395
  F1-Score:  0.353

# BERT embeddings

In [None]:
# Connecting with google drive where fine tuned models are stored
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Import of fine tuned bert model
model_path = "/content/drive/My Drive/fine_tuned_bert"
bert_tokenizer = BertTokenizer.from_pretrained(model_path)
bert_model = BertForMaskedLM.from_pretrained(model_path)
embedding_dim_bert = bert_model.config.hidden_size

In [None]:
# Function for getting bert embeddings
def get_bert_embeddings(sentence, bert_model, bert_tokenizer, max_sequence_length_for_lstm):

    padding_zeroes = np.zeros(embedding_dim_bert, dtype=np.float16) # used to pad sequences shorter than sequence_length (50)

    inputs = bert_tokenizer(
        sentence,
        return_tensors='pt',
        truncation=True
    )

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    bert_model.to(device)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = bert_model(
            input_ids=inputs['input_ids'],
            attention_mask=inputs.get('attention_mask', None),
            token_type_ids=inputs.get('token_type_ids', None),
            output_hidden_states=True
        )

   # Last hidden state: (seq_len, embedding_dim)
    token_embeddings = outputs.hidden_states[-1].squeeze(0).cpu().numpy()
    input_ids = inputs['input_ids'].squeeze(0).cpu().numpy()

    # Filter out special tokens
    filtered_embeddings = []
    for i, token_id in enumerate(input_ids):
        token_str = bert_tokenizer.decode([token_id])
        if token_str in bert_tokenizer.all_special_tokens or not token_str.strip():
            continue
        filtered_embeddings.append(token_embeddings[i])

    return np.array(filtered_embeddings, dtype=np.float32)  # shape: (seq_len, embedding_dim)

In [None]:
# Getting bert embeddings for each word in the sentence
list_of_embeddings_BERT = [
   get_bert_embeddings(text, bert_model, bert_tokenizer, max_sequence_length)
    for text in goemotions_data['text']
]

In [None]:
# Padding sequences to the same length -> adding zeroes to the sequences shorter than embedding_dim and cutting sequences longer than embedding_dim
# Converting to NumPy array
embeddings_BERT = pad_sequences(
    list_of_embeddings_BERT,
    maxlen = max_sequence_length,
    dtype = 'float16',
    padding = 'post',
    truncating = 'post'
)

print (embeddings_BERT)

[[[-0.536    -0.2008    0.8955   ...  0.2314   -0.2686    0.7583  ]
  [-0.04874   0.363     1.516    ...  0.048    -0.6094   -0.01744 ]
  [-0.1783   -0.1528   -0.235    ...  0.27     -0.03848  -0.336   ]
  ...
  [ 0.        0.        0.       ...  0.        0.        0.      ]
  [ 0.        0.        0.       ...  0.        0.        0.      ]
  [ 0.        0.        0.       ...  0.        0.        0.      ]]

 [[-0.02016  -0.002483  1.163    ... -0.2085   -0.10297   0.1288  ]
  [ 0.2874    0.006336  0.2288   ... -0.943    -0.2607    0.4963  ]
  [-0.6045   -0.5537    0.437    ... -0.2007    0.509    -0.648   ]
  ...
  [ 0.        0.        0.       ...  0.        0.        0.      ]
  [ 0.        0.        0.       ...  0.        0.        0.      ]
  [ 0.        0.        0.       ...  0.        0.        0.      ]]

 [[ 0.348     0.1493    0.07184  ... -0.1663   -0.1428    0.4436  ]
  [-0.576    -0.363     0.461    ...  0.2825    0.589    -0.4058  ]
  [ 0.7134    0.782     1.447   

In [None]:
# Checking the dimensions of the embeddings
print(f"\nDimensions of embeddings_BERT: {embeddings_BERT.shape}") # (number_of_examples, sequence_length, embedding_dim)


Dimensions of embeddings_BERT: (54263, 30, 768)


# RNN predictions for BERT Embeddings

In [None]:
import tensorflow as tf
from keras import mixed_precision
print("Configuring Keras Mixed Precision Policy...")
try:
    if tf.config.list_physical_devices('GPU'):
        # On GPU, 'mixed_float16' is recommended as it's typically faster and saves memory
        # while maintaining numerical stability by keeping some variables in float32.
        policy = mixed_precision.Policy('mixed_float16')
        print("  GPU detected. Setting policy to 'mixed_float16' for optimal performance and memory.")
    else:
        policy = mixed_precision.Policy('float16')
        print("  CPU detected. Setting policy to 'float16' for maximum memory optimization.")

    mixed_precision.set_global_policy(policy)
    print(f"  Global Keras policy set to: {mixed_precision.global_policy().name}")

except Exception as e:
    print(f"  Could not set mixed precision policy: {e}")
    print("  Proceeding with default (likely float32) policy as a fallback.")

Configuring Keras Mixed Precision Policy...
  GPU detected. Setting policy to 'mixed_float16' for optimal performance and memory.
  Global Keras policy set to: mixed_float16


In [None]:
# 5-fold cross validation using Bidirectional LSTM RNN
# Model parameters
n_splits = 5 # 5-fold cross-validation
epochs = 50
batch_size = 16
lstm_units = 128
dropout_rate = 0.3

emotion_columns = ['anger', 'joy', 'sadness', 'disgust', 'surprise'] # column names
all_emotions_results_BERT = {}

for emotion_column_name in emotion_columns:
    print(f"\n--- Training a model for emotion: {emotion_column_name} ---")

    y_binary = goemotions_data[emotion_column_name].values

    # Initialize list to store results for individual folds of this emotion
    emotion_fold_results = []

    # StratifiedKFold initialization
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # nside loop: 5-fold cross-validation
    for fold, (train_index, val_index) in enumerate(skf.split(embeddings_BERT, y_binary)):
        print(f"  --- Fold {fold + 1}/{n_splits} ---")

        # Splitting data to train and validation sets for the current fold
        X_train_fold, X_val_fold = embeddings_BERT[train_index], embeddings_BERT[val_index]
        y_train_fold, y_val_fold = y_binary[train_index], y_binary[val_index]

        clear_session()  # In each fold we train model from the beginning

        # Building an LSTM Bidirectional model
        model = Sequential()
        model.add(Input(shape=(max_sequence_length, embedding_dim_bert)))
        model.add(Bidirectional(LSTM(lstm_units, return_sequences=False)))
        model.add(Dropout(dropout_rate))
        model.add(Dense(64, activation='relu'))
        model.add(Dropout(dropout_rate))
        model.add(Dense(1, activation='sigmoid')) # 1 neuoron for binary classification (sigmoid activation)

        # Model compilation for binary classification
        model.compile(
            optimizer='adam',
            loss='binary_crossentropy',
            metrics=['accuracy']
        )

        # Early Stopping callback
        early_stopping_callback = EarlyStopping(
            monitor='val_accuracy',
            patience=5,
            restore_best_weights=True,
            verbose=0
        )

        # Training the model
        history = model.fit(
            X_train_fold, y_train_fold,
            epochs=epochs,
            batch_size=batch_size,
            validation_data=(X_val_fold, y_val_fold),
            callbacks=[early_stopping_callback],
            verbose=0
        )

        # Evaluating the model
        loss, accuracy = model.evaluate(X_val_fold, y_val_fold, verbose=0)

        # Getting predictions
        y_pred_proba = model.predict(X_val_fold, verbose=0)
        y_pred = (y_pred_proba > 0.5).astype(int) # Binary prediction (0 ili 1)

        # Calculating metrics, with zero_division=0 to handle division by zero cases
        precision = precision_score(y_val_fold, y_pred, zero_division=0)
        recall = recall_score(y_val_fold, y_pred, zero_division=0)
        f1 = f1_score(y_val_fold, y_pred, zero_division=0)

        if len(np.unique(y_val_fold)) < 2: # Cannot calculate if validation set doesn't have both classes
            roc_auc = np.nan
        else:
            roc_auc = roc_auc_score(y_val_fold, y_pred_proba)

        # Store results for the current fold
        emotion_fold_results.append({
            'fold': fold + 1,
            'loss': loss,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'roc_auc': roc_auc
        })

    # Store all fold results for the current emotion
    all_emotions_results_BERT[emotion_column_name] = emotion_fold_results


--- Training a model for emotion: anger ---
  --- Fold 1/5 ---
  --- Fold 2/5 ---
  --- Fold 3/5 ---
  --- Fold 4/5 ---
  --- Fold 5/5 ---

--- Training a model for emotion: joy ---
  --- Fold 1/5 ---
  --- Fold 2/5 ---
  --- Fold 3/5 ---
  --- Fold 4/5 ---
  --- Fold 5/5 ---

--- Training a model for emotion: sadness ---
  --- Fold 1/5 ---
  --- Fold 2/5 ---
  --- Fold 3/5 ---
  --- Fold 4/5 ---
  --- Fold 5/5 ---

--- Training a model for emotion: disgust ---
  --- Fold 1/5 ---
  --- Fold 2/5 ---
  --- Fold 3/5 ---
  --- Fold 4/5 ---
  --- Fold 5/5 ---

--- Training a model for emotion: surprise ---
  --- Fold 1/5 ---
  --- Fold 2/5 ---
  --- Fold 3/5 ---
  --- Fold 4/5 ---
  --- Fold 5/5 ---


In [None]:
# Calculating and printing the average results for each emotion across all folds
for emotion, fold_results_list in all_emotions_results_BERT.items():
    print(f"\nResults for emotion: {emotion}")

    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []
    roc_aucs = []

    for result_dict in fold_results_list:
        accuracies.append(result_dict['accuracy'])
        precisions.append(result_dict['precision'])
        recalls.append(result_dict['recall'])
        f1_scores.append(result_dict['f1_score'])
        roc_aucs.append(result_dict['roc_auc'])

    # Calculate mean and standard deviation for each metric
    avg_accuracy = np.mean(accuracies)
    std_accuracy = np.std(accuracies)

    avg_precision = np.mean(precisions)
    std_precision = np.std(precisions)

    avg_recall = np.mean(recalls)
    std_recall = np.std(recalls)

    avg_f1_score = np.mean(f1_scores)
    std_f1_score = np.std(f1_scores)

    avg_roc_auc = np.nanmean(roc_aucs)
    std_roc_auc = np.nanstd(roc_aucs)

    print(f"  Accuracy:  {avg_accuracy:.4f} +/- {std_accuracy:.4f}")
    print(f"  Precision: {avg_precision:.4f} +/- {std_precision:.4f}")
    print(f"  Recall:    {avg_recall:.4f} +/- {std_recall:.4f}")
    print(f"  F1-Score:  {avg_f1_score:.4f} +/- {std_f1_score:.4f}")

    if not np.isnan(avg_roc_auc):
        print(f"  ROC AUC:   {avg_roc_auc:.4f} +/- {std_roc_auc:.4f}")
    else:
        print("  ROC AUC:   N/A (not enough classes in validation folds)")

    print("-" * 30)


Results for emotion: anger
  Accuracy:  0.9673 +/- 0.0011
  Precision: 0.6480 +/- 0.0535
  Recall:    0.2265 +/- 0.0695
  F1-Score:  0.3270 +/- 0.0740
  ROC AUC:   0.9005 +/- 0.0137
------------------------------

Results for emotion: joy
  Accuracy:  0.9728 +/- 0.0008
  Precision: 0.6463 +/- 0.0273
  Recall:    0.3888 +/- 0.0278
  F1-Score:  0.4844 +/- 0.0204
  ROC AUC:   0.9225 +/- 0.0090
------------------------------

Results for emotion: sadness
  Accuracy:  0.9756 +/- 0.0012
  Precision: 0.7273 +/- 0.0715
  Recall:    0.3071 +/- 0.0324
  F1-Score:  0.4289 +/- 0.0291
  ROC AUC:   0.9266 +/- 0.0044
------------------------------

Results for emotion: disgust
  Accuracy:  0.9834 +/- 0.0004
  Precision: 0.6836 +/- 0.0747
  Recall:    0.2301 +/- 0.0806
  F1-Score:  0.3314 +/- 0.0849
  ROC AUC:   0.9166 +/- 0.0152
------------------------------

Results for emotion: surprise
  Accuracy:  0.9791 +/- 0.0009
  Precision: 0.6699 +/- 0.0454
  Recall:    0.3068 +/- 0.0994
  F1-Score:  0.407