**Nina Dobša, 28.7.2025.**

# Imports

In [None]:
%pip install gensim

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional, Input
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.backend import clear_session
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelEncoder
from gensim.models import Word2Vec
from transformers import BertTokenizer, BertForMaskedLM
import torch
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [1]:
# Checking how much RAM memory do we have
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 54.8 gigabytes of available RAM

You are using a high-RAM runtime!


In [2]:
# Checking if we're using GPU
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Sun Jul 27 21:30:52 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   39C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

# GoEmotions dataset

In [2]:
splits = {'train': 'simplified/train-00000-of-00001.parquet',
          'validation': 'simplified/validation-00000-of-00001.parquet',
          'test': 'simplified/test-00000-of-00001.parquet'}

goemotions_train = pd.read_parquet("hf://datasets/google-research-datasets/go_emotions/" + splits["train"])
goemotions_validation = pd.read_parquet("hf://datasets/google-research-datasets/go_emotions/" + splits["validation"])
goemotions_test = pd.read_parquet("hf://datasets/google-research-datasets/go_emotions/" + splits["test"])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
goemotions_data = pd.concat([goemotions_train, goemotions_validation, goemotions_test], ignore_index=True)
goemotions_data.shape

(54263, 3)

Cleaning and labeling emotions with 0 and 1

In [4]:
# Mapping numbers from labels with emotions
emotion_dictionary = { 0 : 'admiration', 1 : 'amusement', 2 : 'anger', 3 : 'annoyance', 4 : 'approval', 5 : 'caring', 6 : 'confusion', 7 : 'curiosity', 8 : 'desire',
                      9 : 'disappointment', 10 : 'disapproval', 11 : 'disgust', 12 : 'embarrassment', 13 : 'excitement', 14 : 'fear', 15 : 'gratitude', 16 : 'grief',
                      17 : 'joy', 18 : 'love', 19 : 'nervousness', 20 : 'optimism', 21 : 'pride', 22 : 'realization', 23 : 'relief', 24 : 'remorse', 25 : 'sadness',
                      26 : 'surprise', 27 : 'neutral' }


target_emotions = {'anger': 2, 'sadness': 25, 'joy': 17, 'disgust' : 11, 'surprise' : 26}

In [5]:
# Creating new columns for each emotion (anger, joy, sadness, disgust, surprise)
# in train, validation and test goemotion datasets
for emotion, label in target_emotions.items():
    goemotions_data[emotion] = goemotions_data['labels'].apply(lambda x: 1 if label in x else 0)

# Dropping id and labels columns
goemotions_data = goemotions_data.drop(columns = ["id", "labels"])
goemotions_data.head()

Unnamed: 0,text,anger,sadness,joy,disgust,surprise
0,My favourite food is anything I didn't have to...,0,0,0,0,0
1,"Now if he does off himself, everyone will thin...",0,0,0,0,0
2,WHY THE FUCK IS BAYLESS ISOING,1,0,0,0,0
3,To make her feel threatened,0,0,0,0,0
4,Dirty Southern Wankers,0,0,0,0,0


In [6]:
# Checking class distribution of target valuess
for i in range (1, 6):
    class_distribution = goemotions_data.iloc[:, i].value_counts()
    print("\nNumber of samples per class:")
    print(class_distribution)


Number of samples per class:
anger
0    52303
1     1960
Name: count, dtype: int64

Number of samples per class:
sadness
0    52638
1     1625
Name: count, dtype: int64

Number of samples per class:
joy
0    52478
1     1785
Name: count, dtype: int64

Number of samples per class:
disgust
0    53250
1     1013
Name: count, dtype: int64

Number of samples per class:
surprise
0    52933
1     1330
Name: count, dtype: int64


Classes are disbalanced...

In [7]:
# Defining number of classes and length of the sequence (if the sentence is shorter padding will be added)
num_classes = 2 # sample has the target emotion or it does not (0 or 1)
max_sequence_length = 30

# Word2vec embeddings

In [None]:
# Connecting with google drive where fine tuned models are stored
from google.colab import drive
drive.mount('/content/drive')

In [18]:
# Import of fine tuned word2vec model
model_path_SG = "/content/drive/My Drive/fine_tuned_word2vec_sg/fine_tuned_word2vec_sg.model"
model_path_CBOW = "/content/drive/My Drive/fine_tuned_word2vec_cbow/fine_tuned_word2vec_cbow.model"
word2vec_model_SG = Word2Vec.load(model_path_SG)
word2vec_model_CBOW = Word2Vec.load(model_path_CBOW)

embedding_dim_SG = word2vec_model_SG.vector_size
embedding_dim_CBOW = word2vec_model_CBOW.vector_size

In [19]:
# Function for getting word2vec embeddings
def get_word2vec_embeddings(sentence, model):
    # Tokenization
    sentence = sentence.lower()
    words = sentence.split()
    word_embeddings = [model.wv[word] for word in words if word in model.wv]

    if not word_embeddings:  # If no words are in the vocabulary, return a zero vector
        return []

    return word_embeddings

In [20]:
# Getting word2vec embeddings for each word in the sentence (except stowords)
list_of_embeddings_CBOW = [
   get_word2vec_embeddings(text, word2vec_model_CBOW)
    for text in goemotions_data['text']
]

list_of_embeddings_SG = [
   get_word2vec_embeddings(text, word2vec_model_SG)
    for text in goemotions_data['text']
]

In [21]:
# Padding sequences to the same length -> adding zeroes to the sequences shorter than embedding_dim and cutting sequences longer than embedding_dim
# Converting to NumPy array
embeddings_CBOW = pad_sequences(
    list_of_embeddings_CBOW,
    maxlen = max_sequence_length,
    dtype = 'float16',
    padding = 'post',
    truncating = 'post'
)

print(embeddings_CBOW)

[[[-0.4407    1.928     0.327    ...  0.015305 -1.564     0.739   ]
  [-0.1677    0.0437   -0.1992   ... -0.2135    0.579     0.1512  ]
  [ 0.1854    0.0452   -1.002    ... -1.923    -0.3738   -0.5596  ]
  ...
  [ 0.        0.        0.       ...  0.        0.        0.      ]
  [ 0.        0.        0.       ...  0.        0.        0.      ]
  [ 0.        0.        0.       ...  0.        0.        0.      ]]

 [[-0.448    -0.2402    2.35     ...  1.032    -1.137     0.7905  ]
  [-0.554     0.1405    0.4563   ... -0.6997   -0.00902   0.4053  ]
  [-0.3103    0.6895    1.6875   ... -1.312     1.014    -1.18    ]
  ...
  [ 0.        0.        0.       ...  0.        0.        0.      ]
  [ 0.        0.        0.       ...  0.        0.        0.      ]
  [ 0.        0.        0.       ...  0.        0.        0.      ]]

 [[-0.6646   -0.169     1.161    ...  1.207     0.37      0.3103  ]
  [ 0.1564    0.6187    0.5796   ...  0.0896    0.925     0.6357  ]
  [-0.1267    0.393    -0.5347  

In [22]:
# Padding sequences to the same length -> adding zeroes to the sequences shorter than embedding_dim and cutting sequences longer than embedding_dim
# Converting to NumPy array
embeddings_SG = pad_sequences(
    list_of_embeddings_SG,
    maxlen = max_sequence_length,
    dtype = 'float16',
    padding = 'post',
    truncating = 'post'
)

print(embeddings_SG)

[[[-0.1522    0.2324   -0.2382   ...  0.0225    0.04294   0.05774 ]
  [ 0.1143   -0.03464  -0.1171   ...  0.1421    0.1758   -0.3223  ]
  [ 0.04355   0.3364    0.0258   ... -0.1736    0.1401   -0.2083  ]
  ...
  [ 0.        0.        0.       ...  0.        0.        0.      ]
  [ 0.        0.        0.       ...  0.        0.        0.      ]
  [ 0.        0.        0.       ...  0.        0.        0.      ]]

 [[-0.0746    0.4734    0.0384   ...  0.0987   -0.0737   -0.06726 ]
  [-0.0376    0.162    -0.0973   ... -0.1958    0.2776   -0.03525 ]
  [-0.1488    0.4314   -0.00872  ... -0.2686    0.4297   -0.1893  ]
  ...
  [ 0.        0.        0.       ...  0.        0.        0.      ]
  [ 0.        0.        0.       ...  0.        0.        0.      ]
  [ 0.        0.        0.       ...  0.        0.        0.      ]]

 [[-0.08185   0.4246   -0.03217  ... -0.09064  -0.0999    0.000683]
  [ 0.01599  -0.09045  -0.2096   ... -0.07776  -0.01978   0.0628  ]
  [-0.007088  0.4177   -0.3674  

In [23]:
# Checking the dimensions of the embeddings
print(f"Dimensions of embeddings_CBOW: {embeddings_CBOW.shape}") # (number_of_examples, sequence_length, embedding_dim)
print(f"Dimensions of embeddings_SG: {embeddings_SG.shape}") # (number_of_examples, sequence_length, embedding_dim)

Dimensions of embeddings_CBOW: (54263, 30, 300)
Dimensions of embeddings_SG: (54263, 30, 300)


# RNN predictions for Word2Vec embeddings

### Predictions for SG embeddings

In [24]:
# 5-fold cross validation using Bidirectional LSTM RNN
# Model parameters
n_splits = 5 # 5-fold cross-validation
epochs = 50
batch_size = 16
lstm_units = 128
dropout_rate = 0.3

emotion_columns = ['anger', 'joy'] # Column names
all_emotions_results_SG = {}

for emotion_column_name in emotion_columns:
    print(f"\n--- Training a model for emotion: {emotion_column_name} ---")

    y_binary = goemotions_data[emotion_column_name].values

    # Initialize list to store results for individual folds of this emotion
    emotion_fold_results = []

    # StratifiedKFold initialization
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # nside loop: 5-fold cross-validation
    for fold, (train_index, val_index) in enumerate(skf.split(embeddings_SG, y_binary)):
        print(f"  --- Fold {fold + 1}/{n_splits} ---")

        # Splitting data to train and validation sets for the current fold
        X_train_fold, X_val_fold = embeddings_SG[train_index], embeddings_SG[val_index]
        y_train_fold, y_val_fold = y_binary[train_index], y_binary[val_index]

        clear_session() # In each fold we train model from the beginning

        # Building an LSTM Bidirectional model
        model = Sequential()
        model.add(Input(shape=(max_sequence_length, embedding_dim_SG)))
        model.add(Bidirectional(LSTM(lstm_units, return_sequences=False)))
        model.add(Dropout(dropout_rate))
        model.add(Dense(64, activation='relu'))
        model.add(Dropout(dropout_rate))
        model.add(Dense(1, activation='sigmoid')) # 1 neuoron for binary classification (sigmoid activation)

        # Model compilation for binary classification
        model.compile(
            optimizer='adam',
            loss='binary_crossentropy',
            metrics=['accuracy']
        )

        # Early Stopping callback
        early_stopping_callback = EarlyStopping(
            monitor='val_accuracy',
            patience=5,
            restore_best_weights=True,
            verbose=0
        )

        # Training the model
        history = model.fit(
            X_train_fold, y_train_fold,
            epochs=epochs,
            batch_size=batch_size,
            validation_data=(X_val_fold, y_val_fold),
            callbacks=[early_stopping_callback],
            verbose=0
        )

        # Evaluating the model
        loss, accuracy = model.evaluate(X_val_fold, y_val_fold, verbose=0)

        # Getting predictions
        y_pred_proba = model.predict(X_val_fold, verbose=0)
        y_pred = (y_pred_proba > 0.5).astype(int) # Binary prediction (0 ili 1)

        # Calculating metrics
        precision = precision_score(y_val_fold, y_pred, zero_division=0)
        recall = recall_score(y_val_fold, y_pred, zero_division=0)
        f1 = f1_score(y_val_fold, y_pred, zero_division=0)

        if len(np.unique(y_val_fold)) < 2: # Cannot calculate if validation set doesn't have both classes
            roc_auc = np.nan
        else:
            roc_auc = roc_auc_score(y_val_fold, y_pred_proba)

        print(f"    Fold {fold + 1} - Loss: {loss:.4f}, Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}, ROC AUC: {roc_auc:.4f}")

        # Store results for the current fold
        emotion_fold_results.append({
            'fold': fold + 1,
            'loss': loss,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'roc_auc': roc_auc
        })

    # Store all fold results for the current emotion
    all_emotions_results_SG[emotion_column_name] = emotion_fold_results


--- Training a model for emotion: anger ---
  --- Fold 1/5 ---
    Fold 1 - Loss: 0.1110, Accuracy: 0.9666, Precision: 0.5960, Recall: 0.2296, F1-Score: 0.3315, ROC AUC: 0.8645
  --- Fold 2/5 ---
    Fold 2 - Loss: 0.1194, Accuracy: 0.9669, Precision: 0.7538, Recall: 0.1250, F1-Score: 0.2144, ROC AUC: 0.8514
  --- Fold 3/5 ---
    Fold 3 - Loss: 0.1179, Accuracy: 0.9665, Precision: 0.6707, Recall: 0.1403, F1-Score: 0.2321, ROC AUC: 0.8625
  --- Fold 4/5 ---
    Fold 4 - Loss: 0.1145, Accuracy: 0.9682, Precision: 0.6643, Recall: 0.2423, F1-Score: 0.3551, ROC AUC: 0.8587
  --- Fold 5/5 ---
    Fold 5 - Loss: 0.1189, Accuracy: 0.9668, Precision: 0.6404, Recall: 0.1862, F1-Score: 0.2885, ROC AUC: 0.8437

--- Training a model for emotion: joy ---
  --- Fold 1/5 ---
    Fold 1 - Loss: 0.0971, Accuracy: 0.9715, Precision: 0.6558, Recall: 0.2829, F1-Score: 0.3953, ROC AUC: 0.8781
  --- Fold 2/5 ---
    Fold 2 - Loss: 0.0965, Accuracy: 0.9721, Precision: 0.6311, Recall: 0.3641, F1-Score: 0.461

In [25]:
# Calculating and printing the average results for each emotion across all folds
for emotion, fold_results_list in all_emotions_results_SG.items():
    print(f"\nResults for emotion: {emotion}")

    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []
    roc_aucs = []

    for result_dict in fold_results_list:
        accuracies.append(result_dict['accuracy'])
        precisions.append(result_dict['precision'])
        recalls.append(result_dict['recall'])
        f1_scores.append(result_dict['f1_score'])
        roc_aucs.append(result_dict['roc_auc'])

    # Calculate mean and standard deviation for each metric
    avg_accuracy = np.mean(accuracies)
    std_accuracy = np.std(accuracies)

    avg_precision = np.mean(precisions)
    std_precision = np.std(precisions)

    avg_recall = np.mean(recalls)
    std_recall = np.std(recalls)

    avg_f1_score = np.mean(f1_scores)
    std_f1_score = np.std(f1_scores)

    avg_roc_auc = np.nanmean(roc_aucs)
    std_roc_auc = np.nanstd(roc_aucs)

    print(f"  Accuracy:  {avg_accuracy:.4f} +/- {std_accuracy:.4f}")
    print(f"  Precision: {avg_precision:.4f} +/- {std_precision:.4f}")
    print(f"  Recall:    {avg_recall:.4f} +/- {std_recall:.4f}")
    print(f"  F1-Score:  {avg_f1_score:.4f} +/- {std_f1_score:.4f}")

    if not np.isnan(avg_roc_auc):
        print(f"  ROC AUC:   {avg_roc_auc:.4f} +/- {std_roc_auc:.4f}")
    else:
        print("  ROC AUC:   N/A (not enough classes in validation folds)")

    print("-" * 30)


Results for emotion: anger
  Accuracy:  0.9670 +/- 0.0006
  Precision: 0.6651 +/- 0.0516
  Recall:    0.1847 +/- 0.0466
  F1-Score:  0.2843 +/- 0.0545
  ROC AUC:   0.8562 +/- 0.0077
------------------------------

Results for emotion: joy
  Accuracy:  0.9724 +/- 0.0007
  Precision: 0.6340 +/- 0.0132
  Recall:    0.3866 +/- 0.0626
  F1-Score:  0.4767 +/- 0.0485
  ROC AUC:   0.8841 +/- 0.0101
------------------------------


### Predictions for CBOW embeddings

In [26]:
# 5-fold cross validation using Bidirectional LSTM RNN
# Model parameters
n_splits = 5 # 5-fold cross-validation
epochs = 50
batch_size = 16
lstm_units = 128
dropout_rate = 0.3

emotion_columns = ['anger', 'joy'] # column names
all_emotions_results_CBOW = {}

for emotion_column_name in emotion_columns:
    print(f"\n--- Training a model for emotion: {emotion_column_name} ---")

    y_binary = goemotions_data[emotion_column_name].values

    # Initialize list to store results for individual folds of this emotion
    emotion_fold_results = []

    # StratifiedKFold initialization
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # nside loop: 5-fold cross-validation
    for fold, (train_index, val_index) in enumerate(skf.split(embeddings_CBOW, y_binary)):
        print(f"  --- Fold {fold + 1}/{n_splits} ---")

        # Splitting data to train and validation sets for the current fold
        X_train_fold, X_val_fold = embeddings_CBOW[train_index], embeddings_CBOW[val_index]
        y_train_fold, y_val_fold = y_binary[train_index], y_binary[val_index]

        clear_session()  # In each fold we train model from the beginning

        # Building an LSTM Bidirectional model
        model = Sequential()
        model.add(Input(shape=(max_sequence_length, embedding_dim_CBOW)))
        model.add(Bidirectional(LSTM(lstm_units, return_sequences=False)))
        model.add(Dropout(dropout_rate))
        model.add(Dense(64, activation='relu'))
        model.add(Dropout(dropout_rate))
        model.add(Dense(1, activation='sigmoid')) # 1 neuoron for binary classification (sigmoid activation)

        # Model compilation for binary classification
        model.compile(
            optimizer='adam',
            loss='binary_crossentropy',
            metrics=['accuracy']
        )

        # Early Stopping callback
        early_stopping_callback = EarlyStopping(
            monitor='val_accuracy',
            patience=5,
            restore_best_weights=True,
            verbose=0
        )

        # Training the model
        history = model.fit(
            X_train_fold, y_train_fold,
            epochs=epochs,
            batch_size=batch_size,
            validation_data=(X_val_fold, y_val_fold),
            callbacks=[early_stopping_callback],
            verbose=0
        )

        # Evaluating the model
        loss, accuracy = model.evaluate(X_val_fold, y_val_fold, verbose=0)

        # Getting predictions
        y_pred_proba = model.predict(X_val_fold, verbose=0)
        y_pred = (y_pred_proba > 0.5).astype(int) # Binary prediction (0 ili 1)

        # Calculating metrics, with zero_division=0 to handle division by zero cases
        precision = precision_score(y_val_fold, y_pred, zero_division=0)
        recall = recall_score(y_val_fold, y_pred, zero_division=0)
        f1 = f1_score(y_val_fold, y_pred, zero_division=0)

        if len(np.unique(y_val_fold)) < 2: # Cannot calculate if validation set doesn't have both classes
            roc_auc = np.nan
        else:
            roc_auc = roc_auc_score(y_val_fold, y_pred_proba) # Use probabilities for ROC AUC

        print(f"    Fold {fold + 1} - Loss: {loss:.4f}, Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}, ROC AUC: {roc_auc:.4f}")

        # Store results for the current fold
        emotion_fold_results.append({
            'fold': fold + 1,
            'loss': loss,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'roc_auc': roc_auc
        })

    # Store all fold results for the current emotion
    all_emotions_results_CBOW[emotion_column_name] = emotion_fold_results


--- Training a model for emotion: anger ---
  --- Fold 1/5 ---
    Fold 1 - Loss: 0.1162, Accuracy: 0.9667, Precision: 0.6962, Recall: 0.1403, F1-Score: 0.2335, ROC AUC: 0.8531
  --- Fold 2/5 ---
    Fold 2 - Loss: 0.1200, Accuracy: 0.9665, Precision: 0.6129, Recall: 0.1939, F1-Score: 0.2946, ROC AUC: 0.8414
  --- Fold 3/5 ---
    Fold 3 - Loss: 0.1179, Accuracy: 0.9664, Precision: 0.6337, Recall: 0.1633, F1-Score: 0.2596, ROC AUC: 0.8414
  --- Fold 4/5 ---
    Fold 4 - Loss: 0.1183, Accuracy: 0.9684, Precision: 0.7094, Recall: 0.2117, F1-Score: 0.3261, ROC AUC: 0.8451
  --- Fold 5/5 ---
    Fold 5 - Loss: 0.1630, Accuracy: 0.9662, Precision: 0.5850, Recall: 0.2194, F1-Score: 0.3191, ROC AUC: 0.7982

--- Training a model for emotion: joy ---
  --- Fold 1/5 ---
    Fold 1 - Loss: 0.1007, Accuracy: 0.9704, Precision: 0.6184, Recall: 0.2633, F1-Score: 0.3694, ROC AUC: 0.8533
  --- Fold 2/5 ---
    Fold 2 - Loss: 0.0988, Accuracy: 0.9713, Precision: 0.6009, Recall: 0.3754, F1-Score: 0.462

In [27]:
# Calculating and printing the average results for each emotion across all folds
for emotion, fold_results_list in all_emotions_results_CBOW.items():
    print(f"\nResults for emotion: {emotion}")

    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []
    roc_aucs = []

    for result_dict in fold_results_list:
        accuracies.append(result_dict['accuracy'])
        precisions.append(result_dict['precision'])
        recalls.append(result_dict['recall'])
        f1_scores.append(result_dict['f1_score'])
        roc_aucs.append(result_dict['roc_auc'])

    # Calculate mean and standard deviation for each metric
    avg_accuracy = np.mean(accuracies)
    std_accuracy = np.std(accuracies)

    avg_precision = np.mean(precisions)
    std_precision = np.std(precisions)

    avg_recall = np.mean(recalls)
    std_recall = np.std(recalls)

    avg_f1_score = np.mean(f1_scores)
    std_f1_score = np.std(f1_scores)

    avg_roc_auc = np.nanmean(roc_aucs)
    std_roc_auc = np.nanstd(roc_aucs)

    print(f"  Accuracy:  {avg_accuracy:.4f} +/- {std_accuracy:.4f}")
    print(f"  Precision: {avg_precision:.4f} +/- {std_precision:.4f}")
    print(f"  Recall:    {avg_recall:.4f} +/- {std_recall:.4f}")
    print(f"  F1-Score:  {avg_f1_score:.4f} +/- {std_f1_score:.4f}")

    if not np.isnan(avg_roc_auc):
        print(f"  ROC AUC:   {avg_roc_auc:.4f} +/- {std_roc_auc:.4f}")
    else:
        print("  ROC AUC:   N/A (not enough classes in validation folds)")

    print("-" * 30)


Results for emotion: anger
  Accuracy:  0.9668 +/- 0.0008
  Precision: 0.6474 +/- 0.0479
  Recall:    0.1857 +/- 0.0298
  F1-Score:  0.2866 +/- 0.0353
  ROC AUC:   0.8358 +/- 0.0193
------------------------------

Results for emotion: joy
  Accuracy:  0.9716 +/- 0.0008
  Precision: 0.6303 +/- 0.0241
  Recall:    0.3378 +/- 0.0536
  F1-Score:  0.4368 +/- 0.0439
  ROC AUC:   0.8632 +/- 0.0093
------------------------------


# BERT embeddings

In [9]:
# Connecting with google drive where fine tuned models are stored
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
# Import of fine tuned bert model
model_path = "/content/drive/My Drive/fine_tuned_bert"
bert_tokenizer = BertTokenizer.from_pretrained(model_path)
bert_model = BertForMaskedLM.from_pretrained(model_path)
embedding_dim_bert = bert_model.config.hidden_size

In [11]:
# Function for getting bert embeddings
def get_bert_embeddings(sentence, bert_model, bert_tokenizer, max_sequence_length_for_lstm):

    padding_zeroes = np.zeros(embedding_dim_bert, dtype=np.float16) # used to pad sequences shorter than sequence_length (50)

    inputs = bert_tokenizer(
        sentence,
        return_tensors='pt',
        truncation=True
    )

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    bert_model.to(device)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = bert_model(
            input_ids=inputs['input_ids'],
            attention_mask=inputs.get('attention_mask', None),
            token_type_ids=inputs.get('token_type_ids', None),
            output_hidden_states=True
        )

    # Get the last hidden state (token embeddings)
    token_embeddings_np = outputs.hidden_states[-1].squeeze(0).to(torch.float16).cpu().numpy()
    #token_embeddings_np = outputs.hidden_states[-1].squeeze(0).cpu().numpy()
    input_ids = inputs['input_ids'].squeeze(0).cpu().numpy()

    # Filter out special tokens and empty strings
    filtered_embeddings_list = []
    for i, token_id in enumerate(input_ids):
        token_str = bert_tokenizer.decode(token_id)

        if token_str in bert_tokenizer.all_special_tokens or not token_str.strip():
            continue

        filtered_embeddings_list.append(token_embeddings_np[i])

    current_len = len(filtered_embeddings_list)

    if current_len == 0:
        return np.full((max_sequence_length_for_lstm, embedding_dim_bert), padding_zeroes, dtype=np.float16)

    # Initialize the final embeddings array with padding zeroes
    final_embeddings_array = np.full((max_sequence_length_for_lstm, embedding_dim_bert), padding_zeroes, dtype=np.float16)

    # If the current length is greater than or equal to the max sequence length (50), truncate the list
    # else, fill the array with the available embeddings
    if current_len >= max_sequence_length_for_lstm:
        final_embeddings_array = np.array(filtered_embeddings_list[:max_sequence_length_for_lstm], dtype=np.float16)
    else:
        final_embeddings_array[:current_len] = np.array(filtered_embeddings_list, dtype=np.float16)

    return final_embeddings_array

In [12]:
# Getting bert embeddings for each word in the sentence
list_of_embeddings_BERT = [
   get_bert_embeddings(text, bert_model, bert_tokenizer, max_sequence_length)
    for text in goemotions_data['text']
]

In [13]:
# Padding sequences to the same length -> adding zeroes to the sequences shorter than embedding_dim and cutting sequences longer than embedding_dim
# Converting to NumPy array
embeddings_BERT = pad_sequences(
    list_of_embeddings_BERT,
    maxlen = max_sequence_length,
    dtype = 'float16',
    padding = 'post',
    truncating = 'post'
)

print (embeddings_BERT)

[[[-0.536    -0.2008    0.8955   ...  0.2314   -0.2686    0.7583  ]
  [-0.04874   0.363     1.516    ...  0.048    -0.6094   -0.01744 ]
  [-0.1783   -0.1528   -0.235    ...  0.27     -0.03848  -0.336   ]
  ...
  [ 0.        0.        0.       ...  0.        0.        0.      ]
  [ 0.        0.        0.       ...  0.        0.        0.      ]
  [ 0.        0.        0.       ...  0.        0.        0.      ]]

 [[-0.02016  -0.002483  1.163    ... -0.2085   -0.10297   0.1288  ]
  [ 0.2874    0.006336  0.2288   ... -0.943    -0.2607    0.4963  ]
  [-0.6045   -0.5537    0.437    ... -0.2007    0.509    -0.648   ]
  ...
  [ 0.        0.        0.       ...  0.        0.        0.      ]
  [ 0.        0.        0.       ...  0.        0.        0.      ]
  [ 0.        0.        0.       ...  0.        0.        0.      ]]

 [[ 0.348     0.1493    0.07184  ... -0.1663   -0.1428    0.4436  ]
  [-0.576    -0.363     0.461    ...  0.2825    0.589    -0.4058  ]
  [ 0.7134    0.782     1.447   

In [14]:
# Checking the dimensions of the embeddings
print(f"\nDimensions of embeddings_BERT: {embeddings_BERT.shape}") # (number_of_examples, sequence_length, embedding_dim)


Dimensions of embeddings_BERT: (54263, 30, 768)


# RNN predictions for BERT Embeddings

In [15]:
import tensorflow as tf
from keras import mixed_precision
print("Configuring Keras Mixed Precision Policy...")
try:
    if tf.config.list_physical_devices('GPU'):
        # On GPU, 'mixed_float16' is recommended as it's typically faster and saves memory
        # while maintaining numerical stability by keeping some variables in float32.
        policy = mixed_precision.Policy('mixed_float16')
        print("  GPU detected. Setting policy to 'mixed_float16' for optimal performance and memory.")
    else:
        policy = mixed_precision.Policy('float16')
        print("  CPU detected. Setting policy to 'float16' for maximum memory optimization.")

    mixed_precision.set_global_policy(policy)
    print(f"  Global Keras policy set to: {mixed_precision.global_policy().name}")

except Exception as e:
    print(f"  Could not set mixed precision policy: {e}")
    print("  Proceeding with default (likely float32) policy as a fallback.")

Configuring Keras Mixed Precision Policy...
  GPU detected. Setting policy to 'mixed_float16' for optimal performance and memory.
  Global Keras policy set to: mixed_float16


In [16]:
# 5-fold cross validation using Bidirectional LSTM RNN
# Model parameters
n_splits = 5 # 5-fold cross-validation
epochs = 50
batch_size = 16
lstm_units = 128
dropout_rate = 0.3

emotion_columns = ['anger', 'joy'] # column names
all_emotions_results_BERT = {}

for emotion_column_name in emotion_columns:
    print(f"\n--- Training a model for emotion: {emotion_column_name} ---")

    y_binary = goemotions_data[emotion_column_name].values

    # Initialize list to store results for individual folds of this emotion
    emotion_fold_results = []

    # StratifiedKFold initialization
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # nside loop: 5-fold cross-validation
    for fold, (train_index, val_index) in enumerate(skf.split(embeddings_BERT, y_binary)):
        print(f"  --- Fold {fold + 1}/{n_splits} ---")

        # Splitting data to train and validation sets for the current fold
        X_train_fold, X_val_fold = embeddings_BERT[train_index], embeddings_BERT[val_index]
        y_train_fold, y_val_fold = y_binary[train_index], y_binary[val_index]

        clear_session()  # In each fold we train model from the beginning

        # Building an LSTM Bidirectional model
        model = Sequential()
        model.add(Input(shape=(max_sequence_length, embedding_dim_bert)))
        model.add(Bidirectional(LSTM(lstm_units, return_sequences=False)))
        model.add(Dropout(dropout_rate))
        model.add(Dense(64, activation='relu'))
        model.add(Dropout(dropout_rate))
        model.add(Dense(1, activation='sigmoid')) # 1 neuoron for binary classification (sigmoid activation)

        # Model compilation for binary classification
        model.compile(
            optimizer='adam',
            loss='binary_crossentropy',
            metrics=['accuracy']
        )

        # Early Stopping callback
        early_stopping_callback = EarlyStopping(
            monitor='val_accuracy',
            patience=5,
            restore_best_weights=True,
            verbose=0
        )

        # Training the model
        history = model.fit(
            X_train_fold, y_train_fold,
            epochs=epochs,
            batch_size=batch_size,
            validation_data=(X_val_fold, y_val_fold),
            callbacks=[early_stopping_callback],
            verbose=0
        )

        # Evaluating the model
        loss, accuracy = model.evaluate(X_val_fold, y_val_fold, verbose=0)

        # Getting predictions
        y_pred_proba = model.predict(X_val_fold, verbose=0)
        y_pred = (y_pred_proba > 0.5).astype(int) # Binary prediction (0 ili 1)

        # Calculating metrics, with zero_division=0 to handle division by zero cases
        precision = precision_score(y_val_fold, y_pred, zero_division=0)
        recall = recall_score(y_val_fold, y_pred, zero_division=0)
        f1 = f1_score(y_val_fold, y_pred, zero_division=0)

        if len(np.unique(y_val_fold)) < 2: # Cannot calculate if validation set doesn't have both classes
            roc_auc = np.nan
        else:
            roc_auc = roc_auc_score(y_val_fold, y_pred_proba)

        print(f"    Fold {fold + 1} - Loss: {loss:.4f}, Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}, ROC AUC: {roc_auc:.4f}")

        # Store results for the current fold
        emotion_fold_results.append({
            'fold': fold + 1,
            'loss': loss,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'roc_auc': roc_auc
        })

    # Store all fold results for the current emotion
    all_emotions_results_BERT[emotion_column_name] = emotion_fold_results


--- Training a model for emotion: anger ---
  --- Fold 1/5 ---
    Fold 1 - Loss: 0.1025, Accuracy: 0.9658, Precision: 0.5436, Recall: 0.3342, F1-Score: 0.4139, ROC AUC: 0.9222
  --- Fold 2/5 ---
    Fold 2 - Loss: 0.1022, Accuracy: 0.9681, Precision: 0.6533, Recall: 0.2500, F1-Score: 0.3616, ROC AUC: 0.9101
  --- Fold 3/5 ---
    Fold 3 - Loss: 0.1005, Accuracy: 0.9674, Precision: 0.6338, Recall: 0.2296, F1-Score: 0.3371, ROC AUC: 0.9192
  --- Fold 4/5 ---
    Fold 4 - Loss: 0.0987, Accuracy: 0.9692, Precision: 0.7589, Recall: 0.2168, F1-Score: 0.3373, ROC AUC: 0.9221
  --- Fold 5/5 ---
    Fold 5 - Loss: 0.1090, Accuracy: 0.9667, Precision: 0.5987, Recall: 0.2398, F1-Score: 0.3424, ROC AUC: 0.9088

--- Training a model for emotion: joy ---
  --- Fold 1/5 ---
    Fold 1 - Loss: 0.0867, Accuracy: 0.9724, Precision: 0.6748, Recall: 0.3081, F1-Score: 0.4231, ROC AUC: 0.9219
  --- Fold 2/5 ---
    Fold 2 - Loss: 0.0905, Accuracy: 0.9722, Precision: 0.6455, Recall: 0.3417, F1-Score: 0.446

In [17]:
# Calculating and printing the average results for each emotion across all folds
for emotion, fold_results_list in all_emotions_results_BERT.items():
    print(f"\nResults for emotion: {emotion}")

    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []
    roc_aucs = []

    for result_dict in fold_results_list:
        accuracies.append(result_dict['accuracy'])
        precisions.append(result_dict['precision'])
        recalls.append(result_dict['recall'])
        f1_scores.append(result_dict['f1_score'])
        roc_aucs.append(result_dict['roc_auc'])

    # Calculate mean and standard deviation for each metric
    avg_accuracy = np.mean(accuracies)
    std_accuracy = np.std(accuracies)

    avg_precision = np.mean(precisions)
    std_precision = np.std(precisions)

    avg_recall = np.mean(recalls)
    std_recall = np.std(recalls)

    avg_f1_score = np.mean(f1_scores)
    std_f1_score = np.std(f1_scores)

    avg_roc_auc = np.nanmean(roc_aucs)
    std_roc_auc = np.nanstd(roc_aucs)

    print(f"  Accuracy:  {avg_accuracy:.4f} +/- {std_accuracy:.4f}")
    print(f"  Precision: {avg_precision:.4f} +/- {std_precision:.4f}")
    print(f"  Recall:    {avg_recall:.4f} +/- {std_recall:.4f}")
    print(f"  F1-Score:  {avg_f1_score:.4f} +/- {std_f1_score:.4f}")

    if not np.isnan(avg_roc_auc):
        print(f"  ROC AUC:   {avg_roc_auc:.4f} +/- {std_roc_auc:.4f}")
    else:
        print("  ROC AUC:   N/A (not enough classes in validation folds)")

    print("-" * 30)


Results for emotion: anger
  Accuracy:  0.9675 +/- 0.0012
  Precision: 0.6377 +/- 0.0712
  Recall:    0.2541 +/- 0.0415
  F1-Score:  0.3585 +/- 0.0291
  ROC AUC:   0.9165 +/- 0.0059
------------------------------

Results for emotion: joy
  Accuracy:  0.9726 +/- 0.0004
  Precision: 0.6547 +/- 0.0138
  Recall:    0.3574 +/- 0.0438
  F1-Score:  0.4603 +/- 0.0331
  ROC AUC:   0.9246 +/- 0.0041
------------------------------
