**Nina Dobša, 28.7.2025.**

# Imports

In [1]:
%pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m104.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━

In [1]:
import pandas as pd
import numpy as np
from datasets import load_dataset
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional, Input
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.backend import clear_session
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelEncoder
from gensim.models import Word2Vec
from transformers import BertTokenizer, BertForMaskedLM
import torch
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Data import - NRC Emotion Lexicon

In [2]:
# Connecting with google drive where fine tuned models are stored
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Import of data from local file
emotion_lexicon = pd.read_excel("/content/drive/My Drive/data/NRC-Emotion-Lexicon-v0.92-In105Languages-Nov2017Translations.xlsx", sheet_name="NRC-Lex-v0.92-word-translations")
# Printing number of rows and columns in emotion lexicon
emotion_lexicon.shape

(14182, 115)

In [4]:
# Printing columns from the dataset
emotion_lexicon.columns

Index(['English (en)', 'Afrikaans (af)', 'Albanian (sq)', 'Amharic (am)',
       'Arabic (ar)', 'Armenian (hy)', 'Azeerbaijani (az)', 'Basque (eu)',
       'Belarusian (be)', 'Bengali (bn)',
       ...
       'Positive', 'Negative', 'Anger', 'Anticipation', 'Disgust', 'Fear',
       'Joy', 'Sadness', 'Surprise', 'Trust'],
      dtype='object', length=115)

# Word2Vec embeddings

In [5]:
# Import of fine tuned word2vec model
model_path_SG = "/content/drive/My Drive/fine_tuned_word2vec_sg/fine_tuned_word2vec_sg.model"
model_path_CBOW = "/content/drive/My Drive/fine_tuned_word2vec_cbow/fine_tuned_word2vec_cbow.model"
word2vec_model_SG = Word2Vec.load(model_path_SG)
word2vec_model_CBOW = Word2Vec.load(model_path_CBOW)

embedding_dim_SG = word2vec_model_SG.vector_size
embedding_dim_CBOW = word2vec_model_CBOW.vector_size

Checking the coverage of words from emotion lexicon by word2vec model

In [6]:
# Coverage of emotion lexicon words in word2vec vocabulary for SG model
lexicon_words = set(emotion_lexicon["English (en)"])
word2vec_vocab_SG = set(word2vec_model_SG.wv.key_to_index.keys())

covered_words = lexicon_words & word2vec_vocab_SG # Intersection of lexicon words and word2vec vocabulary
coverage_percentage = len(covered_words) / len(lexicon_words) * 100

print(f"Lexicon coverage: {coverage_percentage:.2f}%")
print(f"Covered words: {len(covered_words)}")

Lexicon coverage: 87.17%
Covered words: 12362


In [7]:
# Coverage of emotion lexicon words in word2vec vocabulary for CBOW model
word2vec_vocab_CBOW = set(word2vec_model_CBOW.wv.key_to_index.keys())

covered_words = lexicon_words & word2vec_vocab_CBOW # Intersection of lexicon words and word2vec vocabulary
coverage_percentage = len(covered_words) / len(lexicon_words) * 100

print(f"Lexicon coverage: {coverage_percentage:.2f}%")
print(f"Covered words: {len(covered_words)}")

Lexicon coverage: 87.17%
Covered words: 12362


In [8]:
# Intersection of lexicon words and word2vec vocabulary
intersection = word2vec_vocab_SG & word2vec_vocab_CBOW & lexicon_words
print(f"Covered words: {len(intersection)}")

Covered words: 12362


Filtering only covered words and columns; word, anger, joy, sadness, disgust, surprise

In [9]:
filtered_lexicon = emotion_lexicon[emotion_lexicon["English (en)"].isin(word2vec_vocab_SG)].reset_index(drop=True)
filtered_lexicon = filtered_lexicon[["English (en)", "Anger", "Joy", "Sadness", "Disgust", "Surprise"]]
filtered_lexicon.head()

Unnamed: 0,English (en),Anger,Joy,Sadness,Disgust,Surprise
0,aback,0,0,0,0,0
1,abandon,0,0,1,0,0
2,abandoned,1,0,1,0,0
3,abandonment,1,0,1,0,1
4,abate,0,0,0,0,0


In [10]:
# Checking class distribution of target valuess
for i in range (1, 6):
    class_distribution = filtered_lexicon.iloc[:, i].value_counts()
    print("\nNumber of samples per class:")
    print(class_distribution)


Number of samples per class:
Anger
0    11220
1     1142
Name: count, dtype: int64

Number of samples per class:
Joy
0    11698
1      664
Name: count, dtype: int64

Number of samples per class:
Sadness
0    11264
1     1098
Name: count, dtype: int64

Number of samples per class:
Disgust
0    11387
1      975
Name: count, dtype: int64

Number of samples per class:
Surprise
0    11853
1      509
Name: count, dtype: int64


Classes are disbalanced...

In [11]:
# Function for getting word2vec embedding for an input word from lexicon
def get_word2vec_embeddings(word, model):
    try:
        return model.wv[word] # Return the embedding for the word
    except KeyError:
        return np.zeros(model.vector_size) # Return a zero-vector if the word is not in the vocabulary (this should never be the case since we filtered lexicon already)

In [12]:
# Getting word2vec embeddings for each word in the sentence
embeddings_CBOW_list = [
    get_word2vec_embeddings(text, word2vec_model_CBOW)
    for text in filtered_lexicon['English (en)']
]

embeddings_SG_list = [
    get_word2vec_embeddings(text, word2vec_model_SG)
    for text in filtered_lexicon['English (en)']
]

# Convert lists of 1D vectors to 2D NumPy arrays
embeddings_CBOW = np.array(embeddings_CBOW_list)
embeddings_SG = np.array(embeddings_SG_list)
embeddings_CBOW = embeddings_CBOW[:, np.newaxis, :] # Adds a new axis for timesteps
embeddings_SG = embeddings_SG[:, np.newaxis, :] # Adds a new axis for timesteps

In [13]:
# Checking the dimensions of the embeddings
print(f"Shape of embeddings_CBOW_reshaped: {embeddings_CBOW.shape}")
print(f"Shape of embeddings_SG_reshaped: {embeddings_SG.shape}")
print(f"Data type of embeddings_CBOW_reshaped: {embeddings_CBOW.dtype}")

Shape of embeddings_CBOW_reshaped: (12362, 1, 300)
Shape of embeddings_SG_reshaped: (12362, 1, 300)
Data type of embeddings_CBOW_reshaped: float32


# RNN Predictions for Word2Vec Embeddings

### Predictions for SG embeddings

In [14]:
# 5-fold cross validation using Bidirectional LSTM RNN
# Model parameters
n_splits = 5 # 5-fold cross-validation
epochs = 50
batch_size = 16
lstm_units = 128
dropout_rate = 0.3

emotion_columns = ['Anger', 'Joy'] # column names
all_emotions_results_SG = {}

for emotion_column_name in emotion_columns:
    print(f"\n--- Training a model for emotion: {emotion_column_name} ---")

    y_binary = filtered_lexicon[emotion_column_name].values

    # Initialize list to store results for individual folds of this emotion
    emotion_fold_results = []

    # StratifiedKFold initialization
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # nside loop: 5-fold cross-validation
    for fold, (train_index, val_index) in enumerate(skf.split(embeddings_SG, y_binary)):
        print(f"  --- Fold {fold + 1}/{n_splits} ---")

        # Splitting data to train and validation sets for the current fold
        X_train_fold, X_val_fold = embeddings_SG[train_index], embeddings_SG[val_index]
        y_train_fold, y_val_fold = y_binary[train_index], y_binary[val_index]

        clear_session() # Reseting Keras session to build new model

        # Building an LSTM Bidirectional model
        model = Sequential()
        model.add(Input(shape=(1, embedding_dim_SG)))
        model.add(Bidirectional(LSTM(lstm_units, return_sequences=False)))
        model.add(Dropout(dropout_rate))
        model.add(Dense(64, activation='relu'))
        model.add(Dropout(dropout_rate))
        model.add(Dense(1, activation='sigmoid')) # 1 neuoron for binary classification (sigmoid activation)

        # Model compilation for binary classification
        model.compile(
            optimizer='adam',
            loss='binary_crossentropy',
            metrics=['accuracy']
        )

        # Early Stopping callback
        early_stopping_callback = EarlyStopping(
            monitor='val_accuracy',
            patience=5,
            restore_best_weights=True,
            verbose=0
        )

        # Training the model
        history = model.fit(
            X_train_fold, y_train_fold,
            epochs=epochs,
            batch_size=batch_size,
            validation_data=(X_val_fold, y_val_fold),
            callbacks=[early_stopping_callback],
            verbose=0
        )

        # Evaluating the model
        loss, accuracy = model.evaluate(X_val_fold, y_val_fold, verbose=0)

        # Getting predictions
        y_pred_proba = model.predict(X_val_fold, verbose=0)
        y_pred = (y_pred_proba > 0.5).astype(int) # Binary prediction (0 ili 1)

        # Calculating metrics, with zero_division=0 to handle division by zero cases
        precision = precision_score(y_val_fold, y_pred, zero_division=0)
        recall = recall_score(y_val_fold, y_pred, zero_division=0)
        f1 = f1_score(y_val_fold, y_pred, zero_division=0)

        if len(np.unique(y_val_fold)) < 2: # Cannot calculate if validation set doesn't have both classes
            roc_auc = np.nan
        else:
            roc_auc = roc_auc_score(y_val_fold, y_pred_proba) # Use probabilities for ROC AUC

        print(f"    Fold {fold + 1} - Loss: {loss:.4f}, Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}, ROC AUC: {roc_auc:.4f}")

        # Store results for the current fold
        emotion_fold_results.append({
            'fold': fold + 1,
            'loss': loss,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'roc_auc': roc_auc
        })

    # Store all fold results for the current emotion
    all_emotions_results_SG[emotion_column_name] = emotion_fold_results


--- Training a model for emotion: Anger ---
  --- Fold 1/5 ---
    Fold 1 - Loss: 0.2589, Accuracy: 0.9131, Precision: 0.6842, Recall: 0.1135, F1-Score: 0.1948, ROC AUC: 0.7845
  --- Fold 2/5 ---
    Fold 2 - Loss: 0.2684, Accuracy: 0.9139, Precision: 0.6212, Recall: 0.1790, F1-Score: 0.2780, ROC AUC: 0.7730
  --- Fold 3/5 ---
    Fold 3 - Loss: 0.2733, Accuracy: 0.9134, Precision: 0.6842, Recall: 0.1140, F1-Score: 0.1955, ROC AUC: 0.7483
  --- Fold 4/5 ---
    Fold 4 - Loss: 0.2800, Accuracy: 0.9114, Precision: 0.8462, Recall: 0.0482, F1-Score: 0.0913, ROC AUC: 0.7375
  --- Fold 5/5 ---
    Fold 5 - Loss: 0.2689, Accuracy: 0.9126, Precision: 0.7500, Recall: 0.0789, F1-Score: 0.1429, ROC AUC: 0.7455

--- Training a model for emotion: Joy ---
  --- Fold 1/5 ---
    Fold 1 - Loss: 0.1697, Accuracy: 0.9507, Precision: 0.7895, Recall: 0.1128, F1-Score: 0.1974, ROC AUC: 0.8001
  --- Fold 2/5 ---
    Fold 2 - Loss: 0.1972, Accuracy: 0.9503, Precision: 0.7500, Recall: 0.1128, F1-Score: 0.196

In [15]:
# Calculating and printing the average results for each emotion across all folds
for emotion, fold_results_list in all_emotions_results_SG.items():
    print(f"\nResults for emotion: {emotion}")

    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []
    roc_aucs = []

    for result_dict in fold_results_list:
        accuracies.append(result_dict['accuracy'])
        precisions.append(result_dict['precision'])
        recalls.append(result_dict['recall'])
        f1_scores.append(result_dict['f1_score'])
        roc_aucs.append(result_dict['roc_auc'])

    # Calculate mean and standard deviation for each metric
    avg_accuracy = np.mean(accuracies)
    std_accuracy = np.std(accuracies)

    avg_precision = np.mean(precisions)
    std_precision = np.std(precisions)

    avg_recall = np.mean(recalls)
    std_recall = np.std(recalls)

    avg_f1_score = np.mean(f1_scores)
    std_f1_score = np.std(f1_scores)

    avg_roc_auc = np.nanmean(roc_aucs)
    std_roc_auc = np.nanstd(roc_aucs)

    print(f"  Accuracy:  {avg_accuracy:.4f} +/- {std_accuracy:.4f}")
    print(f"  Precision: {avg_precision:.4f} +/- {std_precision:.4f}")
    print(f"  Recall:    {avg_recall:.4f} +/- {std_recall:.4f}")
    print(f"  F1-Score:  {avg_f1_score:.4f} +/- {std_f1_score:.4f}")

    if not np.isnan(avg_roc_auc):
        print(f"  ROC AUC:   {avg_roc_auc:.4f} +/- {std_roc_auc:.4f}")
    else:
        print("  ROC AUC:   N/A (not enough classes in validation folds)")

    print("-" * 30)


Results for emotion: Anger
  Accuracy:  0.9129 +/- 0.0008
  Precision: 0.7172 +/- 0.0763
  Recall:    0.1068 +/- 0.0436
  F1-Score:  0.1805 +/- 0.0621
  ROC AUC:   0.7578 +/- 0.0179
------------------------------

Results for emotion: Joy
  Accuracy:  0.9508 +/- 0.0016
  Precision: 0.7591 +/- 0.0749
  Recall:    0.1235 +/- 0.0231
  F1-Score:  0.2119 +/- 0.0360
  ROC AUC:   0.7957 +/- 0.0191
------------------------------


### Predictions for CBOW embeddings

In [16]:
# 5-fold cross validation using Bidirectional LSTM RNN
# Model parameters
n_splits = 5 # 5-fold cross-validation
epochs = 50
batch_size = 16
lstm_units = 128
dropout_rate = 0.3

emotion_columns = ['Anger', 'Joy'] # column names
all_emotions_results_CBOW = {}

for emotion_column_name in emotion_columns:
    print(f"\n--- Training a model for emotion: {emotion_column_name} ---")

    y_binary = filtered_lexicon[emotion_column_name].values

    # Initialize list to store results for individual folds of this emotion
    emotion_fold_results = []

    # StratifiedKFold initialization
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # nside loop: 5-fold cross-validation
    for fold, (train_index, val_index) in enumerate(skf.split(embeddings_CBOW, y_binary)):
        print(f"  --- Fold {fold + 1}/{n_splits} ---")

        # Splitting data to train and validation sets for the current fold
        X_train_fold, X_val_fold = embeddings_CBOW[train_index], embeddings_CBOW[val_index]
        y_train_fold, y_val_fold = y_binary[train_index], y_binary[val_index]

        clear_session() # Reseting Keras session to build new model

        # Building an LSTM Bidirectional model
        model = Sequential()
        model.add(Input(shape=(1, embedding_dim_CBOW)))
        model.add(Bidirectional(LSTM(lstm_units, return_sequences=False)))
        model.add(Dropout(dropout_rate))
        model.add(Dense(64, activation='relu'))
        model.add(Dropout(dropout_rate))
        model.add(Dense(1, activation='sigmoid')) # 1 neuoron for binary classification (sigmoid activation)

        # Model compilation for binary classification
        model.compile(
            optimizer='adam',
            loss='binary_crossentropy',
            metrics=['accuracy']
        )

        # Early Stopping callback
        early_stopping_callback = EarlyStopping(
            monitor='val_accuracy',
            patience=5,
            restore_best_weights=True,
            verbose=0
        )

        # Training the model
        history = model.fit(
            X_train_fold, y_train_fold,
            epochs=epochs,
            batch_size=batch_size,
            validation_data=(X_val_fold, y_val_fold),
            callbacks=[early_stopping_callback],
            verbose=0
        )

        # Evaluating the model
        loss, accuracy = model.evaluate(X_val_fold, y_val_fold, verbose=0)

        # Getting predictions
        y_pred_proba = model.predict(X_val_fold, verbose=0)
        y_pred = (y_pred_proba > 0.5).astype(int) # Binary prediction (0 ili 1)

        # Calculating metrics, with zero_division=0 to handle division by zero cases
        precision = precision_score(y_val_fold, y_pred, zero_division=0)
        recall = recall_score(y_val_fold, y_pred, zero_division=0)
        f1 = f1_score(y_val_fold, y_pred, zero_division=0)

        if len(np.unique(y_val_fold)) < 2: # Cannot calculate if validation set doesn't have both classes
            roc_auc = np.nan
        else:
            roc_auc = roc_auc_score(y_val_fold, y_pred_proba) # Use probabilities for ROC AUC

        print(f"    Fold {fold + 1} - Loss: {loss:.4f}, Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}, ROC AUC: {roc_auc:.4f}")

        # Store results for the current fold
        emotion_fold_results.append({
            'fold': fold + 1,
            'loss': loss,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'roc_auc': roc_auc
        })

    # Store all fold results for the current emotion
    all_emotions_results_CBOW[emotion_column_name] = emotion_fold_results


--- Training a model for emotion: Anger ---
  --- Fold 1/5 ---
    Fold 1 - Loss: 0.2837, Accuracy: 0.9106, Precision: 0.6000, Recall: 0.1048, F1-Score: 0.1784, ROC AUC: 0.7354
  --- Fold 2/5 ---
    Fold 2 - Loss: 0.2773, Accuracy: 0.9114, Precision: 0.6087, Recall: 0.1223, F1-Score: 0.2036, ROC AUC: 0.7455
  --- Fold 3/5 ---
    Fold 3 - Loss: 0.2881, Accuracy: 0.9118, Precision: 0.8125, Recall: 0.0570, F1-Score: 0.1066, ROC AUC: 0.7346
  --- Fold 4/5 ---
    Fold 4 - Loss: 0.2815, Accuracy: 0.9086, Precision: 0.5556, Recall: 0.0439, F1-Score: 0.0813, ROC AUC: 0.7398
  --- Fold 5/5 ---
    Fold 5 - Loss: 0.2846, Accuracy: 0.9094, Precision: 0.5588, Recall: 0.0833, F1-Score: 0.1450, ROC AUC: 0.7374

--- Training a model for emotion: Joy ---
  --- Fold 1/5 ---
    Fold 1 - Loss: 0.1760, Accuracy: 0.9486, Precision: 0.6154, Recall: 0.1203, F1-Score: 0.2013, ROC AUC: 0.7794
  --- Fold 2/5 ---
    Fold 2 - Loss: 0.2300, Accuracy: 0.9486, Precision: 0.6071, Recall: 0.1278, F1-Score: 0.211

In [17]:
# Calculating and printing the average results for each emotion across all folds
for emotion, fold_results_list in all_emotions_results_CBOW.items():
    print(f"\nResults for emotion: {emotion}")

    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []
    roc_aucs = []

    for result_dict in fold_results_list:
        accuracies.append(result_dict['accuracy'])
        precisions.append(result_dict['precision'])
        recalls.append(result_dict['recall'])
        f1_scores.append(result_dict['f1_score'])
        roc_aucs.append(result_dict['roc_auc'])

    # Calculate mean and standard deviation for each metric
    avg_accuracy = np.mean(accuracies)
    std_accuracy = np.std(accuracies)

    avg_precision = np.mean(precisions)
    std_precision = np.std(precisions)

    avg_recall = np.mean(recalls)
    std_recall = np.std(recalls)

    avg_f1_score = np.mean(f1_scores)
    std_f1_score = np.std(f1_scores)

    avg_roc_auc = np.nanmean(roc_aucs)
    std_roc_auc = np.nanstd(roc_aucs)

    print(f"  Accuracy:  {avg_accuracy:.4f} +/- {std_accuracy:.4f}")
    print(f"  Precision: {avg_precision:.4f} +/- {std_precision:.4f}")
    print(f"  Recall:    {avg_recall:.4f} +/- {std_recall:.4f}")
    print(f"  F1-Score:  {avg_f1_score:.4f} +/- {std_f1_score:.4f}")

    if not np.isnan(avg_roc_auc):
        print(f"  ROC AUC:   {avg_roc_auc:.4f} +/- {std_roc_auc:.4f}")
    else:
        print("  ROC AUC:   N/A (not enough classes in validation folds)")

    print("-" * 30)


Results for emotion: Anger
  Accuracy:  0.9104 +/- 0.0012
  Precision: 0.6271 +/- 0.0951
  Recall:    0.0823 +/- 0.0291
  F1-Score:  0.1430 +/- 0.0449
  ROC AUC:   0.7385 +/- 0.0039
------------------------------

Results for emotion: Joy
  Accuracy:  0.9495 +/- 0.0017
  Precision: 0.6188 +/- 0.0435
  Recall:    0.1536 +/- 0.0411
  F1-Score:  0.2441 +/- 0.0524
  ROC AUC:   0.7723 +/- 0.0180
------------------------------


# BERT embeddings

In [18]:
# Import of fine tuned bert model
model_path = "/content/drive/My Drive/fine_tuned_bert"
bert_tokenizer = BertTokenizer.from_pretrained(model_path)
bert_model = BertForMaskedLM.from_pretrained(model_path)
embedding_dim_bert = bert_model.config.hidden_size

In [19]:
# Function for getting a BERT embedding for an input word from lexicon
def get_bert_embeddings(word, model, tokenizer):

    # Tokenize word and convert to tensor
    input_ids = tokenizer.encode(word, add_special_tokens=False, return_tensors="pt")

    with torch.no_grad():  # Disable gradient computation for efficiency
        outputs = model.bert(input_ids)  # Extract only transformer layers, ignoring MLM head
        last_hidden_state = outputs.last_hidden_state  # Get hidden state of the last layer

    # Calculate the average embedding across tokens
    word_embedding = last_hidden_state.mean(dim=1).squeeze().numpy()

    return word_embedding

In [20]:
# Getting word2vec embeddings for each word in the sentence (except stowords)
embeddings_BERT_list = [
    get_bert_embeddings(word, bert_model, bert_tokenizer)
    for word in filtered_lexicon['English (en)']
]


# Convert lists of 1D vectors to 2D NumPy arrays suitable for RNN
embeddings_BERT= np.array(embeddings_BERT_list)
embeddings_BERT = embeddings_BERT[:, np.newaxis, :] # Adds a new axis for timesteps

In [21]:
print(f"Embedding dimension of BERT: {embedding_dim_bert}")
print(f"Shape of embeddings_BERT: {embeddings_BERT.shape}")
print(f"Data type of embeddings_BERT: {embeddings_BERT.dtype}")

Embedding dimension of BERT: 768
Shape of embeddings_BERT: (12362, 1, 768)
Data type of embeddings_BERT: float32


# RNN Predictions for Bert Embeddings

In [22]:
# 5-fold cross validation using Bidirectional LSTM
# Model parameters
n_splits = 5 # 5-fold cross-validation
epochs = 50
batch_size = 16
lstm_units = 128
dropout_rate = 0.3

emotion_columns = ['Anger', 'Joy'] # column names
all_emotions_results_BERT = {}

for emotion_column_name in emotion_columns:
    print(f"\n--- Training a model for emotion: {emotion_column_name} ---")

    y_binary = filtered_lexicon[emotion_column_name].values

    # Initialize list to store results for individual folds of this emotion
    emotion_fold_results = []

    # StratifiedKFold initialization
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # nside loop: 5-fold cross-validation
    for fold, (train_index, val_index) in enumerate(skf.split(embeddings_BERT, y_binary)):
        print(f"  --- Fold {fold + 1}/{n_splits} ---")

        # Splitting data to train and validation sets for the current fold
        X_train_fold, X_val_fold = embeddings_BERT[train_index], embeddings_BERT[val_index]
        y_train_fold, y_val_fold = y_binary[train_index], y_binary[val_index]

        clear_session() # Reseting Keras session to build new model

        # Building an LSTM Bidirectional model
        model = Sequential()
        model.add(Input(shape=(1, embedding_dim_bert)))
        model.add(Bidirectional(LSTM(lstm_units, return_sequences=False)))
        model.add(Dropout(dropout_rate))
        model.add(Dense(64, activation='relu'))
        model.add(Dropout(dropout_rate))
        model.add(Dense(1, activation='sigmoid')) # 1 neuoron for binary classification (sigmoid activation)

        # Model compilation for binary classification
        model.compile(
            optimizer='adam',
            loss='binary_crossentropy',
            metrics=['accuracy']
        )

        # Early Stopping callback
        early_stopping_callback = EarlyStopping(
            monitor='val_accuracy',
            patience=5,
            restore_best_weights=True,
            verbose=0
        )

        # Training the model
        history = model.fit(
            X_train_fold, y_train_fold,
            epochs=epochs,
            batch_size=batch_size,
            validation_data=(X_val_fold, y_val_fold),
            callbacks=[early_stopping_callback],
            verbose=0
        )

        # Evaluating the model
        loss, accuracy = model.evaluate(X_val_fold, y_val_fold, verbose=0)

        # Getting predictions
        y_pred_proba = model.predict(X_val_fold, verbose=0)
        y_pred = (y_pred_proba > 0.5).astype(int) # Binary prediction (0 ili 1)

        # Calculating metrics, with zero_division=0 to handle division by zero cases
        precision = precision_score(y_val_fold, y_pred, zero_division=0)
        recall = recall_score(y_val_fold, y_pred, zero_division=0)
        f1 = f1_score(y_val_fold, y_pred, zero_division=0)

        if len(np.unique(y_val_fold)) < 2: # Cannot calculate if validation set doesn't have both classes
            roc_auc = np.nan
        else:
            roc_auc = roc_auc_score(y_val_fold, y_pred_proba) # Use probabilities for ROC AUC

        print(f"    Fold {fold + 1} - Loss: {loss:.4f}, Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}, ROC AUC: {roc_auc:.4f}")

        # Store results for the current fold
        emotion_fold_results.append({
            'fold': fold + 1,
            'loss': loss,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'roc_auc': roc_auc
        })

    # Store all fold results for the current emotion
    all_emotions_results_BERT[emotion_column_name] = emotion_fold_results


--- Training a model for emotion: Anger ---
  --- Fold 1/5 ---
    Fold 1 - Loss: 0.2785, Accuracy: 0.9106, Precision: 1.0000, Recall: 0.0349, F1-Score: 0.0675, ROC AUC: 0.7182
  --- Fold 2/5 ---
    Fold 2 - Loss: 0.2830, Accuracy: 0.9102, Precision: 0.6400, Recall: 0.0699, F1-Score: 0.1260, ROC AUC: 0.7198
  --- Fold 3/5 ---
    Fold 3 - Loss: 0.2861, Accuracy: 0.9142, Precision: 0.8077, Recall: 0.0921, F1-Score: 0.1654, ROC AUC: 0.7155
  --- Fold 4/5 ---
    Fold 4 - Loss: 0.2873, Accuracy: 0.9114, Precision: 0.7368, Recall: 0.0614, F1-Score: 0.1134, ROC AUC: 0.7275
  --- Fold 5/5 ---
    Fold 5 - Loss: 0.2838, Accuracy: 0.9082, Precision: 0.5714, Recall: 0.0175, F1-Score: 0.0340, ROC AUC: 0.7173

--- Training a model for emotion: Joy ---
  --- Fold 1/5 ---
    Fold 1 - Loss: 0.2001, Accuracy: 0.9474, Precision: 0.8000, Recall: 0.0301, F1-Score: 0.0580, ROC AUC: 0.6888
  --- Fold 2/5 ---
    Fold 2 - Loss: 0.1882, Accuracy: 0.9486, Precision: 0.8750, Recall: 0.0526, F1-Score: 0.099

In [23]:
# Calculating and printing the average results for each emotion across all folds
for emotion, fold_results_list in all_emotions_results_BERT.items():
    print(f"\nResults for emotion: {emotion}")

    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []
    roc_aucs = []

    for result_dict in fold_results_list:
        accuracies.append(result_dict['accuracy'])
        precisions.append(result_dict['precision'])
        recalls.append(result_dict['recall'])
        f1_scores.append(result_dict['f1_score'])
        roc_aucs.append(result_dict['roc_auc'])

    # Calculate mean and standard deviation for each metric
    avg_accuracy = np.mean(accuracies)
    std_accuracy = np.std(accuracies)

    avg_precision = np.mean(precisions)
    std_precision = np.std(precisions)

    avg_recall = np.mean(recalls)
    std_recall = np.std(recalls)

    avg_f1_score = np.mean(f1_scores)
    std_f1_score = np.std(f1_scores)

    avg_roc_auc = np.nanmean(roc_aucs)
    std_roc_auc = np.nanstd(roc_aucs)

    print(f"  Accuracy:  {avg_accuracy:.4f} +/- {std_accuracy:.4f}")
    print(f"  Precision: {avg_precision:.4f} +/- {std_precision:.4f}")
    print(f"  Recall:    {avg_recall:.4f} +/- {std_recall:.4f}")
    print(f"  F1-Score:  {avg_f1_score:.4f} +/- {std_f1_score:.4f}")

    if not np.isnan(avg_roc_auc):
        print(f"  ROC AUC:   {avg_roc_auc:.4f} +/- {std_roc_auc:.4f}")
    else:
        print("  ROC AUC:   N/A (not enough classes in validation folds)")

    print("-" * 30)


Results for emotion: Anger
  Accuracy:  0.9109 +/- 0.0020
  Precision: 0.7512 +/- 0.1483
  Recall:    0.0552 +/- 0.0262
  F1-Score:  0.1013 +/- 0.0459
  ROC AUC:   0.7197 +/- 0.0041
------------------------------

Results for emotion: Joy
  Accuracy:  0.9477 +/- 0.0010
  Precision: 0.6320 +/- 0.3233
  Recall:    0.0331 +/- 0.0245
  F1-Score:  0.0623 +/- 0.0453
  ROC AUC:   0.7003 +/- 0.0176
------------------------------
