**Nina Dobša, zadnje uređivano 7.7.2025.**

## Imports

In [25]:
import pandas as pd
from gensim.models import Word2Vec
from transformers import BertTokenizer, BertForMaskedLM
import torch
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate, StratifiedKFold, cross_val_predict
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_recall_fscore_support

## ISEAR dataset import

In [26]:
file_path = "data/isear_3.txt"  
ISEAR_data = pd.read_csv(file_path, delimiter="|", header=0, on_bad_lines="skip", engine="python")

# Select only Emotion and Text columns from original dataset
ISEAR_data = ISEAR_data[['SIT', 'Field1']]  # Emotion and last column (Text)
ISEAR_data.columns = ['text', 'emotion'] # Naming the columns

ISEAR_data.shape

(7505, 2)

In [27]:
# Encoding emotions (from words to numbers)
label_encoder = LabelEncoder()
ISEAR_data['emotion_label'] = label_encoder.fit_transform(ISEAR_data['emotion'])
print(dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))

{'anger': 0, 'disgust': 1, 'fear': 2, 'guilt': 3, 'joy': 4, 'sadness': 5, 'shame': 6}


In [28]:
ISEAR_data.head()

Unnamed: 0,text,emotion,emotion_label
0,"During the period of falling in love, each tim...",joy,4
1,When I was involved in a traffic accident.,fear,2
2,When I was driving home after several days of...,anger,0
3,When I lost the person who meant the most to me.,sadness,5
4,The time I knocked a deer down - the sight of ...,disgust,1


## Word2Vec model

In [29]:
# Import of fine tuned word2vec model
model_path_SG = "fine_tuned_word2vec_sg/fine_tuned_word2vec_sg.model"
model_path_CBOW = "fine_tuned_word2vec_cbow/fine_tuned_word2vec_cbow.model"
word2vec_model_SG = Word2Vec.load(model_path_SG)
word2vec_model_CBOW = Word2Vec.load(model_path_CBOW)

In [30]:
# Function that returns the word2vec embedding for a given sentence
def get_word2vec_embedding(sentence, word2vec_model):
    
    words = sentence.split()     # Tokenization
    word_embeddings = [word2vec_model.wv[word] for word in words if word in word2vec_model.wv] # Get embeddings for words that exist in the Word2Vec vocabulary
    
    if not word_embeddings:  # If no words are in the vocabulary, return a zero vector
        return np.zeros(word2vec_model.vector_size)
    
    sentence_embedding = np.mean(word_embeddings, axis=0) # Calculate the mean of all word embeddings to get the emdedding for sentence
    return sentence_embedding

In [31]:
# Adding new "word2vec_embedding_SG" and "word2vec_embedding_CBOW" column to ISEAR dataset
# Applying get_word2vec_embedding to all sentences

ISEAR_data['word2vec_embedding_SG'] = ISEAR_data['text'].apply(
    lambda sentence: get_word2vec_embedding(sentence, word2vec_model_SG)
)

ISEAR_data['word2vec_embedding_CBOW'] = ISEAR_data['text'].apply(
    lambda sentence: get_word2vec_embedding(sentence, word2vec_model_CBOW)
)

In [32]:
ISEAR_data.head()

Unnamed: 0,text,emotion,emotion_label,word2vec_embedding_SG,word2vec_embedding_CBOW
0,"During the period of falling in love, each tim...",joy,4,"[-0.049276937, 0.13258511, -0.08547431, 0.0617...","[-0.12927012, -0.39089495, 0.5890891, -0.44857..."
1,When I was involved in a traffic accident.,fear,2,"[-0.0098165795, 0.15902604, -0.068388596, -0.0...","[0.19186452, -0.037524782, 0.6663514, -0.10714..."
2,When I was driving home after several days of...,anger,0,"[0.0412013, 0.24119537, -0.07577773, 0.0033492...","[0.035169117, -0.06991781, 0.6389158, -0.00986..."
3,When I lost the person who meant the most to me.,sadness,5,"[-0.09790471, 0.12274621, -0.08426029, 0.05351...","[-0.079096586, -0.24543297, 0.48515487, -0.104..."
4,The time I knocked a deer down - the sight of ...,disgust,1,"[0.0071739783, 0.1709428, -0.07293872, 0.07204...","[0.112791084, 0.0432981, 0.31216973, -0.117237..."


## ML predictions for Word2Vec embeddings

In [33]:
# Defining the Random Forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)

In [34]:
# Defining 5-fold cross-validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

### Skip Gram method

In [35]:
# Splitting data
X_SG = np.array(ISEAR_data['word2vec_embedding_SG'].tolist()) 
y = ISEAR_data['emotion_label']

In [36]:
# Performing 5-fold cross validation
scoring_metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
scores_SG = cross_validate(clf, X_SG, y, cv=kf, scoring=scoring_metrics)

In [None]:
# Calculation of average scores
for metric in scoring_metrics:
    avg_score_SG = np.mean(scores_SG[f'test_{metric}'])
    std_dev_SG = np.std(scores_SG[f'test_{metric}'])

    print(f"{metric}: {avg_score_SG:.4f} +/- {std_dev_SG:.4f}")

accuracy: 0.3928 +/- 0.005892963194535902
precision_macro: 0.3890 +/- 0.006447946386094139
recall_macro: 0.3930 +/- 0.005951894938131344
f1_macro: 0.3889 +/- 0.005286286793141579


In [38]:
# Confusion matrix calculation
y_pred_SG = cross_val_predict(clf, X_SG, y, cv=kf)
cm = confusion_matrix(y, y_pred_SG)

print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[332 152 114 170  97  91 114]
 [170 395 157  88  73  67 108]
 [ 83  96 596  90  75  70  69]
 [197  89 112 345  95  99 133]
 [ 84  61  64  66 607 117  66]
 [112  70  94 129 126 440 109]
 [179 133 150 202  99  87 233]]


### CBOW method

In [39]:
# Splitting data
X_CBOW = np.array(ISEAR_data['word2vec_embedding_CBOW'].tolist()) 
y = ISEAR_data['emotion_label']

In [40]:
# Performing 5-fold cross validation
scoring_metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
scores_CBOW = cross_validate(clf, X_CBOW, y, cv=kf, scoring=scoring_metrics)

In [None]:
# Calculation of average scores
for metric in scoring_metrics:
    avg_score_CBOW = np.mean(scores_CBOW[f'test_{metric}'])
    std_dev_CBOW = np.std(scores_CBOW[f'test_{metric}'])

    print(f"{metric}: {avg_score_CBOW:.4f} +/- {std_dev_CBOW:.4f}")

accuracy: 0.3560 +/- 0.012672946315176197
precision_macro: 0.3509 +/- 0.01259861919672469
recall_macro: 0.3562 +/- 0.012743103499281296
f1_macro: 0.3510 +/- 0.012279997900399263


In [42]:
# Calculating confusion matrix
y_pred_CBOW = cross_val_predict(clf, X_CBOW, y, cv=kf)
cm = confusion_matrix(y, y_pred_CBOW)

print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[315 146 131 176  97  90 115]
 [176 351 156 102  95  69 109]
 [ 99  94 565  87  87  76  71]
 [188 115 107 302 101 119 138]
 [ 98  88  76  65 560 116  62]
 [130  85  98 126 166 369 106]
 [172 149 149 188 112 103 210]]


# BERT model

In [43]:
# Import of fine tuned bert model
model_path = "fine_tuned_bert"
bert_tokenizer = BertTokenizer.from_pretrained(model_path)
bert_model = BertForMaskedLM.from_pretrained(model_path)

In [44]:
# Function that returns bert embedding for an input sentence
def get_bert_embedding_sentence(sentence, model, tokenizer):

    # Tokenize the input sentence
    inputs = tokenizer(
        sentence,
        return_tensors='pt',  
        truncation=True,     
        padding=True          
    )

    # Forward pass through the model to get hidden states
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)

    # Extract the last hidden state 
    hidden_states = outputs.hidden_states[-1]  

    # Compute the mean pooling of token embeddings
    sentence_embedding = hidden_states.mean(dim=1).squeeze().numpy()

    return sentence_embedding

In [45]:
# Applying get_bert_embeddings_sentence to all sentences in train, validation and test data
bert_model.eval()
ISEAR_data["bert_embedding"] = ISEAR_data["text"].apply(
     lambda sentence: get_bert_embedding_sentence(sentence, bert_model, bert_tokenizer)
)

# ML prediction for BERT embeddings

In [46]:
# Splitting data
X_bert = np.array(ISEAR_data['bert_embedding'].tolist()) 
y = ISEAR_data['emotion_label']

In [47]:
# Performing 5-fold cross validation
scoring_metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
scores_bert = cross_validate(clf, X_bert, y, cv=kf, scoring=scoring_metrics)

In [48]:
# Calculating the average scores
for metric in scoring_metrics:
    avg_score_BERT = np.mean(scores_bert[f'test_{metric}'])
    std_dev_BERT = np.std(scores_bert[f'test_{metric}'])
    
    print(f"{metric}: {avg_score_BERT:.4f} +/- {std_dev_BERT:.4f}")

accuracy: 0.5183 +/- 0.0122
precision_macro: 0.5135 +/- 0.0121
recall_macro: 0.5187 +/- 0.0122
f1_macro: 0.5136 +/- 0.0122


In [49]:
# Calculating confusion matrix
y_pred_bert = cross_val_predict(clf, X_bert, y, cv=kf)
cm = confusion_matrix(y, y_pred_bert)

print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[424 156  72 184  60  52 122]
 [197 555  68  65  45  42  86]
 [ 55  71 690  53  76  83  51]
 [194  65  81 417  54  88 171]
 [ 21  17  29  36 848  79  35]
 [ 82  60  54  69 160 588  67]
 [182 121  98 186  86  42 368]]
