**Nina Dobša, zadnje uređivano 7.7.2025.** 

In [1]:
#!pip install datasets

# Imports

In [2]:
import pandas as pd
import numpy as np
import torch
import time
from nltk.tokenize import word_tokenize
from transformers import BertTokenizer, BertForMaskedLM
from datasets import load_dataset
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec, KeyedVectors
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from tqdm import tqdm
from sklearn.model_selection import cross_validate, StratifiedKFold, cross_val_predict
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_recall_fscore_support

  from .autonotebook import tqdm as notebook_tqdm


# Data import - NRC Emotion Lexicon

In [3]:
# Import of data from local file
emotion_lexicon = pd.read_excel("data/NRC-Emotion-Lexicon-v0.92-In105Languages-Nov2017Translations.xlsx", sheet_name="NRC-Lex-v0.92-word-translations")

In [4]:
# Printing number of rows and columns in emotion lexicon
emotion_lexicon.shape

(14182, 115)

In [5]:
# Printing columns from the dataset
emotion_lexicon.columns

Index(['English (en)', 'Afrikaans (af)', 'Albanian (sq)', 'Amharic (am)',
       'Arabic (ar)', 'Armenian (hy)', 'Azeerbaijani (az)', 'Basque (eu)',
       'Belarusian (be)', 'Bengali (bn)',
       ...
       'Positive', 'Negative', 'Anger', 'Anticipation', 'Disgust', 'Fear',
       'Joy', 'Sadness', 'Surprise', 'Trust'],
      dtype='object', length=115)

# Word2Vec model

In [8]:
# Import of fine tuned word2vec model
model_path_SG = "fine_tuned_word2vec_sg/fine_tuned_word2vec_sg.model"
model_path_CBOW = "fine_tuned_word2vec_cbow/fine_tuned_word2vec_cbow.model"
word2vec_model_SG = Word2Vec.load(model_path_SG)
word2vec_model_CBOW = Word2Vec.load(model_path_CBOW)

Checking the coverage of words from emotion lexicon by word2vec model

In [9]:
lexicon_words = set(emotion_lexicon["English (en)"])
word2vec_vocab_SG = set(word2vec_model_SG.wv.key_to_index.keys())

covered_words = lexicon_words & word2vec_vocab_SG # Intersection of lexicon words and word2vec vocabulary
coverage_percentage = len(covered_words) / len(lexicon_words) * 100

print(f"Lexicon coverage: {coverage_percentage:.2f}%")
print(f"Covered words: {len(covered_words)}")

Lexicon coverage: 87.17%
Covered words: 12362


In [10]:
word2vec_vocab_CBOW = set(word2vec_model_CBOW.wv.key_to_index.keys())

covered_words = lexicon_words & word2vec_vocab_CBOW # Intersection of lexicon words and word2vec vocabulary
coverage_percentage = len(covered_words) / len(lexicon_words) * 100

print(f"Lexicon coverage: {coverage_percentage:.2f}%")
print(f"Covered words: {len(covered_words)}")

Lexicon coverage: 87.17%
Covered words: 12362


In [11]:
intersection = word2vec_vocab_SG & word2vec_vocab_CBOW & lexicon_words # Intersection of lexicon words and word2vec vocabulary

print(f"Covered words: {len(intersection)}")

Covered words: 12362


Filtering only covered words and columns; word, anger, joy, sadness, disgust, surprise

In [12]:
filtered_lexicon = emotion_lexicon[emotion_lexicon["English (en)"].isin(word2vec_vocab_SG)].reset_index(drop=True)
filtered_lexicon = filtered_lexicon[["English (en)", "Anger", "Joy", "Sadness", "Disgust", "Surprise"]]
filtered_lexicon.head()

Unnamed: 0,English (en),Anger,Joy,Sadness,Disgust,Surprise
0,aback,0,0,0,0,0
1,abandon,0,0,1,0,0
2,abandoned,1,0,1,0,0
3,abandonment,1,0,1,0,1
4,abate,0,0,0,0,0


Word2vec embeddings

In [13]:
# Function for getting word2vec embedding for an input word from lexicon
def get_word2vec_embedding(word, model):
    try:
        return model.wv[word] # Return the embedding for the word
    except KeyError:
        return np.zeros(model.vector_size) # Return a zero-vector if the word is not in the vocabulary (this should never be the case since we filtered lexicon already)

In [14]:
# Applying get_word2vec_embedding function to every word in filtered dictionary
filtered_lexicon['word2vec_embedding_SG'] = filtered_lexicon['English (en)'].apply(
    lambda word: get_word2vec_embedding(word, word2vec_model_SG)
)

filtered_lexicon['word2vec_embedding_CBOW'] = filtered_lexicon['English (en)'].apply(
    lambda word: get_word2vec_embedding(word, word2vec_model_CBOW)
)

filtered_lexicon.head()

Unnamed: 0,English (en),Anger,Joy,Sadness,Disgust,Surprise,word2vec_embedding_SG,word2vec_embedding_CBOW
0,aback,0,0,0,0,0,"[-0.05127184, 0.09056614, 0.18572424, 0.125842...","[0.042214748, -0.034687907, 0.15201779, 0.1485..."
1,abandon,0,0,1,0,0,"[-0.099102534, 0.19064683, -0.14152387, 0.0222...","[-0.046076052, 0.367438, -0.17085469, 0.096703..."
2,abandoned,1,0,1,0,0,"[-0.015122078, 0.4042172, -0.009172466, -0.211...","[-0.13989627, 0.41552946, 0.1068828, -0.094600..."
3,abandonment,1,0,1,0,1,"[-0.13099883, 0.18350211, -0.023655038, 0.1231...","[-0.118318245, 0.30210206, 0.10299667, 0.23302..."
4,abate,0,0,0,0,0,"[0.028459722, 0.075622424, 0.0515427, 0.031762...","[0.014462058, 0.024446249, 0.037190884, 0.0148..."


# ML algorithm for Word2Vec embeddings

In [30]:
# Defining the Random Forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)

In [31]:
# Defining 5-fold cross-validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

### SGNS embeddings

In [32]:
# Splitting data
X_SG = np.array(filtered_lexicon['word2vec_embedding_SG'].tolist())  # Word embeddings as features
y_anger = filtered_lexicon['Anger']  
y_joy = filtered_lexicon['Joy']
y_sadness = filtered_lexicon['Sadness']
y_disgust = filtered_lexicon['Disgust']
y_surprise = filtered_lexicon['Surprise']

In [33]:
# Performing 5-fold cross validation
scoring_metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
scores_anger_sg = cross_validate(clf, X_SG, y_anger, cv=kf, scoring=scoring_metrics)
scores_joy_sg = cross_validate(clf, X_SG, y_joy, cv=kf, scoring=scoring_metrics)
scores_sadness_sg = cross_validate(clf, X_SG, y_sadness, cv=kf, scoring=scoring_metrics)
scores_disgust_sg = cross_validate(clf, X_SG, y_disgust, cv=kf, scoring=scoring_metrics)
scores_surprise_sg = cross_validate(clf, X_SG, y_surprise, cv=kf, scoring=scoring_metrics)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [34]:
# Calculating and printing the avg score +/- std dev
for metric in scoring_metrics:
    avg_score_anger_sg = np.mean(scores_anger_sg[f'test_{metric}'])
    std_dev_anger_sg = np.std(scores_anger_sg[f'test_{metric}'])

    avg_score_joy_sg = np.mean(scores_joy_sg[f'test_{metric}'])
    std_dev_joy_sg = np.std(scores_joy_sg[f'test_{metric}'])

    avg_score_sadness_sg = np.mean(scores_sadness_sg[f'test_{metric}'])
    std_dev_sadness_sg = np.std(scores_sadness_sg[f'test_{metric}'])

    avg_score_disgust_sg = np.mean(scores_disgust_sg[f'test_{metric}'])
    std_dev_disgust_sg = np.std(scores_disgust_sg[f'test_{metric}'])

    avg_score_surprise_sg = np.mean(scores_surprise_sg[f'test_{metric}'])
    std_dev_surprise_sg = np.std(scores_surprise_sg[f'test_{metric}'])


    print(f"{metric} anger: {avg_score_anger_sg:.4f} +/- {std_dev_anger_sg:.4f}")
    print(f"{metric} joy: {avg_score_joy_sg:.4f} +/- {std_dev_joy_sg:.4f}")
    print(f"{metric} sadness: {avg_score_sadness_sg:.4f} +/- {std_dev_sadness_sg:.4f}")
    print(f"{metric} disgust: {avg_score_disgust_sg:.4f} +/- {std_dev_disgust_sg:.4f}")
    print(f"{metric} surprise: {avg_score_surprise_sg:.4f} +/- {std_dev_surprise_sg:.4f}")
    print("\n")

accuracy anger: 0.9079 +/- 0.0006
accuracy joy: 0.9467 +/- 0.0005
accuracy sadness: 0.9123 +/- 0.0007
accuracy disgust: 0.9216 +/- 0.0005
accuracy surprise: 0.9589 +/- 0.0002


precision_macro anger: 0.7225 +/- 0.1724
precision_macro joy: 0.8234 +/- 0.2001
precision_macro sadness: 0.8527 +/- 0.0949
precision_macro disgust: 0.8359 +/- 0.1938
precision_macro surprise: 0.5795 +/- 0.2001


recall_macro anger: 0.5037 +/- 0.0026
recall_macro joy: 0.5052 +/- 0.0039
recall_macro sadness: 0.5088 +/- 0.0021
recall_macro disgust: 0.5040 +/- 0.0026
recall_macro surprise: 0.5010 +/- 0.0020


f1_macro anger: 0.4836 +/- 0.0051
f1_macro joy: 0.4967 +/- 0.0076
f1_macro sadness: 0.4948 +/- 0.0040
f1_macro disgust: 0.4877 +/- 0.0052
f1_macro surprise: 0.4915 +/- 0.0039




### CBOW embeddings

In [35]:
# Splitting data
X_CBOW = np.array(filtered_lexicon['word2vec_embedding_CBOW'].tolist())  # Word embeddings as features

In [36]:
# Performing 5-fold cross validation
scoring_metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
scores_anger_cbow = cross_validate(clf, X_CBOW, y_anger, cv=kf, scoring=scoring_metrics)
scores_joy_cbow = cross_validate(clf, X_CBOW, y_joy, cv=kf, scoring=scoring_metrics)
scores_sadness_cbow = cross_validate(clf, X_CBOW, y_sadness, cv=kf, scoring=scoring_metrics)
scores_disgust_cbow = cross_validate(clf, X_CBOW, y_disgust, cv=kf, scoring=scoring_metrics)
scores_surprise_cbow = cross_validate(clf, X_CBOW, y_surprise, cv=kf, scoring=scoring_metrics)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [37]:
# Calculating and printing the avg score +/- std dev
for metric in scoring_metrics:
    avg_score_anger_cbow = np.mean(scores_anger_cbow[f'test_{metric}'])
    std_dev_anger_cbow = np.std(scores_anger_cbow[f'test_{metric}'])

    avg_score_joy_cbow = np.mean(scores_joy_cbow[f'test_{metric}'])
    std_dev_joy_cbow = np.std(scores_joy_cbow[f'test_{metric}'])

    avg_score_sadness_cbow = np.mean(scores_sadness_cbow[f'test_{metric}'])
    std_dev_sadness_cbow = np.std(scores_sadness_cbow[f'test_{metric}'])

    avg_score_disgust_cbow = np.mean(scores_disgust_cbow[f'test_{metric}'])
    std_dev_disgust_cbow = np.std(scores_disgust_cbow[f'test_{metric}'])

    avg_score_surprise_cbow = np.mean(scores_surprise_cbow[f'test_{metric}'])
    std_dev_surprise_cbow = np.std(scores_surprise_cbow[f'test_{metric}'])


    print(f"{metric} anger: {avg_score_anger_cbow:.4f} +/- {std_dev_anger_cbow:.4f}")
    print(f"{metric} joy: {avg_score_joy_cbow:.4f} +/- {std_dev_joy_cbow:.4f}")
    print(f"{metric} sadness: {avg_score_sadness_cbow:.4f} +/- {std_dev_sadness_cbow:.4f}")
    print(f"{metric} disgust: {avg_score_disgust_cbow:.4f} +/- {std_dev_disgust_cbow:.4f}")
    print(f"{metric} surprise: {avg_score_surprise_cbow:.4f} +/- {std_dev_surprise_cbow:.4f}")
    print("\n")

accuracy anger: 0.9083 +/- 0.0005
accuracy joy: 0.9474 +/- 0.0007
accuracy sadness: 0.9121 +/- 0.0011
accuracy disgust: 0.9214 +/- 0.0004
accuracy surprise: 0.9587 +/- 0.0004


precision_macro anger: 0.8193 +/- 0.0862
precision_macro joy: 0.8506 +/- 0.0729
precision_macro sadness: 0.7698 +/- 0.0856
precision_macro disgust: 0.7774 +/- 0.1857
precision_macro surprise: 0.4794 +/- 0.0001


recall_macro anger: 0.5059 +/- 0.0021
recall_macro joy: 0.5148 +/- 0.0054
recall_macro sadness: 0.5112 +/- 0.0049
recall_macro disgust: 0.5029 +/- 0.0020
recall_macro surprise: 0.4999 +/- 0.0002


f1_macro anger: 0.4880 +/- 0.0042
f1_macro joy: 0.5154 +/- 0.0102
f1_macro sadness: 0.4997 +/- 0.0094
f1_macro disgust: 0.4856 +/- 0.0039
f1_macro surprise: 0.4894 +/- 0.0001




# BERT model

In [23]:
# Import of fine tuned bert model
model_path = "fine_tuned_bert"
bert_tokenizer = BertTokenizer.from_pretrained(model_path)
bert_model = BertForMaskedLM.from_pretrained(model_path)

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


In [24]:
# Function for getting a BERT embedding for an input word from lexicon
def get_bert_embedding(word, model, tokenizer):

    # Tokenize word and convert to tensor
    input_ids = tokenizer.encode(word, add_special_tokens=False, return_tensors="pt")

    with torch.no_grad():  # Disable gradient computation for efficiency
        outputs = model.bert(input_ids)  # Extract only transformer layers, ignoring MLM head
        last_hidden_state = outputs.last_hidden_state  # Get hidden state of the last layer

    # Calculate the average embedding across tokens
    word_embedding = last_hidden_state.mean(dim=1).squeeze().numpy()

    return word_embedding

In [25]:
# Getting BERT embeddings
bert_model.eval()
filtered_lexicon['bert_embedding'] = filtered_lexicon['English (en)'].apply(
    lambda word: get_bert_embedding(word, bert_model, bert_tokenizer)
)

# ML algorithm for BERT embeddings

In [38]:
# Splitting data
X_BERT = np.array(filtered_lexicon['bert_embedding'].tolist())  # Word embeddings as features

In [39]:
# Performing 5-fold cross validation
scoring_metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
scores_anger_bert = cross_validate(clf, X_BERT, y_anger, cv=kf, scoring=scoring_metrics)
scores_joy_bert = cross_validate(clf, X_BERT, y_joy, cv=kf, scoring=scoring_metrics)
scores_sadness_bert = cross_validate(clf, X_BERT, y_sadness, cv=kf, scoring=scoring_metrics)
scores_disgust_bert = cross_validate(clf, X_BERT, y_disgust, cv=kf, scoring=scoring_metrics)
scores_surprise_bert = cross_validate(clf, X_BERT, y_surprise, cv=kf, scoring=scoring_metrics)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

In [40]:
# Calculating and printing the avg score +/- std dev
for metric in scoring_metrics:
    avg_score_anger_bert = np.mean(scores_anger_bert[f'test_{metric}'])
    std_dev_anger_bert = np.std(scores_anger_bert[f'test_{metric}'])

    avg_score_joy_bert = np.mean(scores_joy_bert[f'test_{metric}'])
    std_dev_joy_bert = np.std(scores_joy_bert[f'test_{metric}'])

    avg_score_sadness_bert = np.mean(scores_sadness_bert[f'test_{metric}'])
    std_dev_sadness_bert = np.std(scores_sadness_bert[f'test_{metric}'])

    avg_score_disgust_bert = np.mean(scores_disgust_bert[f'test_{metric}'])
    std_dev_disgust_bert = np.std(scores_disgust_bert[f'test_{metric}'])

    avg_score_surprise_bert = np.mean(scores_surprise_bert[f'test_{metric}'])
    std_dev_surprise_bert = np.std(scores_surprise_bert[f'test_{metric}'])


    print(f"{metric} anger: {avg_score_anger_bert:.4f} +/- {std_dev_anger_bert:.4f}")
    print(f"{metric} joy: {avg_score_joy_bert:.4f} +/- {std_dev_joy_bert:.4f}")
    print(f"{metric} sadness: {avg_score_sadness_bert:.4f} +/- {std_dev_sadness_bert:.4f}")
    print(f"{metric} disgust: {avg_score_disgust_bert:.4f} +/- {std_dev_disgust_bert:.4f}")
    print(f"{metric} surprise: {avg_score_surprise_bert:.4f} +/- {std_dev_surprise_bert:.4f}")
    print("\n")

accuracy anger: 0.9075 +/- 0.0005
accuracy joy: 0.9463 +/- 0.0002
accuracy sadness: 0.9113 +/- 0.0003
accuracy disgust: 0.9211 +/- 0.0000
accuracy surprise: 0.9588 +/- 0.0002


precision_macro anger: 0.4538 +/- 0.0001
precision_macro joy: 0.4731 +/- 0.0001
precision_macro sadness: 0.5556 +/- 0.2001
precision_macro disgust: 0.4606 +/- 0.0000
precision_macro surprise: 0.4794 +/- 0.0001


recall_macro anger: 0.4999 +/- 0.0002
recall_macro joy: 0.5000 +/- 0.0000
recall_macro sadness: 0.5005 +/- 0.0009
recall_macro disgust: 0.5000 +/- 0.0000
recall_macro surprise: 0.5000 +/- 0.0000


f1_macro anger: 0.4757 +/- 0.0001
f1_macro joy: 0.4862 +/- 0.0000
f1_macro sadness: 0.4777 +/- 0.0019
f1_macro disgust: 0.4795 +/- 0.0000
f1_macro surprise: 0.4895 +/- 0.0000


