# Compute Go emotions ground truth

In [6]:
from tqdm.auto import tqdm
import numpy as np
import tensorflow_datasets as tfds

train_ds = tfds.load('huggingface:go_emotions/simplified', split='test')
emotions = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'neutral', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise']

num_items, num_labels = len(train_ds), len(emotions)
train_ds = train_ds.as_numpy_iterator()

y_targets = np.zeros((num_items, num_labels), dtype=int)
for i in tqdm(range(num_items)):
    x = train_ds.next()
    if i < 5:
        print(x['text'])
    labels = x['labels']
    for j in labels:
        y_targets[i, j] = 1

np.save("data/model_eval/y_targets.npy", y_targets)

2023-11-23 15:20:02.626805: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
 10%|█         | 543/5427 [00:00<00:01, 2873.99it/s]

b'I have a bad feeling I\xe2\x80\x99m gonna regret not tuning into this'
b'So the article is pretty much nonsense.'
b"What happened to one of your star players?? i saw the headline 'freak injury' or someting like that.. but i was too scared to see someting gruesome.."
b'What u doing dame'
b'we deserve to lose after that'


100%|██████████| 5427/5427 [00:03<00:00, 1720.39it/s]


# Compute roberta predictions

In [None]:
from transformers import pipeline
import tensorflow_datasets as tfds
import numpy as np
from tqdm.auto import tqdm
classifier = pipeline(task="text-classification", model="SamLowe/roberta-base-go_emotions", top_k=None, max_length=512, truncation=True)
train_ds = tfds.load('huggingface:go_emotions/simplified', split='test')

emotions = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'neutral', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise']
opt_thresholds = [0.25, 0.45, 0.15, 0.1, 0.3, 0.4, 0.55, 0.25, 0.25, 0.4, 0.3, 0.2, 0.1, 0.35, 0.4, 0.45, 0.05, 0.4, 0.25, 0.25, 0.2, 0.1, 0.15, 0.05, 0.1, 0.4, 0.15, 0.25]

num_items, num_labels = len(train_ds), len(emotions)
train_ds_iter = train_ds.as_numpy_iterator()

def get_detected_emotions(model_output, thresholds=0.5):
    th = np.asarray(thresholds)
    emotions = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'neutral', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise']
    outputs = {d['label']: d['score'] for d in model_output[0]}
    output_list = [outputs[em] for em in emotions]
    emotion_ids = np.where(np.asarray(output_list) > th)[0]
    emotions = np.asarray(emotions)[emotion_ids]
    return emotion_ids, emotions 

##### Compute scores

In [15]:
y_bert_pred = np.zeros((num_items, num_labels), dtype=int)
for i in tqdm(range(num_items)):
    x = train_ds_iter.next()
    if i < 5:
        print(x['text'])
    labels = classifier(x['text'].decode('utf-8'))[0]
    labels, _ = get_detected_emotions([labels], thresholds=opt_thresholds)
    for j in labels:
        y_bert_pred[i, j] = 1

np.save("data/model_eval/y_bert_pred_optth.npy", y_bert_pred)

train_ds_iter = train_ds.as_numpy_iterator()
y_bert_pred = np.zeros((num_items, num_labels), dtype=int)
for i in tqdm(range(num_items)):
    x = train_ds_iter.next()
    if i < 5:
        print(x['text'])
    labels = classifier(x['text'].decode('utf-8'))[0]
    labels, _ = get_detected_emotions([labels], thresholds=0.5)
    for j in labels:
        y_bert_pred[i, j] = 1

np.save("data/model_eval/y_bert_pred_05.npy", y_bert_pred)

  0%|          | 0/5427 [00:00<?, ?it/s]

b'I have a bad feeling I\xe2\x80\x99m gonna regret not tuning into this'


  0%|          | 1/5427 [00:00<1:15:15,  1.20it/s]

b'So the article is pretty much nonsense.'


  0%|          | 2/5427 [00:01<44:18,  2.04it/s]  

b"What happened to one of your star players?? i saw the headline 'freak injury' or someting like that.. but i was too scared to see someting gruesome.."


  0%|          | 3/5427 [00:01<38:22,  2.36it/s]

b'What u doing dame'


  0%|          | 5/5427 [00:01<23:57,  3.77it/s]

b'we deserve to lose after that'


100%|██████████| 5427/5427 [11:24<00:00,  7.92it/s] 
  0%|          | 0/5427 [00:00<?, ?it/s]

b'I have a bad feeling I\xe2\x80\x99m gonna regret not tuning into this'


  0%|          | 2/5427 [00:00<17:40,  5.12it/s]

b'So the article is pretty much nonsense.'
b"What happened to one of your star players?? i saw the headline 'freak injury' or someting like that.. but i was too scared to see someting gruesome.."


  0%|          | 4/5427 [00:00<16:48,  5.38it/s]

b'What u doing dame'
b'we deserve to lose after that'


100%|██████████| 5427/5427 [08:56<00:00, 10.12it/s]


# Compute LLM predictions

In [1]:
import ctypes

from llama_cpp import llama_log_set

def my_log_callback(level, message, user_data):
    pass

log_callback = ctypes.CFUNCTYPE(None, ctypes.c_int, ctypes.c_char_p, ctypes.c_void_p)(my_log_callback)
llama_log_set(log_callback, ctypes.c_void_p())

In [2]:
from llama_cpp import Llama
import tensorflow_datasets as tfds
import numpy as np
from tqdm.auto import tqdm

emotions = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'neutral', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise']
test_ds = tfds.load('huggingface:go_emotions/simplified', split='test')
num_items, num_labels = len(test_ds), len(emotions)

path = "./llama-2-7b-chat.Q5_K_M.gguf"
llm = Llama(model_path=path, verbose=False)

high_recall_prompt = "Write all the emotions present in the following text. Do not write emotions that are not present. You can only use the following words: admiration, amusement, anger, annoyance, approval, caring, confusion, curiosity, desire, disappointment, disapproval, disgust, embarrassment, excitement, fear, gratitude, grief, joy, love, nervousness, neutral, optimism, pride, realization, relief, remorse, sadness, surprise"
high_f1_prompt = "Which are the emotions present in the following text?"

def get_llm_predictions(text, prompt):
    eval_prompt = "Q: " + prompt + " Text: " + text + " A:"
    output = llm(eval_prompt, max_tokens=128, stop=["Q:", "\n"], echo=False)
    pred_emotions_idx = [i for i, em in enumerate(emotions) if em in output['choices'][0]['text'].lower()]
    y = np.zeros(len(emotions), dtype=int)
    y[pred_emotions_idx] = 1
    return y

  from .autonotebook import tqdm as notebook_tqdm
  hf_names = hf_datasets.list_datasets()
2023-12-18 19:39:14.285687: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-18 19:39:14.336630: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-18 19:39:14.696042: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-18 19:39:14.698509: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-18 

In [3]:
# High f1 score prompt evaluation
#y_llm_pred = np.zeros((num_items, num_labels), dtype=int)
#test_ds_iter = test_ds.as_numpy_iterator()

if False:
    for i in tqdm(range(num_items)):
        x = test_ds_iter.next()
        text = x['text'].decode('utf-8')
        labels = get_llm_predictions(text, high_f1_prompt)
        y_llm_pred[i, :] = labels
        
        if i < 5:
            print(x['text'])
            print(labels)

    np.save("data/model_eval/y_llm_f1.npy", y_llm_pred)

# High recall prompt evaluation
#y_llm_pred = np.zeros((num_items, num_labels), dtype=int)
i0 = 600 + 3100 + 700
y_llm_pred = np.load("data/model_eval/y_llm_recall.npy")
test_ds_iter = test_ds.as_numpy_iterator()
for _ in range(i0+1):
    x = test_ds_iter.next()

#for i in tqdm(range(num_items)):
for i in tqdm(range(i0+1, num_items)):
    x = test_ds_iter.next()
    text = x['text'].decode('utf-8')
    labels = get_llm_predictions(text, high_recall_prompt)
    y_llm_pred[i, :] = labels
    
    if i < 5:
        print(x['text'])
        print(labels)

    if i % 100 == 0:
        np.save("data/model_eval/y_llm_recall.npy", y_llm_pred)

np.save("data/model_eval/y_llm_recall.npy", y_llm_pred)

100%|██████████| 1026/1026 [3:57:34<00:00, 13.89s/it] 


# Build tables with metrics

In [2]:
import pandas as pd
import numpy as np
from sklearn import metrics
emotions = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'neutral', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise']

def calc_label_metrics(label, y_targets, y_preds, threshold):
    return {
        "label": label,
        "accuracy": metrics.accuracy_score(y_targets, y_preds),
        "precision": metrics.precision_score(y_targets, y_preds, zero_division=0),
        "recall": metrics.recall_score(y_targets, y_preds, zero_division=0),
        "f1": metrics.f1_score(y_targets, y_preds, zero_division=0),
        "mcc": metrics.matthews_corrcoef(y_targets, y_preds),
        "support": y_targets.sum(),
        "threshold": threshold,
    }

def compare_models(y_gt, y_pred, threshold=0.5):
    emotions = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'neutral', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise']
    results = []
    for label_index, label in enumerate(emotions):
        y_targets, y_preds = y_gt[:, label_index], y_pred[:, label_index]
        results.append(calc_label_metrics(label, y_targets, y_preds, threshold))
    return results

In [5]:
models = [
    ("ROBERTA_0.5", "data/model_eval/y_bert_pred_05.npy", "roberta_05"),
    ("ROBERTA_opt", "data/model_eval/y_bert_pred_optth.npy", "roberta_opt"),
    ("LLM_recall", "data/model_eval/y_llm_recall.npy", "llm_recall"),
    ("LLM_f1", "data/model_eval/y_llm_f1.npy", "llm_f1"),
]

mean_scores_log = []
for model_name, pred_path, savefile_name in models:

    # Compute results
    y_gt = np.load("data/model_eval/y_targets.npy")
    y_preds = np.load(pred_path)
    results = compare_models(y_gt, y_preds)

    per_label_results = pd.DataFrame(results, index=emotions)
    per_label_results = per_label_results.drop(columns=["label"])
    display(per_label_results.round(3))

    # Save model mean scores
    mean_scores = per_label_results.mean(axis=0)
    mean_scores_log.append(mean_scores)
    mean_scores.to_csv(f"data/model_eval/mean_scores_{savefile_name}.csv")
    print(mean_scores)

    # Save latex table
    latex = per_label_results.round(3).to_latex(index=False, formatters={"name": str.upper}, float_format="{:.1f}".format)
    with open(f"data/model_eval/metrics_per_label_{savefile_name}.tex", "w") as f:
        f.write(latex)

    # Save html table with colors
    results_hm = per_label_results.style.background_gradient(cmap='Blues', subset=["accuracy", "recall", "precision", "f1", "mcc"])
    display(results_hm)

    with open(f"data/prompt_search/metrics_per_label_{savefile_name}.html", "w") as f:
        f.write(results_hm.to_html())

# Save mean scores and tables
model_names = [m[0] for m in models]
mean_scores_df = pd.DataFrame(mean_scores_log, index=model_names)
mean_scores_df = mean_scores_df.drop(columns=["support", "threshold"])
display(mean_scores_df.round(3))

# Save latex table
latex = mean_scores_df.round(3).to_latex(index=True, formatters={"name": str.upper}, float_format="{:.1f}".format)
with open(f"data/model_eval/mean_scores.tex", "w") as f:
    f.write(latex)

# Save html table with colors
results_hm = mean_scores_df.style.background_gradient(cmap='Blues', subset=["accuracy", "recall", "precision", "f1", "mcc"])
display(results_hm)

with open(f"data/model_eval/mean_scores.html", "w") as f:
    f.write(results_hm.to_html())

latex = mean_scores_df.round(3).to_latex(index=True, formatters={"name": str.upper}, float_format="{:.3f}".format)
with open(f"data/model_eval/mean_scores.tex", "w") as f:
    f.write(latex)


Unnamed: 0,accuracy,precision,recall,f1,mcc,support,threshold
admiration,0.946,0.725,0.675,0.699,0.67,504,0.5
amusement,0.982,0.79,0.871,0.829,0.821,264,0.5
anger,0.97,0.652,0.379,0.479,0.483,198,0.5
annoyance,0.94,0.472,0.159,0.238,0.25,320,0.5
approval,0.942,0.609,0.302,0.404,0.403,351,0.5
caring,0.973,0.448,0.319,0.372,0.364,135,0.5
confusion,0.972,0.5,0.431,0.463,0.45,153,0.5
curiosity,0.95,0.537,0.356,0.428,0.412,284,0.5
desire,0.987,0.63,0.41,0.496,0.502,83,0.5
disappointment,0.974,0.625,0.199,0.302,0.343,151,0.5


accuracy       0.952875
precision      0.451283
recall         0.308002
f1             0.348578
mcc            0.345403
support      226.035714
threshold      0.500000
dtype: float64


Unnamed: 0,accuracy,precision,recall,f1,mcc,support,threshold
admiration,0.946011,0.724947,0.674603,0.698869,0.669782,504,0.5
amusement,0.982495,0.790378,0.871212,0.828829,0.820707,264,0.5
anger,0.969965,0.652174,0.378788,0.479233,0.48317,198,0.5
annoyance,0.93993,0.472222,0.159375,0.238318,0.249988,320,0.5
approval,0.942325,0.609195,0.301994,0.40381,0.40293,351,0.5
caring,0.973282,0.447917,0.318519,0.372294,0.364499,135,0.5
confusion,0.971808,0.5,0.431373,0.463158,0.450052,153,0.5
curiosity,0.950249,0.537234,0.355634,0.427966,0.412479,284,0.5
desire,0.987286,0.62963,0.409639,0.49635,0.501859,83,0.5
disappointment,0.974387,0.625,0.198675,0.301508,0.342998,151,0.5


Unnamed: 0,accuracy,precision,recall,f1,mcc,support,threshold
admiration,0.94,0.651,0.776,0.708,0.678,504,0.5
amusement,0.982,0.781,0.89,0.832,0.825,264,0.5
anger,0.959,0.454,0.601,0.517,0.502,198,0.5
annoyance,0.864,0.243,0.619,0.349,0.328,320,0.5
approval,0.926,0.432,0.442,0.437,0.397,351,0.5
caring,0.972,0.426,0.385,0.405,0.391,135,0.5
confusion,0.974,0.548,0.412,0.47,0.462,153,0.5
curiosity,0.943,0.473,0.711,0.568,0.552,284,0.5
desire,0.985,0.518,0.53,0.524,0.516,83,0.5
disappointment,0.974,0.562,0.298,0.39,0.398,151,0.5


accuracy       0.935712
precision      0.406477
recall         0.428797
f1             0.397704
mcc            0.380985
support      226.035714
threshold      0.500000
dtype: float64


Unnamed: 0,accuracy,precision,recall,f1,mcc,support,threshold
admiration,0.940483,0.650582,0.775794,0.707692,0.678085,504,0.5
amusement,0.982495,0.780731,0.890152,0.831858,0.824636,264,0.5
anger,0.959093,0.454198,0.60101,0.517391,0.501779,198,0.5
annoyance,0.863645,0.242647,0.61875,0.348592,0.328033,320,0.5
approval,0.926294,0.431755,0.441595,0.43662,0.397222,351,0.5
caring,0.971808,0.42623,0.385185,0.404669,0.390794,135,0.5
confusion,0.973835,0.547826,0.411765,0.470149,0.461914,153,0.5
curiosity,0.943431,0.473068,0.711268,0.568214,0.552118,284,0.5
desire,0.985259,0.517647,0.53012,0.52381,0.516362,83,0.5
disappointment,0.974019,0.5625,0.298013,0.38961,0.397649,151,0.5


Unnamed: 0,accuracy,precision,recall,f1,mcc,support,threshold
admiration,0.365,0.104,0.764,0.183,0.055,504,0.5
amusement,0.391,0.05,0.636,0.092,0.006,264,0.5
anger,0.38,0.036,0.626,0.069,-0.001,198,0.5
annoyance,0.419,0.06,0.6,0.109,0.004,320,0.5
approval,0.36,0.066,0.672,0.12,0.005,351,0.5
caring,0.366,0.026,0.659,0.049,0.006,135,0.5
confusion,0.351,0.025,0.569,0.047,-0.03,153,0.5
curiosity,0.372,0.043,0.518,0.079,-0.055,284,0.5
desire,0.378,0.017,0.687,0.033,0.015,83,0.5
disappointment,0.382,0.031,0.689,0.058,0.021,151,0.5


accuracy       0.475756
precision      0.042974
recall         0.537683
f1             0.070783
mcc            0.008419
support      226.035714
threshold      0.500000
dtype: float64


Unnamed: 0,accuracy,precision,recall,f1,mcc,support,threshold
admiration,0.365027,0.103718,0.763889,0.182638,0.054989,504,0.5
amusement,0.390824,0.049734,0.636364,0.092257,0.006493,264,0.5
anger,0.380136,0.036321,0.626263,0.06866,-0.001134,198,0.5
annoyance,0.418832,0.059664,0.6,0.108536,0.003586,320,0.5
approval,0.359683,0.065628,0.672365,0.119584,0.005424,351,0.5
caring,0.366317,0.02556,0.659259,0.049212,0.00588,135,0.5
confusion,0.351207,0.024562,0.568627,0.047091,-0.030062,153,0.5
curiosity,0.372213,0.04302,0.517606,0.079438,-0.054513,284,0.5
desire,0.378478,0.016745,0.686747,0.032693,0.015339,83,0.5
disappointment,0.382348,0.030507,0.688742,0.058427,0.021208,151,0.5


Unnamed: 0,accuracy,precision,recall,f1,mcc,support,threshold
admiration,0.907,0.478,0.044,0.08,0.123,504,0.5
amusement,0.941,0.325,0.201,0.248,0.226,264,0.5
anger,0.853,0.123,0.495,0.198,0.192,198,0.5
annoyance,0.908,0.129,0.097,0.111,0.064,320,0.5
approval,0.924,0.095,0.02,0.033,0.014,351,0.5
caring,0.974,0.0,0.0,0.0,-0.005,135,0.5
confusion,0.935,0.119,0.203,0.15,0.123,153,0.5
curiosity,0.921,0.227,0.215,0.221,0.179,284,0.5
desire,0.973,0.168,0.193,0.18,0.167,83,0.5
disappointment,0.928,0.102,0.205,0.136,0.11,151,0.5


accuracy       0.936060
precision      0.165265
recall         0.143782
f1             0.117354
mcc            0.103977
support      226.035714
threshold      0.500000
dtype: float64


Unnamed: 0,accuracy,precision,recall,f1,mcc,support,threshold
admiration,0.906762,0.478261,0.043651,0.08,0.122766,504,0.5
amusement,0.940851,0.325153,0.200758,0.248244,0.226178,264,0.5
anger,0.853326,0.123426,0.494949,0.197581,0.191966,198,0.5
annoyance,0.908052,0.128631,0.096875,0.110517,0.063755,320,0.5
approval,0.924268,0.094595,0.019943,0.032941,0.014302,351,0.5
caring,0.974203,0.0,0.0,0.0,-0.00485,135,0.5
confusion,0.935139,0.118774,0.202614,0.149758,0.123006,153,0.5
curiosity,0.920582,0.226766,0.214789,0.220615,0.17888,284,0.5
desire,0.973097,0.168421,0.192771,0.179775,0.166555,83,0.5
disappointment,0.927584,0.101974,0.205298,0.136264,0.109826,151,0.5


Unnamed: 0,accuracy,precision,recall,f1,mcc
ROBERTA_0.5,0.953,0.451,0.308,0.349,0.345
ROBERTA_opt,0.936,0.406,0.429,0.398,0.381
LLM_recall,0.476,0.043,0.538,0.071,0.008
LLM_f1,0.936,0.165,0.144,0.117,0.104


Unnamed: 0,accuracy,precision,recall,f1,mcc
ROBERTA_0.5,0.952875,0.451283,0.308002,0.348578,0.345403
ROBERTA_opt,0.935712,0.406477,0.428797,0.397704,0.380985
LLM_recall,0.475756,0.042974,0.537683,0.070783,0.008419
LLM_f1,0.93606,0.165265,0.143782,0.117354,0.103977


In [14]:
y_gt = np.load("data/model_eval/y_targets.npy")[:]
y_preds = np.load("data/model_eval/y_bert_pred_optth.npy")[:]
print(y_gt.shape, y_preds.shape)
results = compare_models(y_gt, y_preds)

per_label_results = pd.DataFrame(results, index=emotions)
per_label_results = per_label_results.drop(columns=["label"])
display(per_label_results.round(3))

# Save model mean scores
mean_scores = per_label_results.mean(axis=0)
print(mean_scores)
mean_scores.to_csv("data/model_eval/mean_scores_roberta_optth.csv")

# Save results in display format
latex = per_label_results.round(3).to_latex(index=False, formatters={"name": str.upper}, float_format="{:.1f}".format)
with open("data/model_eval/metrics_per_label_roberta.tex", "w") as f:
    f.write(latex)

results_hm = per_label_results.style.background_gradient(cmap='Blues', subset=["accuracy", "recall", "precision", "f1", "mcc"])
display(results_hm)

with open(f"data/prompt_search/metrics_per_label_roberta.html", "w") as f:
    f.write(results_hm.to_html())

(5427, 28) (5427, 28)


Unnamed: 0,accuracy,precision,recall,f1,mcc,support,threshold
admiration,0.94,0.651,0.776,0.708,0.678,504,0.5
amusement,0.982,0.781,0.89,0.832,0.825,264,0.5
anger,0.959,0.454,0.601,0.517,0.502,198,0.5
annoyance,0.864,0.243,0.619,0.349,0.328,320,0.5
approval,0.926,0.432,0.442,0.437,0.397,351,0.5
caring,0.972,0.426,0.385,0.405,0.391,135,0.5
confusion,0.974,0.548,0.412,0.47,0.462,153,0.5
curiosity,0.943,0.473,0.711,0.568,0.552,284,0.5
desire,0.985,0.518,0.53,0.524,0.516,83,0.5
disappointment,0.974,0.562,0.298,0.39,0.398,151,0.5


accuracy       0.935712
precision      0.406477
recall         0.428797
f1             0.397704
mcc            0.380985
support      226.035714
threshold      0.500000
dtype: float64


Unnamed: 0,accuracy,precision,recall,f1,mcc,support,threshold
admiration,0.940483,0.650582,0.775794,0.707692,0.678085,504,0.5
amusement,0.982495,0.780731,0.890152,0.831858,0.824636,264,0.5
anger,0.959093,0.454198,0.60101,0.517391,0.501779,198,0.5
annoyance,0.863645,0.242647,0.61875,0.348592,0.328033,320,0.5
approval,0.926294,0.431755,0.441595,0.43662,0.397222,351,0.5
caring,0.971808,0.42623,0.385185,0.404669,0.390794,135,0.5
confusion,0.973835,0.547826,0.411765,0.470149,0.461914,153,0.5
curiosity,0.943431,0.473068,0.711268,0.568214,0.552118,284,0.5
desire,0.985259,0.517647,0.53012,0.52381,0.516362,83,0.5
disappointment,0.974019,0.5625,0.298013,0.38961,0.397649,151,0.5


### TODO:
- Run bert predictions
- Implement LLM prompt search
- Run it for results