# Compute Go emotions ground truth

In [None]:
from tqdm.auto import tqdm
import numpy as np
import tensorflow_datasets as tfds

train_ds = tfds.load('huggingface:go_emotions/simplified', split='test')
emotions = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'neutral', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise']

num_items, num_labels = len(train_ds), len(emotions)
train_ds = train_ds.as_numpy_iterator()

y_targets = np.zeros((num_items, num_labels), dtype=int)
for i in tqdm(range(num_items)):
    x = train_ds.next()
    if i < 5:
        print(x['text'])
    labels = x['labels']
    for j in labels:
        y_targets[i, j] = 1

np.save("data/model_eval/y_targets.npy", y_targets)

# Compute roberta predictions

In [None]:
from transformers import pipeline
import tensorflow_datasets as tfds
import numpy as np
from tqdm.auto import tqdm
classifier = pipeline(task="text-classification", model="SamLowe/roberta-base-go_emotions", top_k=None, max_length=512, truncation=True)
train_ds = tfds.load('huggingface:go_emotions/simplified', split='test')

emotions = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'neutral', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise']
opt_thresholds = [0.25, 0.45, 0.15, 0.1, 0.3, 0.4, 0.55, 0.25, 0.25, 0.4, 0.3, 0.2, 0.1, 0.35, 0.4, 0.45, 0.05, 0.4, 0.25, 0.25, 0.2, 0.1, 0.15, 0.05, 0.1, 0.4, 0.15, 0.25]

num_items, num_labels = len(train_ds), len(emotions)
train_ds_iter = train_ds.as_numpy_iterator()

def get_detected_emotions(model_output, thresholds=0.5):
    th = np.asarray(thresholds)
    emotions = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'neutral', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise']
    outputs = {d['label']: d['score'] for d in model_output[0]}
    output_list = [outputs[em] for em in emotions]
    emotion_ids = np.where(np.asarray(output_list) > th)[0]
    emotions = np.asarray(emotions)[emotion_ids]
    return emotion_ids, emotions 

##### Compute scores

In [None]:
y_bert_pred = np.zeros((num_items, num_labels), dtype=int)
for i in tqdm(range(num_items)):
    x = train_ds_iter.next()
    if i < 5:
        print(x['text'])
    labels = classifier(x['text'].decode('utf-8'))[0]
    labels, _ = get_detected_emotions([labels], thresholds=opt_thresholds)
    for j in labels:
        y_bert_pred[i, j] = 1

np.save("data/model_eval/y_bert_pred_optth.npy", y_bert_pred)

train_ds_iter = train_ds.as_numpy_iterator()
y_bert_pred = np.zeros((num_items, num_labels), dtype=int)
for i in tqdm(range(num_items)):
    x = train_ds_iter.next()
    if i < 5:
        print(x['text'])
    labels = classifier(x['text'].decode('utf-8'))[0]
    labels, _ = get_detected_emotions([labels], thresholds=0.5)
    for j in labels:
        y_bert_pred[i, j] = 1

np.save("data/model_eval/y_bert_pred_05.npy", y_bert_pred)

# Compute LLM predictions

In [None]:
import ctypes

from llama_cpp import llama_log_set

def my_log_callback(level, message, user_data):
    pass

log_callback = ctypes.CFUNCTYPE(None, ctypes.c_int, ctypes.c_char_p, ctypes.c_void_p)(my_log_callback)
llama_log_set(log_callback, ctypes.c_void_p())

In [None]:
from llama_cpp import Llama
import tensorflow_datasets as tfds
import numpy as np
from tqdm.auto import tqdm

emotions = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'neutral', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise']
test_ds = tfds.load('huggingface:go_emotions/simplified', split='test')
num_items, num_labels = len(test_ds), len(emotions)

path = "./llama-2-7b-chat.Q5_K_M.gguf"
llm = Llama(model_path=path, verbose=False)

high_recall_prompt = "Write all the emotions present in the following text. Do not write emotions that are not present. You can only use the following words: admiration, amusement, anger, annoyance, approval, caring, confusion, curiosity, desire, disappointment, disapproval, disgust, embarrassment, excitement, fear, gratitude, grief, joy, love, nervousness, neutral, optimism, pride, realization, relief, remorse, sadness, surprise"
high_f1_prompt = "Which are the emotions present in the following text?"

def get_llm_predictions(text, prompt):
    eval_prompt = "Q: " + prompt + " Text: " + text + " A:"
    output = llm(eval_prompt, max_tokens=128, stop=["Q:", "\n"], echo=False)
    pred_emotions_idx = [i for i, em in enumerate(emotions) if em in output['choices'][0]['text'].lower()]
    y = np.zeros(len(emotions), dtype=int)
    y[pred_emotions_idx] = 1
    return y

In [None]:
# High f1 score prompt evaluation
#y_llm_pred = np.zeros((num_items, num_labels), dtype=int)
#test_ds_iter = test_ds.as_numpy_iterator()

if False:
    for i in tqdm(range(num_items)):
        x = test_ds_iter.next()
        text = x['text'].decode('utf-8')
        labels = get_llm_predictions(text, high_f1_prompt)
        y_llm_pred[i, :] = labels
        
        if i < 5:
            print(x['text'])
            print(labels)

    np.save("data/model_eval/y_llm_f1.npy", y_llm_pred)

# High recall prompt evaluation
#y_llm_pred = np.zeros((num_items, num_labels), dtype=int)
i0 = 600 + 3100 + 700
y_llm_pred = np.load("data/model_eval/y_llm_recall.npy")
test_ds_iter = test_ds.as_numpy_iterator()
for _ in range(i0+1):
    x = test_ds_iter.next()

#for i in tqdm(range(num_items)):
for i in tqdm(range(i0+1, num_items)):
    x = test_ds_iter.next()
    text = x['text'].decode('utf-8')
    labels = get_llm_predictions(text, high_recall_prompt)
    y_llm_pred[i, :] = labels
    
    if i < 5:
        print(x['text'])
        print(labels)

    if i % 100 == 0:
        np.save("data/model_eval/y_llm_recall.npy", y_llm_pred)

np.save("data/model_eval/y_llm_recall.npy", y_llm_pred)

# Build tables with metrics

In [None]:
import pandas as pd
import numpy as np
from sklearn import metrics
emotions = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'neutral', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise']

def calc_label_metrics(label, y_targets, y_preds, threshold):
    return {
        "label": label,
        "accuracy": metrics.accuracy_score(y_targets, y_preds),
        "precision": metrics.precision_score(y_targets, y_preds, zero_division=0),
        "recall": metrics.recall_score(y_targets, y_preds, zero_division=0),
        "f1": metrics.f1_score(y_targets, y_preds, zero_division=0),
        "mcc": metrics.matthews_corrcoef(y_targets, y_preds),
        "support": y_targets.sum(),
        "threshold": threshold,
    }

def compare_models(y_gt, y_pred, threshold=0.5):
    emotions = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'neutral', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise']
    results = []
    for label_index, label in enumerate(emotions):
        y_targets, y_preds = y_gt[:, label_index], y_pred[:, label_index]
        results.append(calc_label_metrics(label, y_targets, y_preds, threshold))
    return results

In [None]:
models = [
    ("ROBERTA_0.5", "data/model_eval/y_bert_pred_05.npy", "roberta_05"),
    ("ROBERTA_opt", "data/model_eval/y_bert_pred_optth.npy", "roberta_opt"),
    ("LLM_recall", "data/model_eval/y_llm_recall.npy", "llm_recall"),
    ("LLM_f1", "data/model_eval/y_llm_f1.npy", "llm_f1"),
]

mean_scores_log = []
for model_name, pred_path, savefile_name in models:

    # Compute results
    y_gt = np.load("data/model_eval/y_targets.npy")
    y_preds = np.load(pred_path)
    results = compare_models(y_gt, y_preds)

    per_label_results = pd.DataFrame(results, index=emotions)
    per_label_results = per_label_results.drop(columns=["label"])
    display(per_label_results.round(3))

    # Save model mean scores
    mean_scores = per_label_results.mean(axis=0)
    mean_scores_log.append(mean_scores)
    mean_scores.to_csv(f"data/model_eval/mean_scores_{savefile_name}.csv")
    print(mean_scores)

    # Save latex table
    latex = per_label_results.round(3).to_latex(index=False, formatters={"name": str.upper}, float_format="{:.1f}".format)
    with open(f"data/model_eval/metrics_per_label_{savefile_name}.tex", "w") as f:
        f.write(latex)

    # Save html table with colors
    results_hm = per_label_results.style.background_gradient(cmap='Blues', subset=["accuracy", "recall", "precision", "f1", "mcc"])
    display(results_hm)

    with open(f"data/prompt_search/metrics_per_label_{savefile_name}.html", "w") as f:
        f.write(results_hm.to_html())

# Save mean scores and tables
model_names = [m[0] for m in models]
mean_scores_df = pd.DataFrame(mean_scores_log, index=model_names)
mean_scores_df = mean_scores_df.drop(columns=["support", "threshold"])
display(mean_scores_df.round(3))

# Save latex table
latex = mean_scores_df.round(3).to_latex(index=True, formatters={"name": str.upper}, float_format="{:.1f}".format)
with open(f"data/model_eval/mean_scores.tex", "w") as f:
    f.write(latex)

# Save html table with colors
results_hm = mean_scores_df.style.background_gradient(cmap='Blues', subset=["accuracy", "recall", "precision", "f1", "mcc"])
display(results_hm)

with open(f"data/model_eval/mean_scores.html", "w") as f:
    f.write(results_hm.to_html())

latex = mean_scores_df.round(3).to_latex(index=True, formatters={"name": str.upper}, float_format="{:.3f}".format)
with open(f"data/model_eval/mean_scores.tex", "w") as f:
    f.write(latex)


In [None]:
y_gt = np.load("data/model_eval/y_targets.npy")[:]
y_preds = np.load("data/model_eval/y_bert_pred_optth.npy")[:]
print(y_gt.shape, y_preds.shape)
results = compare_models(y_gt, y_preds)

per_label_results = pd.DataFrame(results, index=emotions)
per_label_results = per_label_results.drop(columns=["label"])
display(per_label_results.round(3))

# Save model mean scores
mean_scores = per_label_results.mean(axis=0)
print(mean_scores)
mean_scores.to_csv("data/model_eval/mean_scores_roberta_optth.csv")

# Save results in display format
latex = per_label_results.round(3).to_latex(index=False, formatters={"name": str.upper}, float_format="{:.1f}".format)
with open("data/model_eval/metrics_per_label_roberta.tex", "w") as f:
    f.write(latex)

results_hm = per_label_results.style.background_gradient(cmap='Blues', subset=["accuracy", "recall", "precision", "f1", "mcc"])
display(results_hm)

with open(f"data/prompt_search/metrics_per_label_roberta.html", "w") as f:
    f.write(results_hm.to_html())

### TODO:
- Run bert predictions
- Implement LLM prompt search
- Run it for results