In [None]:
import ast
from sklearn.metrics import classification_report, precision_recall_fscore_support
from sklearn.preprocessing import MultiLabelBinarizer

def load_data():
    data = []
    with open("../analysis/eval_gold_and_pred.txt", "r", encoding="utf-8") as file:
        for line in file:
            data.append(line.strip().split("\t"))
    return data

import ast

def parse_label(label):
    label = label.strip()
    if not label or label == '""':
        return []  # Empty label, no emotions assigned

    label = label.replace('""', '"')  # Fix double quotes

    if label.startswith('"') and label.endswith('"'):
        label = label[1:-1]  # Remove outermost quotes

    parsed = ast.literal_eval(label)

    if isinstance(parsed, str):
        return [parsed]
    elif isinstance(parsed, (tuple, list)):
        return list(parsed)
    else:
        raise ValueError(f"Unexpected label format: {parsed}")

data = load_data()
gold_labels = []
predictions = []

for line in data[1:]:
    gold_labels.append(line[2])
    predictions.append(line[3])

gold_labels = [parse_label(label) for label in gold_labels]
predictions = [parse_label(label) for label in predictions]

mlb = MultiLabelBinarizer()
mlb.fit(gold_labels + predictions)

gold_binarized = mlb.transform(gold_labels)
pred_binarized = mlb.transform(predictions)

report = classification_report(gold_binarized, pred_binarized, target_names=mlb.classes_)
print(report)


              precision    recall  f1-score   support

  admiration       0.73      0.85      0.79        13
   agitation       0.67      0.89      0.76        18
       anger       1.00      0.62      0.77         8
     disgust       0.50      0.25      0.33         4
        fear       0.87      0.72      0.79        18
         joy       0.80      0.50      0.62         8
        love       0.33      1.00      0.50         1
     neutral       0.50      0.25      0.33         4
     sadness       0.81      1.00      0.89        17
       shame       0.50      1.00      0.67         1

   micro avg       0.74      0.76      0.75        92
   macro avg       0.67      0.71      0.64        92
weighted avg       0.76      0.76      0.74        92
 samples avg       0.75      0.76      0.73        92

