In [1]:
from collections import Counter
from sklearn.metrics import f1_score

def subset_from_csv(file_path):
    subset = {"category": [], "text": []}

    with open(file_path) as f:
        for line in f:
            class_, text = line.split('\t', 1)
            subset["category"].append(class_)
            subset["text"].append(text)

    return subset

In [8]:
emotion_v0_train = subset_from_csv("../data/BenchmarkingZeroShot/emotion/train_pu_half_v0.txt")
seen_classes = set(emotion_v0_train["category"])

emotion_test = subset_from_csv("../data/BenchmarkingZeroShot/emotion/test.txt")
emotion_dev = subset_from_csv("../data/BenchmarkingZeroShot/emotion/dev.txt")

counts = Counter(emotion_test["category"])
most_common_class, most_common_class_count = counts.most_common(1)[0]
print("Most common class      : ", most_common_class)
print("Most common class count: ", most_common_class_count)

print("All counts")
counts.most_common()  # reported joy count: 3100 != 3000

Most common class      :  joy
Most common class count:  3000
All counts


[('joy', 3000),
 ('fear', 2700),
 ('anger', 2500),
 ('sadness', 2300),
 ('noemo', 2000),
 ('surprise', 1200),
 ('love', 1100),
 ('disgust', 600),
 ('shame', 300),
 ('guilt', 300)]

In [3]:
unseen_classes = set(emotion_test["category"]) - seen_classes
assert unseen_classes == set(emotion_dev["category"]) - seen_classes

unseen_classes

{'disgust', 'guilt', 'joy', 'noemo', 'surprise'}

# Sklearn f1_score

In [4]:
print("Emotion v0 **test** unseen classes majority baseline")
emotion_labels_test_v0_unseen = [c for c in emotion_test["category"] if c in unseen_classes]

print("Reported: 13.3")
res = f1_score(y_true=emotion_labels_test_v0_unseen, y_pred=[most_common_class] * len(emotion_labels_test_v0_unseen), average="weighted")
print(f"Ours    : {res * 100}")

Emotion v0 **test** unseen classes majority baseline
Reported: 13.3
Ours    : 25.101101659461726


In [5]:
print("Emotion v0 **dev** unseen classes majority baseline")
emotion_labels_dev_v0_unseen = [c for c in emotion_dev["category"] if c in unseen_classes]

print("Reported: 13.3")
res = f1_score(y_true=emotion_labels_dev_v0_unseen, y_pred=[most_common_class] * len(emotion_labels_dev_v0_unseen), average="weighted")
print(f"Ours    : {res * 100}")

Emotion v0 **dev** unseen classes majority baseline
Reported: 13.3
Ours    : 28.68347338935574


# By hand

In [23]:
assert most_common_class in unseen_classes  # we test emotion_v0 and this is the case there
unseen_classes_counts = {c: n for c, n in counts.items() if c in unseen_classes}
print(unseen_classes_counts)

# f1 for all classes except the majority class equals to 0
# f1 for the majority class

p_majority = most_common_class_count / sum(unseen_classes_counts.values())
r_majority = 1.0
f1_majority = (2 * p_majority * r_majority) / (p_majority + r_majority)

# print("F1 majority: ", f1_majority)

weights = {c: n / sum(unseen_classes_counts.values()) for c, n in unseen_classes_counts.items()}
f1_weighted = f1_majority * weights[most_common_class]  # other classes are zero

print("Reported: 13.3")
print("Ours    :", f1_weighted * 100)  # same as sklearn

{'joy': 3000, 'disgust': 600, 'guilt': 300, 'noemo': 2000, 'surprise': 1200}
Reported: 13.3
Ours    : 25.101101659461722
