In [1]:
#run from the root directory
import os

os.chdir("..")

In [19]:
from config import *

In [7]:
import random
import numpy as np
import pickle as pkl
from collections import defaultdict
from sklearn.metrics import (
    precision_recall_curve,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)


def compute(labels, preds):
    return { 'accuracy': accuracy_score(labels, preds),
            'precision': precision_score(labels, preds),
            'recall': recall_score(labels, preds),
            'f1_score': f1_score(labels, preds)
    }

def findMean(data):
    _result = {}
    for key in data.keys():
        val = data[key]
        val = np.array(val).astype(np.float32)
        mean = np.round(np.mean(val), 3)
        std = np.round(np.std(val), 3)
        _result[key] = str(mean) + u" \u00B1 " + str(std)
    return _result

def evaluate(data):
    labels = data['label']
    preds = data['prediction']

    cresult = defaultdict(list)

    for seed in range(10):
        random.seed(seed)
        zlist = list(zip(labels, preds))
        random.shuffle(zlist)

        val_size = int(0.1 * len(zlist))
        val_zlist = zlist[0:val_size]
        test_zlist = zlist[val_size:]

        vlabels, vpreds = zip(*val_zlist)
        tlabels, tpreds = zip(*test_zlist)

        precision, recall, threshold = precision_recall_curve(vlabels, vpreds)
        f1_scores = 2*recall*precision/(recall+precision + 1e-8)
        cutoff = threshold[np.argmax(f1_scores)]

        rpreds = 1.0 * np.array([x > cutoff for x in tpreds])
        _result = compute(tlabels, rpreds)
        _result['auc_roc_score'] = roc_auc_score(tlabels, tpreds)
        _result['cutoff'] = cutoff

        for key in _result.keys():
            cresult[key].append(_result[key])
    
    return findMean(cresult)



In [15]:
results = pkl.load(open("results/bert_results.pkl", "rb"))

In [20]:
domains = list(DOMAIN_INDEX_MAPPING.keys())

In [21]:
presults = defaultdict(lambda: defaultdict(dict))

for domain in domains:
    presults[domain] = evaluate(results[domain])


In [22]:
data = presults

import csv
csv_rows = []

# Prepare headers
metrics = ['accuracy', 'precision', 'recall', 'f1_score', 'auc_roc_score', 'cutoff']
empty_cell = len(metrics) - 1

headers_main = [""] + metrics
csv_rows.append(headers_main)

# Prepare rows by category
domains = list(data.keys())
for domain in domains:
    row = [domain]
    for metric in metrics:
        row.append(data[domain][metric])
    csv_rows.append(row)


# Write to CSV
with open("results/BERToutput.csv", "w", newline='') as file:
    writer = csv.writer(file)
    writer.writerows(csv_rows)


## Analyze the results

In [26]:
import pandas as pd
df = pd.read_csv("results/BERToutput.csv", index_col=0)
df = df.transpose()

In [29]:
df.head(6)

Unnamed: 0,"Discrimination, Exclusion, Toxicity",Misinformation,HCI harms,Malicious Uses
accuracy,0.585 ± 0.011,0.513 ± 0.011,0.502 ± 0.002,0.534 ± 0.023
precision,0.553 ± 0.009,0.507 ± 0.007,0.501 ± 0.002,0.52 ± 0.015
recall,0.897 ± 0.027,0.971 ± 0.03,0.99 ± 0.008,0.929 ± 0.05
f1_score,0.683 ± 0.003,0.666 ± 0.002,0.666 ± 0.002,0.666 ± 0.002
auc_roc_score,0.614 ± 0.002,0.638 ± 0.001,0.573 ± 0.002,0.64 ± 0.001
cutoff,0.335 ± 0.031,0.045 ± 0.015,0.063 ± 0.025,0.103 ± 0.035
