In [1]:
#run from the root directory
import os

os.chdir("..")

In [116]:
configs = [file for file in os.listdir("./results/") if file.endswith(".pkl") and 'baseline' not in file]

In [117]:
import random
import numpy as np
import pickle as pkl
from collections import defaultdict
from sklearn.metrics import (
    precision_recall_curve,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)


def compute(labels, preds):
    return { 'accuracy': accuracy_score(labels, preds),
            'precision': precision_score(labels, preds),
            'recall': recall_score(labels, preds),
            'f1_score': f1_score(labels, preds)
    }

def findMean(data):
    _result = {}
    for key in data.keys():
        val = data[key]
        val = np.array(val).astype(np.float32)
        mean = np.round(np.mean(val), 3)
        std = np.round(np.std(val), 3)
        _result[key] = str(mean) + u" \u00B1 " + str(std)
    return _result

def evaluate(data):
    labels = data['label']
    preds = data['prediction']

    cresult = defaultdict(list)

    for seed in range(10):
        random.seed(seed)
        zlist = list(zip(labels, preds))
        random.shuffle(zlist)

        val_size = int(0.1 * len(zlist))
        val_zlist = zlist[0:val_size]
        test_zlist = zlist[val_size:]

        vlabels, vpreds = zip(*val_zlist)
        tlabels, tpreds = zip(*test_zlist)

        precision, recall, threshold = precision_recall_curve(vlabels, vpreds)
        f1_scores = 2*recall*precision/(recall+precision + 1e-8)
        cutoff = threshold[np.argmax(f1_scores)]

        rpreds = 1.0 * np.array([x > cutoff for x in tpreds])
        _result = compute(tlabels, rpreds)
        _result['auc_roc_score'] = roc_auc_score(tlabels, tpreds)
        _result['cutoff'] = cutoff

        for key in _result.keys():
            cresult[key].append(_result[key])
    
    return findMean(cresult)



In [122]:
presults = defaultdict(lambda: defaultdict(dict))

data = pkl.load(open(f"./results/bert.pkl", "rb"))

for domain in data.keys():
    presults[domain] = evaluate(data[domain])


In [128]:
data = presults

import csv
csv_rows = []

# Prepare headers
metrics = ['accuracy', 'precision', 'recall', 'f1_score', 'auc_roc_score', 'cutoff']
empty_cell = len(metrics) - 1

headers_main = [""] + metrics
csv_rows.append(headers_main)

# Prepare rows by category
domains = list(data.keys())
for domain in domains:
    row = [domain]
    for metric in metrics:
        row.append(data[domain][metric])
    csv_rows.append(row)

# Write to CSV
with open("BERToutput.csv", "w", newline='') as file:
    writer = csv.writer(file)
    writer.writerows(csv_rows)


In [100]:
presults = defaultdict(lambda: defaultdict(dict))

for config in configs:
    data = pkl.load(open(f"./results/{config}", "rb"))
    config_name = str(config)[:-4]
    print(config_name)
    
    for domain in data.keys():
        presults[config_name][domain] = evaluate(data[domain])


individual_20
individual_28
fusion_mean
individual_32
fusion_max
individual_24
individual_16


In [101]:
data = presults

In [102]:
import csv
csv_rows = []

# Prepare headers
headers_main = ["Category"]
# headers_sub = [""]
metrics = ['accuracy', 'precision', 'recall', 'f1_score', 'auc_roc_score', 'cutoff']
empty_cell = len(metrics) - 1

for individual in data.keys():
    # headers_main.append(individual)
    # headers_main.extend([""]*empty_cell)
    individual_metrics = [individual + "_" + metric for metric in metrics]
    headers_main.extend(individual_metrics)
    # headers_sub.extend(individual_metrics)

csv_rows.append(headers_main)
# csv_rows.append(headers_sub)

# Prepare rows by category
categories = list(data['individual_20'].keys())
for category in categories:
    row = [category]
    for individual in data.keys():
        for metric in metrics:
            row.append(data[individual][category][metric])
    csv_rows.append(row)

# Write to CSV
with open("output.csv", "w", newline='') as file:
    writer = csv.writer(file)
    writer.writerows(csv_rows)


## Analyze the results

In [129]:
import pandas as pd
df = pd.read_csv("output.csv", index_col=0)
df = df.transpose()
df["Category"] = df.index

In [130]:
df[df["Category"].str.contains("roc", case=True)].sort_values(by="Category", ascending=True)

Category,"Discrimination, Exclusion, Toxicity",Misinformation,HCI harms,Malicious Uses,Category.1
fusion_max_auc_roc_score,0.843 ± 0.002,0.684 ± 0.002,0.452 ± 0.015,0.774 ± 0.004,fusion_max_auc_roc_score
fusion_mean_auc_roc_score,0.857 ± 0.003,0.686 ± 0.004,0.461 ± 0.014,0.806 ± 0.006,fusion_mean_auc_roc_score
individual_16_auc_roc_score,0.861 ± 0.002,0.692 ± 0.002,0.417 ± 0.006,0.806 ± 0.005,individual_16_auc_roc_score
individual_20_auc_roc_score,0.867 ± 0.003,0.682 ± 0.002,0.41 ± 0.005,0.806 ± 0.007,individual_20_auc_roc_score
individual_24_auc_roc_score,0.847 ± 0.002,0.683 ± 0.003,0.475 ± 0.013,0.815 ± 0.007,individual_24_auc_roc_score
individual_28_auc_roc_score,0.864 ± 0.003,0.689 ± 0.004,0.483 ± 0.005,0.728 ± 0.008,individual_28_auc_roc_score
individual_32_auc_roc_score,0.828 ± 0.003,0.671 ± 0.003,0.467 ± 0.006,0.767 ± 0.003,individual_32_auc_roc_score
