In [1]:
import pandas as pd
from sklearn.metrics import f1_score, classification_report
import sklearn
import os

In [2]:
PREDICTIONS_DIR = '/data1/moscato/personalised-hate-boundaries-data/data/final_predictions/'

if not os.path.exists("final_predictions/kumar_majority_vote_predictions_on_personalizedkumar.csv"):
    # Read the predictions from the three models
    # kumar_maj = pd.read_csv("final_predictions/kumar_majority_vote_predictions_on_kumar.csv")
    # kumar_sep = pd.read_csv("final_predictions/kumar_sepheads_predictions_on_kumar.csv")
    # mhs_maj = pd.read_csv("final_predictions/mhs_majority_vote_predictions_on_mhs.csv")
    # mhs_sep = pd.read_csv("final_predictions/mhs_sepheads_predictions_on_mhs.csv")
    kumar_maj = pd.read_csv(os.path.join(PREDICTIONS_DIR, "kumar_majority_vote_predictions_on_kumar.csv"))
    kumar_sep = pd.read_csv(os.path.join(PREDICTIONS_DIR, "kumar_sepheads_predictions_on_kumar.csv"))
    mhs_maj = pd.read_csv(os.path.join(PREDICTIONS_DIR, "mhs_majority_vote_predictions_on_mhs.csv"))
    mhs_sep = pd.read_csv(os.path.join(PREDICTIONS_DIR, "mhs_sepheads_predictions_on_mhs.csv"))

    # Merge the predictions on text_id
    # kumar_maj = kumar_maj.merge(kumar_sep, on='text_id', suffixes=('_maj', '_sep'))
    kumar_maj = pd.merge(
        left=kumar_sep,
        right=kumar_maj[['text_id', 'majority_vote_model_predicted_toxic_score']],
        how='left',
        on='text_id'
    )
    
    # mhs_maj = mhs_maj.merge(mhs_sep, on='text_id', suffixes=('_maj', '_sep'))
    mhs_maj = pd.merge(
        left=mhs_sep,
        right=mhs_maj[['text_id', 'majority_vote_model_predicted_toxic_score']],
        how='left',
        on='text_id'
    )

    # Save the merged predictions
    # kumar_maj.to_csv("final_predictions/kumar_majority_vote_predictions_on_personalizedkumar.csv", index=False)
    # mhs_maj.to_csv("final_predictions/kumar_sepheads_predictions_on_personalizedmhs.csv", index=False) 

In [3]:
# Filenames
files = {
    "kumar": {
        "majority": {
            "kumar": "kumar_majority_vote_predictions_on_personalizedkumar.csv",
            "hatecheck": "kumar_majority_vote_predictions_on_hatecheck.csv",
        },
        "sepheads": {
            "kumar": "kumar_sepheads_predictions_on_kumar.csv",
            "hatecheck": "kumar_sepheads_predictions_on_hatecheck.csv",
        }
    },
    "mhs": {
        "majority": {
            "mhs": "kumar_sepheads_predictions_on_personalizedmhs.csv",
            "hatecheck": "mhs_majority_vote_predictions_on_hatecheck.csv",
        },
        "sepheads": {
            "mhs": "mhs_sepheads_predictions_on_mhs.csv",
            "hatecheck": "mhs_sepheads_predictions_on_hatecheck.csv",
        }
    }
}

def load_and_evaluate(path, model_type, eval_set, average_type="macro"):
    prediction_path = 'final_predictions/'
    breakdown_confusion_matrix = {}
    df = pd.read_csv(prediction_path+path)
    if model_type == "majority":
        preds = df["majority_vote_model_predicted_toxic_score"]
    else:  # sepheads
        preds = df["sepheads_predicted_toxic_score"]

    if eval_set == "hatecheck":
        df["toxic_score"] = df["label_gold"]
        y_true = df["toxic_score"]
    else:
        if model_type == "majority":
            df["toxic_score"] = df["toxic_score_sep"]
        y_true = df["toxic_score"]

    f1_macro = f1_score(y_true, preds, average=average_type)
    # compute confusion matrix
    confusion_matrix = sklearn.metrics.confusion_matrix(y_true, preds)
    breakdown_confusion_matrix['all'] = confusion_matrix
    print(classification_report(y_true, preds, output_dict=True))

    if eval_set != "hatecheck":

        # # Create a dictionary to hold F1 scores per annotator
        # f1_per_annotator = {}

        # # Group by annotator_id
        # for annotator_id, group in df.groupby("annotator_id"):
        #     y_true = group["toxic_score"]
        #     y_pred = group["boundary_model_predicted_toxic_score"]
            
        #     # If there's only one class in y_true or y_pred, f1_score can throw a warning or error
        #     if len(set(y_true)) > 1 or len(set(y_pred)) > 1:
        #         f1 = f1_score(y_true, y_pred)
        #     else:
        #         f1 = 0.0  # Can't compute F1 if there's no variation
        #         print("PROBLEM!")
            
        #     f1_per_annotator[annotator_id] = f1
        
        # average_f1 = sum(f1_per_annotator.values()) / len(f1_per_annotator)
        # print(average_f1)


        f1_breakdown = df.groupby("extreme_annotator").apply(
            lambda g: f1_score(g["toxic_score"], g["sepheads_predicted_toxic_score"], average=average_type)
        ).to_dict()
        breakdown_confusion_matrix[0] = sklearn.metrics.confusion_matrix(
            df[df["extreme_annotator"] == False]["toxic_score"],
            df[df["extreme_annotator"] == False]["sepheads_predicted_toxic_score"]
        )
        breakdown_confusion_matrix[1] = sklearn.metrics.confusion_matrix(
            df[df["extreme_annotator"] == True]["toxic_score"],
            df[df["extreme_annotator"] == True]["sepheads_predicted_toxic_score"]
        )

        return df, f1_macro, f1_breakdown, breakdown_confusion_matrix

    return df, f1_macro, confusion_matrix

average_type = "macro"  # Change to "macro" for macro F1

# Collect results
results = []
results_df = {
    "kumar": {
        "majority": {},
        "sepheads": {}
    },
    "mhs": {
        "majority": {},
        "sepheads": {}
    }
}

confusion_matrix = {
    "kumar": {
        "majority": {
            "kumar" : {},
            "hatecheck" : {}
        },
        "sepheads": {
            "kumar" : {},
            "hatecheck" : {}
        }
    },
    "mhs": {
        "majority": {
            "mhs" : {},
            "hatecheck" : {}
        },
        "sepheads": {
            "mhs" : {},
            "hatecheck" : {}
        }
    }
}

for dataset, models in files.items():
    for model_type, sources in models.items():
        for eval_set, file_path in sources.items():
            for average_type in ["macro", "positive"]:
                model_name = f"{dataset}_{model_type}_on_{eval_set}"
                print(model_name)
                if eval_set != "hatecheck":
                    results_df[dataset][model_type][eval_set], f1, breakdown, confusion_matrix[dataset][model_type][eval_set] = load_and_evaluate(file_path, model_type, eval_set, average_type)
                    results.append({
                        "model": model_name,
                        "macro_f1": f1,
                        "extreme_annotator_0_f1": breakdown.get(False),
                        "extreme_annotator_1_f1": breakdown.get(True)
                    })
                else:
                    results_df[dataset][model_type][eval_set],f1, confusion_matrix[dataset][model_type][eval_set]['all'] = load_and_evaluate(file_path, model_type, eval_set, average_type)
                    results.append({
                        "model": model_name,
                        "macro_f1": f1
                    })

# Convert results to DataFrame
df_scores = pd.DataFrame(results)
df_scores


kumar_majority_on_kumar


FileNotFoundError: [Errno 2] No such file or directory: 'final_predictions/kumar_majority_vote_predictions_on_personalizedkumar.csv'

In [28]:

print(confusion_matrix["kumar"]["majority"]["kumar"]['all'].sum(axis=1))
# get percentage
print(confusion_matrix["kumar"]["majority"]["kumar"]['all'].sum(axis=1) / (confusion_matrix["kumar"]["majority"]["kumar"]['all'].sum()))
pd.DataFrame(confusion_matrix["kumar"]["majority"]["kumar"]['all']).style.background_gradient(cmap='Blues', axis=None)


[19973 15474]
[0.56346094 0.43653906]


Unnamed: 0,0,1
0,13720,6253
1,4684,10790


In [25]:

print(confusion_matrix["kumar"]["sepheads"]["kumar"]['all'].sum(axis=1))
# get percentage
print(confusion_matrix["kumar"]["sepheads"]["kumar"]['all'].sum(axis=1) / (confusion_matrix["kumar"]["sepheads"]["kumar"]['all'].sum()))
pd.DataFrame(confusion_matrix["kumar"]["sepheads"]["kumar"]['all']).style.background_gradient(cmap='Blues', axis=None)

[19973 15474]
[0.56346094 0.43653906]


Unnamed: 0,0,1
0,15184,4789
1,4099,11375


### Preds VS Kumar annotator-level labels

In [4]:
# Majority vote model preds against annotator-level labels.
cm = sklearn.metrics.confusion_matrix(
    y_true=kumar_maj['toxic_score'],
    y_pred=kumar_maj['majority_vote_model_predicted_toxic_score']
)

print(cm)
print(cm.sum(axis=1) / cm.sum())

print(classification_report(
    y_true=kumar_maj['toxic_score'],
    y_pred=kumar_maj['majority_vote_model_predicted_toxic_score'],
    digits=3
))

# Non-extreme annotators.
print('Non-extreme annotators')
print(classification_report(
    y_true=kumar_maj[~kumar_maj['extreme_annotator']]['toxic_score'],
    y_pred=kumar_maj[~kumar_maj['extreme_annotator']]['majority_vote_model_predicted_toxic_score'],
    digits=3
))

# Extreme annotators.
print('Extreme annotators')
print(classification_report(
    y_true=kumar_maj[kumar_maj['extreme_annotator']]['toxic_score'],
    y_pred=kumar_maj[kumar_maj['extreme_annotator']]['majority_vote_model_predicted_toxic_score'],
    digits=3
))

[[14095  5878]
 [ 4986 10488]]
[0.56346094 0.43653906]
              precision    recall  f1-score   support

           0      0.739     0.706     0.722     19973
           1      0.641     0.678     0.659     15474

    accuracy                          0.694     35447
   macro avg      0.690     0.692     0.690     35447
weighted avg      0.696     0.694     0.694     35447

Non-extreme annotators
              precision    recall  f1-score   support

           0      0.736     0.711     0.723     19498
           1      0.648     0.676     0.662     15347

    accuracy                          0.696     34845
   macro avg      0.692     0.694     0.693     34845
weighted avg      0.697     0.696     0.696     34845

Extreme annotators
              precision    recall  f1-score   support

           0      0.931     0.480     0.633       475
           1      0.308     0.866     0.455       127

    accuracy                          0.561       602
   macro avg      0.619     0.6

In [5]:
# SepHeads preds against annotator-level labels.
cm = sklearn.metrics.confusion_matrix(
    y_true=kumar_maj['toxic_score'],
    y_pred=kumar_maj['sepheads_predicted_toxic_score']
)

print(cm)
print(cm.sum(axis=1) / cm.sum())

print(classification_report(
    y_true=kumar_maj['toxic_score'],
    y_pred=kumar_maj['sepheads_predicted_toxic_score'],
    digits=3
))

# Non-extreme annotators.
print('Non-extreme annotators')
print(classification_report(
    y_true=kumar_maj[~kumar_maj['extreme_annotator']]['toxic_score'],
    y_pred=kumar_maj[~kumar_maj['extreme_annotator']]['sepheads_predicted_toxic_score'],
    digits=3
))

# Extreme annotators.
print('Extreme annotators')
print(classification_report(
    y_true=kumar_maj[kumar_maj['extreme_annotator']]['toxic_score'],
    y_pred=kumar_maj[kumar_maj['extreme_annotator']]['sepheads_predicted_toxic_score'],
    digits=3
))

[[15184  4789]
 [ 4099 11375]]
[0.56346094 0.43653906]
              precision    recall  f1-score   support

           0      0.787     0.760     0.774     19973
           1      0.704     0.735     0.719     15474

    accuracy                          0.749     35447
   macro avg      0.746     0.748     0.746     35447
weighted avg      0.751     0.749     0.750     35447

Non-extreme annotators
              precision    recall  f1-score   support

           0      0.786     0.757     0.771     19498
           1      0.705     0.738     0.721     15347

    accuracy                          0.748     34845
   macro avg      0.745     0.747     0.746     34845
weighted avg      0.750     0.748     0.749     34845

Extreme annotators
              precision    recall  f1-score   support

           0      0.856     0.888     0.872       475
           1      0.514     0.441     0.475       127

    accuracy                          0.794       602
   macro avg      0.685     0.6

In [105]:
# Boundary model preds against annotator-level labels.
# All annotators.
print(classification_report(
    y_true=kumar_maj['toxic_score'],
    y_pred=kumar_maj['boundary_model_predicted_toxic_score'],
    digits=3
))

# Non-extreme annotators.
print('Non-extreme annotators')
print(classification_report(
    y_true=kumar_maj[~kumar_maj['extreme_annotator']]['toxic_score'],
    y_pred=kumar_maj[~kumar_maj['extreme_annotator']]['boundary_model_predicted_toxic_score'],
    digits=3
))

# Extreme annotators.
print('Extreme annotators')
print(classification_report(
    y_true=kumar_maj[kumar_maj['extreme_annotator']]['toxic_score'],
    y_pred=kumar_maj[kumar_maj['extreme_annotator']]['boundary_model_predicted_toxic_score'],
    digits=3
))

              precision    recall  f1-score   support

           0      0.625     0.915     0.742     19973
           1      0.725     0.290     0.415     15474

    accuracy                          0.642     35447
   macro avg      0.675     0.603     0.579     35447
weighted avg      0.668     0.642     0.599     35447

Non-extreme annotators
              precision    recall  f1-score   support

           0      0.621     0.917     0.740     19498
           1      0.732     0.289     0.415     15347

    accuracy                          0.640     34845
   macro avg      0.677     0.603     0.578     34845
weighted avg      0.670     0.640     0.597     34845

Extreme annotators
              precision    recall  f1-score   support

           0      0.851     0.819     0.835       475
           1      0.407     0.465     0.434       127

    accuracy                          0.744       602
   macro avg      0.629     0.642     0.634       602
weighted avg      0.757     0.74

### Preds VS MHS annotator-level labels

In [6]:
# Majority vote model preds against annotator-level labels.
cm = sklearn.metrics.confusion_matrix(
    y_true=mhs_maj['toxic_score'],
    y_pred=mhs_maj['majority_vote_model_predicted_toxic_score']
)

print(cm)
print(cm.sum(axis=1) / cm.sum())

print(classification_report(
    y_true=mhs_maj['toxic_score'],
    y_pred=mhs_maj['majority_vote_model_predicted_toxic_score'],
    digits=3
))

# Non-extreme annotators.
print('Non-extreme annotators')
print(classification_report(
    y_true=mhs_maj[~mhs_maj['extreme_annotator']]['toxic_score'],
    y_pred=mhs_maj[~mhs_maj['extreme_annotator']]['majority_vote_model_predicted_toxic_score'],
    digits=3
))

# Extreme annotators.
print('Extreme annotators')
print(classification_report(
    y_true=mhs_maj[mhs_maj['extreme_annotator']]['toxic_score'],
    y_pred=mhs_maj[mhs_maj['extreme_annotator']]['majority_vote_model_predicted_toxic_score'],
    digits=3
))

[[1058  153]
 [ 215  831]]
[0.53655295 0.46344705]
              precision    recall  f1-score   support

           0      0.831     0.874     0.852      1211
           1      0.845     0.794     0.819      1046

    accuracy                          0.837      2257
   macro avg      0.838     0.834     0.835      2257
weighted avg      0.837     0.837     0.836      2257

Non-extreme annotators
              precision    recall  f1-score   support

           0      0.817     0.877     0.846      1028
           1      0.857     0.790     0.822       960

    accuracy                          0.835      1988
   macro avg      0.837     0.834     0.834      1988
weighted avg      0.837     0.835     0.835      1988

Extreme annotators
              precision    recall  f1-score   support

           0      0.923     0.852     0.886       183
           1      0.730     0.849     0.785        86

    accuracy                          0.851       269
   macro avg      0.827     0.851  

In [7]:
# SepHeads preds against annotator-level labels.
cm = sklearn.metrics.confusion_matrix(
    y_true=mhs_maj['toxic_score'],
    y_pred=mhs_maj['sepheads_predicted_toxic_score']
)

print(cm)
print(cm.sum(axis=1) / cm.sum())

print(classification_report(
    y_true=mhs_maj['toxic_score'],
    y_pred=mhs_maj['sepheads_predicted_toxic_score'],
    digits=3
))

# Non-extreme annotators.
print('Non-extreme annotators')
print(classification_report(
    y_true=mhs_maj[~mhs_maj['extreme_annotator']]['toxic_score'],
    y_pred=mhs_maj[~mhs_maj['extreme_annotator']]['sepheads_predicted_toxic_score'],
    digits=3
))

# Extreme annotators.
print('Extreme annotators')
print(classification_report(
    y_true=mhs_maj[mhs_maj['extreme_annotator']]['toxic_score'],
    y_pred=mhs_maj[mhs_maj['extreme_annotator']]['sepheads_predicted_toxic_score'],
    digits=3
))

[[1113   98]
 [ 381  665]]
[0.53655295 0.46344705]
              precision    recall  f1-score   support

           0      0.745     0.919     0.823      1211
           1      0.872     0.636     0.735      1046

    accuracy                          0.788      2257
   macro avg      0.808     0.777     0.779      2257
weighted avg      0.804     0.788     0.782      2257

Non-extreme annotators
              precision    recall  f1-score   support

           0      0.731     0.917     0.814      1028
           1      0.878     0.639     0.739       960

    accuracy                          0.783      1988
   macro avg      0.805     0.778     0.777      1988
weighted avg      0.802     0.783     0.778      1988

Extreme annotators
              precision    recall  f1-score   support

           0      0.833     0.929     0.879       183
           1      0.800     0.605     0.689        86

    accuracy                          0.825       269
   macro avg      0.817     0.767  

In [106]:
# Boundary model preds against annotator-level labels.
# All annotators.
print(classification_report(
    y_true=mhs_maj['toxic_score'],
    y_pred=mhs_maj['boundary_model_predicted_toxic_score'],
    digits=3
))

# Non-extreme annotators.
print('Non-extreme annotators')
print(classification_report(
    y_true=mhs_maj[~mhs_maj['extreme_annotator']]['toxic_score'],
    y_pred=mhs_maj[~mhs_maj['extreme_annotator']]['boundary_model_predicted_toxic_score'],
    digits=3
))

# Extreme annotators.
print('Extreme annotators')
print(classification_report(
    y_true=mhs_maj[mhs_maj['extreme_annotator']]['toxic_score'],
    y_pred=mhs_maj[mhs_maj['extreme_annotator']]['boundary_model_predicted_toxic_score'],
    digits=3
))

              precision    recall  f1-score   support

           0      0.972     0.572     0.720      1211
           1      0.665     0.981     0.792      1046

    accuracy                          0.762      2257
   macro avg      0.818     0.777     0.756      2257
weighted avg      0.829     0.762     0.754      2257

Non-extreme annotators
              precision    recall  f1-score   support

           0      0.967     0.574     0.720      1028
           1      0.682     0.979     0.804       960

    accuracy                          0.770      1988
   macro avg      0.825     0.777     0.762      1988
weighted avg      0.830     0.770     0.761      1988

Extreme annotators
              precision    recall  f1-score   support

           0      1.000     0.563     0.720       183
           1      0.518     1.000     0.683        86

    accuracy                          0.703       269
   macro avg      0.759     0.781     0.701       269
weighted avg      0.846     0.70

### Preds VS HateCheck

In [8]:
maj_kumar_on_hatecheck = pd.read_csv(os.path.join(PREDICTIONS_DIR, 'kumar_majority_vote_predictions_on_hatecheck.csv'))
sepheads_kumar_on_hatecheck = pd.read_csv(os.path.join(PREDICTIONS_DIR, 'kumar_sepheads_predictions_on_hatecheck.csv'))
maj_mhs_on_hatecheck = pd.read_csv(os.path.join(PREDICTIONS_DIR, 'mhs_majority_vote_predictions_on_hatecheck.csv'))
sepheads_mhs_on_hatecheck = pd.read_csv(os.path.join(PREDICTIONS_DIR, 'mhs_sepheads_predictions_on_hatecheck.csv'))

In [9]:
# Majority vote model trained on Kumar.
print(classification_report(
    y_true=maj_kumar_on_hatecheck['label_gold'],
    y_pred=maj_kumar_on_hatecheck['majority_vote_model_predicted_toxic_score'],
    digits=3
))

              precision    recall  f1-score   support

           0      0.684     0.279     0.396      1165
           1      0.742     0.941     0.830      2563

    accuracy                          0.734      3728
   macro avg      0.713     0.610     0.613      3728
weighted avg      0.724     0.734     0.694      3728



In [13]:
# SepHeads model trained on Kumar.
# All annotators.
print('All annotators')
print(classification_report(
    y_true=sepheads_kumar_on_hatecheck['label_gold'],
    y_pred=sepheads_kumar_on_hatecheck['sepheads_predicted_toxic_score'],
    digits=3
))

# Non-extreme annotators.
print('Non-extreme annotators')
print(classification_report(
    y_true=sepheads_kumar_on_hatecheck[~sepheads_kumar_on_hatecheck['extreme_annotator']]['label_gold'],
    y_pred=sepheads_kumar_on_hatecheck[~sepheads_kumar_on_hatecheck['extreme_annotator']]['sepheads_predicted_toxic_score'],
    digits=3
))

# Extreme annotators.
print('Extreme annotators')
print(classification_report(
    y_true=sepheads_kumar_on_hatecheck[sepheads_kumar_on_hatecheck['extreme_annotator']]['label_gold'],
    y_pred=sepheads_kumar_on_hatecheck[sepheads_kumar_on_hatecheck['extreme_annotator']]['sepheads_predicted_toxic_score'],
    digits=3
))

All annotators
              precision    recall  f1-score   support

           0      0.554     0.433     0.486   3358695
           1      0.766     0.842     0.802   7389129

    accuracy                          0.714  10747824
   macro avg      0.660     0.637     0.644  10747824
weighted avg      0.700     0.714     0.703  10747824

Non-extreme annotators
              precision    recall  f1-score   support

           0      0.583     0.412     0.483   3178120
           1      0.764     0.866     0.812   6991864

    accuracy                          0.724  10169984
   macro avg      0.674     0.639     0.648  10169984
weighted avg      0.708     0.724     0.709  10169984

Extreme annotators
              precision    recall  f1-score   support

           0      0.383     0.794     0.516    180575
           1      0.817     0.418     0.553    397265

    accuracy                          0.535    577840
   macro avg      0.600     0.606     0.535    577840
weighted avg     

In [11]:
# Majority vote model trained on MHS.
print(classification_report(
    y_true=maj_mhs_on_hatecheck['label_gold'],
    y_pred=maj_mhs_on_hatecheck['majority_vote_model_predicted_toxic_score'],
    digits=3
))

              precision    recall  f1-score   support

           0      0.347     0.919     0.504      1165
           1      0.854     0.214     0.342      2563

    accuracy                          0.435      3728
   macro avg      0.600     0.567     0.423      3728
weighted avg      0.695     0.435     0.393      3728



In [14]:
# SepHeads model trained on MHS.
# All annotators.
print('All annotators')
print(classification_report(
    y_true=sepheads_mhs_on_hatecheck['label_gold'],
    y_pred=sepheads_mhs_on_hatecheck['sepheads_predicted_toxic_score'],
    digits=3
))

# Non-extreme annotators.
print('Non-extreme annotators')
print(classification_report(
    y_true=sepheads_mhs_on_hatecheck[~sepheads_mhs_on_hatecheck['extreme_annotator']]['label_gold'],
    y_pred=sepheads_mhs_on_hatecheck[~sepheads_mhs_on_hatecheck['extreme_annotator']]['sepheads_predicted_toxic_score'],
    digits=3
))

# Extreme annotators.
print('Extreme annotators')
print(classification_report(
    y_true=sepheads_mhs_on_hatecheck[sepheads_mhs_on_hatecheck['extreme_annotator']]['label_gold'],
    y_pred=sepheads_mhs_on_hatecheck[sepheads_mhs_on_hatecheck['extreme_annotator']]['sepheads_predicted_toxic_score'],
    digits=3
))

All annotators
              precision    recall  f1-score   support

           0      0.319     0.904     0.472    634925
           1      0.741     0.124     0.213   1396835

    accuracy                          0.368   2031760
   macro avg      0.530     0.514     0.342   2031760
weighted avg      0.609     0.368     0.294   2031760

Non-extreme annotators
              precision    recall  f1-score   support

           0      0.319     0.904     0.472    633760
           1      0.741     0.124     0.213   1394272

    accuracy                          0.368   2028032
   macro avg      0.530     0.514     0.342   2028032
weighted avg      0.609     0.368     0.294   2028032

Extreme annotators
              precision    recall  f1-score   support

           0      0.319     0.917     0.473      1165
           1      0.744     0.110     0.192      2563

    accuracy                          0.362      3728
   macro avg      0.531     0.513     0.332      3728
weighted avg     

In [116]:
# Boundary model.
boundary_model_on_hatecheck = sepheads_kumar_on_hatecheck.groupby('case_id').agg(
    boundary_model_predicted_toxic_score=pd.NamedAgg('boundary_model_predicted_toxic_score', 'first'),
    label_gold=pd.NamedAgg('label_gold', 'first')
)

print(classification_report(
    y_true=boundary_model_on_hatecheck['label_gold'],
    y_pred=boundary_model_on_hatecheck['boundary_model_predicted_toxic_score'],
    digits=3
))

              precision    recall  f1-score   support

           0      0.990     0.840     0.909      1165
           1      0.932     0.996     0.963      2563

    accuracy                          0.947      3728
   macro avg      0.961     0.918     0.936      3728
weighted avg      0.950     0.947     0.946      3728



## Checks

In [20]:

print(confusion_matrix["kumar"]["sepheads"]["kumar"][1].sum(axis=1))
# get percentage
print(confusion_matrix["kumar"]["sepheads"]["kumar"][1].sum(axis=1) / (confusion_matrix["kumar"]["sepheads"]["kumar"][1].sum()))
pd.DataFrame(confusion_matrix["kumar"]["sepheads"]["kumar"][1]).style.background_gradient(cmap='Blues', axis=None)

[475 127]
[0.78903654 0.21096346]


Unnamed: 0,0,1
0,422,53
1,71,56


In [30]:
a = results_df["kumar"]["majority"]["kumar"]
b = results_df["kumar"]["sepheads"]["kumar"]
# check if toxic score is the same 
a[a["toxic_score"] == b["toxic_score"]]


Unnamed: 0,text_id,text,toxic_score_maj,majority_vote_model_predicted_toxic_score,comment,worker_id,toxic_score_sep,annotator_id,extreme_annotator,sepheads_predicted_toxic_score,boundary_model_predicted_toxic_score,boundary_model_confidence_score,toxic_score
0,5,#AtoZQuiz A05 Bar [any mention of baa],0,0,#AtoZQuiz A05 Bar [any mention of baa],dbc501198ada6725d8e8cc6f0101824f04d4b4b8935059...,0,1,False,0,0,90,0
1,5,#AtoZQuiz A05 Bar [any mention of baa],0,0,#AtoZQuiz A05 Bar [any mention of baa],29a3513367445e0fd3c53d61da1fcbebbf4efc6e0de0b9...,0,2,False,0,0,90,0
2,5,#AtoZQuiz A05 Bar [any mention of baa],0,0,#AtoZQuiz A05 Bar [any mention of baa],26523080557217fc3b42c882aecab5863966ccfbe31c3f...,0,3,False,0,0,90,0
3,11,Robert Manion????HA!More like ROBER manion bcs...,0,0,Robert Manion????HA!More like ROBER manion bcs...,dbc501198ada6725d8e8cc6f0101824f04d4b4b8935059...,0,1,False,0,0,90,0
4,11,Robert Manion????HA!More like ROBER manion bcs...,0,0,Robert Manion????HA!More like ROBER manion bcs...,29a3513367445e0fd3c53d61da1fcbebbf4efc6e0de0b9...,0,2,False,0,0,90,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
35442,105986,Same. Jesus Christ he’s been horrible this year,1,0,Same. Jesus Christ he’s been horrible this year,8bcb34d67e6969f6e2c3f4d96db021ea3bedb73831522e...,1,3024,False,1,0,90,1
35443,105986,Same. Jesus Christ he’s been horrible this year,1,0,Same. Jesus Christ he’s been horrible this year,50d355ebffb4a40ef84da9137b206ac3a54b00bcc94b6c...,1,10446,False,1,0,90,1
35444,105992,Last time I checked they don't measure radiati...,0,0,Last time I checked they don't measure radiati...,93bb39808c33e806cf7fc28190caeca8662561dca6ca2a...,0,640,False,0,0,95,0
35445,105992,Last time I checked they don't measure radiati...,0,0,Last time I checked they don't measure radiati...,8bcb34d67e6969f6e2c3f4d96db021ea3bedb73831522e...,0,3024,False,0,0,95,0


## Check: re-compute all metrics

Trained models' metrics.

In [121]:
classification_reports = {}

for training_dataset in ['kumar', 'mhs']:
    classification_reports[training_dataset] = {}
    
    multi_annotator_dataset = 'kumar' if (training_dataset == 'kumar') else 'mhs'
    
    for eval_dataset in [multi_annotator_dataset, 'hatecheck']:
        classification_reports[training_dataset][eval_dataset] = {}
        
        for model in ['majority_vote', 'sepheads']:
            print(f'Training dataset: {training_dataset} | Eval dataset: {eval_dataset} | Model: {model}')

            classification_reports[training_dataset][eval_dataset][model] = {}

            # Load predictions.
            pred_df = pd.read_csv(os.path.join(PREDICTIONS_DIR, f'{training_dataset}_{model}_predictions_on_{eval_dataset}.csv'))

            # If the training dataset is a multi-annotator one and the model
            # is a majority-vote one, boradcast (join) the model's predictions
            # across all annotators.
            if (eval_dataset != 'hatecheck') and (model == 'majority_vote'):
                # Load the corresponding predictions from SepHeads (just to have all the annotators).
                annotator_level_data = pd.read_csv(os.path.join(PREDICTIONS_DIR, f'{training_dataset}_sepheads_predictions_on_{eval_dataset}.csv'))[
                    ['text_id', 'annotator_id', 'extreme_annotator', 'toxic_score']
                ]

                # Join with the predictions from the majority vote model.
                pred_df = pd.merge(
                    left=annotator_level_data,
                    right=pred_df[['text_id', 'majority_vote_model_predicted_toxic_score']],
                    how='left',
                    on='text_id'
                )

            pred_column = 'majority_vote_model_predicted_toxic_score' if model == 'majority_vote' else 'sepheads_predicted_toxic_score'
            ground_truth_column = 'toxic_score' if eval_dataset != 'hatecheck' else 'label_gold'

            cr_all = classification_report(
                y_true=pred_df[ground_truth_column],
                y_pred=pred_df[pred_column],
                digits=3,
                output_dict=True
            )

            classification_reports[training_dataset][eval_dataset][model]['all'] = cr_all

            if (eval_dataset != 'hatecheck') or (model != 'majority_vote'):
                cr_nonextreme = classification_report(
                    y_true=pred_df[~pred_df['extreme_annotator']][ground_truth_column],
                    y_pred=pred_df[~pred_df['extreme_annotator']][pred_column],
                    digits=3,
                    output_dict=True
                )

                classification_reports[training_dataset][eval_dataset][model]['nonextreme'] = cr_nonextreme

                cr_extreme = classification_report(
                    y_true=pred_df[pred_df['extreme_annotator']][ground_truth_column],
                    y_pred=pred_df[pred_df['extreme_annotator']][pred_column],
                    digits=3,
                    output_dict=True
                )

                classification_reports[training_dataset][eval_dataset][model]['extreme'] = cr_extreme

                print(
                    round(cr_all['macro avg']['f1-score'], 3),
                    round(cr_nonextreme['macro avg']['f1-score'], 3),
                    round(cr_extreme['macro avg']['f1-score'], 3),
                    round(cr_all['1']['recall'], 3),
                    round(cr_nonextreme['1']['recall'], 3),
                    round(cr_extreme['1']['recall'], 3)
                )
            else:
                print(
                    round(cr_all['macro avg']['f1-score'], 3),
                    round(cr_all['1']['recall'], 3),
                )

            print('\n')

            if model == 'sepheads':
                if eval_dataset == 'hatecheck':
                    print(f'Training dataset: - | Eval dataset: {eval_dataset} | Model: boundary')
                    
                    boundary_model_pred_df = pred_df.groupby('case_id').agg(
                        boundary_model_predicted_toxic_score=pd.NamedAgg('boundary_model_predicted_toxic_score', 'first'),
                        label_gold=pd.NamedAgg('label_gold', 'first')
                    ).reset_index()
    
                    cr_all = classification_report(
                        y_true=boundary_model_pred_df['label_gold'],
                        y_pred=boundary_model_pred_df['boundary_model_predicted_toxic_score'],
                        digits=3,
                        output_dict=True
                    )
    
                    print(
                        round(cr_all['macro avg']['f1-score'], 3),
                        round(cr_all['1']['recall'], 3),
                    )
    
                    print('\n')
                else:
                    print(f'Training dataset: - | Eval dataset: {eval_dataset} | Model: boundary')

                    cr_all = classification_report(
                        y_true=pred_df[ground_truth_column],
                        y_pred=pred_df['boundary_model_predicted_toxic_score'],
                        digits=3,
                        output_dict=True
                    )

                    cr_nonextreme = classification_report(
                        y_true=pred_df[~pred_df['extreme_annotator']][ground_truth_column],
                        y_pred=pred_df[~pred_df['extreme_annotator']]['boundary_model_predicted_toxic_score'],
                        digits=3,
                        output_dict=True
                    )
    
                    cr_extreme = classification_report(
                        y_true=pred_df[pred_df['extreme_annotator']][ground_truth_column],
                        y_pred=pred_df[pred_df['extreme_annotator']]['boundary_model_predicted_toxic_score'],
                        digits=3,
                        output_dict=True
                    )

                    print(
                        round(cr_all['macro avg']['f1-score'], 3),
                        round(cr_nonextreme['macro avg']['f1-score'], 3),
                        round(cr_extreme['macro avg']['f1-score'], 3),
                        round(cr_all['1']['recall'], 3),
                        round(cr_nonextreme['1']['recall'], 3),
                        round(cr_extreme['1']['recall'], 3)
                    )

                    print('\n')

Training dataset: kumar | Eval dataset: kumar | Model: majority_vote
0.69 0.693 0.544 0.678 0.676 0.866


Training dataset: kumar | Eval dataset: kumar | Model: sepheads
0.746 0.746 0.673 0.735 0.738 0.441


Training dataset: - | Eval dataset: kumar | Model: boundary
0.579 0.578 0.634 0.29 0.289 0.465


Training dataset: kumar | Eval dataset: hatecheck | Model: majority_vote
0.613 0.941


Training dataset: kumar | Eval dataset: hatecheck | Model: sepheads
0.644 0.648 0.535 0.842 0.866 0.418


Training dataset: - | Eval dataset: hatecheck | Model: boundary
0.936 0.996


Training dataset: mhs | Eval dataset: mhs | Model: majority_vote
0.835 0.834 0.836 0.794 0.79 0.849


Training dataset: mhs | Eval dataset: mhs | Model: sepheads
0.779 0.777 0.784 0.636 0.639 0.605


Training dataset: - | Eval dataset: mhs | Model: boundary
0.756 0.762 0.701 0.981 0.979 1.0


Training dataset: mhs | Eval dataset: hatecheck | Model: majority_vote
0.423 0.214


Training dataset: mhs | Eval dataset: hateche

Dummy model's metrics.

In [80]:
from sklearn.dummy import DummyClassifier

In [88]:
for eval_dataset in ['kumar', 'mhs', 'hatecheck']:
    print(f'Eval dataset: {eval_dataset} | Model: random')
    
    # Load predictions from the sepheads model (the Kumar one, for hatecheck)
    # to get the samples.
    dummy_training_dataset = {
        'kumar': 'kumar',
        'mhs': 'mhs',
        'hatecheck': 'mhs'
    }

    pred_df = pd.read_csv(os.path.join(PREDICTIONS_DIR, f'{dummy_training_dataset[eval_dataset]}_sepheads_predictions_on_{eval_dataset}.csv'))
    
    ground_truth_column = 'toxic_score' if eval_dataset != 'hatecheck' else 'label_gold'

    random_model = DummyClassifier(strategy='uniform')
    random_model.fit(pred_df[ground_truth_column], pred_df[ground_truth_column])
    
    pred_df['random_model_predicted_toxic_score'] =  random_model.predict(pred_df[ground_truth_column])

    cr_all = classification_report(
        y_true=pred_df[ground_truth_column],
        y_pred=pred_df['random_model_predicted_toxic_score'],
        digits=3,
        output_dict=True
    )

    cr_nonextreme = classification_report(
        y_true=pred_df[~pred_df['extreme_annotator']][ground_truth_column],
        y_pred=pred_df[~pred_df['extreme_annotator']]['random_model_predicted_toxic_score'],
        digits=3,
        output_dict=True
    )

    cr_extreme = classification_report(
        y_true=pred_df[pred_df['extreme_annotator']][ground_truth_column],
        y_pred=pred_df[pred_df['extreme_annotator']]['random_model_predicted_toxic_score'],
        digits=3,
        output_dict=True
    )

    print(
        round(cr_all['macro avg']['f1-score'], 3),
        round(cr_nonextreme['macro avg']['f1-score'], 3),
        round(cr_extreme['macro avg']['f1-score'], 3),
        round(cr_all['1']['recall'], 3),
        round(cr_nonextreme['1']['recall'], 3),
        round(cr_extreme['1']['recall'], 3)
    )

Eval dataset: kumar | Model: random
0.493 0.494 0.438 0.495 0.495 0.425
Eval dataset: mhs | Model: random
0.489 0.491 0.468 0.487 0.486 0.488
Eval dataset: hatecheck | Model: random
0.482 0.482 0.491 0.5 0.5 0.5
