In [1]:
from pathlib import Path
import sys

PROJECT_ROOT = Path("..").resolve()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

print("PROJECT_ROOT:", PROJECT_ROOT)


PROJECT_ROOT: /app


## Load predictions

In [2]:
import pandas as pd

preds_path = PROJECT_ROOT / "reports" / "val_predictions.csv"
preds = pd.read_csv(preds_path)

preds.head()


Unnamed: 0,text,true_label_id,true_label_name,pred_label_id,pred_label_name,prob_safe,prob_toxic,prob_hate
0,remove or deface Spectre's comments. I have,0,safe,0,safe,0.994821,0.004096,0.001084
1,"""\ni don't have time to read your personally-s...",0,safe,0,safe,0.995348,0.003548,0.001104
2,Friends In High Places \n\nPutting your finger...,0,safe,0,safe,0.628034,0.344412,0.027554
3,"LOL \n\nyou made me laugh, thanks.",0,safe,0,safe,0.972933,0.022957,0.004111
4,maybe I should also bring up where you will ta...,0,safe,0,safe,0.994957,0.003988,0.001056


In [3]:
print("Rows:", len(preds))
print("\nTrue label distribution:")
print(preds["true_label_name"].value_counts())

print("\nPredicted label distribution:")
print(preds["pred_label_name"].value_counts())

overall_acc = (preds["true_label_name"] == preds["pred_label_name"]).mean()
print("\nOverall accuracy:", overall_acc)


Rows: 15958

True label distribution:
true_label_name
safe     14335
toxic     1482
hate       141
Name: count, dtype: int64

Predicted label distribution:
pred_label_name
safe     14782
toxic     1176
Name: count, dtype: int64

Overall accuracy: 0.9483017922045369


In [4]:
(preds["pred_label_name"] == "hate").sum()


np.int64(0)

## Build false positives / false negatives

In [5]:
harmful = ["toxic", "hate"]

false_positives = preds[
    (preds["true_label_name"] == "safe") &
    (preds["pred_label_name"].isin(harmful))
]

false_negatives = preds[
    (preds["true_label_name"].isin(harmful)) &
    (preds["pred_label_name"] == "safe")
]

print("False positives:", len(false_positives))
print("False negatives:", len(false_negatives))


False positives: 134
False negatives: 581


In [8]:
# View false positives (safe → toxic/hate)
cols = ["text", "true_label_name", "pred_label_name", "prob_safe", "prob_toxic", "prob_hate"]
false_positives.sample(15, random_state=0)[cols]


Unnamed: 0,text,true_label_name,pred_label_name,prob_safe,prob_toxic,prob_hate
11271,Firstly I note that you posted a header disput...,safe,toxic,0.245852,0.702524,0.051623
8214,my god your all heartless!,safe,toxic,0.128235,0.760427,0.111339
2910,Wil Wheaton\nWil Wheaton loves star wars you jerk,safe,toxic,0.039304,0.854005,0.106691
824,"Ok, seriously \n\nHow the hell was the Antlion...",safe,toxic,0.254168,0.69909,0.046742
12239,""": Absence of statement is not statement of ab...",safe,toxic,0.476611,0.487988,0.035401
4995,You Need to be Banned!\n\nCorvus cornix why do...,safe,toxic,0.064958,0.855556,0.079486
7660,I think that hating this Album became a stupid...,safe,toxic,0.107599,0.819846,0.072554
6044,Compromise \n\nHere's the deal: I'll stop vand...,safe,toxic,0.131916,0.797868,0.070216
5133,Only a fool turns his back on history.,safe,toxic,0.187266,0.728067,0.084667
13418,"To Richard \n\nOh and also, I find some of the...",safe,toxic,0.13292,0.785332,0.081749


In [9]:
# View false negatives (toxic/hate → safe)
false_negatives.sample(15, random_state=1)[cols]


Unnamed: 0,text,true_label_name,pred_label_name,prob_safe,prob_toxic,prob_hate
10788,"Yeah, he said he'd never heard of Ferrara, wha...",toxic,safe,0.755159,0.218743,0.026098
13431,"""\n\n As much as I'd like to see you (and all ...",toxic,safe,0.937349,0.056807,0.005844
1523,"""\n\nI agree with your opinion. Such an ignora...",toxic,safe,0.972814,0.023261,0.003925
11575,. And then we would have to believe that the E...,toxic,safe,0.821407,0.16336,0.015233
14776,"""\n\nWhat the hell does that mean? Talk Englis...",toxic,safe,0.98408,0.014105,0.001815
15858,"does morons like you got a place here? If no, ...",toxic,safe,0.732885,0.247966,0.019149
10866,", or if this lousy bot that I use to leave ann...",toxic,safe,0.824751,0.162842,0.012408
11956,I will continue vandalising until someone bloc...,toxic,safe,0.987317,0.010992,0.00169
2587,Wrongful abuse. \n\nHey im probably gonna piss...,toxic,safe,0.850424,0.137894,0.011682
14416,They're about to start construction on new lin...,toxic,safe,0.991243,0.006895,0.001862


In [10]:
# Focus on TRUE HATE examples

hate_cases = preds[preds["true_label_name"] == "hate"]
print("True hate cases:", len(hate_cases))

hate_missed = hate_cases[hate_cases["pred_label_name"] != "hate"]
print("Hate missed:", len(hate_missed))

hate_missed.sample(min(20, len(hate_missed)), random_state=42)[cols]


True hate cases: 141
Hate missed: 141


Unnamed: 0,text,true_label_name,pred_label_name,prob_safe,prob_toxic,prob_hate
7845,This is just a perfect example of how Wikipedi...,hate,toxic,0.03475,0.868919,0.096331
12200,""" name was Willie, and he said he was standing...",hate,toxic,0.397475,0.534791,0.067734
3368,"The Israelis are committing massacres in Gaza,...",hate,safe,0.990188,0.008049,0.001763
5612,Fuck you niggerkite. y you deleting so much sh...,hate,toxic,0.026494,0.852429,0.121077
10325,Hi afghani beggar refugee i love to suck the p...,hate,safe,0.740668,0.229029,0.030303
15799,Leave our country. \n\nWe don't want you Arabs...,hate,safe,0.982489,0.015003,0.002508
9123,Go fuck yourself!!! you fucking delteted it an...,hate,toxic,0.030046,0.867485,0.102469
12820,CKCKCKCK\nYo i see u tryin 2 change shit wit C...,hate,toxic,0.056328,0.819441,0.124232
15483,Bla bla bla....suck it Irishguy =),hate,safe,0.748369,0.219892,0.031738
783,before calling me a bitch on my own talk at le...,hate,toxic,0.027002,0.842145,0.130853


## Confidence analysis

In [11]:
# Confidently wrong safe predictions (harmful but predicted safe with high prob_safe)
fn_confident = false_negatives.sort_values("prob_safe", ascending=False).head(20)
fn_confident[cols]


Unnamed: 0,text,true_label_name,pred_label_name,prob_safe,prob_toxic,prob_hate
10759,"""\n\nCrusades\nHey, S. Schulz, amigo, stop scr...",toxic,safe,0.995555,0.003301,0.001143
6169,"""\n\nNeither you nor I are experts in the arti...",toxic,safe,0.995407,0.003479,0.001114
9312,I refer to people's constantly taking the word...,toxic,safe,0.995366,0.003514,0.00112
10819,"""\n""""Years after Hanson's death, will her biog...",toxic,safe,0.995334,0.003542,0.001124
8849,""":::It appears yo are blowing more Sh*&^ out o...",toxic,safe,0.995322,0.003634,0.001044
14406,"""\n\n==SUSPICIOUS MASS EDITS-IS ANY ADMINISTRA...",toxic,safe,0.995292,0.003597,0.001111
11513,Admins have free reign? \n\nPeople keep removi...,toxic,safe,0.995291,0.0036,0.001109
2824,Why should I pay any attention to anything you...,toxic,safe,0.99529,0.003631,0.001079
14803,"""\n\nAs for the arguements that abo should be ...",toxic,safe,0.995287,0.003616,0.001097
3967,""" You think just because you are an admin you...",toxic,safe,0.995247,0.003752,0.001001


In [12]:
# Confidently wrong toxic predictions (safe but predicted toxic with high prob_toxic)
fp_confident = false_positives.sort_values("prob_toxic", ascending=False).head(20)
fp_confident[cols]


Unnamed: 0,text,true_label_name,pred_label_name,prob_safe,prob_toxic,prob_hate
14948,"Why do not the two of you, the mentally retard...",safe,toxic,0.048282,0.879203,0.072515
15675,"See? He understands, and doesn't want interfe...",safe,toxic,0.048882,0.873847,0.077271
3510,"""\n\nSciencewatcher, please read the work of t...",safe,toxic,0.055598,0.873682,0.07072
13634,Damn straight. Get your facts right next time.,safe,toxic,0.040521,0.873657,0.085823
8552,LOL PENIS! \n\nLOL!\n\nI AM TALKING TO YOU SO ...,safe,toxic,0.030385,0.871467,0.098149
6324,Clean this crap up \n\nWhy are there so many r...,safe,toxic,0.064082,0.870649,0.065269
8229,""" \nShare on printPrint Share on emailEmail ...",safe,toxic,0.037962,0.867033,0.095005
12996,Vandalism?? You call my changing the Apollo ar...,safe,toxic,0.05037,0.865789,0.08384
10341,Maybe you're right. Barneca is a really mean a...,safe,toxic,0.054866,0.865491,0.079643
1334,"""\n\n Gregalton, You're Still A Douchebag, Sig...",safe,toxic,0.061429,0.864944,0.073627


## Observed Failure Modes

### 1) Class imbalance → hate class collapse
- `hate` is a tiny fraction of the dataset.
- The current model never predicts `hate` on validation (0 predicted hate), so recall for hate is ~0.
- Overall accuracy looks high but is misleading for safety applications.

### 2) Sarcasm / joking insults (false positives)
- Some playful or sarcastic comments are flagged as toxic.

### 3) Borderline rude criticism vs harassment
- Blunt criticism can be classified as toxic depending on phrasing.

### 4) Subtle or indirect hate (false negatives)
- Hate that doesn’t use explicit slurs can be missed or downgraded to safe/toxic.

### 5) Context dependence
- Some comments are impossible to classify correctly without conversation context.
