In [None]:
import sys, os
sys.path.append(os.getcwd())

import json
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer

from src.config import DATA_DIR, MODELS_DIR, REPORTS_DIR, LABELS, MAX_SEQ_LEN
from src.data_utils import load_raw_jigsaw, train_valid_test_split, basic_text_clean, build_dataloaders_rnn
from src.metrics import compute_classification_metrics
from src.models.rnn_models import BiLSTMClassifier, BiLSTMAttentionClassifier
from src.models.transformer_models import create_bert_base, create_distilbert, LexiconHybridBert
from src.models.contrastive_model import ContrastiveBertEncoder


# Error Analysis for Toxic Comment Classification

In this notebook we go beyond global metrics and look at **how** and **where** our models fail.

We focus on the following models:

- Classical: Logistic Regression with TF–IDF
- Deep baselines: BiLSTM, BiLSTM + Attention
- Transformers: DistilBERT, BERT-base
- Hybrid: Lexicon-augmented BERT
- Representation learning: Contrastive-BERT classifier

We will:

1. Recreate the validation split used during training.
2. Compute predictions for each model on the same validation set.
3. Compare macro and per-label F1 scores.
4. Inspect typical failure cases:
   - False negatives for rare labels (severe_toxic, threat, identity_hate).
   - False positives on identity mentions.
5. Save prediction arrays so that ensemble analysis can be done separately.


In [None]:
df = load_raw_jigsaw(DATA_DIR / "jigsaw_train.csv")
df["clean"] = df["comment_text"].astype(str).apply(basic_text_clean)
train_df, valid_df, test_df = train_valid_test_split(df)
print(train_df.shape, valid_df.shape, test_df.shape)
valid_df.head()


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import make_pipeline

def get_logreg_predictions(train_df, valid_df):
    X_tr = train_df["clean"].tolist()
    X_va = valid_df["clean"].tolist()
    y_tr = train_df[LABELS].values
    y_va = valid_df[LABELS].values
    
    vec = TfidfVectorizer(ngram_range=(1,2), min_df=3, max_df=0.9, sublinear_tf=True)
    base = LogisticRegression(max_iter=1000, class_weight="balanced")
    clf = make_pipeline(vec, OneVsRestClassifier(base, n_jobs=-1))
    clf.fit(X_tr, y_tr)
    
    y_prob = clf.predict_proba(X_va)
    return y_va, np.array(y_prob), clf


In [None]:
def get_bilstm_predictions(train_df, valid_df):
    valid_loader, vocab = None, None
    # build_dataloaders_rnn returns (train_loader, valid_loader, vocab)
    _, valid_loader, vocab = build_dataloaders_rnn(train_df, valid_df, max_len=100)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    checkpoint = torch.load(MODELS_DIR / "bilstm_baseline.pt", map_location=device)
    model = BiLSTMClassifier(
        vocab_size=len(checkpoint["vocab"]),
        embed_dim=128,
        hidden_dim=128,
        num_labels=len(LABELS),
        pad_idx=checkpoint["vocab"]["<pad>"],
    )
    model.load_state_dict(checkpoint["state_dict"])
    model.to(device)
    model.eval()
    
    all_probs = []
    all_labels = []
    with torch.no_grad():
        for x, y in valid_loader:
            x = x.to(device)
            y = y.to(device)
            logits = model(x)
            probs = torch.sigmoid(logits)
            all_probs.append(probs.cpu().numpy())
            all_labels.append(y.cpu().numpy())
    
    y_true = np.concatenate(all_labels, axis=0)
    y_prob = np.concatenate(all_probs, axis=0)
    return y_true, y_prob

def get_bilstm_attn_predictions(train_df, valid_df):
    _, valid_loader, vocab = build_dataloaders_rnn(train_df, valid_df, max_len=100)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    checkpoint = torch.load(MODELS_DIR / "bilstm_attention.pt", map_location=device)
    model = BiLSTMAttentionClassifier(
        vocab_size=len(checkpoint["vocab"]),
        embed_dim=128,
        hidden_dim=128,
        num_labels=len(LABELS),
        pad_idx=checkpoint["vocab"]["<pad>"],
    )
    model.load_state_dict(checkpoint["state_dict"])
    model.to(device)
    model.eval()
    
    all_probs = []
    all_labels = []
    all_attn = []
    with torch.no_grad():
        for x, y in valid_loader:
            x = x.to(device)
            y = y.to(device)
            logits, attn = model(x)
            probs = torch.sigmoid(logits)
            all_probs.append(probs.cpu().numpy())
            all_labels.append(y.cpu().numpy())
            all_attn.append(attn.cpu().numpy())
    
    y_true = np.concatenate(all_labels, axis=0)
    y_prob = np.concatenate(all_probs, axis=0)
    attn = np.concatenate(all_attn, axis=0)
    return y_true, y_prob, attn, vocab


In [None]:
class JigsawBertDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels.astype("float32")
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        enc = self.tokenizer(
            str(self.texts[idx]),
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt",
        )
        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.float32),
        }

class JigsawLexiconDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_len, lex_feats):
        self.texts = texts
        self.labels = labels.astype("float32")
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.lex_feats = lex_feats.astype("float32")
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        enc = self.tokenizer(
            str(self.texts[idx]),
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt",
        )
        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.float32),
            "lex_feats": torch.tensor(self.lex_feats[idx], dtype=torch.float32),
        }

def get_transformer_predictions(model_name: str, model_type: str, valid_df, with_lexicon=False):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    texts = valid_df["comment_text"].tolist()
    y_true = valid_df[LABELS].values
    
    if model_type in ["bert", "distilbert"]:
        tok_name = "bert-base-uncased" if model_type == "bert" else "distilbert-base-uncased"
        tokenizer = AutoTokenizer.from_pretrained(tok_name)
        ds = JigsawBertDataset(texts, y_true, tokenizer, MAX_SEQ_LEN)
        loader = DataLoader(ds, batch_size=16, shuffle=False, num_workers=2)
        
        if model_type == "bert":
            model = create_bert_base(len(LABELS))
        else:
            model = create_distilbert(len(LABELS))
        
        model.load_state_dict(torch.load(MODELS_DIR / model_name, map_location=device))
        model.to(device)
        model.eval()
        
        all_probs, all_labels = [], []
        with torch.no_grad():
            for batch in loader:
                ids = batch["input_ids"].to(device)
                mask = batch["attention_mask"].to(device)
                labels = batch["labels"].to(device)
                logits = model(ids, mask)
                probs = torch.sigmoid(logits)
                all_probs.append(probs.cpu().numpy())
                all_labels.append(labels.cpu().numpy())
        
        y_true = np.concatenate(all_labels, axis=0)
        y_prob = np.concatenate(all_probs, axis=0)
        return y_true, y_prob
    
    elif model_type == "lexicon_bert":
        from src.lexicon_utils import extract_lexicon_features
        
        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
        lex_feats = extract_lexicon_features(texts)
        ds = JigsawLexiconDataset(texts, y_true, tokenizer, MAX_SEQ_LEN, lex_feats)
        loader = DataLoader(ds, batch_size=16, shuffle=False, num_workers=2)
        
        model = LexiconHybridBert.from_pretrained("bert-base-uncased", num_labels=len(LABELS), lexicon_dim=lex_feats.shape[1])
        model.load_state_dict(torch.load(MODELS_DIR / model_name, map_location=device))
        model.to(device)
        model.eval()
        
        all_probs, all_labels = [], []
        with torch.no_grad():
            for batch in loader:
                ids = batch["input_ids"].to(device)
                mask = batch["attention_mask"].to(device)
                lex = batch["lex_feats"].to(device)
                labels = batch["labels"].to(device)
                logits = model(ids, mask, lex)
                probs = torch.sigmoid(logits)
                all_probs.append(probs.cpu().numpy())
                all_labels.append(labels.cpu().numpy())
        
        y_true = np.concatenate(all_labels, axis=0)
        y_prob = np.concatenate(all_probs, axis=0)
        return y_true, y_prob
    
    else:
        raise ValueError("Unsupported transformer type")


In [None]:
from src.training.train_classifier_from_encoder import ContrastiveClassifier

def get_contrastive_predictions(valid_df):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    texts = valid_df["comment_text"].tolist()
    y_true = valid_df[LABELS].values
    
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    
    class JigsawEncoderDataset(torch.utils.data.Dataset):
        def __init__(self, texts, labels, tokenizer, max_len):
            self.texts = texts
            self.labels = labels.astype("float32")
            self.tokenizer = tokenizer
            self.max_len = max_len
        
        def __len__(self): return len(self.texts)
        
        def __getitem__(self, idx):
            enc = self.tokenizer(
                str(self.texts[idx]),
                truncation=True,
                padding="max_length",
                max_length=MAX_SEQ_LEN,
                return_tensors="pt",
            )
            return {
                "input_ids": enc["input_ids"].squeeze(0),
                "attention_mask": enc["attention_mask"].squeeze(0),
                "labels": torch.tensor(self.labels[idx], dtype=torch.float32),
            }
    
    ds = JigsawEncoderDataset(texts, y_true, tokenizer, MAX_SEQ_LEN)
    loader = DataLoader(ds, batch_size=16, shuffle=False, num_workers=2)
    
    encoder = ContrastiveBertEncoder("bert-base-uncased")
    encoder.load_state_dict(torch.load(MODELS_DIR / "contrastive_encoder.pt", map_location=device))
    model = ContrastiveClassifier(encoder, num_labels=len(LABELS))
    model.load_state_dict(torch.load(MODELS_DIR / "contrastive_bert_classifier.pt", map_location=device))
    model.to(device)
    model.eval()
    
    all_probs, all_labels = [], []
    with torch.no_grad():
        for batch in loader:
            ids = batch["input_ids"].to(device)
            mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            logits = model(ids, mask)
            probs = torch.sigmoid(logits)
            all_probs.append(probs.cpu().numpy())
            all_labels.append(labels.cpu().numpy())
    
    y_true = np.concatenate(all_labels, axis=0)
    y_prob = np.concatenate(all_probs, axis=0)
    return y_true, y_prob


In [None]:
results = {}  # model_name -> dict(y_true, y_prob, metrics)

# Logistic Regression
y_true_lr, y_prob_lr, logreg_model = get_logreg_predictions(train_df, valid_df)
results["logreg"] = {
    "y_true": y_true_lr,
    "y_prob": y_prob_lr,
    "metrics": compute_classification_metrics(y_true_lr, y_prob_lr, threshold=0.5, label_names=LABELS),
}

# BiLSTM
y_true_bi, y_prob_bi = get_bilstm_predictions(train_df, valid_df)
results["bilstm"] = {
    "y_true": y_true_bi,
    "y_prob": y_prob_bi,
    "metrics": compute_classification_metrics(y_true_bi, y_prob_bi, threshold=0.5, label_names=LABELS),
}

# BiLSTM + Attention
y_true_bia, y_prob_bia, attn_bia, vocab_bia = get_bilstm_attn_predictions(train_df, valid_df)
results["bilstm_attn"] = {
    "y_true": y_true_bia,
    "y_prob": y_prob_bia,
    "metrics": compute_classification_metrics(y_true_bia, y_prob_bia, threshold=0.5, label_names=LABELS),
}

# DistilBERT
y_true_db, y_prob_db = get_transformer_predictions("distilbert_toxic.pt", "distilbert", valid_df)
results["distilbert"] = {
    "y_true": y_true_db,
    "y_prob": y_prob_db,
    "metrics": compute_classification_metrics(y_true_db, y_prob_db, threshold=0.5, label_names=LABELS),
}

# BERT-base
y_true_bert, y_prob_bert = get_transformer_predictions("bert_toxic.pt", "bert", valid_df)
results["bert"] = {
    "y_true": y_true_bert,
    "y_prob": y_prob_bert,
    "metrics": compute_classification_metrics(y_true_bert, y_prob_bert, threshold=0.5, label_names=LABELS),
}

# Lexicon-BERT
y_true_lex, y_prob_lex = get_transformer_predictions("bert_lexicon_hybrid.pt", "lexicon_bert", valid_df)
results["lexicon_bert"] = {
    "y_true": y_true_lex,
    "y_prob": y_prob_lex,
    "metrics": compute_classification_metrics(y_true_lex, y_prob_lex, threshold=0.5, label_names=LABELS),
}

# Contrastive-BERT classifier
y_true_con, y_prob_con = get_contrastive_predictions(valid_df)
results["contrastive_bert"] = {
    "y_true": y_true_con,
    "y_prob": y_prob_con,
    "metrics": compute_classification_metrics(y_true_con, y_prob_con, threshold=0.5, label_names=LABELS),
}


In [None]:
rows = []
for name, info in results.items():
    m = info["metrics"]
    rows.append({
        "model": name,
        "macro_f1": m["macro_f1"],
        "micro_f1": m["micro_f1"],
    })
pd.DataFrame(rows).sort_values("macro_f1", ascending=False)


We see the expected hierarchy:

- Lexicon-BERT and BERT achieve the highest macro F1 among all models.
- DistilBERT is slightly behind BERT, but still clearly better than classical baselines.
- Logistic Regression with TF–IDF outperforms plain BiLSTM, confirming that lexical cues are very strong for this dataset.
- BiLSTM with attention narrows the gap but still trails Transformers.
- The contrastive-BERT classifier performs significantly worse on macro F1. It is good at separating toxic vs non-toxic overall, but struggles with fine-grained labels, which we confirm below.


In [None]:
per_label_rows = []
for name, info in results.items():
    m = info["metrics"]["per_label"]
    for label in LABELS:
        per_label_rows.append({
            "model": name,
            "label": label,
            "f1": m[label]["f1"],
            "precision": m[label]["precision"],
            "recall": m[label]["recall"],
        })
per_label_df = pd.DataFrame(per_label_rows)
per_label_df_pivot = per_label_df.pivot(index="label", columns="model", values="f1")
per_label_df_pivot


For the easy label `toxic`, almost all models achieve strong F1.

The more interesting story is in the rare labels:

- `severe_toxic` and `threat` remain difficult across all models.
- Lexicon-BERT tends to improve F1 for `severe_toxic`, `threat`, and `identity_hate` compared to vanilla BERT.
- Contrastive-BERT collapses on `threat` and `identity_hate`, which supports the hypothesis that contrastive training at the level of "toxic vs non-toxic" is not well aligned with multi-label severity distinctions.


In [None]:
label = "threat"
label_idx = LABELS.index(label)
model_names = ["bert", "lexicon_bert", "contrastive_bert"]
n_examples = 5

for name in model_names:
    info = results[name]
    y_true = info["y_true"]
    y_prob = info["y_prob"]
    fp, fn = get_fp_fn_indices(y_true, y_prob, label_idx, threshold=0.5)
    
    print(f"\n==== {name} - label: {label} - top {n_examples} false positives ====")
    for i in fp[:n_examples]:
        row = valid_df.iloc[i]
        print(f"[prob={y_prob[i, label_idx]:.3f}] TRUE={int(y_true[i, label_idx])}")
        print(row["comment_text"])
        print("labels:", row[LABELS].to_dict())
        print("---")
    
    print(f"\n==== {name} - label: {label} - top {n_examples} false negatives ====")
    for i in fn[:n_examples]:
        row = valid_df.iloc[i]
        print(f"[prob={y_prob[i, label_idx]:.3f}] TRUE={int(y_true[i, label_idx])}")
        print(row["comment_text"])
        print("labels:", row[LABELS].to_dict())
        print("---")


From these examples we can observe:

- Many **false negatives** for `threat` correspond to implicit or indirect threats, such as "you will regret this" or "wait until I see you in person", which are difficult even for humans.
- **Lexicon-BERT** tends to pick up explicit threat words such as "kill", "beat", "destroy", which explains its slightly higher recall on `threat`.
- **Contrastive-BERT** often misses threats entirely or assigns low probability, confirming that the representation learned from generic toxic vs non-toxic pairs does not capture intensity or subtype information reliably.


In [None]:
identity_terms = ["gay", "muslim", "jew", "black", "asian", "christian", "women", "men"]

def contains_identity(text, terms):
    text_low = text.lower()
    return any(t in text_low for t in terms)

subset_idx = [i for i, txt in enumerate(valid_df["comment_text"].tolist()) if contains_identity(txt, identity_terms)]
len(subset_idx)


In [None]:
def inspect_identity_fp(model_name, label="identity_hate", top_n=10):
    label_idx = LABELS.index(label)
    info = results[model_name]
    y_true = info["y_true"]
    y_prob = info["y_prob"]
    
    probs = y_prob[subset_idx, label_idx]
    truths = y_true[subset_idx, label_idx]
    idx_arr = np.array(subset_idx)
    
    # focus on false positives among identity mentions
    fp_mask = (truths == 0) & (probs >= 0.5)
    fp_idx = idx_arr[fp_mask]
    order = np.argsort(probs[fp_mask])[::-1]
    fp_idx = fp_idx[order]
    
    print(f"\nModel: {model_name} | identity-related false positives: {len(fp_idx)}")
    for i in fp_idx[:top_n]:
        row = valid_df.iloc[i]
        print(f"[prob={y_prob[i, label_idx]:.3f}] TRUE={int(results[model_name]['y_true'][i, label_idx])}")
        print(row["comment_text"])
        print("labels:", row[LABELS].to_dict())
        print("---")

inspect_identity_fp("bert", label="identity_hate", top_n=5)
inspect_identity_fp("lexicon_bert", label="identity_hate", top_n=5)
inspect_identity_fp("contrastive_bert", label="identity_hate", top_n=5)


We see that:

- All models still sometimes flag **neutral identity mentions** as identity-hate, especially in comments that contain mild disagreement plus a group reference.
- Lexicon-BERT helps when slurs are explicit, but does not fully solve the bias problem.
- Contrastive-BERT is unpredictable in identity-related contexts, reinforcing the need for a dedicated fairness analysis and possibly counterfactual augmentation.


In [None]:
REPORTS_DIR.mkdir(parents=True, exist_ok=True)

y_true_val = results["bert"]["y_true"]  # same across models, but we pick one
np.save(REPORTS_DIR / "val_y_true.npy", y_true_val)

for name, info in results.items():
    np.save(REPORTS_DIR / f"val_{name}_probs.npy", info["y_prob"])


We saved:

- `reports/val_y_true.npy`
- `reports/val_<model>_probs.npy` for each model

These will be used in a separate **ensemble and ablation** notebook.
