# ðŸ§ª Toxic Comment Classification (Jigsaw) â€” End-to-End Notebook
# Single-model optimization with a quick 2-model benchmark phase



---

## 0. Setup & Config

In [None]:
!pip -q install transformers datasets accelerate evaluate scikit-learn matplotlib seaborn gradio torchmetrics --upgrade

ERROR: Could not find a version that satisfies the requirement accelerate (from versions: none)
ERROR: No matching distribution found for accelerate


In [None]:
import os, sys, math, json, random, gc, time, shutil
from dataclasses import dataclass
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import torch
from torch import nn
from torch.utils.data import DataLoader

from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import MinMaxScaler
import evaluate

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    get_linear_schedule_with_warmup,
)
import transformers

print("Transformers:", transformers.__version__)
print("Torch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SEED   = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED); 
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# ====== PATHS: set this to your local location ======
DATA_PATH = "./data/train.csv"  # <- change if needed
OUT_DIR   = "./outputs"
os.makedirs(OUT_DIR, exist_ok=True)

# Labels in Jigsaw dataset
LABELS = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]

# Base configs
BASE_MODELS = {
    "distilbert-base-uncased": {"max_length": 192},
    "bert-base-uncased":      {"max_length": 192}
}

# Subset sizes (smaller on CPU to keep it practical)
if DEVICE == "cuda":
    QUICK_TRAIN_SIZE = 8000   # for quick benchmark per model
    QUICK_VAL_SIZE   = 2000
else:
    QUICK_TRAIN_SIZE = 2000
    QUICK_VAL_SIZE   = 800

# Final training sizes (Phase 2) â€” you can increase if GPU available
FINAL_TRAIN_FRACTION = 0.9   # 90% of train split used for training; rest for val
EPOCHS_BENCHMARK     = 1     # quick head-to-head
EPOCHS_FINAL         = 3     # main optimization run
BATCH_SIZE           = 16 if DEVICE == "cuda" else 8
LR_BENCHMARK         = 2e-5
LR_FINAL             = 2e-5
WEIGHT_DECAY         = 0.01
PATIENCE             = 2     # early stopping patience for final run

---

## 1. Load & Inspect Data

In [None]:
assert os.path.exists(DATA_PATH), f"File not found: {DATA_PATH}\nMake sure train.csv is available."

df = pd.read_csv(DATA_PATH)
print(df.shape)
df.head()

In [None]:
# Basic checks
for col in LABELS:
    assert col in df.columns, f"Missing label column '{col}' in train.csv"

df["total_labels"] = df[LABELS].sum(axis=1)
print(df[LABELS].mean().sort_values(ascending=False))  # class prevalence

plt.figure(figsize=(7,4))
df[LABELS].mean().sort_values(ascending=False).plot(kind="bar")
plt.title("Label Prevalence (mean of 0/1)"); plt.ylabel("Proportion"); plt.grid(axis='y'); plt.show()

plt.figure(figsize=(7,4))
df["comment_text_len"] = df["comment_text"].astype(str).str.len()
sns.histplot(df["comment_text_len"], bins=50, kde=False)
plt.title("Comment length distribution"); plt.xlabel("chars"); plt.grid(axis='y'); plt.show()

---

## 2. Train/Validation/Test Split

In [None]:
# Standard split: keep a holdout test set from training csv (since Kaggle provides separate test on platform)
train_df, test_df = train_test_split(df, test_size=0.1, random_state=SEED, stratify=(df[LABELS].sum(axis=1)>0))

# Smaller sampled sets for quick benchmark
def sample_balanced(df_in, n):
    n = min(n, len(df_in))
    # simple random for speed; optionally stratify by 'any_toxic'
    return df_in.sample(n=n, random_state=SEED)

train_quick = sample_balanced(train_df, QUICK_TRAIN_SIZE)
val_quick   = sample_balanced(test_df, QUICK_VAL_SIZE)

print("Quick train:", train_quick.shape, "Quick val:", val_quick.shape)

---

## 3. Hugging Face Datasets & Tokenization

In [None]:
def to_hf_dataset(df_in):
    return Dataset.from_pandas(df_in[["comment_text"] + LABELS].reset_index(drop=True))

def tokenize_function(examples, tokenizer, max_length):
    return tokenizer(
        examples["comment_text"],
        padding=False,
        truncation=True,
        max_length=max_length
    )

def build_hf_splits(train_df, val_df, tokenizer, max_length):
    ds_train = to_hf_dataset(train_df)
    ds_val   = to_hf_dataset(val_df)

    ds_train = ds_train.map(lambda x: tokenize_function(x, tokenizer, max_length), batched=True, remove_columns=["comment_text"])
    ds_val   = ds_val.map(lambda x: tokenize_function(x, tokenizer, max_length),   batched=True, remove_columns=["comment_text"])
    ds = DatasetDict({"train": ds_train, "validation": ds_val})
    return ds

---

## 4. Class Weights (to mitigate imbalance)

In [None]:
# Compute positive rate per label and derive inverse-frequency weights
pos_rates = train_quick[LABELS].mean().values
class_weights = 1.0 / np.clip(pos_rates, 1e-6, None)
class_weights = class_weights / class_weights.mean()  # normalize
CLASS_WEIGHTS_TENSOR = torch.tensor(class_weights, dtype=torch.float32).to(DEVICE)

dict(zip(LABELS, class_weights))

---

## 5. Metrics (Multi-Label: macro-F1, weighted-F1, macro ROC-AUC, per-label F1)

In [None]:
metric_f1 = evaluate.load("f1")
metric_accuracy = evaluate.load("accuracy")  # not very meaningful for multi-label, but we can compute thresholded exact-match accuracy
from sklearn.metrics import f1_score

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def compute_metrics_fn(eval_pred):
    logits, labels = eval_pred
    probs = sigmoid(logits)
    y_true = labels
    y_pred = (probs >= 0.5).astype(int)

    # Macro/weighted F1 across labels
    f1_macro   = f1_score(y_true, y_pred, average="macro", zero_division=0)
    f1_weight  = f1_score(y_true, y_pred, average="weighted", zero_division=0)
    # ROC-AUC macro
    try:
        roc_macro = roc_auc_score(y_true, probs, average="macro")
    except Exception:
        roc_macro = float("nan")
    # per-label F1
    f1_per_label = f1_score(y_true, y_pred, average=None, zero_division=0)
    return {
        "f1_macro":   f1_macro,
        "f1_weight":  f1_weight,
        "roc_auc_macro": roc_macro,
        **{f"f1_{LABELS[i]}": f1_per_label[i] for i in range(len(LABELS))}
    }

---

## 6. Custom Loss: BCEWithLogits + Class Weights

In [None]:
class MultiLabelTrainer(Trainer):
    def __init__(self, class_weights=None, **kwargs):
        super().__init__(**kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = torch.stack([inputs.pop(l) for l in LABELS], dim=1).float().to(model.device)
        outputs = model(**inputs)
        logits  = outputs.logits
        # BCEWithLogits with per-label weights
        loss_fct = nn.BCEWithLogitsLoss(pos_weight=self.class_weights)
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

---

## 7. Phase 1 â€“ Quick Head-to-Head (DistilBERT vs BERT-base)

In [None]:
results_benchmark = {}

for model_name, cfg in BASE_MODELS.items():
    print(f"\n===== Benchmarking: {model_name} =====")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    ds_quick = build_hf_splits(train_quick, val_quick, tokenizer, cfg["max_length"])
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    num_labels = len(LABELS)
    config = AutoConfig.from_pretrained(model_name, num_labels=num_labels, problem_type="multi_label_classification")
    model  = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)

    args = TrainingArguments(
        output_dir=os.path.join(OUT_DIR, f"{model_name.replace('/','_')}_bench"),
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        learning_rate=LR_BENCHMARK,
        num_train_epochs=EPOCHS_BENCHMARK,
        evaluation_strategy="epoch",
        save_strategy="no",
        logging_steps=50,
        fp16=(DEVICE=="cuda"),
        report_to="none",
        load_best_model_at_end=False,
        disable_tqdm=False
    )

    trainer = MultiLabelTrainer(
        model=model,
        args=args,
        train_dataset=ds_quick["train"],
        eval_dataset=ds_quick["validation"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics_fn,
        class_weights=CLASS_WEIGHTS_TENSOR
    )
    train_out = trainer.train()
    eval_out  = trainer.evaluate()
    results_benchmark[model_name] = eval_out
    print(f"Eval: {eval_out}")

print("\nBenchmark summary:")
print(json.dumps(results_benchmark, indent=2))

> **Choose the final model**: pick the one with higher `f1_macro` (and `roc_auc_macro`). Typically DistilBERT is faster; BERT-base may be slightly better if you have GPU.

In [None]:
# Auto-select best by f1_macro
best_model_name = max(results_benchmark.keys(), key=lambda k: results_benchmark[k]["eval_f1_macro"])
best_model_name

---

## 8. Phase 2 â€“ Final Training & Optimization (single chosen model)

In [None]:
FINAL_MODEL = best_model_name  # or set manually: "distilbert-base-uncased"
print("Final chosen model:", FINAL_MODEL)

# Build a larger split from original train_df/test_df
train_full, val_full = train_test_split(
    train_df, test_size=(1 - FINAL_TRAIN_FRACTION), random_state=SEED,
    stratify=(train_df[LABELS].sum(axis=1)>0)
)

tokenizer = AutoTokenizer.from_pretrained(FINAL_MODEL)
max_len   = BASE_MODELS[FINAL_MODEL]["max_length"]
ds_full   = build_hf_splits(train_full, val_full, tokenizer, max_len)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

num_labels = len(LABELS)
config = AutoConfig.from_pretrained(FINAL_MODEL, num_labels=num_labels, problem_type="multi_label_classification")
model  = AutoModelForSequenceClassification.from_pretrained(FINAL_MODEL, config=config)

total_train_steps = (len(ds_full["train"]) // BATCH_SIZE) * EPOCHS_FINAL
print("Train samples:", len(ds_full["train"]), "Val samples:", len(ds_full["validation"]))

args_final = TrainingArguments(
    output_dir=os.path.join(OUT_DIR, f"{FINAL_MODEL.replace('/','_')}_final"),
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LR_FINAL,
    num_train_epochs=EPOCHS_FINAL,
    weight_decay=WEIGHT_DECAY,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    load_best_model_at_end=True,
    logging_steps=50,
    fp16=(DEVICE=="cuda"),
    report_to="none",
    disable_tqdm=False
)

early_stop = EarlyStoppingCallback(early_stopping_patience=PATIENCE, early_stopping_threshold=0.0)

trainer_final = MultiLabelTrainer(
    model=model,
    args=args_final,
    train_dataset=ds_full["train"],
    eval_dataset=ds_full["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics_fn,
    callbacks=[early_stop],
    class_weights=CLASS_WEIGHTS_TENSOR
)

train_res = trainer_final.train()
best_metrics = trainer_final.evaluate()
best_metrics


---

## 9. Detailed Evaluation: Per-Label Reports, Curves

In [None]:
# Predictions on validation set
raw = trainer_final.predict(ds_full["validation"])
logits = raw.predictions
probs  = 1 / (1 + np.exp(-logits))
y_true = np.stack([ds_full["validation"][l] for l in LABELS], axis=1)
y_pred = (probs >= 0.5).astype(int)

print("Macro F1:", classification_report(y_true, y_pred, target_names=LABELS, zero_division=0))

# ROC-AUC per label
try:
    roc_per_label = {LABELS[i]: roc_auc_score(y_true[:,i], probs[:,i]) for i in range(len(LABELS))}
    pd.Series(roc_per_label).sort_values(ascending=False)
except Exception as e:
    print("ROC-AUC per label error:", e)

# Plot probability histograms per label
fig, axes = plt.subplots(2,3, figsize=(14,8))
axes = axes.ravel()
for i, lab in enumerate(LABELS):
    sns.histplot(probs[:,i], bins=30, ax=axes[i], color="#3A8BFF")
    axes[i].set_title(f"Predicted P({lab})")
    axes[i].grid(axis='y')
plt.tight_layout(); plt.show()

---

## 10. Save Artifacts & Inference Function

In [None]:
SAVE_DIR = os.path.join(OUT_DIR, f"{FINAL_MODEL.replace('/','_')}_BEST")
trainer_final.save_model(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)

def predict_texts(texts, path=SAVE_DIR, threshold=0.5):
    tok = AutoTokenizer.from_pretrained(path)
    cfg = AutoConfig.from_pretrained(path)
    mdl = AutoModelForSequenceClassification.from_pretrained(path).to(DEVICE)
    mdl.eval()
    enc = tok(texts, padding=True, truncation=True, max_length=max_len, return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        logits = mdl(**enc).logits
        probs = torch.sigmoid(logits).cpu().numpy()
    preds = (probs >= threshold).astype(int)
    return probs, preds

probs, preds = predict_texts(["I will find you and I will hurt you.","Have a wonderful day!"])
pd.DataFrame(probs, columns=LABELS)

---

## 11. Gradio Demo (Local App)

In [None]:
import gradio as gr

def classify_comment(text, threshold=0.5):
    pr, pd_bin = predict_texts([text], path=SAVE_DIR, threshold=float(threshold))
    pr = pr[0]; pd_bin = pd_bin[0]
    result = {LABELS[i]: float(pr[i]) for i in range(len(LABELS))}
    preds  = {LABELS[i]: int(pd_bin[i]) for i in range(len(LABELS))}
    return result, preds

demo = gr.Interface(
    fn=classify_comment,
    inputs=[gr.Textbox(lines=4, label="Comment"), gr.Slider(0.1, 0.9, value=0.5, step=0.05, label="Decision threshold")],
    outputs=[gr.Label(num_top_classes=6, label="Probabilities"), gr.JSON(label="Binary predictions (â‰¥ threshold)")],
    title="Jigsaw Toxic Comment Classifier",
    description="DistilBERT/BERT multi-label classifier with sigmoid outputs."
)

# Uncomment to launch locally
# demo.launch(share=False)

---

## 12. Report Pointers (for exam write-up)

- **Problem framing**: Online moderation, multi-label toxicity detection.  
- **Data**: Jigsaw (2018), size, class imbalance, preprocessing decisions.  
- **Method**: Phase 1 benchmark (DistilBERT vs BERT-base) â†’ Phase 2 optimization (chosen model).  
- **Loss**: `BCEWithLogitsLoss` with per-label `pos_weight`.  
- **Optimization**: LR 2e-5, batch 16 (GPU), early stopping, weight decay.  
- **Metrics**: Macro-F1 primary, Weighted-F1 secondary, Macro ROC-AUC; per-label F1 table.  
- **Results**: Show curves, tables, sample predictions.  
- **Ethics**: Bias, fairness, explainability (optional: SHAP on token importance), threshold choice & moderation policy.  
- **Reproducibility**: random seeds, environment, versions, saved artifacts. 

---

## 13. (Optional) Tips if CPU-only

- Use **DistilBERT** only; set `QUICK_TRAIN_SIZE=1000`, `QUICK_VAL_SIZE=400`, `EPOCHS_FINAL=2`.  
- Reduce `max_length` to **128**.  
- Consider **gradient accumulation** to emulate larger batch sizes:
  - Add `gradient_accumulation_steps=2` in `TrainingArguments`.
- Expect much slower training; use the notebook to validate pipeline, then scale on Colab GPU.