In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import pandas as pd
from tqdm import tqdm
import numpy as np
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.metrics import f1_score, accuracy_score

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using: {device}")

In [None]:
model_name = "unitary/toxic-bert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.to(device)
model.eval()

In [None]:
# Load input text and ground truth
input_df = pd.read_csv("/home/kronbii/repos/content-violence-detection/datasets/text/jigsaw/test.csv")  # contains 'id' and 'text'
gt_df = pd.read_csv("/home/kronbii/repos/content-violence-detection/datasets/text/jigsaw/test_labels.csv")   # contains 'id' and 6 labels

# Merge on ID to ensure alignment
data = input_df.merge(gt_df, on="id", how="inner")

print(data.head())

In [None]:
# Label columns to check for -1
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# Remove all rows where any label is -1
clean_data = data[~(data[label_cols] == -1).any(axis=1)].reset_index(drop=True)

print(f"Rows before cleaning: {len(data)}")
print(f"Rows after removing -1 labels: {len(clean_data)}")

In [None]:
# Store predictions
all_preds = []

batch_size = 32
texts = clean_data['comment_text'].tolist()
ids = clean_data['id'].tolist()

for i in tqdm(range(0, len(texts), batch_size)):
    batch_texts = texts[i:i+batch_size]
    encodings = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=128)
    encodings = {k: v.to(device) for k, v in encodings.items()}

    with torch.no_grad():
        logits = model(**encodings).logits
        probs = torch.sigmoid(logits).cpu().numpy()
        all_preds.append(probs)

# Combine predictions into final array
all_preds = np.vstack(all_preds)

In [None]:
# Build output DataFrame
pred_df = pd.DataFrame(all_preds, columns=label_cols)
pred_df.insert(0, 'id', ids)

# Save to CSV
pred_df.to_csv("predictions.csv", index=False)
print("Predictions saved to predictions.csv")

In [None]:
# Replace -1 in ground truth (assumes -1 means positive label)
print(clean_data.columns.tolist())

y_true = clean_data[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].replace(-1, 1).values
y_pred_probs = pred_df[label_cols].values

# Binarize predictions
y_pred = (y_pred_probs >= 0.5).astype(int)

# Per-label metrics
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

print("Per-label Evaluation:\n")

for i, label in enumerate(label_cols):
    true = y_true[:, i]
    pred = y_pred[:, i]

    total = len(true)
    tp = ((true == 1) & (pred == 1)).sum()
    tn = ((true == 0) & (pred == 0)).sum()
    acc = accuracy_score(true, pred)
    f1 = f1_score(true, pred, average='binary', zero_division=0)

    print(f"Label: {label}")
    print(f"  Total samples:     {total}")
    print(f"  Matches dets:      {tp + tn}")
    print(f"  Accuracy:          {acc:.4f}")
    print(f"  F1 Score:          {f1:.4f}")
    print("-" * 40)