## Imports

In [None]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import pandas as pd
import ast
from transformers import AutoTokenizer
from torch.utils.data import TensorDataset, DataLoader
import torch
import numpy as np
from transformers import AutoModel
import torch.nn as nn
from sklearn.metrics import f1_score, roc_auc_score

  from .autonotebook import tqdm as notebook_tqdm


## Proccess data
1. **SemEval-2021 Task 5: Toxic Spans Detection**, to detect the offensive part of the message
2. **Jigsaw**, to detect toxic messages

In [2]:
# Helper function for the SemEval-2021 Task 5: Toxic Spans Detection dataset
def extract_text_and_spans(dataset_split):
    X = []  # texts
    y = []  # spans
    
    for sample in dataset_split:
        text = sample["text_of_post"]
        X.append(text)
        # Parse positions and convert to spans
        try:
            toxic_positions = ast.literal_eval(sample["position"])
        except:
            toxic_positions = []
        # Convert positions to spans [start, end)
        spans = []
        if toxic_positions:
            toxic_positions = sorted(toxic_positions)
            start = toxic_positions[0]
            end = toxic_positions[0]
            
            for pos in toxic_positions[1:]:
                if pos == end + 1:  # Consecutive
                    end = pos
                else:  # Gap found
                    spans.append([start, end + 1])
                    start = pos
                    end = pos
            spans.append([start, end + 1])
        y.append(spans)
    
    return X, y

In [3]:
# SemEval-2021 Task 5: Toxic Spans Detection
dataset = load_dataset("heegyu/toxic-spans")
train = dataset["train"]
test = dataset["test"]
X_train_span, y_train_span = extract_text_and_spans(train)
X_test_span, y_test_span = extract_text_and_spans(test)

# Jigsaw
# -Train / Val-
subcategories = ["severe_toxic", "obscene", "threat", "insult", "identity_hate"]
train_data = pd.read_csv("jigsaw-toxic-comment-data/train.csv")
# If any subcategory is 1, set toxic to 1
train_data["toxic"] = train_data[["toxic"] + subcategories].max(axis=1)
X_train_toxic = train_data["comment_text"]
y_train_toxic = train_data["toxic"]
# Split to train and val
X_train_toxic, X_val_toxic, y_train_toxic, y_val_toxic = train_test_split(
    X_train_toxic, y_train_toxic, test_size=0.15, stratify=y_train_toxic, random_state=2025
)

# -Test-
test_text = pd.read_csv("jigsaw-toxic-comment-data/test.csv")
test_labels = pd.read_csv("jigsaw-toxic-comment-data/test_labels.csv")
# Keep only rows where toxic is not -1
mask = test_labels["toxic"] != -1
test_text = test_text[mask].reset_index(drop=True)
test_labels = test_labels[mask].reset_index(drop=True)
test_labels["toxic"] = test_labels[["toxic"] + subcategories].max(axis=1)
X_test_toxic = test_text["comment_text"]
y_test_toxic = test_labels["toxic"]

In [12]:
type(X_train_toxic)

list

In [None]:
# Create datasets # SemEval-2021 Task 5: Toxic Spans Detection
# TODO 
# Jigsaw

# Need to change them since they are pandas series objects
X_train_toxic = np.array(X_train_toxic, dtype=str)
X_val_toxic = np.array(X_val_toxic, dtype=str)
X_test_toxic = np.array(X_test_toxic, dtype=str)

y_train_toxic = np.array(y_train_toxic, dtype=np.float32)
y_val_toxic = np.array(y_val_toxic, dtype=np.float32)
y_test_toxic = np.array(y_test_toxic, dtype=np.float32)

# Load tokenizer for DeBERTa-v3-base (moved before usage)
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")

# Get average text size, through all the train and val texts
# Fix: Use np.concatenate instead of + operator for numpy arrays
sample_texts = np.concatenate([X_train_toxic, X_val_toxic])[:2000]  # first 1000 texts
avg_tokens = round(np.mean([len(tokenizer.encode(t, add_special_tokens=True)) for t in sample_texts]))
print(avg_tokens)

# Tokenization
train_encodings = tokenizer(
    X_train_toxic.tolist(),  # Convert to list for better compatibility
    truncation=True,
    padding="max_length",
    max_length=avg_tokens,
    return_tensors="pt",
)

val_encodings = tokenizer(
    X_val_toxic.tolist(),
    truncation=True,
    padding="max_length",
    max_length=avg_tokens,
    return_tensors="pt",
)

test_encodings = tokenizer(
    X_test_toxic.tolist(),
    truncation=True,
    padding="max_length",
    max_length=avg_tokens,
    return_tensors="pt",
)

y_train_toxic_tensor = torch.tensor(y_train_toxic, dtype=torch.float32)
y_val_toxic_tensor = torch.tensor(y_val_toxic, dtype=torch.float32)
y_test_toxic_tensor = torch.tensor(y_test_toxic, dtype=torch.float32)

train_toxic_dataset = TensorDataset(
    train_encodings["input_ids"],
    train_encodings["attention_mask"],
    y_train_toxic_tensor
)

val_toxic_dataset = TensorDataset(
    val_encodings["input_ids"],
    val_encodings["attention_mask"],
    y_val_toxic_tensor
)

test_toxic_dataset = TensorDataset(
    test_encodings["input_ids"],
    test_encodings["attention_mask"],
    y_test_toxic_tensor
)

train_toxic_loader = DataLoader(train_toxic_dataset, batch_size=32, shuffle=True)
val_toxic_loader = DataLoader(val_toxic_dataset, batch_size=32, shuffle=False) 
test_toxic_loader = DataLoader(test_toxic_dataset, batch_size=32, shuffle=False)  

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


90


## Model architecture and training/evaluation

In [None]:
# Model needs to contain 2 different heads (return values) one for the classification problem
# and one for the span
class ToxicityModel(nn.Module):
    def __init__(self, model_name="microsoft/deberta-v3-base"):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(model_name)
        hidden = self.backbone.config.hidden_size
        self.dropout = nn.Dropout(0.1)

        # Heads
        self.seq_head = nn.Linear(hidden, 1)   # [batch, 1]
        self.tok_head = nn.Linear(hidden, 2)   # [batch, seq_len, 2] -> CE over classes

    def forward(self, input_ids, attention_mask, token_type_ids=None):
        out = self.backbone(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            return_dict=True
        )
        last_hidden = self.dropout(out.last_hidden_state)   # [B, T, H]
        cls_pooled  = self.dropout(last_hidden[:, 0])       # CLS pooling for DeBERTa-v3

        seq_logits  = self.seq_head(cls_pooled)             # [B, 1]
        tok_logits  = self.tok_head(last_hidden)      
              # [B, T, 2]
        return seq_logits, tok_logits


In [None]:
# Helper functions
def freeze_backbone(model, freeze=True):
    for p in model.backbone.parameters():
        p.requires_grad = not freeze

@torch.no_grad()
def bin_acc_from_logits(logits, labels):
    """
    logits: [B, 1], raw (pre-sigmoid)
    labels: [B, 1] or [B], 0/1
    """
    probs = torch.sigmoid(logits)
    preds = (probs >= 0.5).long()
    labs  = labels.view_as(preds).long()
    return (preds == labs).float().mean().item()

In [None]:
# Provided by you:
# train_toxic_loader = DataLoader(train_toxic_dataset, batch_size=32, shuffle=True)
# val_toxic_loader   = DataLoader(val_toxic_dataset,   batch_size=32, shuffle=False)

import torch
from torch.nn import BCEWithLogitsLoss
from torch.optim import AdamW
import matplotlib.pyplot as plt
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model  = ToxicityModel().to(device)

# Train full model on classification first
freeze_backbone(model, freeze=False)

clf_criterion = BCEWithLogitsLoss()
optimizer = AdamW(model.parameters(), lr=2e-5)

num_epochs_stage1 = 3
tr_losses, va_losses = [], []
tr_accs,   va_accs   = [], []

for epoch in range(1, num_epochs_stage1+1):
    # ---- train ----
    model.train()
    total_loss = 0.0
    total_acc  = 0.0
    total_n    = 0

    for batch in tqdm(train_toxic_loader, desc=f"Stage 1 | Epoch {epoch} [train]"):
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device).float().unsqueeze(1)

        optimizer.zero_grad()
        seq_logits, _ = model(input_ids=input_ids, attention_mask=attention_mask)

        loss = clf_criterion(seq_logits, labels)
        loss.backward()
        optimizer.step()

        bs = labels.size(0)
        total_loss += loss.item() * bs
        total_acc  += bin_acc_from_logits(seq_logits.detach(), labels) * bs
        total_n    += bs

    tr_losses.append(total_loss / total_n)
    tr_accs.append(total_acc / total_n)

    # ---- validate ----
    model.eval()
    val_loss = 0.0
    val_acc  = 0.0
    val_n    = 0
    with torch.no_grad():
        for batch in tqdm(val_toxic_loader, desc=f"Stage 1 | Epoch {epoch} [val]"):
            input_ids, attention_mask, labels = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device).float().unsqueeze(1)


            seq_logits, _ = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = clf_criterion(seq_logits, labels)

            bs = labels.size(0)
            val_loss += loss.item() * bs
            val_acc  += bin_acc_from_logits(seq_logits, labels) * bs
            val_n    += bs

    va_losses.append(val_loss / val_n)
    va_accs.append(val_acc / val_n)

    print(f"[Epoch {epoch}] TrainLoss {tr_losses[-1]:.4f} | TrainAcc {tr_accs[-1]:.4f} | "
          f"ValLoss {va_losses[-1]:.4f} | ValAcc {va_accs[-1]:.4f}")

# Plot Stage 1 curves
plt.figure(figsize=(11,4))
plt.subplot(1,2,1); plt.plot(tr_losses, label="Train"); plt.plot(va_losses, label="Val")
plt.title("Stage 1: Loss"); plt.xlabel("Epoch"); plt.legend()
plt.subplot(1,2,2); plt.plot(tr_accs, label="Train"); plt.plot(va_accs, label="Val")
plt.title("Stage 1: Accuracy"); plt.xlabel("Epoch"); plt.legend()
plt.show()

# Save after Stage 1
model.save_pretrained("toxicity_dualhead_stage1")
