# AURA Simple: DeBERTa-v3-base Reporting-Aware Toxicity Detection

**Philosophy**: A more powerful model can learn nuances directly.

**Architecture**: `microsoft/deberta-v3-base` ‚Üí Binary Classification Head

**Dataset**: Unified Toxicity + Reporting (18k samples)
- Label 0: Non-toxic OR Reporting (safe content)
- Label 1: Toxic direct speech

**Goal**: The model should learn that *reporting* toxic content is NOT toxic.

---

In [None]:
# Cell 1: Setup & GPU Check
import torch
print("üîß Checking GPU...")
if torch.cuda.is_available():
    print(f"‚úÖ GPU: {torch.cuda.get_device_name(0)}")
    print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    print("‚ö†Ô∏è No GPU! Enable in Settings ‚Üí Accelerator ‚Üí GPU")

In [None]:
# Cell 2: Imports
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup
)
from sklearn.metrics import f1_score, classification_report, confusion_matrix
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Reproducibility
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'üîß Device: {device}')

In [None]:
# Cell 3: Configuration
CONFIG = {
    'model_name': 'microsoft/deberta-v3-base',
    'max_length': 128,
    'batch_size': 16,
    'epochs': 5,
    'lr': 2e-5,
    'weight_decay': 0.01,
    'warmup_ratio': 0.1,
    'patience': 3,
}

DATA_DIR = '/kaggle/input/aura-deberta-data'

print('üìã Configuration:')
for k, v in CONFIG.items():
    print(f'   {k}: {v}')

In [None]:
# Cell 4: Dataset Class
class ToxicityDataset(Dataset):
    def __init__(self, path, tokenizer, max_len):
        self.df = pd.read_csv(path)
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        enc = self.tokenizer(
            str(row['text']),
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': enc['input_ids'].flatten(),
            'attention_mask': enc['attention_mask'].flatten(),
            'label': torch.tensor(int(row['label']), dtype=torch.long)
        }

print('üì¶ Dataset class defined.')

In [None]:
# Cell 5: Load Data
tokenizer = AutoTokenizer.from_pretrained(CONFIG['model_name'])

train_ds = ToxicityDataset(f"{DATA_DIR}/deberta_unified_train.csv", tokenizer, CONFIG['max_length'])
val_ds = ToxicityDataset(f"{DATA_DIR}/deberta_unified_val.csv", tokenizer, CONFIG['max_length'])

train_loader = DataLoader(train_ds, batch_size=CONFIG['batch_size'], shuffle=True, num_workers=2)
val_loader = DataLoader(val_ds, batch_size=CONFIG['batch_size'], shuffle=False)

print('='*60)
print('üìä DATASET SUMMARY')
print('='*60)
print(f'Training samples: {len(train_ds):,}')
print(f'Validation samples: {len(val_ds):,}')

# Check distribution
train_df = pd.read_csv(f"{DATA_DIR}/deberta_unified_train.csv")
print(f'\nLabel distribution (train):')
print(f'   0 (Safe/Reporting): {(train_df["label"] == 0).sum():,}')
print(f'   1 (Toxic Direct):   {(train_df["label"] == 1).sum():,}')

In [None]:
# Cell 6: Load Model
model = AutoModelForSequenceClassification.from_pretrained(
    CONFIG['model_name'],
    num_labels=2
).to(device)

# Class weights for imbalance
n_safe = (train_df['label'] == 0).sum()
n_toxic = (train_df['label'] == 1).sum()
class_weights = torch.tensor([n_toxic / n_safe, 1.0], device=device, dtype=torch.float32)
print(f'‚öñÔ∏è Class weights: {class_weights}')

criterion = nn.CrossEntropyLoss(weight=class_weights)

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=CONFIG['lr'], weight_decay=CONFIG['weight_decay'])

# Scheduler
total_steps = len(train_loader) * CONFIG['epochs']
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(total_steps * CONFIG['warmup_ratio']),
    num_training_steps=total_steps
)

print(f'\nüèóÔ∏è Model loaded: {CONFIG["model_name"]}')
print(f'   Parameters: {sum(p.numel() for p in model.parameters()):,}')

In [None]:
# Cell 7: Training Functions
def train_epoch(model, loader, optimizer, scheduler, criterion):
    model.train()
    total_loss = 0
    
    for batch in tqdm(loader, desc='Training'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.logits, labels)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        
        total_loss += loss.item()
    
    return total_loss / len(loader)

def evaluate(model, loader):
    model.eval()
    all_preds, all_labels = [], []
    
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask)
            preds = outputs.logits.argmax(dim=1)
            
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(batch['label'].numpy())
    
    f1 = f1_score(all_labels, all_preds, average='macro')
    return f1, all_preds, all_labels

print('‚úÖ Training functions defined.')

In [None]:
# Cell 8: Training Loop
print('='*60)
print('üöÄ TRAINING START')
print('='*60)

best_f1 = 0
patience_counter = 0
history = {'train_loss': [], 'val_f1': []}

for epoch in range(1, CONFIG['epochs'] + 1):
    print(f'\n--- Epoch {epoch}/{CONFIG["epochs"]} ---')
    
    train_loss = train_epoch(model, train_loader, optimizer, scheduler, criterion)
    val_f1, _, _ = evaluate(model, val_loader)
    
    history['train_loss'].append(train_loss)
    history['val_f1'].append(val_f1)
    
    print(f'Train Loss: {train_loss:.4f} | Val F1: {val_f1:.4f}')
    
    if val_f1 > best_f1:
        best_f1 = val_f1
        patience_counter = 0
        torch.save(model.state_dict(), '/kaggle/working/deberta_best.pt')
        print('>>> BEST MODEL SAVED <<<')
    else:
        patience_counter += 1
        print(f'(Patience: {patience_counter}/{CONFIG["patience"]})')
        if patience_counter >= CONFIG['patience']:
            print('üõë Early stopping.')
            break

print(f'\nüèÜ Best F1: {best_f1:.4f}')

In [None]:
# Cell 9: Final Evaluation
print('='*60)
print('üî¨ FINAL EVALUATION')
print('='*60)

model.load_state_dict(torch.load('/kaggle/working/deberta_best.pt'))
f1, preds, labels = evaluate(model, val_loader)

print('\n--- Classification Report ---')
print(classification_report(labels, preds, target_names=['Safe/Reporting', 'Toxic']))

# Confusion Matrix
cm = confusion_matrix(labels, preds)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Safe', 'Toxic'], yticklabels=['Safe', 'Toxic'])
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.tight_layout()
plt.show()

In [None]:
# Cell 10: Qualitative Stress Test - Does it understand Reporting?
print('='*80)
print('üß™ QUALITATIVE STRESS TEST: REPORTING AWARENESS')
print('='*80)

test_cases = [
    # Reporting (should be SAFE)
    ("He said you are a disgusting person", "Safe", "Reporting"),
    ("According to the witness, 'you are an idiot'", "Safe", "Reporting"),
    ("She claims he called her a failure", "Safe", "Reporting"),
    ("The document records the insult: 'you are worthless'", "Safe", "Reporting"),
    
    # Direct Toxic (should be TOXIC)
    ("You are a disgusting person", "Toxic", "Direct"),
    ("I am telling you that you are an idiot", "Toxic", "Direct"),
    ("You are a failure and everyone knows it", "Toxic", "Direct"),
    ("You are worthless garbage", "Toxic", "Direct"),
    
    # Implicit Hate (should be TOXIC)
    ("People like you belong in a cage", "Toxic", "Implicit"),
    ("Your kind is inferior", "Toxic", "Implicit"),
    
    # Non-toxic (should be SAFE)
    ("I hate rainy Mondays", "Safe", "General"),
    ("This soup is disgusting", "Safe", "Object"),
    ("The movie was terrible", "Safe", "Opinion"),
]

print(f"{'Text':<50} | {'Expected':<8} | {'Pred':<8} | {'Status'}")
print('-'*80)

correct = 0
model.eval()

with torch.no_grad():
    for text, expected, category in test_cases:
        enc = tokenizer(text, max_length=CONFIG['max_length'], padding='max_length',
                        truncation=True, return_tensors='pt')
        outputs = model(enc['input_ids'].to(device), attention_mask=enc['attention_mask'].to(device))
        pred_idx = outputs.logits.argmax(dim=1).item()
        pred_label = 'Toxic' if pred_idx == 1 else 'Safe'
        
        status = '‚úÖ' if pred_label == expected else '‚ùå'
        if pred_label == expected:
            correct += 1
        
        print(f"{text[:48]:<50} | {expected:<8} | {pred_label:<8} | {status}")

print('-'*80)
print(f'\nüéØ Stress Test Accuracy: {correct}/{len(test_cases)} ({correct/len(test_cases)*100:.1f}%)')
print('\nüìù Key Question: Does it correctly identify REPORTING as SAFE?')

In [None]:
# Cell 11: Save Artifacts
print('üíæ Saving artifacts...')

import json
with open('/kaggle/working/deberta_history.json', 'w') as f:
    json.dump(history, f)

print('‚úÖ Model: /kaggle/working/deberta_best.pt')
print('‚úÖ History: /kaggle/working/deberta_history.json')
print(f'\nüèÜ Final Best F1: {best_f1:.4f}')