# AURA V5: Production Model

---
## PRIMA DI ESEGUIRE:
1. **Settings** -> **Accelerator** -> **GPU T4 x2**
2. **Add Input** -> Carica `aura-data-v5`
---

### V5 Configuration
| Component | Value |
|-----------|-------|
| Backbone | BERT-base (110M) |
| Emotions | 5 (anger, disgust, fear, joy, neutral) |
| Toxicity Loss | Focal Loss (Î³=2.0) |
| Learning Rate | 1e-5 |
| Dropout | 0.3 |
| Data | Balanced, single-label, noise-filtered |

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, ConcatDataset
from torch.optim.lr_scheduler import OneCycleLR
from transformers import BertModel, BertTokenizer
from tqdm.notebook import tqdm
from sklearn.metrics import f1_score, classification_report
import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings('ignore')

print("="*60)
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device('cpu')
    raise RuntimeError("ATTIVA LA GPU!")
print("="*60)

torch.manual_seed(42)
np.random.seed(42)

In [None]:
# ============================================================
# V5 CONFIGURATION - OPTIMIZED
# ============================================================
CONFIG = {
    'encoder': 'bert-base-uncased',
    'max_length': 128,
    'num_emotion_classes': 5,  # V5: 5 emotions only
    'dropout': 0.3,            # Increased from 0.1
    'batch_size': 16,
    'gradient_accumulation': 2,
    'epochs': 8,               # Increased from 5
    'lr': 1e-5,                # Reduced from 2e-5
    'weight_decay': 0.02,      # Increased from 0.01
    'patience': 3,             # Increased from 2
    'mc_samples': 10,
    'focal_gamma': 2.0,
    'output_dir': '/kaggle/working'
}

# V5 Emotion columns (5 only, toxicity-relevant)
EMO_COLS = ['anger', 'disgust', 'fear', 'joy', 'neutral']

print("V5 Configuration:")
print(f"  Emotions: {EMO_COLS}")
print(f"  LR: {CONFIG['lr']} | Dropout: {CONFIG['dropout']}")
print(f"  Focal Gamma: {CONFIG['focal_gamma']}")

# Data paths
DATA_DIR = None
for path in ['/kaggle/input/aura-data-v5', '/kaggle/input/aura_data_v5', 'data/kaggle_upload_v5']:
    if os.path.exists(path):
        if os.path.exists(os.path.join(path, 'goemotions_v5.csv')):
            DATA_DIR = path
            GOEMO_FILE = 'goemotions_v5.csv'
            break

if DATA_DIR is None:
    raise FileNotFoundError("V5 Dataset not found! Upload aura-data-v5")

print(f"\nDataset: {DATA_DIR}")

In [None]:
# ============================================================
# MODEL: AURA Bayesian (BERT + Kendall Uncertainty)
# ============================================================
class AURA_Bayesian(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.bert = BertModel.from_pretrained(config['encoder'])
        hidden_size = self.bert.config.hidden_size
        self.dropout = nn.Dropout(config['dropout'])
        
        self.toxicity_head = nn.Linear(hidden_size, 2)
        self.emotion_head = nn.Linear(hidden_size, config['num_emotion_classes'])
        
        # Homoscedastic Uncertainty (Kendall 2018)
        self.tox_log_var = nn.Parameter(torch.zeros(1))
        self.emo_log_var = nn.Parameter(torch.zeros(1))
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled = self.dropout(outputs.pooler_output)
        
        tox_logits = self.toxicity_head(pooled)
        emo_logits = self.emotion_head(pooled)
        
        return tox_logits, emo_logits, self.tox_log_var, self.emo_log_var

In [None]:
# ============================================================
# LOSS FUNCTIONS
# ============================================================
def focal_loss_with_uncertainty(logits, log_var, targets, gamma=2.0, T=10):
    """
    Focal Loss + Kendall Uncertainty (Lin 2017 + Kendall 2018)
    """
    log_var_clamped = torch.clamp(log_var, min=-10, max=10)
    std = torch.exp(0.5 * log_var_clamped)
    
    # Monte Carlo Sampling
    logits_expanded = logits.unsqueeze(0).expand(T, -1, -1)
    noise = torch.randn_like(logits_expanded)
    corrupted_logits = logits_expanded + (noise * std)
    
    # Average probabilities
    probs = F.softmax(corrupted_logits, dim=-1)
    avg_probs = torch.mean(probs, dim=0)
    
    # Focal Loss
    p_t = avg_probs[range(len(targets)), targets]
    focal_weight = (1 - p_t) ** gamma
    ce_loss = -torch.log(p_t + 1e-8)
    focal_loss = (focal_weight * ce_loss).mean()
    
    return focal_loss + 0.5 * log_var_clamped


def mc_uncertainty_loss_multilabel(logits, log_var, targets, T=10):
    """
    Monte Carlo Uncertainty for Multi-Label (Emotions)
    """
    log_var_clamped = torch.clamp(log_var, min=-10, max=10)
    std = torch.exp(0.5 * log_var_clamped)
    
    logits_expanded = logits.unsqueeze(0).expand(T, -1, -1)
    noise = torch.randn_like(logits_expanded)
    corrupted_logits = logits_expanded + (noise * std)
    
    probs = torch.sigmoid(corrupted_logits)
    avg_probs = torch.mean(probs, dim=0)
    
    bce = F.binary_cross_entropy(avg_probs, targets, reduction='mean')
    return bce + 0.5 * log_var_clamped

In [None]:
# ============================================================
# DATASET
# ============================================================
class AURADataset(Dataset):
    def __init__(self, csv_path, tokenizer, max_length, is_toxicity=True, emo_cols=None):
        self.df = pd.read_csv(csv_path)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.is_toxicity = is_toxicity
        self.emo_cols = emo_cols or EMO_COLS
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = str(row.get('text', row.get('tweet', '')))
        
        enc = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        tox_label = -1
        emo_label = torch.full((len(self.emo_cols),), -1.0)
        
        if self.is_toxicity:
            label_raw = row['label'] if 'label' in row else row.get('subtask_a', 'NOT')
            tox_label = 1 if label_raw in [1, 'OFF'] else 0
        else:
            emo_label = torch.tensor([float(row[c]) for c in self.emo_cols], dtype=torch.float32)
        
        return {
            'input_ids': enc['input_ids'].flatten(),
            'attention_mask': enc['attention_mask'].flatten(),
            'toxicity_target': torch.tensor(tox_label, dtype=torch.long),
            'emotion_target': emo_label,
            'is_toxicity_task': torch.tensor(1 if self.is_toxicity else 0, dtype=torch.long)
        }

In [None]:
# ============================================================
# DATA LOADING
# ============================================================
tokenizer = BertTokenizer.from_pretrained(CONFIG['encoder'])

# Load V5 datasets
olid_train = AURADataset(f"{DATA_DIR}/olid_train.csv", tokenizer, CONFIG['max_length'], is_toxicity=True)
olid_val = AURADataset(f"{DATA_DIR}/olid_validation.csv", tokenizer, CONFIG['max_length'], is_toxicity=True)
goemo = AURADataset(f"{DATA_DIR}/{GOEMO_FILE}", tokenizer, CONFIG['max_length'], is_toxicity=False, emo_cols=EMO_COLS)

# Verify data
goemo_df = pd.read_csv(f"{DATA_DIR}/{GOEMO_FILE}")
print("V5 Data Verification:")
print(f"  OLID Train: {len(olid_train)} samples")
print(f"  OLID Val: {len(olid_val)} samples")
print(f"  GoEmotions: {len(goemo)} samples")
print(f"  Emotions: {EMO_COLS}")
for col in EMO_COLS:
    if col in goemo_df.columns:
        print(f"    {col}: {int(goemo_df[col].sum())} samples")

# Combine
train_set = ConcatDataset([olid_train, goemo])
train_loader = DataLoader(train_set, batch_size=CONFIG['batch_size'], shuffle=True, num_workers=2, pin_memory=True)
val_loader = DataLoader(olid_val, batch_size=CONFIG['batch_size'], shuffle=False, num_workers=2, pin_memory=True)

print(f"\nTotal Training: {len(train_set)} samples")

In [None]:
# ============================================================
# TRAINING FUNCTION
# ============================================================
def train_epoch(model, loader, optimizer, scheduler, epoch, config):
    model.train()
    total_loss = 0
    tox_preds, tox_labels = [], []
    
    loop = tqdm(loader, desc=f"Epoch {epoch}", leave=True)
    optimizer.zero_grad()
    
    for step, batch in enumerate(loop):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        tox_targets = batch['toxicity_target'].to(device)
        emo_targets = batch['emotion_target'].to(device)
        is_tox_task = batch['is_toxicity_task'].to(device)
        
        tox_logits, emo_logits, tox_log_var, emo_log_var = model(input_ids, attention_mask)
        
        loss = torch.tensor(0.0, device=device)
        
        # Toxicity Loss (Focal)
        tox_mask = is_tox_task == 1
        if tox_mask.sum() > 0:
            tox_loss = focal_loss_with_uncertainty(
                tox_logits[tox_mask], 
                tox_log_var, 
                tox_targets[tox_mask],
                gamma=config['focal_gamma'],
                T=config['mc_samples']
            )
            loss = loss + tox_loss
            
            preds = torch.argmax(tox_logits[tox_mask], dim=1).cpu().numpy()
            tox_preds.extend(preds)
            tox_labels.extend(tox_targets[tox_mask].cpu().numpy())
        
        # Emotion Loss
        emo_mask = is_tox_task == 0
        if emo_mask.sum() > 0:
            emo_loss = mc_uncertainty_loss_multilabel(
                emo_logits[emo_mask], 
                emo_log_var, 
                emo_targets[emo_mask],
                T=config['mc_samples']
            )
            loss = loss + emo_loss
        
        loss = loss / config['gradient_accumulation']
        loss.backward()
        
        if (step + 1) % config['gradient_accumulation'] == 0:
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
        
        total_loss += loss.item() * config['gradient_accumulation']
        
        sigma_tox = torch.exp(0.5 * tox_log_var).item()
        sigma_emo = torch.exp(0.5 * emo_log_var).item()
        loop.set_postfix(loss=loss.item(), s_tox=f"{sigma_tox:.3f}", s_emo=f"{sigma_emo:.3f}")
    
    avg_loss = total_loss / len(loader)
    train_f1 = f1_score(tox_labels, tox_preds, average='macro') if tox_labels else 0
    
    return avg_loss, train_f1

In [None]:
# ============================================================
# VALIDATION FUNCTION
# ============================================================
@torch.no_grad()
def validate(model, loader):
    model.eval()
    total_loss = 0
    all_preds, all_labels = [], []
    
    for batch in tqdm(loader, desc="Validating", leave=False):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        tox_targets = batch['toxicity_target'].to(device)
        
        tox_logits, _, _, _ = model(input_ids, attention_mask)
        
        loss = F.cross_entropy(tox_logits, tox_targets)
        total_loss += loss.item()
        
        preds = torch.argmax(tox_logits, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(tox_targets.cpu().numpy())
    
    avg_loss = total_loss / len(loader)
    val_f1 = f1_score(all_labels, all_preds, average='macro')
    
    return avg_loss, val_f1, all_preds, all_labels

In [None]:
# ============================================================
# MAIN TRAINING LOOP
# ============================================================
model = AURA_Bayesian(CONFIG).to(device)
print(f"Model: BERT (110M params) on {device}")

optimizer = torch.optim.AdamW(model.parameters(), lr=CONFIG['lr'], weight_decay=CONFIG['weight_decay'])

total_steps = len(train_loader) * CONFIG['epochs'] // CONFIG['gradient_accumulation']
scheduler = OneCycleLR(optimizer, max_lr=CONFIG['lr'], total_steps=total_steps, pct_start=0.1)

best_f1 = 0
patience_counter = 0

print("\n" + "="*60)
print("STARTING V5 TRAINING (Balanced Data + Focal Loss + Optimized HP)")
print("="*60)

for epoch in range(1, CONFIG['epochs'] + 1):
    print(f"\nEpoch {epoch}/{CONFIG['epochs']}")
    
    train_loss, train_f1 = train_epoch(model, train_loader, optimizer, scheduler, epoch, CONFIG)
    val_loss, val_f1, preds, labels = validate(model, val_loader)
    
    sigma_tox = torch.exp(0.5 * model.tox_log_var).item()
    sigma_emo = torch.exp(0.5 * model.emo_log_var).item()
    gap = abs(train_f1 - val_f1) * 100
    
    print(f"   Train Loss: {train_loss:.4f} | Train F1: {train_f1:.4f}")
    print(f"   Val Loss:   {val_loss:.4f} | Val F1:   {val_f1:.4f}")
    print(f"   Gap: {gap:.1f}% | sigma_Tox: {sigma_tox:.4f} | sigma_Emo: {sigma_emo:.4f}")
    
    if val_f1 > best_f1:
        best_f1 = val_f1
        patience_counter = 0
        torch.save(model.state_dict(), f"{CONFIG['output_dir']}/aura_v5_best.pt")
        print(f"   NEW BEST! (F1: {best_f1:.4f})")
    else:
        patience_counter += 1
        print(f"   No improvement ({patience_counter}/{CONFIG['patience']})")
    
    if patience_counter >= CONFIG['patience']:
        print(f"\nEarly stopping at epoch {epoch}")
        break

print("\n" + "="*60)
print(f"TRAINING COMPLETE | Best Val F1: {best_f1:.4f}")
print("="*60)

In [None]:
# ============================================================
# FINAL EVALUATION
# ============================================================
model.load_state_dict(torch.load(f"{CONFIG['output_dir']}/aura_v5_best.pt"))
val_loss, val_f1, preds, labels = validate(model, val_loader)

print("\nFINAL CLASSIFICATION REPORT (V5 - Production Model)")
print("="*60)
print(classification_report(labels, preds, target_names=['NOT', 'OFF']))

print(f"\nFinal Macro-F1: {val_f1:.4f}")
print(f"Model saved: {CONFIG['output_dir']}/aura_v5_best.pt")