# üèõÔ∏è AURA: Bayesian Multi-Task Learning (Final Production)

---
## ‚ö†Ô∏è PRIMA DI ESEGUIRE:
1. **Settings** (‚öôÔ∏è) ‚Üí **Accelerator** ‚Üí **GPU T4 x2**
2. **Add Input** ‚Üí Carica `aura-data` (deve contenere `goemotions_processed.csv`, `olid_train.csv`, `olid_validation.csv`)
---

### üî¨ Scientific Specifications
- **Architecture**: BERT-Base (Shared Encoder) + Task-Specific Heads
- **Uncertainty**: Homoscedastic (Task-Level) Variance Parameters [Kendall 2018]
- **Loss Function**: Monte Carlo Sampled NLL + Regularization
- **Optimizer**: AdamW + OneCycleLR

## 1Ô∏è‚É£ Setup & Imports

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, ConcatDataset
from torch.optim.lr_scheduler import OneCycleLR
from transformers import BertModel, BertTokenizer
from tqdm.notebook import tqdm
from sklearn.metrics import f1_score, classification_report
import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings('ignore')

print("="*50)
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"‚úÖ GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device('cpu')
    print("‚ùå NO GPU! Vai su Settings ‚Üí Accelerator ‚Üí GPU T4")
    raise RuntimeError("Attiva la GPU prima di procedere!")
print("="*50)

torch.manual_seed(42)
np.random.seed(42)

## 2Ô∏è‚É£ Configuration

In [None]:
CONFIG = {
    'encoder': 'bert-base-uncased',
    'max_length': 128,
    'num_emotion_classes': 7,
    'dropout': 0.1,
    'batch_size': 16,
    'gradient_accumulation': 2,
    'epochs': 5,
    'lr': 2e-5,
    'weight_decay': 0.01,
    'patience': 2,
    'mc_samples': 10,  # T=10 Monte Carlo Samples
    'output_dir': '/kaggle/working'
}

# Auto-detect data directory
DATA_DIR = None
for path in ['/kaggle/input/aura-data', '/kaggle/input/aura-data/kaggle_upload', 'data/processed']:
    if os.path.exists(path) and 'olid_train.csv' in os.listdir(path):
        DATA_DIR = path
        break

if DATA_DIR is None:
    raise FileNotFoundError("Dataset non trovato! Assicurati di aver collegato 'aura-data'.")

print(f"‚úÖ Dataset trovato: {DATA_DIR}")

## 3Ô∏è‚É£ Model Architecture: AURA Bayesian

Implements **Homoscedastic Uncertainty** via learnable `log_var` parameters (one per task).

In [None]:
class AURA_Bayesian(nn.Module):
    """
    AURA Multi-Task Model with Bayesian Uncertainty.
    - Shared BERT Encoder
    - Toxicity Head (2 classes)
    - Emotion Head (7 classes, multi-label via BCE)
    - Task-Level Log Variance (Homoscedastic Uncertainty)
    """
    def __init__(self, config):
        super().__init__()
        self.bert = BertModel.from_pretrained(config['encoder'])
        hidden_size = self.bert.config.hidden_size  # 768
        self.dropout = nn.Dropout(config['dropout'])
        
        # Task Heads
        self.toxicity_head = nn.Linear(hidden_size, 2)
        self.emotion_head = nn.Linear(hidden_size, config['num_emotion_classes'])
        
        # Homoscedastic Uncertainty Parameters (Kendall 2018)
        # Initialized to 0 => sigma = exp(0/2) = 1
        self.tox_log_var = nn.Parameter(torch.zeros(1))
        self.emo_log_var = nn.Parameter(torch.zeros(1))
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled = self.dropout(outputs.pooler_output)
        
        tox_logits = self.toxicity_head(pooled)
        emo_logits = self.emotion_head(pooled)
        
        return tox_logits, emo_logits, self.tox_log_var, self.emo_log_var

## 4Ô∏è‚É£ Loss Functions

### Monte Carlo Uncertainty Loss (Kendall et al. 2018, Eq. 12)
For classification, we sample T corrupted logits, average the softmax probabilities, then compute NLL.

In [None]:
def monte_carlo_uncertainty_loss_classification(logits, log_var, targets, T=10):
    """
    Computes Bayesian Uncertainty Loss for Classification via Monte Carlo Sampling.
    
    Args:
        logits: [B, C] - Raw model predictions
        log_var: [1] - Task-level log variance (homoscedastic)
        targets: [B] - Ground truth class indices
        T: int - Number of Monte Carlo samples
    
    Returns:
        loss: scalar - MC NLL + Regularization
    """
    # 1. Clamp for numerical stability
    log_var_clamped = torch.clamp(log_var, min=-10, max=10)
    std = torch.exp(0.5 * log_var_clamped)  # sigma = sqrt(exp(log_var))
    
    # 2. Monte Carlo Sampling: [T, B, C]
    logits_expanded = logits.unsqueeze(0).expand(T, -1, -1)
    noise = torch.randn_like(logits_expanded)
    corrupted_logits = logits_expanded + (noise * std)
    
    # 3. Softmax & Average across T samples
    probs = F.softmax(corrupted_logits, dim=-1)  # [T, B, C]
    avg_probs = torch.mean(probs, dim=0)  # [B, C]
    
    # 4. NLL Loss (with epsilon for stability)
    log_probs = torch.log(avg_probs + 1e-8)
    nll = F.nll_loss(log_probs, targets)
    
    # 5. Regularization (penalizes high uncertainty)
    regularization = 0.5 * log_var_clamped
    
    return nll + regularization


def monte_carlo_uncertainty_loss_multilabel(logits, log_var, targets, T=10):
    """
    Computes Bayesian Uncertainty Loss for Multi-Label Classification (Emotions).
    Uses BCE instead of NLL.
    
    Args:
        logits: [B, C] - Raw model predictions
        log_var: [1] - Task-level log variance
        targets: [B, C] - Multi-label targets (0 or 1)
        T: int - Number of Monte Carlo samples
    
    Returns:
        loss: scalar - MC BCE + Regularization
    """
    log_var_clamped = torch.clamp(log_var, min=-10, max=10)
    std = torch.exp(0.5 * log_var_clamped)
    
    # Monte Carlo Sampling
    logits_expanded = logits.unsqueeze(0).expand(T, -1, -1)
    noise = torch.randn_like(logits_expanded)
    corrupted_logits = logits_expanded + (noise * std)
    
    # Sigmoid & Average
    probs = torch.sigmoid(corrupted_logits)
    avg_probs = torch.mean(probs, dim=0)
    
    # BCE Loss
    bce = F.binary_cross_entropy(avg_probs, targets, reduction='mean')
    
    # Regularization
    regularization = 0.5 * log_var_clamped
    
    return bce + regularization

## 5Ô∏è‚É£ Dataset Class

In [None]:
class AURADataset(Dataset):
    """
    Unified Dataset for both Toxicity (OLID) and Emotion (GoEmotions) tasks.
    Uses masking: if is_toxicity=True, emotion labels are -1 (masked).
    """
    def __init__(self, csv_path, tokenizer, max_length, is_toxicity=True):
        self.df = pd.read_csv(csv_path)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.is_toxicity = is_toxicity
        self.emo_cols = ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'neutral']
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = str(row.get('text', row.get('tweet', '')))
        
        enc = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        # Default: masked labels
        tox_label = -1  # Will be ignored in loss
        emo_label = torch.full((7,), -1.0)  # Will be ignored in loss
        
        if self.is_toxicity:
            label_raw = row['label'] if 'label' in row else row.get('subtask_a', 'NOT')
            tox_label = 1 if label_raw in [1, 'OFF'] else 0
        else:
            emo_label = torch.tensor([float(row[c]) for c in self.emo_cols], dtype=torch.float32)
        
        return {
            'input_ids': enc['input_ids'].flatten(),
            'attention_mask': enc['attention_mask'].flatten(),
            'toxicity_target': torch.tensor(tox_label, dtype=torch.long),
            'emotion_target': emo_label,
            'is_toxicity_task': torch.tensor(1 if self.is_toxicity else 0, dtype=torch.long)
        }

## 6Ô∏è‚É£ Data Loading

In [None]:
tokenizer = BertTokenizer.from_pretrained(CONFIG['encoder'])

# Load datasets
olid_train = AURADataset(f"{DATA_DIR}/olid_train.csv", tokenizer, CONFIG['max_length'], is_toxicity=True)
olid_val = AURADataset(f"{DATA_DIR}/olid_validation.csv", tokenizer, CONFIG['max_length'], is_toxicity=True)
goemo_full = AURADataset(f"{DATA_DIR}/goemotions_processed.csv", tokenizer, CONFIG['max_length'], is_toxicity=False)

# Sample GoEmotions to balance with OLID
goemo_indices = np.random.choice(len(goemo_full), min(30000, len(goemo_full)), replace=False)
goemo_subset = torch.utils.data.Subset(goemo_full, goemo_indices)

# Combine for training
train_set = ConcatDataset([olid_train, goemo_subset])
train_loader = DataLoader(train_set, batch_size=CONFIG['batch_size'], shuffle=True, num_workers=2, pin_memory=True)
val_loader = DataLoader(olid_val, batch_size=CONFIG['batch_size'], shuffle=False, num_workers=2, pin_memory=True)

print(f"‚úÖ Training set: {len(train_set)} samples")
print(f"‚úÖ Validation set: {len(olid_val)} samples")

## 7Ô∏è‚É£ Training Function

In [None]:
def train_epoch(model, loader, optimizer, scheduler, epoch):
    model.train()
    total_loss = 0
    tox_preds, tox_labels = [], []
    
    loop = tqdm(loader, desc=f"Epoch {epoch}", leave=True)
    optimizer.zero_grad()
    
    for step, batch in enumerate(loop):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        tox_targets = batch['toxicity_target'].to(device)
        emo_targets = batch['emotion_target'].to(device)
        is_tox_task = batch['is_toxicity_task'].to(device)
        
        # Forward
        tox_logits, emo_logits, tox_log_var, emo_log_var = model(input_ids, attention_mask)
        
        # Compute losses (masked)
        loss = torch.tensor(0.0, device=device)
        
        # Toxicity Loss (only where is_tox_task == 1)
        tox_mask = is_tox_task == 1
        if tox_mask.sum() > 0:
            tox_loss = monte_carlo_uncertainty_loss_classification(
                tox_logits[tox_mask], 
                tox_log_var, 
                tox_targets[tox_mask],
                T=CONFIG['mc_samples']
            )
            loss = loss + tox_loss
            
            # Track predictions
            preds = torch.argmax(tox_logits[tox_mask], dim=1).cpu().numpy()
            tox_preds.extend(preds)
            tox_labels.extend(tox_targets[tox_mask].cpu().numpy())
        
        # Emotion Loss (only where is_tox_task == 0)
        emo_mask = is_tox_task == 0
        if emo_mask.sum() > 0:
            emo_loss = monte_carlo_uncertainty_loss_multilabel(
                emo_logits[emo_mask], 
                emo_log_var, 
                emo_targets[emo_mask],
                T=CONFIG['mc_samples']
            )
            loss = loss + emo_loss
        
        # Gradient Accumulation
        loss = loss / CONFIG['gradient_accumulation']
        loss.backward()
        
        if (step + 1) % CONFIG['gradient_accumulation'] == 0:
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
        
        total_loss += loss.item() * CONFIG['gradient_accumulation']
        
        # Update progress bar
        sigma_tox = torch.exp(0.5 * tox_log_var).item()
        sigma_emo = torch.exp(0.5 * emo_log_var).item()
        loop.set_postfix(loss=loss.item(), œÉ_tox=f"{sigma_tox:.3f}", œÉ_emo=f"{sigma_emo:.3f}")
    
    # Calculate epoch metrics
    avg_loss = total_loss / len(loader)
    train_f1 = f1_score(tox_labels, tox_preds, average='macro') if tox_labels else 0
    
    return avg_loss, train_f1

## 8Ô∏è‚É£ Validation Function

In [None]:
@torch.no_grad()
def validate(model, loader):
    model.eval()
    total_loss = 0
    all_preds, all_labels = [], []
    
    for batch in tqdm(loader, desc="Validating", leave=False):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        tox_targets = batch['toxicity_target'].to(device)
        
        tox_logits, _, tox_log_var, _ = model(input_ids, attention_mask)
        
        # Validation uses standard CrossEntropy (no MC needed for eval)
        loss = F.cross_entropy(tox_logits, tox_targets)
        total_loss += loss.item()
        
        preds = torch.argmax(tox_logits, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(tox_targets.cpu().numpy())
    
    avg_loss = total_loss / len(loader)
    val_f1 = f1_score(all_labels, all_preds, average='macro')
    
    return avg_loss, val_f1, all_preds, all_labels

## 9Ô∏è‚É£ Main Training Loop

In [None]:
# Initialize Model
model = AURA_Bayesian(CONFIG).to(device)
print(f"‚úÖ Model loaded on {device}")

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=CONFIG['lr'], weight_decay=CONFIG['weight_decay'])

# Scheduler
total_steps = len(train_loader) * CONFIG['epochs'] // CONFIG['gradient_accumulation']
scheduler = OneCycleLR(optimizer, max_lr=CONFIG['lr'], total_steps=total_steps, pct_start=0.1)

# Training
best_f1 = 0
patience_counter = 0

print("\n" + "="*60)
print("üöÄ STARTING BAYESIAN TRAINING")
print("="*60)

for epoch in range(1, CONFIG['epochs'] + 1):
    print(f"\nüìç Epoch {epoch}/{CONFIG['epochs']}")
    
    train_loss, train_f1 = train_epoch(model, train_loader, optimizer, scheduler, epoch)
    val_loss, val_f1, preds, labels = validate(model, val_loader)
    
    # Log Sigma values
    sigma_tox = torch.exp(0.5 * model.tox_log_var).item()
    sigma_emo = torch.exp(0.5 * model.emo_log_var).item()
    
    print(f"   Train Loss: {train_loss:.4f} | Train F1: {train_f1:.4f}")
    print(f"   Val Loss:   {val_loss:.4f} | Val F1:   {val_f1:.4f}")
    print(f"   œÉ_Tox: {sigma_tox:.4f} | œÉ_Emo: {sigma_emo:.4f}")
    
    # Save best model
    if val_f1 > best_f1:
        best_f1 = val_f1
        patience_counter = 0
        torch.save(model.state_dict(), f"{CONFIG['output_dir']}/aura_bayesian_best.pt")
        print(f"   üíæ New best model saved! (F1: {best_f1:.4f})")
    else:
        patience_counter += 1
        print(f"   ‚è≥ No improvement ({patience_counter}/{CONFIG['patience']})")
    
    # Early stopping
    if patience_counter >= CONFIG['patience']:
        print(f"\nüõë Early stopping triggered at epoch {epoch}")
        break

print("\n" + "="*60)
print(f"‚úÖ TRAINING COMPLETE | Best Val F1: {best_f1:.4f}")
print("="*60)

## üîü Final Evaluation & Report

In [None]:
# Load best model
model.load_state_dict(torch.load(f"{CONFIG['output_dir']}/aura_bayesian_best.pt"))
val_loss, val_f1, preds, labels = validate(model, val_loader)

print("\nüìä FINAL CLASSIFICATION REPORT (Toxicity Task)")
print("="*50)
print(classification_report(labels, preds, target_names=['NOT', 'OFF']))

print(f"\nüèÜ Final Macro-F1: {val_f1:.4f}")
print(f"\nüì¶ Model saved to: {CONFIG['output_dir']}/aura_bayesian_best.pt")