# AURA V11 ‚Äî RoBERTa Baseline (Ablation Study)

**Purpose**: Controlled ablation to measure the contribution of Task-Specific Multi-Head Attention.

This notebook is **identical** to `AURA_V10.2_Kaggle.ipynb` in every respect except the model:

| | AURA V10.2 | This Baseline |
|---|---|---|
| Encoder | RoBERTa-base | RoBERTa-base |
| Task Attention | 4√ó MHSA (8 heads each) | ‚ùå None |
| Pooling | Mean pool per-task attention output | Mean pool shared encoder output |
| Heads | 4 linear classifiers | 4 linear classifiers (identical) |
| Loss | Focal + Kendall (Softplus variant) | Focal + Kendall (Softplus variant) |
| Data | aura-v11-data | aura-v11-data |
| Config | Identical | Identical |
| Seed | 42 | 42 |

The **only independent variable** is the presence of Task-Specific MHSA.

---

In [None]:
# Cell 1: Imports & Seed ‚Äî IDENTICAL TO V10.2
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, ConcatDataset
from transformers import RobertaModel, RobertaTokenizer, get_linear_schedule_with_warmup
from tqdm.notebook import tqdm
from sklearn.metrics import (
    f1_score, classification_report, confusion_matrix, 
    multilabel_confusion_matrix, precision_recall_fscore_support
)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Reproducibility ‚Äî SAME SEED AS V10.2
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'üîß Device: {device}')
if device.type == 'cuda':
    print(f'   GPU: {torch.cuda.get_device_name(0)}')
    print(f'   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB')

In [None]:
# Cell 2: Configuration ‚Äî IDENTICAL TO V10.2
CONFIG = {
    # Model
    'encoder': 'roberta-base',
    'hidden_dim': 768,
    'n_heads': 8,
    'num_emotion_classes': 7,
    'max_length': 128,
    'dropout': 0.3,
    
    # Training
    'batch_size': 16,
    'gradient_accumulation': 4,  # Effective batch = 64
    'epochs': 10,  # SAME AS V10.2 FINAL RUN
    'lr_encoder': 1e-5,
    'lr_heads': 5e-5,
    'weight_decay': 0.01,
    'max_grad_norm': 1.0,
    'warmup_ratio': 0.1,
    
    # Regularization (Module 3)
    'focal_gamma': 2.0,
    'label_smoothing': 0.1,
    'patience': 5,
    'freezing_epochs': 1,
}

DATA_DIR = '/kaggle/input/aura-v11-data'
EMO_COLS = ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'neutral']

print('üìã Baseline Configuration (identical to V10.2):')
for k, v in CONFIG.items():
    print(f'   {k}: {v}')

In [None]:
# Cell 3: Visualization Functions ‚Äî IDENTICAL TO V10.2
def plot_class_distribution(df, label_col, title, ax=None):
    """Plot class distribution (NB11 pattern)."""
    if ax is None:
        fig, ax = plt.subplots(figsize=(6, 4))
    counts = df[label_col].value_counts().sort_index()
    bars = ax.bar(counts.index.astype(str), counts.values, color=['#66c2a5', '#fc8d62'])
    ax.set_title(title)
    ax.set_xlabel('Class')
    ax.set_ylabel('Count')
    for bar, count in zip(bars, counts.values):
        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 50, 
                str(count), ha='center', fontsize=10)
    return ax

def plot_confusion_matrix_heatmap(y_true, y_pred, labels, title='Confusion Matrix', ax=None):
    """Plot confusion matrix heatmap (NB10 pattern)."""
    if ax is None:
        fig, ax = plt.subplots(figsize=(6, 5))
    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=labels, yticklabels=labels, ax=ax,
                cbar_kws={'label': 'Count'})
    ax.set_title(title)
    ax.set_ylabel('Actual')
    ax.set_xlabel('Predicted')
    return ax

def plot_multilabel_confusion_matrices(y_true, y_pred, labels, normalize=True):
    """Plot confusion matrix for each label in multilabel task (NB06 pattern)."""
    cms = multilabel_confusion_matrix(y_true, y_pred)
    n_labels = len(labels)
    cols = min(4, n_labels)
    rows = (n_labels + cols - 1) // cols
    fig, axes = plt.subplots(rows, cols, figsize=(cols*3, rows*3))
    axes = axes.flatten() if n_labels > 1 else [axes]
    
    for i, (cm, label) in enumerate(zip(cms, labels)):
        ax = axes[i]
        if normalize:
            cm = cm.astype('float') / cm.sum(axis=1, keepdims=True)
            fmt = '.2f'
        else:
            fmt = 'd'
        sns.heatmap(cm, annot=True, fmt=fmt, cmap='YlGnBu', ax=ax,
                    xticklabels=['Neg', 'Pos'], yticklabels=['Neg', 'Pos'],
                    vmin=0, vmax=1 if normalize else None, cbar=False)
        ax.set_title(label, fontsize=10)
        ax.set_ylabel('Actual')
        ax.set_xlabel('Predicted')
    
    # Hide unused axes
    for i in range(n_labels, len(axes)):
        axes[i].axis('off')
    
    plt.suptitle('Multilabel Confusion Matrices (Normalized)', fontsize=12)
    plt.tight_layout()
    plt.show()

def plot_training_history(history):
    """Plot training history (NB10 pattern)."""
    fig, axes = plt.subplots(1, 3, figsize=(15, 4))
    
    # Loss
    axes[0].plot(range(1, len(history['train_loss'])+1), history['train_loss'], 'b-o', label='Train')
    axes[0].set_xlabel('Epoch')
    axes[0].set_ylabel('Loss')
    axes[0].set_title('Training Loss')
    axes[0].legend()
    axes[0].grid(True, alpha=0.3)
    
    # F1 Score
    axes[1].plot(range(1, len(history['val_f1'])+1), history['val_f1'], 'g-o', label='Val F1')
    axes[1].set_xlabel('Epoch')
    axes[1].set_ylabel('Macro F1')
    axes[1].set_title('Validation F1 Score')
    axes[1].legend()
    axes[1].grid(True, alpha=0.3)
    
    # Task Weights (Kendall)
    weights = np.array(history['task_weights'])
    for i, name in enumerate(['Toxicity', 'Emotion', 'Sentiment', 'Reporting']):
        axes[2].plot(range(1, len(weights)+1), weights[:, i], '-o', label=name)
    axes[2].set_xlabel('Epoch')
    axes[2].set_ylabel('Weight (1/œÉ¬≤)')
    axes[2].set_title('Kendall Task Weights')
    axes[2].legend()
    axes[2].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

print('üìä Visualization functions loaded.')

## ‚öóÔ∏è Ablation: Baseline Model (No Task-Specific Attention)

The baseline model removes **all** Task-Specific Multi-Head Attention blocks.

Architecture comparison:

```
AURA V10.2:   RoBERTa ‚Üí [tox_mha, emo_mha, sent_mha, rep_mha] ‚Üí mean_pool ‚Üí dropout ‚Üí heads
Baseline:     RoBERTa ‚Üí mean_pool ‚Üí dropout ‚Üí heads
```

All 4 task heads receive the **same pooled representation** from the shared encoder. This isolates whether the MHSA blocks provide any benefit over a simple shared-encoder MTL setup.

**Everything else is identical**: same loss functions, optimizer, scheduler, data, seed, freezing strategy.

In [None]:
# Cell 4: Baseline Model (No Task-Specific MHA)
#
# ABLATION VARIABLE: This replaces AURA_V10's 4 TaskSpecificMHA blocks
# with a single shared mean-pool representation fed to all 4 heads.
#
# Everything else (dropout, bias init, head dimensions) is IDENTICAL to V10.2.

class AURA_Baseline(nn.Module):
    """RoBERTa + 4 Linear Heads (No Task-Specific Attention).
    
    Ablation baseline for AURA V10.2. Removes all TaskSpecificMHA modules.
    All task heads share the same pooled encoder representation.
    """
    
    def __init__(self, config):
        super().__init__()
        self.roberta = RobertaModel.from_pretrained(config['encoder'])
        hidden = config['hidden_dim']
        
        # NO TaskSpecificMHA blocks ‚Äî this is the ablation variable
        
        self.dropout = nn.Dropout(config['dropout'])
        
        # Classification Heads ‚Äî IDENTICAL to V10.2
        self.toxicity_head = nn.Linear(hidden, 2)
        self.emotion_head = nn.Linear(hidden, config['num_emotion_classes'])
        self.sentiment_head = nn.Linear(hidden, 2)
        self.reporting_head = nn.Linear(hidden, 1)
        
        # Bias Initialization ‚Äî IDENTICAL to V10.2
        with torch.no_grad():
            self.toxicity_head.bias[0] = 2.5   # Non-Toxic
            self.toxicity_head.bias[1] = -2.5  # Toxic

    def _mean_pool(self, seq, mask):
        """Masked mean pooling ‚Äî IDENTICAL to V10.2."""
        mask_exp = mask.unsqueeze(-1).expand(seq.size()).float()
        return (seq * mask_exp).sum(dim=1) / mask_exp.sum(dim=1).clamp(min=1e-9)

    def forward(self, input_ids, attention_mask):
        # Shared encoder ‚Äî IDENTICAL to V10.2
        shared = self.roberta(input_ids, attention_mask).last_hidden_state
        
        # ABLATION DIFFERENCE: single shared pooling, no task-specific attention
        pooled = self.dropout(self._mean_pool(shared, attention_mask))
        
        # All heads receive the SAME representation
        return {
            'toxicity': self.toxicity_head(pooled),
            'emotion': self.emotion_head(pooled),
            'sentiment': self.sentiment_head(pooled),
            'reporting': self.reporting_head(pooled).squeeze(-1)
        }

print('üß† AURA_Baseline model defined (No MHSA).')

In [None]:
# Cell 5: Loss Functions ‚Äî IDENTICAL TO V10.2
def focal_loss(logits, targets, gamma=2.0, weight=None, smoothing=0.0):
    """Focal Loss (NB11): focuses on hard examples.
    
    FL(p_t) = -(1 - p_t)^gamma * log(p_t)
    """
    ce = F.cross_entropy(logits, targets, weight=weight, reduction='none', label_smoothing=smoothing)
    pt = torch.exp(-ce)
    return ((1 - pt) ** gamma * ce).mean()

class UncertaintyLoss(nn.Module):
    """Kendall et al. (2018) Homoscedastic Uncertainty ‚Äî Softplus variant.
    
    IDENTICAL to V10.2. Uses Softplus instead of exp() for numerical stability.
    
    L_total = sum_i [precision_i * L_i + softplus(s_i)/2]
    where precision_i = 1 / softplus(s_i)
    """
    def __init__(self, n_tasks=4):
        super().__init__()
        self.log_vars = nn.Parameter(torch.zeros(n_tasks))
    
    def forward(self, losses):
        total = 0
        for i, loss in enumerate(losses):
            # SoftPlus variant for better numerical stability
            precision = 1.0 / (F.softplus(self.log_vars[i]) + 1e-8)  # FIXED: Correct inverse formula
            total += precision * loss + F.softplus(self.log_vars[i]) * 0.5
        return total
    
    def get_weights(self):
        return (1.0 / (F.softplus(self.log_vars) + 1e-8)).detach().cpu().numpy()  # FIXED

print('‚öñÔ∏è Loss functions defined (Focal + Kendall Softplus) ‚Äî identical to V10.2.')

In [None]:
# Cell 6: Dataset Classes ‚Äî IDENTICAL TO V10.2
class BaseDataset(Dataset):
    def __init__(self, path, tokenizer, max_len):
        self.df = pd.read_csv(path)
        self.tok = tokenizer
        self.max_len = max_len
        
    def __len__(self): 
        return len(self.df)
    
    def encode(self, text):
        return self.tok(
            str(text), max_length=self.max_len, 
            padding='max_length', truncation=True, return_tensors='pt'
        )

class ToxicityDataset(BaseDataset):
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        enc = self.encode(row['text'])
        return {
            'ids': enc['input_ids'].flatten(), 
            'mask': enc['attention_mask'].flatten(),
            'tox': torch.tensor(int(row['label']), dtype=torch.long), 
            'task': 0
        }

class EmotionDataset(BaseDataset):
    def __init__(self, path, tokenizer, max_len, cols):
        super().__init__(path, tokenizer, max_len)
        self.cols = cols
        # FIX: Filter samples with no labels + reset_index
        if 'label_sum' in self.df.columns:
            self.df = self.df[self.df['label_sum'] > 0].reset_index(drop=True)
            
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        enc = self.encode(row['text'])
        return {
            'ids': enc['input_ids'].flatten(), 
            'mask': enc['attention_mask'].flatten(),
            'emo': torch.tensor([float(row[c]) for c in self.cols], dtype=torch.float), 
            'task': 1
        }

class SentimentDataset(BaseDataset):
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        enc = self.encode(row['text'])
        return {
            'ids': enc['input_ids'].flatten(), 
            'mask': enc['attention_mask'].flatten(),
            'sent': torch.tensor(int(row['label']), dtype=torch.long), 
            'task': 2
        }

class ReportingDataset(BaseDataset):
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        enc = self.encode(row['text'])
        return {
            'ids': enc['input_ids'].flatten(), 
            'mask': enc['attention_mask'].flatten(),
            'rep': torch.tensor(int(row['is_reporting']), dtype=torch.long), 
            'task': 3
        }

def collate_fn(batch):
    """Custom collate: handle mixed-task batches gracefully."""
    ids = torch.stack([x['ids'] for x in batch])
    mask = torch.stack([x['mask'] for x in batch])
    tasks = torch.tensor([x['task'] for x in batch])
    
    tox_items = [x['tox'] for x in batch if x['task'] == 0]
    emo_items = [x['emo'] for x in batch if x['task'] == 1]
    sent_items = [x['sent'] for x in batch if x['task'] == 2]
    rep_items = [x['rep'] for x in batch if x['task'] == 3]
    
    return {
        'ids': ids, 'mask': mask, 'tasks': tasks,
        'tox': torch.stack(tox_items) if tox_items else None,
        'emo': torch.stack(emo_items) if emo_items else None,
        'sent': torch.stack(sent_items) if sent_items else None,
        'rep': torch.stack(rep_items) if rep_items else None
    }

print('üì¶ Dataset classes defined ‚Äî identical to V10.2.')

In [None]:
# Cell 7: Load Data ‚Äî IDENTICAL TO V10.2
tokenizer = RobertaTokenizer.from_pretrained(CONFIG['encoder'])

# Load all datasets
tox_train = ToxicityDataset(f'{DATA_DIR}/toxicity_train.csv', tokenizer, CONFIG['max_length'])
emo_train = EmotionDataset(f'{DATA_DIR}/emotions_train.csv', tokenizer, CONFIG['max_length'], EMO_COLS)
sent_train = SentimentDataset(f'{DATA_DIR}/sentiment_train.csv', tokenizer, CONFIG['max_length'])
rep_train = ReportingDataset(f'{DATA_DIR}/reporting_examples_augmented.csv', tokenizer, CONFIG['max_length'])
tox_val = ToxicityDataset(f'{DATA_DIR}/toxicity_val.csv', tokenizer, CONFIG['max_length'])

train_ds = ConcatDataset([tox_train, emo_train, sent_train, rep_train])
train_loader = DataLoader(train_ds, batch_size=CONFIG['batch_size'], shuffle=True, 
                          collate_fn=collate_fn, num_workers=2, pin_memory=True)
val_loader = DataLoader(tox_val, batch_size=CONFIG['batch_size'], collate_fn=collate_fn)

print('='*60)
print('üìä DATASET SUMMARY')
print('='*60)
print(f'Training Samples: {len(train_ds):,}')
print(f'  ‚îú‚îÄ Toxicity:  {len(tox_train):,}')
print(f'  ‚îú‚îÄ Emotion:   {len(emo_train):,}')
print(f'  ‚îú‚îÄ Sentiment: {len(sent_train):,}')
print(f'  ‚îî‚îÄ Reporting: {len(rep_train):,}')
print(f'Validation Samples: {len(tox_val):,} (Toxicity only)')

In [None]:
# Cell 8: Model & Optimizer Setup
#
# ABLATION NOTE: Uses AURA_Baseline instead of AURA_V10.
# Optimizer structure is adapted to match: encoder params get lr_encoder,
# all head params + loss params get lr_heads ‚Äî same differential LR strategy.

model = AURA_Baseline(CONFIG).to(device)
loss_fn = UncertaintyLoss().to(device)
tox_weights = torch.tensor([0.5, 2.0], device=device)  # Class weights ‚Äî IDENTICAL

# Optimizer with differential LR ‚Äî SAME STRATEGY as V10.2
# V10.2 groups: [roberta params @ lr_encoder] + [MHA + heads + loss @ lr_heads]
# Baseline groups: [roberta params @ lr_encoder] + [heads + loss @ lr_heads]
optimizer = torch.optim.AdamW([
    {'params': model.roberta.parameters(), 'lr': CONFIG['lr_encoder']},
    {'params': list(model.toxicity_head.parameters()) + list(model.emotion_head.parameters()) +
               list(model.sentiment_head.parameters()) + list(model.reporting_head.parameters()) +
               list(loss_fn.parameters()), 'lr': CONFIG['lr_heads']}
], weight_decay=CONFIG['weight_decay'])

# Scheduler with warmup ‚Äî IDENTICAL to V10.2
total_steps = len(train_loader) * CONFIG['epochs'] // CONFIG['gradient_accumulation']
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=int(total_steps * CONFIG['warmup_ratio']), 
    num_training_steps=total_steps
)

# Model summary
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print('='*60)
print('üèóÔ∏è BASELINE MODEL SETUP')
print('='*60)
print(f'Total parameters:     {total_params:,}')
print(f'Trainable parameters: {trainable_params:,}')
print(f'Total optimization steps: {total_steps}')
print(f'Warmup steps: {int(total_steps * CONFIG["warmup_ratio"])}')
print(f'Effective batch size: {CONFIG["batch_size"] * CONFIG["gradient_accumulation"]}')
print()
print('‚öóÔ∏è ABLATION NOTE:')
print(f'   AURA V10.2 params: ~128.7M (with 4√ó MHSA blocks)')
print(f'   Baseline params:   {total_params:,} (no MHSA)')
print(f'   Œî (MHSA overhead): ~{128_700_000 - total_params:,} parameters')

In [None]:
# Cell 9: Training Functions ‚Äî IDENTICAL TO V10.2
def train_epoch(epoch):
    model.train()
    
    # Progressive Freezing ‚Äî IDENTICAL to V10.2
    if epoch <= CONFIG['freezing_epochs']:
        print(f'‚ùÑÔ∏è Epoch {epoch}: RoBERTa FROZEN')
        for p in model.roberta.parameters(): 
             p.requires_grad = False
    else:
        print(f'üî• Epoch {epoch}: RoBERTa UNFROZEN')
        for p in model.roberta.parameters(): 
             p.requires_grad = True
    
    total_loss = 0
    optimizer.zero_grad()
    pbar = tqdm(train_loader, desc=f'Epoch {epoch}', mininterval=10.0)
    
    for step, batch in enumerate(pbar):
        ids = batch['ids'].to(device)
        mask = batch['mask'].to(device)
        tasks = batch['tasks']
        
        # Forward pass
        out = model(ids, mask)
        
        # Compute per-task losses ‚Äî IDENTICAL to V10.2
        losses = []
        
        # Toxicity
        if batch['tox'] is not None and (tasks == 0).sum() > 0:
            losses.append(focal_loss(
                out['toxicity'][tasks == 0], batch['tox'].to(device), 
                weight=tox_weights, smoothing=CONFIG['label_smoothing']
            ))
        else: 
            losses.append(torch.tensor(0., device=device, requires_grad=False))
            
        # Emotion (Multilabel BCE)
        if batch['emo'] is not None and (tasks == 1).sum() > 0:
            losses.append(F.binary_cross_entropy_with_logits(
                out['emotion'][tasks == 1], batch['emo'].to(device)
            ))
        else: 
            losses.append(torch.tensor(0., device=device, requires_grad=False))
            
        # Sentiment
        if batch['sent'] is not None and (tasks == 2).sum() > 0:
            losses.append(focal_loss(
                out['sentiment'][tasks == 2], batch['sent'].to(device), 
                smoothing=CONFIG['label_smoothing']
            ))
        else: 
            losses.append(torch.tensor(0., device=device, requires_grad=False))
            
        # Reporting
        if batch['rep'] is not None and (tasks == 3).sum() > 0:
            # Use BCE with logits on float target
            losses.append(F.binary_cross_entropy_with_logits(
                out['reporting'][tasks == 3], batch['rep'].float().to(device)
            ))
        else: 
            losses.append(torch.tensor(0., device=device, requires_grad=False))
            
        # Check for empty batch ‚Äî IDENTICAL to V10.2
        if all((tasks == i).sum() == 0 for i in range(4)):
            print(f"‚ö†Ô∏è Warning: Empty batch at step {step}, skipping")
            optimizer.zero_grad()
            continue

        # Kendall weighted loss ‚Äî IDENTICAL to V10.2
        loss = loss_fn(losses) / CONFIG['gradient_accumulation']
        
        # NaN/Inf safety check ‚Äî IDENTICAL to V10.2
        if torch.isnan(loss) or torch.isinf(loss):
            print(f"‚ö†Ô∏è Warning: Invalid loss {loss.item():.4f} at step {step}, skipping batch")
            optimizer.zero_grad()
            continue

        # Backward pass
        loss.backward()
        
        # Gradient Accumulation ‚Äî IDENTICAL to V10.2
        if (step + 1) % CONFIG['gradient_accumulation'] == 0:
            nn.utils.clip_grad_norm_(model.parameters(), CONFIG['max_grad_norm'])
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            
        total_loss += loss.item() * CONFIG['gradient_accumulation']
        if step % 50 == 0: pbar.set_postfix({'loss': f'{loss.item() * CONFIG["gradient_accumulation"]:.3f}'})
        
    return total_loss / len(train_loader)

@torch.no_grad()
def evaluate():
    model.eval()
    preds, trues = [], []
    for batch in val_loader:
        out = model(batch['ids'].to(device), batch['mask'].to(device))
        preds.extend(out['toxicity'].argmax(1).cpu().numpy())
        trues.extend(batch['tox'].numpy())
    return f1_score(trues, preds, average='macro', zero_division=0)

print('üéØ Training functions defined ‚Äî identical to V10.2.')

In [None]:
# Cell 10: Main Training Loop ‚Äî IDENTICAL TO V10.2
print('='*60)
print('üöÄ BASELINE ‚Äî TRAINING START')
print('='*60)

best_f1 = 0
patience_counter = 0
history = {'train_loss': [], 'val_f1': [], 'task_weights': []}

for epoch in range(1, CONFIG['epochs'] + 1):
    train_loss = train_epoch(epoch)
    val_f1 = evaluate()
    weights = loss_fn.get_weights()
    
    history['train_loss'].append(train_loss)
    history['val_f1'].append(val_f1)
    history['task_weights'].append(weights.copy())
    
    print(f'\nEpoch {epoch} Summary:')
    print(f'  Train Loss: {train_loss:.4f}')
    print(f'  Val F1:     {val_f1:.4f}')
    print(f'  Task Weights [Tox/Emo/Sent/Rep]: {weights.round(3)}')
    
    if val_f1 > best_f1:
        best_f1 = val_f1
        patience_counter = 0
        torch.save(model.state_dict(), 'baseline_best.pt')
        print('  >>> BEST MODEL SAVED <<<')
    else:
        patience_counter += 1
        print(f'  (No improvement. Patience: {patience_counter}/{CONFIG["patience"]})')
        if patience_counter >= CONFIG['patience']:
            print(f'\n‚ö†Ô∏è Early stopping at epoch {epoch}')
            break

print('\n' + '='*60)
print(f'‚úÖ Baseline Training Complete. Best Val F1: {best_f1:.4f}')
print('='*60)

In [None]:
# Cell 11: Training History Visualization
plot_training_history(history)

In [None]:
# Cell 12: Final Evaluation ‚Äî Toxicity (IDENTICAL evaluation to V10.2)
print('='*60)
print('üî¨ BASELINE FINAL EVALUATION: TOXICITY')
print('='*60)

model.load_state_dict(torch.load('baseline_best.pt'))
model.eval()

preds, trues = [], []
with torch.no_grad():
    for batch in val_loader:
        out = model(batch['ids'].to(device), batch['mask'].to(device))
        preds.extend(out['toxicity'].argmax(1).cpu().numpy())
        trues.extend(batch['tox'].numpy())

# Classification Report
print('\n--- Classification Report ---')
print(classification_report(trues, preds, target_names=['Non-Toxic', 'Toxic']))

# Confusion Matrix
fig, ax = plt.subplots(figsize=(6, 5))
plot_confusion_matrix_heatmap(trues, preds, ['Non-Toxic', 'Toxic'], 'Baseline: Toxicity Confusion Matrix', ax)
plt.tight_layout()
plt.show()

In [None]:
# Cell 13: Qualitative Stress Test ‚Äî IDENTICAL test cases to V10.2
print('='*60)
print('üß™ BASELINE QUALITATIVE STRESS TEST')
print('='*60)

test_cases = [
    # Should be NON-TOXIC (The Traffic Paradox)
    ("I hate rainy Mondays", "Non-Toxic"),
    ("This soup is disgusting", "Non-Toxic"),
    ("I am so angry at the traffic", "Non-Toxic"),
    ("This movie was terrible", "Non-Toxic"),
    
    # Should be TOXIC
    ("You are an idiot", "Toxic"),
    ("I hate you so much", "Toxic"),
    ("Go kill yourself", "Toxic"),
    ("You are worthless garbage", "Toxic"),
    
    # Should be NON-TOXIC (Reporting)
    ("He said you are an idiot", "Non-Toxic"),
    ("The article discusses hate speech", "Non-Toxic"),
    ("Someone wrote 'go die' in the comments", "Non-Toxic"),
]

print(f"{'Text':<50} {'Expected':<12} {'Predicted':<12} {'Status'}")
print('-'*80)

correct = 0
model.eval()
with torch.no_grad():
    for text, expected in test_cases:
        enc = tokenizer(text, max_length=128, padding='max_length', truncation=True, return_tensors='pt')
        out = model(enc['input_ids'].to(device), enc['attention_mask'].to(device))
        pred_idx = out['toxicity'].argmax(1).item()
        pred_label = 'Toxic' if pred_idx == 1 else 'Non-Toxic'
        status = '‚úÖ' if pred_label == expected else '‚ùå'
        if pred_label == expected:
            correct += 1
        print(f"{text[:48]:<50} {expected:<12} {pred_label:<12} {status}")

print('-'*80)
print(f'Stress Test Accuracy: {correct}/{len(test_cases)} ({correct/len(test_cases)*100:.0f}%)')

In [None]:
# Cell 14: Ablation Comparison ‚Äî AURA V10.2 vs Baseline
print('='*60)
print('‚öóÔ∏è ABLATION COMPARISON: AURA V10.2 vs BASELINE')
print('='*60)

# V10.2 known results (from AURA_V10.2_Kaggle.ipynb run)
AURA_V10_F1 = 0.7572
BASELINE_F1 = best_f1
DELTA = BASELINE_F1 - AURA_V10_F1

print(f'\n{"Metric":<30} {"AURA V10.2 (MHSA)":<20} {"Baseline (No MHSA)":<20} {"Œî":<10}')
print('-'*80)
print(f'{"Toxicity Val F1 (macro)":<30} {AURA_V10_F1:<20.4f} {BASELINE_F1:<20.4f} {DELTA:+.4f}')
print(f'{"Architecture":<30} {"RoBERTa + 4√óMHSA":<20} {"RoBERTa only":<20} {"":<10}')
print(f'{"Extra Params (MHSA)":<30} {"~7M":<20} {"0":<20} {"":<10}')
print('-'*80)

if DELTA > 0:
    print(f'\nüìä Result: Baseline is BETTER by {abs(DELTA):.4f} F1 points.')
    print('   ‚Üí Task-Specific MHSA may be HURTING performance.')
elif DELTA < -0.01:
    print(f'\nüìä Result: AURA V10.2 is BETTER by {abs(DELTA):.4f} F1 points.')
    print('   ‚Üí Task-Specific MHSA provides a measurable improvement.')
else:
    print(f'\nüìä Result: Difference is NEGLIGIBLE ({abs(DELTA):.4f} F1 points).')
    print('   ‚Üí Task-Specific MHSA adds ~7M parameters for marginal/no gain.')

In [None]:
# Cell 15: Save Artifacts
print('='*60)
print('üíæ SAVING BASELINE ARTIFACTS')
print('='*60)

import json
history_serializable = {
    'train_loss': history['train_loss'],
    'val_f1': history['val_f1'],
    'task_weights': [w.tolist() for w in history['task_weights']],
    'best_f1': best_f1,
    'config': CONFIG,
    'model_type': 'baseline_no_mhsa'
}
with open('baseline_history.json', 'w') as f:
    json.dump(history_serializable, f, indent=2)

print('‚úÖ Model saved: baseline_best.pt')
print('‚úÖ History saved: baseline_history.json')
print(f'\nüèÜ Baseline Best F1: {best_f1:.4f}')
print(f'üèÜ AURA V10.2 F1:    {AURA_V10_F1:.4f}')
print(f'üèÜ Œî (Baseline - AURA): {DELTA:+.4f}')