# 🎯 Head-Only Training Pipeline

**Mission**: Ultra-fast classifier training with cached features  
**Target**: 3 ablations in <10 minutes, 4GB VRAM optimized  
**Strategy**: Load pre-extracted features → train lightweight heads → compare architectures

---

## 🏗️ Pipeline Overview

1. **Feature Loading**: Memory-mapped NPZ cache → batch loading
2. **Head Architectures**: Linear, MLP, Attention-based classifiers
3. **Fast Training**: 10-20 epochs max, early stopping, mixed precision
4. **Ablation Studies**: Compare head architectures, learning rates, regularization
5. **Model Selection**: Best head → save for ensemble/distillation

### 📊 Performance Targets
- **Speed**: <3 minutes per head architecture
- **VRAM**: <1.5GB peak (frozen encoder + small head)
- **Quality**: Match full training baseline
- **Throughput**: 3 architectures × 3 configs = 9 experiments <10min

In [1]:
# 🔧 Setup & Imports
import os
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import time
import json
from datetime import datetime
from typing import Dict, List, Tuple, Optional
import warnings
warnings.filterwarnings('ignore')

# ML Libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.amp import autocast, GradScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# Project imports
sys.path.append('../src')

# 🎮 Device & Memory Setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print(f"🚀 GPU: {torch.cuda.get_device_name(0)}")
    print(f"💾 VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f}GB")
    # Enable optimizations
    torch.backends.cudnn.benchmark = True
else:
    print("⚠️ Running on CPU - training will be slower")

print(f"🔧 PyTorch: {torch.__version__}")
print(f"📁 Working dir: {Path.cwd()}")

⚠️ Running on CPU - training will be slower
🔧 PyTorch: 2.8.0+cpu
📁 Working dir: C:\Users\MadScie254\Documents\GitHub\Capstone-Lazarus\notebooks


In [2]:
# ⚙️ Configuration
CONFIG = {
    # Paths
    'features_dir': '../features',
    'manifest_file': '../features/manifest_features.v001.csv',
    'models_dir': '../test_models/head_training',
    'encoder_name': 'efficientnet_b0',
    
    # Training settings (4GB VRAM optimized)
    'batch_size': 256,        # Large batch for stability
    'max_epochs': 20,         # Fast convergence
    'early_stop_patience': 5, # Early stopping
    'learning_rates': [1e-3, 3e-4, 1e-4],  # LR ablation
    
    # Model architectures
    'head_types': ['linear', 'mlp', 'attention'],
    'dropout_rates': [0.3, 0.5, 0.7],
    'hidden_dims': [512, 256, 128],
    
    # Data settings
    'test_size': 0.2,
    'val_size': 0.1,
    'random_state': 42,
    
    # Performance
    'num_workers': 4,
    'pin_memory': True,
    'use_amp': True,          # Mixed precision
    'compile_model': False,   # PyTorch 2.0 compile (disable for compatibility)
}

print("🎯 HEAD TRAINING CONFIGURATION:")
print(f"   🎬 Batch size: {CONFIG['batch_size']} (feature cached)")
print(f"   📈 Max epochs: {CONFIG['max_epochs']} (early stop: {CONFIG['early_stop_patience']})")
print(f"   🏗️ Head types: {CONFIG['head_types']}")
print(f"   📊 Learning rates: {CONFIG['learning_rates']}")
print(f"   🔄 Mixed precision: {CONFIG['use_amp']}")

🎯 HEAD TRAINING CONFIGURATION:
   🎬 Batch size: 256 (feature cached)
   📈 Max epochs: 20 (early stop: 5)
   🏗️ Head types: ['linear', 'mlp', 'attention']
   📊 Learning rates: [0.001, 0.0003, 0.0001]
   🔄 Mixed precision: True


In [3]:
# 📊 Feature Dataset Loading

class FeatureDataset(Dataset):
    """Memory-efficient dataset for cached features"""
    
    def __init__(self, feature_files: List[str], labels: List[int], 
                 cache_features: bool = True):
        self.feature_files = feature_files
        self.labels = labels
        self.cache_features = cache_features
        self.feature_cache = {}
        
        # Load first feature to get dimensions
        sample_feature = np.load(feature_files[0])['features']
        self.feature_dim = sample_feature.shape[0]
        self.feature_dtype = sample_feature.dtype
        
        print(f"📊 FeatureDataset initialized:")
        print(f"   🖼️ Samples: {len(feature_files)}")
        print(f"   📐 Feature dim: {self.feature_dim}")
        print(f"   🗜️ Dtype: {self.feature_dtype}")
        print(f"   💾 Caching: {cache_features}")
    
    def __len__(self):
        return len(self.feature_files)
    
    def __getitem__(self, idx):
        feature_file = self.feature_files[idx]
        label = self.labels[idx]
        
        # Check cache first
        if self.cache_features and feature_file in self.feature_cache:
            features = self.feature_cache[feature_file]
        else:
            # Load features
            try:
                data = np.load(feature_file)
                features = data['features'].astype(np.float32)  # Ensure float32
                
                # Cache if enabled
                if self.cache_features:
                    self.feature_cache[feature_file] = features
                    
            except Exception as e:
                print(f"⚠️ Error loading {feature_file}: {e}")
                # Return zero features as fallback
                features = np.zeros(self.feature_dim, dtype=np.float32)
        
        return torch.from_numpy(features), label

def load_feature_manifest(manifest_file: str) -> pd.DataFrame:
    """Load and validate feature manifest"""
    if not Path(manifest_file).exists():
        raise FileNotFoundError(f"Feature manifest not found: {manifest_file}")
    
    manifest = pd.read_csv(manifest_file)
    print(f"📋 Loaded manifest: {len(manifest)} features")
    
    # Validate feature files exist
    missing_files = []
    for feature_file in manifest['feature_file']:
        if not Path(feature_file).exists():
            missing_files.append(feature_file)
    
    if missing_files:
        print(f"⚠️ Missing {len(missing_files)} feature files")
        # Filter out missing files
        manifest = manifest[~manifest['feature_file'].isin(missing_files)]
        print(f"   📊 Valid features: {len(manifest)}")
    
    return manifest

def create_train_val_test_split(manifest: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """Create stratified train/val/test splits"""
    
    # Prepare labels
    label_encoder = LabelEncoder()
    manifest['label_encoded'] = label_encoder.fit_transform(manifest['class_name'])
    
    # First split: train+val vs test
    train_val, test = train_test_split(
        manifest, 
        test_size=CONFIG['test_size'],
        stratify=manifest['label_encoded'],
        random_state=CONFIG['random_state']
    )
    
    # Second split: train vs val
    val_size_adjusted = CONFIG['val_size'] / (1 - CONFIG['test_size'])
    train, val = train_test_split(
        train_val,
        test_size=val_size_adjusted,
        stratify=train_val['label_encoded'],
        random_state=CONFIG['random_state']
    )
    
    print(f"📊 Data splits:")
    print(f"   🎓 Train: {len(train)} samples")
    print(f"   🔍 Val: {len(val)} samples")
    print(f"   🧪 Test: {len(test)} samples")
    
    # Store label encoder
    CONFIG['label_encoder'] = label_encoder
    CONFIG['num_classes'] = len(label_encoder.classes_)
    
    print(f"   🏷️ Classes: {CONFIG['num_classes']}")
    
    return train, val, test

# Load feature manifest
try:
    manifest = load_feature_manifest(CONFIG['manifest_file'])
    train_df, val_df, test_df = create_train_val_test_split(manifest)
    
    print(f"✅ Feature loading ready")
    print(f"   📁 Features dir: {CONFIG['features_dir']}")
    print(f"   🎯 Ready for head training")
    
except FileNotFoundError:
    print(f"⚠️ Feature manifest not found: {CONFIG['manifest_file']}")
    print(f"   Run feature extraction first (02_feature_extract_microjobs.ipynb)")
    manifest, train_df, val_df, test_df = None, None, None, None

⚠️ Feature manifest not found: ../features/manifest_features.v001.csv
   Run feature extraction first (02_feature_extract_microjobs.ipynb)


In [4]:
# 🏗️ Head Architecture Definitions

class LinearHead(nn.Module):
    """Simple linear classifier head"""
    
    def __init__(self, feature_dim: int, num_classes: int, dropout: float = 0.3):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(feature_dim, num_classes)
        
    def forward(self, features):
        features = self.dropout(features)
        return self.classifier(features)

class MLPHead(nn.Module):
    """Multi-layer perceptron head"""
    
    def __init__(self, feature_dim: int, num_classes: int, 
                 hidden_dim: int = 512, dropout: float = 0.3):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(feature_dim, hidden_dim),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim // 2, num_classes)
        )
    
    def forward(self, features):
        return self.layers(features)

class AttentionHead(nn.Module):
    """Self-attention based head"""
    
    def __init__(self, feature_dim: int, num_classes: int, 
                 hidden_dim: int = 256, dropout: float = 0.3):
        super().__init__()
        self.feature_dim = feature_dim
        self.hidden_dim = hidden_dim
        
        # Attention mechanism
        self.query = nn.Linear(feature_dim, hidden_dim)
        self.key = nn.Linear(feature_dim, hidden_dim)
        self.value = nn.Linear(feature_dim, hidden_dim)
        
        # Output layers
        self.norm = nn.LayerNorm(hidden_dim)
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(hidden_dim, num_classes)
        
    def forward(self, features):
        batch_size = features.size(0)
        
        # Self-attention (treating each sample as sequence of length 1)
        features = features.unsqueeze(1)  # [B, 1, D]
        
        Q = self.query(features)  # [B, 1, H]
        K = self.key(features)    # [B, 1, H]
        V = self.value(features)  # [B, 1, H]
        
        # Attention weights (simplified for single sequence)
        attention_weights = torch.softmax(torch.bmm(Q, K.transpose(1, 2)) / (self.hidden_dim ** 0.5), dim=-1)
        attended_features = torch.bmm(attention_weights, V).squeeze(1)  # [B, H]
        
        # Residual connection
        attended_features = self.norm(attended_features + self.query(features.squeeze(1)))
        
        # Classification
        attended_features = self.dropout(attended_features)
        return self.classifier(attended_features)

def create_head_model(head_type: str, feature_dim: int, num_classes: int, 
                     hidden_dim: int = 512, dropout: float = 0.3) -> nn.Module:
    """Factory function for creating head models"""
    
    if head_type == 'linear':
        model = LinearHead(feature_dim, num_classes, dropout)
    elif head_type == 'mlp':
        model = MLPHead(feature_dim, num_classes, hidden_dim, dropout)
    elif head_type == 'attention':
        model = AttentionHead(feature_dim, num_classes, hidden_dim, dropout)
    else:
        raise ValueError(f"Unknown head type: {head_type}")
    
    # Initialize weights
    for module in model.modules():
        if isinstance(module, nn.Linear):
            nn.init.xavier_uniform_(module.weight)
            if module.bias is not None:
                nn.init.constant_(module.bias, 0)
    
    return model

print("🏗️ Head architectures defined:")
print(f"   📐 LinearHead: Simple dropout + linear")
print(f"   🧠 MLPHead: 2-layer MLP with ReLU")
print(f"   🎯 AttentionHead: Self-attention + residual")
print(f"   ⚡ All heads support mixed precision training")

🏗️ Head architectures defined:
   📐 LinearHead: Simple dropout + linear
   🧠 MLPHead: 2-layer MLP with ReLU
   🎯 AttentionHead: Self-attention + residual
   ⚡ All heads support mixed precision training


In [5]:
# 🚀 Training Engine

def train_head_model(model: nn.Module, train_loader: DataLoader, val_loader: DataLoader,
                    learning_rate: float, max_epochs: int = 20) -> Dict:
    """Train head model with early stopping and mixed precision"""
    
    # Setup training
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1e-4)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', patience=3, factor=0.5)
    criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
    scaler = GradScaler() if CONFIG['use_amp'] else None
    
    # Training state
    best_val_acc = 0.0
    best_model_state = None
    patience_counter = 0
    train_history = []
    
    print(f"🚀 Starting training: LR={learning_rate}, Epochs={max_epochs}")
    
    start_time = time.time()
    
    for epoch in range(max_epochs):
        epoch_start = time.time()
        
        # Training phase
        model.train()
        train_loss = 0.0
        train_correct = 0
        train_total = 0
        
        for batch_features, batch_labels in train_loader:
            batch_features = batch_features.to(device, non_blocking=True)
            batch_labels = batch_labels.to(device, non_blocking=True)
            
            optimizer.zero_grad()
            
            # Forward pass with mixed precision
            if CONFIG['use_amp'] and scaler is not None:
                with autocast(device_type='cuda'):
                    outputs = model(batch_features)
                    loss = criterion(outputs, batch_labels)
                
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
            else:
                outputs = model(batch_features)
                loss = criterion(outputs, batch_labels)
                loss.backward()
                optimizer.step()
            
            # Statistics
            train_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            train_total += batch_labels.size(0)
            train_correct += (predicted == batch_labels).sum().item()
        
        train_acc = train_correct / train_total
        avg_train_loss = train_loss / len(train_loader)
        
        # Validation phase
        model.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0
        
        with torch.no_grad():
            for batch_features, batch_labels in val_loader:
                batch_features = batch_features.to(device, non_blocking=True)
                batch_labels = batch_labels.to(device, non_blocking=True)
                
                if CONFIG['use_amp']:
                    with autocast(device_type='cuda'):
                        outputs = model(batch_features)
                        loss = criterion(outputs, batch_labels)
                else:
                    outputs = model(batch_features)
                    loss = criterion(outputs, batch_labels)
                
                val_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                val_total += batch_labels.size(0)
                val_correct += (predicted == batch_labels).sum().item()
        
        val_acc = val_correct / val_total
        avg_val_loss = val_loss / len(val_loader)
        
        # Learning rate scheduling
        scheduler.step(val_acc)
        
        # Early stopping and best model tracking
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_model_state = model.state_dict().copy()
            patience_counter = 0
        else:
            patience_counter += 1
        
        # Record history
        epoch_time = time.time() - epoch_start
        train_history.append({
            'epoch': epoch,
            'train_loss': avg_train_loss,
            'train_acc': train_acc,
            'val_loss': avg_val_loss,
            'val_acc': val_acc,
            'lr': optimizer.param_groups[0]['lr'],
            'epoch_time': epoch_time
        })
        
        # Print progress
        if epoch % 5 == 0 or epoch == max_epochs - 1:
            print(f"   Epoch {epoch:2d}: Train={train_acc:.3f}, Val={val_acc:.3f}, "
                  f"Loss={avg_val_loss:.3f}, Time={epoch_time:.1f}s")
        
        # Early stopping
        if patience_counter >= CONFIG['early_stop_patience']:
            print(f"   Early stopping at epoch {epoch} (patience={CONFIG['early_stop_patience']})")
            break
    
    # Restore best model
    if best_model_state is not None:
        model.load_state_dict(best_model_state)
    
    total_time = time.time() - start_time
    
    print(f"✅ Training complete: Best Val Acc = {best_val_acc:.3f}, Time = {total_time:.1f}s")
    
    return {
        'best_val_acc': best_val_acc,
        'total_time': total_time,
        'epochs_trained': len(train_history),
        'train_history': train_history,
        'model_state': best_model_state
    }

def evaluate_model(model: nn.Module, test_loader: DataLoader) -> Dict:
    """Evaluate model on test set"""
    model.eval()
    
    all_predictions = []
    all_labels = []
    test_loss = 0.0
    criterion = nn.CrossEntropyLoss()
    
    with torch.no_grad():
        for batch_features, batch_labels in test_loader:
            batch_features = batch_features.to(device, non_blocking=True)
            batch_labels = batch_labels.to(device, non_blocking=True)
            
            outputs = model(batch_features)
            loss = criterion(outputs, batch_labels)
            test_loss += loss.item()
            
            _, predicted = torch.max(outputs.data, 1)
            all_predictions.extend(predicted.cpu().numpy())
            all_labels.extend(batch_labels.cpu().numpy())
    
    # Calculate metrics
    test_acc = accuracy_score(all_labels, all_predictions)
    avg_test_loss = test_loss / len(test_loader)
    
    return {
        'test_acc': test_acc,
        'test_loss': avg_test_loss,
        'predictions': all_predictions,
        'labels': all_labels
    }

print("🚀 Training engine ready:")
print(f"   ⚡ Mixed precision: {CONFIG['use_amp']}")
print(f"   📊 Early stopping: {CONFIG['early_stop_patience']} epochs")
print(f"   🎯 Target: <3 minutes per head")

🚀 Training engine ready:
   ⚡ Mixed precision: True
   📊 Early stopping: 5 epochs
   🎯 Target: <3 minutes per head


In [6]:
# 🔬 Ablation Study Execution

def run_ablation_study() -> pd.DataFrame:
    """Run comprehensive head architecture ablation study"""
    
    if train_df is None:
        print("❌ No training data available - run feature extraction first")
        return pd.DataFrame()
    
    print(f"🔬 Starting ablation study...")
    print(f"   🎯 Architectures: {len(CONFIG['head_types'])}")
    print(f"   📊 Learning rates: {len(CONFIG['learning_rates'])}")
    print(f"   🔄 Total experiments: {len(CONFIG['head_types']) * len(CONFIG['learning_rates'])}")
    
    # Create data loaders
    train_dataset = FeatureDataset(
        train_df['feature_file'].tolist(),
        train_df['label_encoded'].tolist(),
        cache_features=True
    )
    
    val_dataset = FeatureDataset(
        val_df['feature_file'].tolist(),
        val_df['label_encoded'].tolist(),
        cache_features=True
    )
    
    test_dataset = FeatureDataset(
        test_df['feature_file'].tolist(),
        test_df['label_encoded'].tolist(),
        cache_features=True
    )
    
    train_loader = DataLoader(
        train_dataset, batch_size=CONFIG['batch_size'], shuffle=True,
        num_workers=CONFIG['num_workers'], pin_memory=CONFIG['pin_memory']
    )
    
    val_loader = DataLoader(
        val_dataset, batch_size=CONFIG['batch_size'], shuffle=False,
        num_workers=CONFIG['num_workers'], pin_memory=CONFIG['pin_memory']
    )
    
    test_loader = DataLoader(
        test_dataset, batch_size=CONFIG['batch_size'], shuffle=False,
        num_workers=CONFIG['num_workers'], pin_memory=CONFIG['pin_memory']
    )
    
    # Get feature dimensions
    feature_dim = train_dataset.feature_dim
    num_classes = CONFIG['num_classes']
    
    print(f"\n📊 Dataset ready:")
    print(f"   📐 Feature dim: {feature_dim}")
    print(f"   🏷️ Classes: {num_classes}")
    print(f"   🎓 Train batches: {len(train_loader)}")
    
    # Run experiments
    results = []
    experiment_id = 0
    
    study_start_time = time.time()
    
    for head_type in CONFIG['head_types']:
        for learning_rate in CONFIG['learning_rates']:
            experiment_id += 1
            exp_name = f"{head_type}_lr{learning_rate}"
            
            print(f"\n🧪 Experiment {experiment_id}: {exp_name}")
            
            try:
                # Create model
                model = create_head_model(
                    head_type, feature_dim, num_classes,
                    hidden_dim=512, dropout=0.5
                ).to(device)
                
                # Count parameters
                param_count = sum(p.numel() for p in model.parameters())
                
                print(f"   🏗️ Model: {head_type}, Params: {param_count:,}")
                
                # Train model
                train_results = train_head_model(
                    model, train_loader, val_loader, 
                    learning_rate, CONFIG['max_epochs']
                )
                
                # Test model
                test_results = evaluate_model(model, test_loader)
                
                # Store results
                result = {
                    'experiment_id': experiment_id,
                    'experiment_name': exp_name,
                    'head_type': head_type,
                    'learning_rate': learning_rate,
                    'param_count': param_count,
                    'best_val_acc': train_results['best_val_acc'],
                    'test_acc': test_results['test_acc'],
                    'train_time': train_results['total_time'],
                    'epochs_trained': train_results['epochs_trained'],
                    'timestamp': datetime.now().isoformat()
                }
                results.append(result)
                
                print(f"   ✅ {exp_name}: Val={train_results['best_val_acc']:.3f}, "
                      f"Test={test_results['test_acc']:.3f}, Time={train_results['total_time']:.1f}s")
                
                # Save best models
                if len(results) == 1 or test_results['test_acc'] > max(r['test_acc'] for r in results[:-1]):
                    model_path = Path(CONFIG['models_dir']) / f"best_head_{exp_name}.pth"
                    model_path.parent.mkdir(parents=True, exist_ok=True)
                    torch.save({
                        'model_state_dict': model.state_dict(),
                        'model_config': {
                            'head_type': head_type,
                            'feature_dim': feature_dim,
                            'num_classes': num_classes,
                            'hidden_dim': 512,
                            'dropout': 0.5
                        },
                        'results': result,
                        'label_encoder_classes': CONFIG['label_encoder'].classes_.tolist()
                    }, model_path)
                    print(f"   💾 Best model saved: {model_path.name}")
                
            except Exception as e:
                print(f"   ❌ Experiment failed: {e}")
                import traceback
                traceback.print_exc()
            
            # Clear GPU memory
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
    
    total_time = time.time() - study_start_time
    
    # Create results dataframe
    results_df = pd.DataFrame(results)
    
    print(f"\n🏁 Ablation study complete:")
    print(f"   ⏱️ Total time: {total_time/60:.1f} minutes")
    print(f"   🧪 Experiments: {len(results)}/{len(CONFIG['head_types']) * len(CONFIG['learning_rates'])}")
    
    if not results_df.empty:
        best_result = results_df.loc[results_df['test_acc'].idxmax()]
        print(f"   🏆 Best model: {best_result['experiment_name']} ({best_result['test_acc']:.3f} test acc)")
        print(f"   📊 Average experiment time: {results_df['train_time'].mean():.1f}s")
    
    return results_df

print("🔬 Ablation study function ready")
print(f"   🎯 Target: Complete 9 experiments in <10 minutes")
print(f"   🏆 Automated best model selection and saving")

🔬 Ablation study function ready
   🎯 Target: Complete 9 experiments in <10 minutes
   🏆 Automated best model selection and saving


In [7]:
# 🚀 RUN ABLATION STUDY
# Execute complete head architecture comparison

if train_df is not None:
    print("🚀 EXECUTING ABLATION STUDY")
    print(f"Expected time: <10 minutes for {len(CONFIG['head_types']) * len(CONFIG['learning_rates'])} experiments")
    
    # Clear memory before starting
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        print(f"🧹 GPU memory cleared")
    
    # Run the full ablation study
    ablation_results = run_ablation_study()
    
    # Save results
    if not ablation_results.empty:
        results_dir = Path(CONFIG['models_dir'])
        results_dir.mkdir(parents=True, exist_ok=True)
        
        results_file = results_dir / f"ablation_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
        ablation_results.to_csv(results_file, index=False)
        
        print(f"\n💾 Results saved: {results_file}")
        
        # Display summary table
        print(f"\n📊 ABLATION RESULTS SUMMARY:")
        summary_cols = ['experiment_name', 'head_type', 'learning_rate', 
                       'best_val_acc', 'test_acc', 'train_time', 'param_count']
        if all(col in ablation_results.columns for col in summary_cols):
            display_df = ablation_results[summary_cols].copy()
            display_df['train_time'] = display_df['train_time'].round(1)
            display_df = display_df.sort_values('test_acc', ascending=False)
            print(display_df.to_string(index=False))
        
        print(f"\n🎯 PHASE C COMPLETE: Head-only training pipeline ready!")
        print(f"   ✅ {len(ablation_results)} experiments completed")
        print(f"   🏆 Best head architecture identified and saved")
        print(f"   ⚡ Average training time: {ablation_results['train_time'].mean():.1f}s per experiment")
        
    else:
        print("❌ No successful experiments - check feature extraction and data loading")
        
else:
    print("⚠️ Ablation study skipped - no training data available")
    print("   Run feature extraction first (02_feature_extract_microjobs.ipynb)")
    print("   Expected workflow:")
    print("   1. Feature extraction → cached features")
    print("   2. Head training → this notebook")
    print("   3. Results → best head architecture for ensemble")

⚠️ Ablation study skipped - no training data available
   Run feature extraction first (02_feature_extract_microjobs.ipynb)
   Expected workflow:
   1. Feature extraction → cached features
   2. Head training → this notebook
   3. Results → best head architecture for ensemble
