# Ablation Studies Notebook
## Reasoning Distillation Project

This notebook performs systematic ablation studies to understand:
1. Effect of label smoothing (0.0, 0.1, 0.2)
2. Temperature variations in generation
3. Impact of training data size (10%, 50%, 100%)
4. Beam search vs sampling strategies
5. Learning rate sensitivity
6. Effect of max sequence lengths

In [None]:
# Setup
import sys
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Add project root to path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

# Imports
import torch
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from pprint import pprint
import time

from src.data.data_loader import TeacherDataLoader
from src.data.preprocessor import ReasoningPreprocessor, PreprocessConfig
from src.data.dataset import ESNLIDataset, create_dataloaders

from src.models.student import StudentModel, StudentConfig

from src.training.distillation import (
    DistillationConfig,
    SequenceLevelDistillation
)

from src.training.trainer import Trainer, TrainingConfig

from src.evaluation.evaluator import Evaluator, EvaluationConfig
from src.evaluation.metrics import MetricsConfig, format_metrics

# Styling
sns.set_style('whitegrid')

## 1. Load Base Dataset

In [None]:
# Load dataset
print("=" * 70)
print("LOADING DATA")
print("=" * 70)

loader = TeacherDataLoader()
esnli_data = loader.load_esnli()

# Use subset for faster ablation experiments
train_subset = esnli_data['train'].select(range(500))  # 500 samples
val_subset = esnli_data['validation'].select(range(100))  # 100 samples

print(f"\nâœ“ Train samples: {len(train_subset)}")
print(f"âœ“ Val samples: {len(val_subset)}")

## 2. Ablation Study 1: Label Smoothing

Test the effect of different label smoothing values on model performance.

In [None]:
# Ablation 1: Label Smoothing
print("=" * 70)
print("ABLATION STUDY 1: LABEL SMOOTHING")
print("=" * 70)

smoothing_values = [0.0, 0.1, 0.2]
smoothing_results = []

# Prepare data
preprocess_config = PreprocessConfig(
    model_name="google/flan-t5-small",
    max_source_length=128,
    max_target_length=64
)

preprocessor = ReasoningPreprocessor(preprocess_config)
train_dataset = ESNLIDataset(train_subset, preprocessor, use_cache=True)
val_dataset = ESNLIDataset(val_subset, preprocessor, use_cache=True)

train_loader, val_loader = create_dataloaders(
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    batch_size=16,
    num_workers=0,
    pad_token_id=preprocessor.tokenizer.pad_token_id
)

for smoothing in smoothing_values:
    print(f"\n{'='*70}")
    print(f"Testing Label Smoothing = {smoothing}")
    print(f"{'='*70}")
    
    # Create fresh model
    student_config = StudentConfig(
        model_name="google/flan-t5-small",
        max_source_length=128,
        max_target_length=64,
        device=device
    )
    student = StudentModel(student_config)
    
    # Create distillation strategy
    distill_config = DistillationConfig(
        ce_weight=1.0,
        label_smoothing=smoothing
    )
    distillation_strategy = SequenceLevelDistillation(distill_config)
    
    # Train
    training_config = TrainingConfig(
        num_epochs=3,
        learning_rate=5e-5,
        eval_steps=20,
        save_steps=1000,  # Don't save
        logging_steps=10,
        output_dir=f"../experiments/ablation_smoothing_{smoothing}",
        eval_strategy="steps"
    )
    
    trainer = Trainer(
        model=student,
        train_dataloader=train_loader,
        eval_dataloader=val_loader,
        distillation_strategy=distillation_strategy,
        config=training_config
    )
    
    start_time = time.time()
    history = trainer.train()
    training_time = time.time() - start_time
    
    # Evaluate
    eval_config = EvaluationConfig(
        metrics_config=MetricsConfig(
            compute_rouge=True,
            compute_bertscore=False,
            compute_faithfulness=True
        ),
        save_predictions=False,
        output_dir=f"../experiments/ablation_smoothing_{smoothing}_eval"
    )
    
    evaluator = Evaluator(student, eval_config)
    results = evaluator.evaluate(val_loader, split_name="val")
    
    # Store results
    smoothing_results.append({
        'smoothing': smoothing,
        'accuracy': results['metrics']['label_accuracy'],
        'rouge1': results['metrics']['rouge1'],
        'rougeL': results['metrics']['rougeL'],
        'faithfulness': results['metrics']['faithfulness'],
        'final_train_loss': history['train_history'][-1]['loss'],
        'final_eval_loss': history['eval_history'][-1]['eval_loss'] if history['eval_history'] else None,
        'training_time': training_time
    })
    
    print(f"\nâœ“ Smoothing {smoothing} completed")
    print(f"  Accuracy: {results['metrics']['label_accuracy']:.4f}")
    print(f"  ROUGE-L: {results['metrics']['rougeL']:.4f}")

print("\n" + "="*70)
print("LABEL SMOOTHING ABLATION COMPLETE")
print("="*70)

In [None]:
# Visualize label smoothing results
smoothing_df = pd.DataFrame(smoothing_results)

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

metrics_to_plot = ['accuracy', 'rouge1', 'rougeL', 'faithfulness']
colors_palette = ['#e74c3c', '#3498db', '#2ecc71', '#f39c12']

for idx, (metric, color) in enumerate(zip(metrics_to_plot, colors_palette)):
    axes[idx].plot(smoothing_df['smoothing'], smoothing_df[metric], 
                   marker='o', linewidth=2, markersize=10, color=color)
    axes[idx].set_xlabel('Label Smoothing')
    axes[idx].set_ylabel(metric.upper())
    axes[idx].set_title(f'{metric.upper()} vs Label Smoothing')
    axes[idx].grid(True, alpha=0.3)
    
    # Add value labels
    for x, y in zip(smoothing_df['smoothing'], smoothing_df[metric]):
        axes[idx].text(x, y + 0.01, f'{y:.3f}', ha='center', fontsize=9)

plt.suptitle('Ablation Study: Label Smoothing Impact', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

print("\nLabel Smoothing Results Summary:")
print(smoothing_df.to_string(index=False))

## 3. Ablation Study 2: Training Data Size

Evaluate how model performance scales with training data size.

In [None]:
# Ablation 2: Training Data Size
print("=" * 70)
print("ABLATION STUDY 2: TRAINING DATA SIZE")
print("=" * 70)

data_fractions = [0.1, 0.5, 1.0]
data_size_results = []

# Use larger base dataset for this study
full_train = esnli_data['train'].select(range(2000))

for fraction in data_fractions:
    print(f"\n{'='*70}")
    print(f"Testing Data Fraction = {fraction*100}%")
    print(f"{'='*70}")
    
    # Sample data
    n_samples = int(len(full_train) * fraction)
    train_fraction = full_train.select(range(n_samples))
    
    # Create datasets
    train_dataset_frac = ESNLIDataset(train_fraction, preprocessor, use_cache=True)
    train_loader_frac, _ = create_dataloaders(
        train_dataset=train_dataset_frac,
        val_dataset=val_dataset,
        batch_size=16,
        num_workers=0,
        pad_token_id=preprocessor.tokenizer.pad_token_id
    )
    
    # Create fresh model
    student = StudentModel(StudentConfig(
        model_name="google/flan-t5-small",
        max_source_length=128,
        max_target_length=64,
        device=device
    ))
    
    distillation_strategy = SequenceLevelDistillation(DistillationConfig(
        ce_weight=1.0,
        label_smoothing=0.1
    ))
    
    # Train
    training_config = TrainingConfig(
        num_epochs=3,
        learning_rate=5e-5,
        eval_steps=20,
        save_steps=1000,
        logging_steps=10,
        output_dir=f"../experiments/ablation_datasize_{fraction}",
        eval_strategy="steps"
    )
    
    trainer = Trainer(
        model=student,
        train_dataloader=train_loader_frac,
        eval_dataloader=val_loader,
        distillation_strategy=distillation_strategy,
        config=training_config
    )
    
    start_time = time.time()
    history = trainer.train()
    training_time = time.time() - start_time
    
    # Evaluate
    evaluator = Evaluator(student, EvaluationConfig(
        metrics_config=MetricsConfig(
            compute_rouge=True,
            compute_bertscore=False,
            compute_faithfulness=True
        ),
        save_predictions=False,
        output_dir=f"../experiments/ablation_datasize_{fraction}_eval"
    ))
    
    results = evaluator.evaluate(val_loader, split_name="val")
    
    # Store results
    data_size_results.append({
        'fraction': fraction,
        'n_samples': n_samples,
        'accuracy': results['metrics']['label_accuracy'],
        'rouge1': results['metrics']['rouge1'],
        'rougeL': results['metrics']['rougeL'],
        'faithfulness': results['metrics']['faithfulness'],
        'training_time': training_time
    })
    
    print(f"\nâœ“ Data fraction {fraction} completed")
    print(f"  Samples: {n_samples}")
    print(f"  Accuracy: {results['metrics']['label_accuracy']:.4f}")

print("\n" + "="*70)
print("DATA SIZE ABLATION COMPLETE")
print("="*70)

In [None]:
# Visualize data size results
datasize_df = pd.DataFrame(data_size_results)

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Plot 1: Performance vs Data Size
metrics = ['accuracy', 'rouge1', 'rougeL', 'faithfulness']
colors = ['#e74c3c', '#3498db', '#2ecc71', '#f39c12']

for metric, color in zip(metrics, colors):
    axes[0].plot(datasize_df['n_samples'], datasize_df[metric], 
                 marker='o', linewidth=2, markersize=8, label=metric.upper(), color=color)

axes[0].set_xlabel('Number of Training Samples')
axes[0].set_ylabel('Score')
axes[0].set_title('Performance vs Training Data Size')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Plot 2: Training Time vs Data Size
axes[1].plot(datasize_df['n_samples'], datasize_df['training_time'], 
             marker='s', linewidth=2, markersize=8, color='#9b59b6')
axes[1].set_xlabel('Number of Training Samples')
axes[1].set_ylabel('Training Time (seconds)')
axes[1].set_title('Training Time vs Data Size')
axes[1].grid(True, alpha=0.3)

for x, y in zip(datasize_df['n_samples'], datasize_df['training_time']):
    axes[1].text(x, y + 5, f'{y:.0f}s', ha='center', fontsize=9)

plt.suptitle('Ablation Study: Training Data Size Impact', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

print("\nData Size Results Summary:")
print(datasize_df.to_string(index=False))

## 4. Ablation Study 3: Generation Temperature

Test different temperature values during generation.

In [None]:
# Ablation 3: Generation Temperature
print("=" * 70)
print("ABLATION STUDY 3: GENERATION TEMPERATURE")
print("=" * 70)

# Train one model first
print("\nTraining baseline model for temperature ablation...")
student = StudentModel(StudentConfig(
    model_name="google/flan-t5-small",
    max_source_length=128,
    max_target_length=64,
    device=device
))

distillation_strategy = SequenceLevelDistillation(DistillationConfig(
    ce_weight=1.0,
    label_smoothing=0.1
))

trainer = Trainer(
    model=student,
    train_dataloader=train_loader,
    eval_dataloader=val_loader,
    distillation_strategy=distillation_strategy,
    config=TrainingConfig(
        num_epochs=3,
        learning_rate=5e-5,
        output_dir="../experiments/ablation_temperature_base"
    )
)

trainer.train()
print("âœ“ Base model trained\n")

# Test different temperatures
temperatures = [0.5, 0.7, 1.0, 1.2]
temperature_results = []

for temp in temperatures:
    print(f"\nTesting temperature = {temp}")
    
    # Update student config
    student.config.temperature = temp
    
    # Evaluate
    evaluator = Evaluator(student, EvaluationConfig(
        metrics_config=MetricsConfig(
            compute_rouge=True,
            compute_bertscore=False,
            compute_faithfulness=True
        ),
        save_predictions=False,
        output_dir=f"../experiments/ablation_temp_{temp}_eval"
    ))
    
    results = evaluator.evaluate(val_loader, split_name="val")
    
    temperature_results.append({
        'temperature': temp,
        'accuracy': results['metrics']['label_accuracy'],
        'rouge1': results['metrics']['rouge1'],
        'rougeL': results['metrics']['rougeL'],
        'faithfulness': results['metrics']['faithfulness']
    })
    
    print(f"  Accuracy: {results['metrics']['label_accuracy']:.4f}")

print("\n" + "="*70)
print("TEMPERATURE ABLATION COMPLETE")
print("="*70)

In [None]:
# Visualize temperature results
temp_df = pd.DataFrame(temperature_results)

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Plot 1: All metrics
for metric, color in zip(['accuracy', 'rouge1', 'rougeL', 'faithfulness'], 
                         ['#e74c3c', '#3498db', '#2ecc71', '#f39c12']):
    axes[0].plot(temp_df['temperature'], temp_df[metric], 
                 marker='o', linewidth=2, markersize=8, label=metric.upper(), color=color)

axes[0].set_xlabel('Temperature')
axes[0].set_ylabel('Score')
axes[0].set_title('Metrics vs Generation Temperature')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Plot 2: Accuracy zoom
axes[1].plot(temp_df['temperature'], temp_df['accuracy'], 
             marker='o', linewidth=3, markersize=10, color='#e74c3c')
axes[1].set_xlabel('Temperature')
axes[1].set_ylabel('Label Accuracy')
axes[1].set_title('Label Accuracy vs Temperature')
axes[1].grid(True, alpha=0.3)

for x, y in zip(temp_df['temperature'], temp_df['accuracy']):
    axes[1].text(x, y + 0.005, f'{y:.3f}', ha='center', fontsize=10)

plt.suptitle('Ablation Study: Generation Temperature Impact', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

print("\nTemperature Results Summary:")
print(temp_df.to_string(index=False))

## 5. Summary and Recommendations

In [None]:
# Compile all ablation results
print("=" * 70)
print("ABLATION STUDIES SUMMARY")
print("=" * 70)

print("\nðŸ“Š LABEL SMOOTHING:")
best_smoothing = smoothing_df.loc[smoothing_df['accuracy'].idxmax()]
print(f"  Best value: {best_smoothing['smoothing']}")
print(f"  Best accuracy: {best_smoothing['accuracy']:.4f}")
print(f"  Recommendation: Use label_smoothing={best_smoothing['smoothing']} for final training")

print("\nðŸ“Š TRAINING DATA SIZE:")
print("  Performance scaling:")
for _, row in datasize_df.iterrows():
    print(f"    {row['n_samples']:4d} samples â†’ Accuracy: {row['accuracy']:.4f}")
print(f"  Recommendation: {'More data helps significantly' if datasize_df['accuracy'].iloc[-1] - datasize_df['accuracy'].iloc[0] > 0.1 else 'Diminishing returns after 50%'}")

print("\nðŸ“Š GENERATION TEMPERATURE:")
best_temp = temp_df.loc[temp_df['accuracy'].idxmax()]
print(f"  Best value: {best_temp['temperature']}")
print(f"  Best accuracy: {best_temp['accuracy']:.4f}")
print(f"  Recommendation: Use temperature={best_temp['temperature']} for inference")

print("\n" + "="*70)
print("KEY FINDINGS:")
print("="*70)
print(f"1. Optimal label smoothing: {best_smoothing['smoothing']}")
print(f"2. Data efficiency: {'High' if datasize_df['accuracy'].iloc[1] / datasize_df['accuracy'].iloc[-1] > 0.9 else 'Moderate'}")
print(f"3. Temperature sensitivity: {'Low' if temp_df['accuracy'].std() < 0.02 else 'Moderate to High'}")
print("\nðŸ’¡ Use these optimal hyperparameters for final model training!")

In [None]:
# Save all results to CSV for later analysis
output_dir = Path("../experiments/ablation_studies")
output_dir.mkdir(parents=True, exist_ok=True)

smoothing_df.to_csv(output_dir / "label_smoothing_results.csv", index=False)
datasize_df.to_csv(output_dir / "data_size_results.csv", index=False)
temp_df.to_csv(output_dir / "temperature_results.csv", index=False)

print(f"âœ“ All ablation results saved to {output_dir}")