# 17 - VGG-16 & MobileNetV2 Training (Weak Baselines)

**Author:** Tan Ming Kai (24PMR12003)  
**Date:** 2025-11-26  
**Purpose:** Train weak baseline models (VGG-16, MobileNetV2) with 5 random seeds

**Hardware:** NVIDIA RTX 6000 Ada (24GB VRAM)

---

## Objectives
1. Train VGG-16 with seeds: 42, 123, 456, 789, 101112
2. Train MobileNetV2 with seeds: 42, 123, 456, 789, 101112
3. Log all runs to MLflow
4. Calculate mean ± std accuracy
5. Compare against other baselines in Phase 3

---

## Optimizations for 24GB VRAM
- **Batch size: 64** (vs 8 on 8GB VRAM)
- **Gradient accumulation: 1** (not needed with large batch)
- **Num workers: 8** (faster data loading)
- **Mixed precision: True** (for speed)

**Estimated training time:**
- VGG-16: ~15 min/seed × 5 = 75 min
- MobileNetV2: ~8 min/seed × 5 = 40 min
- **Total: ~2 hours**

In [None]:
# Standard imports
import os, sys, random, time, warnings
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models

import cv2
from PIL import Image

try:
    import mlflow
    import mlflow.pytorch
    MLFLOW_AVAILABLE = True
except ImportError:
    MLFLOW_AVAILABLE = False
    print("[WARNING] MLflow not available")

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-darkgrid')

print(f"[OK] PyTorch {torch.__version__} | CUDA: {torch.cuda.is_available()}")

In [None]:
# Hardware verification
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory/1e9:.2f} GB")
    print(f"CUDA Version: {torch.version.cuda}")

In [None]:
# Configuration - MATCHED WITH OTHER TRAINING NOTEBOOKS
CSV_DIR = Path("../data/processed")
MODELS_DIR = Path("../experiments/phase2_models")
RESULTS_DIR = Path("../experiments/phase2_results")
MODELS_DIR.mkdir(parents=True, exist_ok=True)
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

CONFIG = {
    'device': device,
    'num_classes': 4,
    'image_size': 224,
    'class_names': ['COVID', 'Normal', 'Lung_Opacity', 'Viral Pneumonia'],
    'class_weights': [1.47, 0.52, 0.88, 3.95],
    
    # OPTIMIZED FOR 24GB+ VRAM (RTX 6000 Ada)
    'batch_size': 64,
    'num_workers': 8,
    
    # Training hyperparameters - MATCHED WITH OTHER MODELS
    'learning_rate': 1e-4,      # Same as ResNet, DenseNet, EfficientNet
    'weight_decay': 1e-4,       # Same as other models (was 0.05)
    'max_epochs': 30,           # Same as other models (was 50)
    'early_stopping_patience': 10,  # Same as other models (was 15)
    
    # Normalization (ImageNet)
    'mean': [0.485, 0.456, 0.406],
    'std': [0.229, 0.224, 0.225],
    
    # Mixed precision for speed
    'mixed_precision': True,
    
    # Seeds - same as all other models
    'seeds': [42, 123, 456, 789, 101112],
}

print(f"Configuration matched with other training notebooks:")
print(f"  Batch size: {CONFIG['batch_size']}")
print(f"  Learning rate: {CONFIG['learning_rate']}")
print(f"  Weight decay: {CONFIG['weight_decay']}")
print(f"  Max epochs: {CONFIG['max_epochs']}")
print(f"  Early stopping: {CONFIG['early_stopping_patience']}")
print(f"  Seeds: {CONFIG['seeds']}")
print(f"  Models dir: {MODELS_DIR}")
print(f"  Results dir: {RESULTS_DIR}")

In [None]:
# MLflow setup
if MLFLOW_AVAILABLE:
    mlflow.set_experiment("crossvit-covid19-classification")
    mlflow.set_tracking_uri("file:./mlruns")
    print("[OK] MLflow configured")
else:
    print("[WARNING] MLflow not available - results will not be logged")

In [None]:
# Load data - Use train.csv/val.csv/test.csv (raw image paths, not processed)
# These have 'image_path' column pointing to raw images
# We'll apply CLAHE on-the-fly in the Dataset class

train_df = pd.read_csv(CSV_DIR / "train.csv")
val_df = pd.read_csv(CSV_DIR / "val.csv")
test_df = pd.read_csv(CSV_DIR / "test.csv")
print(f"Train: {len(train_df):,} | Val: {len(val_df):,} | Test: {len(test_df):,}")

# Verify paths exist
sample_path = train_df['image_path'].iloc[0]
print(f"\nSample path: {sample_path}")
print(f"Path exists: {Path(sample_path).exists()}")

# Verify class distribution
print("\nClass distribution (train):")
print(train_df['label'].value_counts().sort_index())

In [None]:
# Dataset class with on-the-fly CLAHE preprocessing
class COVID19Dataset(Dataset):
    """
    Dataset that loads raw images and applies CLAHE preprocessing on-the-fly.
    
    Preprocessing:
    1. Load image (grayscale or BGR)
    2. Convert to grayscale if needed
    3. Apply CLAHE (clip_limit=2.0, tile_grid_size=(8,8))
    4. Convert to RGB (3 channels)
    5. Apply torchvision transforms
    """
    def __init__(self, dataframe, transform=None):
        self.dataframe = dataframe.reset_index(drop=True)
        self.transform = transform
        self.image_paths = dataframe['image_path'].values  # Use 'image_path' column
        self.labels = dataframe['label'].values
        
        # CLAHE parameters (must match existing preprocessing)
        self.clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))

        # Verify first path exists
        if len(self.image_paths) > 0:
            first_path = Path(self.image_paths[0])
            if not first_path.exists():
                print(f"[WARNING] First image path does not exist: {first_path}")
            else:
                print(f"[OK] Path verification passed: {first_path.name}")

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        
        # Load image
        image = cv2.imread(img_path)
        if image is None:
            raise FileNotFoundError(f"Could not load image: {img_path}")
        
        # Convert to grayscale for CLAHE
        if len(image.shape) == 3:
            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        else:
            gray = image
        
        # Apply CLAHE
        enhanced = self.clahe.apply(gray)
        
        # Convert to RGB (3 channels for pretrained models)
        rgb_image = cv2.cvtColor(enhanced, cv2.COLOR_GRAY2RGB)
        
        # Convert to PIL Image for torchvision transforms
        image = Image.fromarray(rgb_image)
        
        if self.transform:
            image = self.transform(image)
        
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return image, label

print("[OK] Dataset class defined with on-the-fly CLAHE preprocessing")

In [None]:
# Transforms (MUST MATCH EXISTING PREPROCESSING)
train_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomRotation(10),
    transforms.RandomHorizontalFlip(0.5),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),
    transforms.ToTensor(),
    transforms.Normalize(mean=CONFIG['mean'], std=CONFIG['std'])
])

val_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=CONFIG['mean'], std=CONFIG['std'])
])

print("[OK] Transforms defined")

In [None]:
# Training functions
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

def train_one_epoch(model, loader, criterion, optimizer, device, scaler=None, epoch=0):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    progress_bar = tqdm(loader, desc=f"Epoch {epoch+1} [Train]")
    
    for batch_idx, (images, labels) in enumerate(progress_bar):
        images = images.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True)
        
        optimizer.zero_grad(set_to_none=True)
        
        if scaler is not None:
            with torch.cuda.amp.autocast():
                outputs = model(images)
                loss = criterion(outputs, labels)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
        
        running_loss += loss.item()
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()
        
        progress_bar.set_postfix({'loss': running_loss / (batch_idx + 1), 'acc': 100. * correct / total})
    
    return running_loss / len(loader), 100. * correct / total

def validate(model, loader, criterion, device, desc="Val"):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for images, labels in tqdm(loader, desc=f"[{desc}]"):
            images = images.to(device, non_blocking=True)
            labels = labels.to(device, non_blocking=True)
            
            outputs = model(images)
            loss = criterion(outputs, labels)
            
            running_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()
            
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    return running_loss / len(loader), 100. * correct / total, np.array(all_preds), np.array(all_labels)

print("[OK] Training functions defined")

In [None]:
# Single seed training function - MATCHED WITH OTHER NOTEBOOKS
def train_model_single_seed(model_name, model_fn, seed, config):
    """
    Train a model with a single seed.
    
    Args:
        model_name: str, e.g., 'VGG-16' or 'MobileNetV2'
        model_fn: callable that returns a model
        seed: int, random seed
        config: dict, configuration
    """
    print(f"\n{'='*70}\nTRAINING {model_name.upper()} WITH SEED {seed}\n{'='*70}")
    
    set_seed(seed)
    
    # Create dataloaders
    train_dataset = COVID19Dataset(train_df, transform=train_transform)
    val_dataset = COVID19Dataset(val_df, transform=val_transform)
    test_dataset = COVID19Dataset(test_df, transform=val_transform)
    
    train_loader = DataLoader(
        train_dataset,
        batch_size=config['batch_size'],
        shuffle=True,
        num_workers=config['num_workers'],
        pin_memory=True,
        persistent_workers=True if config['num_workers'] > 0 else False
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=config['batch_size'],
        shuffle=False,
        num_workers=config['num_workers'],
        pin_memory=True,
        persistent_workers=True if config['num_workers'] > 0 else False
    )
    
    test_loader = DataLoader(
        test_dataset,
        batch_size=config['batch_size'],
        shuffle=False,
        num_workers=config['num_workers'],
        pin_memory=True,
        persistent_workers=True if config['num_workers'] > 0 else False
    )
    
    # Load model
    model = model_fn()
    model = model.to(device)
    
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"[OK] {model_name} loaded: {total_params:,} parameters ({trainable_params:,} trainable)")
    
    # Loss, optimizer, scheduler - MATCHED WITH OTHER MODELS
    class_weights = torch.tensor(config['class_weights'], dtype=torch.float32).to(device)
    criterion = nn.CrossEntropyLoss(weight=class_weights)
    
    # Use Adam (not AdamW) - same as ResNet, DenseNet, EfficientNet
    optimizer = optim.Adam(model.parameters(), lr=config['learning_rate'], weight_decay=config['weight_decay'])
    
    # Use ReduceLROnPlateau - same as other models
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)
    
    scaler = torch.cuda.amp.GradScaler() if config['mixed_precision'] else None
    
    # MLflow
    if MLFLOW_AVAILABLE:
        mlflow.start_run(run_name=f"{model_name.lower().replace('-', '')}-seed-{seed}")
        mlflow.log_param("model", model_name)
        mlflow.log_param("random_seed", seed)
        mlflow.log_param("batch_size", config['batch_size'])
        mlflow.log_param("learning_rate", config['learning_rate'])
        mlflow.log_param("weight_decay", config['weight_decay'])
        mlflow.log_param("optimizer", "Adam")
        mlflow.log_param("scheduler", "ReduceLROnPlateau")
        mlflow.log_param("image_size", config['image_size'])
        mlflow.set_tag("phase", "Phase 2 - Weak Baselines")
        mlflow.set_tag("hardware", "RTX 6000 Ada")
    
    # Training loop
    best_val_loss = float('inf')
    patience_counter = 0
    best_model_path = MODELS_DIR / f"{model_name.lower().replace('-', '')}_seed{seed}.pth"
    
    start_time = time.time()
    
    for epoch in range(config['max_epochs']):
        train_loss, train_acc = train_one_epoch(model, train_loader, criterion, optimizer, device, scaler, epoch)
        val_loss, val_acc, _, _ = validate(model, val_loader, criterion, device)
        
        # ReduceLROnPlateau needs val_loss
        scheduler.step(val_loss)
        
        if MLFLOW_AVAILABLE:
            mlflow.log_metric("train_loss", train_loss, step=epoch)
            mlflow.log_metric("train_acc", train_acc, step=epoch)
            mlflow.log_metric("val_loss", val_loss, step=epoch)
            mlflow.log_metric("val_acc", val_acc, step=epoch)
            mlflow.log_metric("learning_rate", optimizer.param_groups[0]['lr'], step=epoch)
        
        print(f"Epoch {epoch+1}: Train Loss={train_loss:.4f} Acc={train_acc:.2f}% | Val Loss={val_loss:.4f} Acc={val_acc:.2f}%")
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            torch.save(model.state_dict(), best_model_path)
            print(f"[OK] Best model saved! ({best_model_path.name})")
        else:
            patience_counter += 1
            if patience_counter >= config['early_stopping_patience']:
                print(f"[STOP] Early stopping at epoch {epoch+1}")
                break
    
    training_time = time.time() - start_time
    
    # Test evaluation
    model.load_state_dict(torch.load(best_model_path, weights_only=True))
    test_loss, test_acc, test_preds, test_labels = validate(model, test_loader, criterion, device, desc="Test")
    
    # Confusion matrix
    cm = confusion_matrix(test_labels, test_preds)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=config['class_names'], yticklabels=config['class_names'])
    plt.ylabel('True')
    plt.xlabel('Predicted')
    plt.title(f"{model_name} Confusion Matrix (Seed {seed})")
    cm_path = RESULTS_DIR / f"{model_name.lower().replace('-', '')}_cm_seed{seed}.png"
    plt.savefig(cm_path, dpi=300, bbox_inches='tight')
    plt.close()
    
    # Classification report
    report = classification_report(test_labels, test_preds, target_names=config['class_names'], output_dict=True)
    
    if MLFLOW_AVAILABLE:
        mlflow.log_metric("test_loss", test_loss)
        mlflow.log_metric("test_accuracy", test_acc)
        mlflow.log_metric("training_time_minutes", training_time / 60)
        mlflow.log_artifact(str(cm_path))
        
        # Log per-class metrics
        for class_name in config['class_names']:
            mlflow.log_metric(f"{class_name}_precision", report[class_name]['precision'])
            mlflow.log_metric(f"{class_name}_recall", report[class_name]['recall'])
            mlflow.log_metric(f"{class_name}_f1", report[class_name]['f1-score'])
        
        mlflow.end_run()
    
    print(f"[OK] Seed {seed} complete: Test Acc = {test_acc:.2f}% | Time = {training_time/60:.1f} min")
    print(f"[OK] Model saved to: {best_model_path}")
    
    # Clean up GPU memory
    del model
    torch.cuda.empty_cache()
    
    return {
        'model': model_name,
        'seed': seed,
        'test_acc': test_acc,
        'test_loss': test_loss,
        'training_time_min': training_time / 60,
        'best_val_loss': best_val_loss
    }

print("[OK] Single seed training function defined (matched with other notebooks)")

---
# VGG-16 Training
---

In [None]:
# VGG-16 model definition
def create_vgg16():
    """
    Create VGG-16 model with pretrained ImageNet weights.
    Replace final classifier layer for 4-class COVID-19 classification.
    """
    model = models.vgg16(weights='IMAGENET1K_V1')
    
    # Replace final layer: 4096 -> 4 classes
    model.classifier[6] = nn.Linear(4096, 4)
    
    return model

# Test model creation
test_model = create_vgg16()
print(f"[OK] VGG-16 created: {sum(p.numel() for p in test_model.parameters()):,} parameters")
del test_model

In [None]:
# Train VGG-16 with all seeds
print(f"\n{'='*70}\nSTARTING MULTI-SEED VGG-16 TRAINING\n{'='*70}")
print(f"Seeds: {CONFIG['seeds']}\n")

vgg16_results = []
for seed in CONFIG['seeds']:
    try:
        result = train_model_single_seed('VGG-16', create_vgg16, seed, CONFIG)
        vgg16_results.append(result)
    except Exception as e:
        print(f"[ERROR] Error with VGG-16 seed {seed}: {e}")
        import traceback
        traceback.print_exc()
        continue

print(f"\n{'='*70}\nVGG-16 ALL SEEDS COMPLETED\n{'='*70}")

In [None]:
# VGG-16 Statistical analysis
vgg16_accuracies = [r['test_acc'] for r in vgg16_results]
vgg16_mean_acc = np.mean(vgg16_accuracies)
vgg16_std_acc = np.std(vgg16_accuracies, ddof=1)

print(f"\n[STATS] VGG-16 Results (5 seeds):")
print(f"   Mean ± Std: {vgg16_mean_acc:.2f}% ± {vgg16_std_acc:.2f}%")
print(f"   Range: [{np.min(vgg16_accuracies):.2f}%, {np.max(vgg16_accuracies):.2f}%]")
print(f"   Median: {np.median(vgg16_accuracies):.2f}%")

# Save results
vgg16_df = pd.DataFrame(vgg16_results)
vgg16_results_path = RESULTS_DIR / "vgg16_results.csv"
vgg16_df.to_csv(vgg16_results_path, index=False)
print(f"\n[OK] Results saved to {vgg16_results_path}")
print("\n" + "="*70)
print(vgg16_df.to_string(index=False))
print("="*70)

---
# MobileNetV2 Training
---

In [None]:
# MobileNetV2 model definition
def create_mobilenetv2():
    """
    Create MobileNetV2 model with pretrained ImageNet weights.
    Replace final classifier layer for 4-class COVID-19 classification.
    """
    model = models.mobilenet_v2(weights='IMAGENET1K_V1')
    
    # Replace final layer: 1280 -> 4 classes
    model.classifier[1] = nn.Linear(model.classifier[1].in_features, 4)
    
    return model

# Test model creation
test_model = create_mobilenetv2()
print(f"[OK] MobileNetV2 created: {sum(p.numel() for p in test_model.parameters()):,} parameters")
del test_model

In [None]:
# Train MobileNetV2 with all seeds
print(f"\n{'='*70}\nSTARTING MULTI-SEED MOBILENETV2 TRAINING\n{'='*70}")
print(f"Seeds: {CONFIG['seeds']}\n")

mobilenetv2_results = []
for seed in CONFIG['seeds']:
    try:
        result = train_model_single_seed('MobileNetV2', create_mobilenetv2, seed, CONFIG)
        mobilenetv2_results.append(result)
    except Exception as e:
        print(f"[ERROR] Error with MobileNetV2 seed {seed}: {e}")
        import traceback
        traceback.print_exc()
        continue

print(f"\n{'='*70}\nMOBILENETV2 ALL SEEDS COMPLETED\n{'='*70}")

In [None]:
# MobileNetV2 Statistical analysis
mobilenetv2_accuracies = [r['test_acc'] for r in mobilenetv2_results]
mobilenetv2_mean_acc = np.mean(mobilenetv2_accuracies)
mobilenetv2_std_acc = np.std(mobilenetv2_accuracies, ddof=1)

print(f"\n[STATS] MobileNetV2 Results (5 seeds):")
print(f"   Mean ± Std: {mobilenetv2_mean_acc:.2f}% ± {mobilenetv2_std_acc:.2f}%")
print(f"   Range: [{np.min(mobilenetv2_accuracies):.2f}%, {np.max(mobilenetv2_accuracies):.2f}%]")
print(f"   Median: {np.median(mobilenetv2_accuracies):.2f}%")

# Save results
mobilenetv2_df = pd.DataFrame(mobilenetv2_results)
mobilenetv2_results_path = RESULTS_DIR / "mobilenetv2_results.csv"
mobilenetv2_df.to_csv(mobilenetv2_results_path, index=False)
print(f"\n[OK] Results saved to {mobilenetv2_results_path}")
print("\n" + "="*70)
print(mobilenetv2_df.to_string(index=False))
print("="*70)

---
# Summary Comparison
---

In [None]:
# Combined summary
print(f"\n{'='*70}")
print("FINAL SUMMARY: VGG-16 vs MobileNetV2")
print(f"{'='*70}\n")

summary_data = [
    {
        'Model': 'VGG-16',
        'Mean Acc': f"{vgg16_mean_acc:.2f}%",
        'Std': f"{vgg16_std_acc:.2f}%",
        'Min': f"{np.min(vgg16_accuracies):.2f}%",
        'Max': f"{np.max(vgg16_accuracies):.2f}%",
        'Median': f"{np.median(vgg16_accuracies):.2f}%"
    },
    {
        'Model': 'MobileNetV2',
        'Mean Acc': f"{mobilenetv2_mean_acc:.2f}%",
        'Std': f"{mobilenetv2_std_acc:.2f}%",
        'Min': f"{np.min(mobilenetv2_accuracies):.2f}%",
        'Max': f"{np.max(mobilenetv2_accuracies):.2f}%",
        'Median': f"{np.median(mobilenetv2_accuracies):.2f}%"
    }
]

summary_df = pd.DataFrame(summary_data)
print(summary_df.to_string(index=False))
print(f"\n{'='*70}")

# Save combined summary
summary_path = RESULTS_DIR / "weak_baselines_summary.csv"
summary_df.to_csv(summary_path, index=False)
print(f"[OK] Summary saved to {summary_path}")

In [None]:
# Visualization: Box plot comparison
fig, ax = plt.subplots(figsize=(10, 6))

data_to_plot = [vgg16_accuracies, mobilenetv2_accuracies]
labels = ['VGG-16', 'MobileNetV2']

bp = ax.boxplot(data_to_plot, labels=labels, patch_artist=True)

# Color the boxes
colors = ['lightblue', 'lightgreen']
for patch, color in zip(bp['boxes'], colors):
    patch.set_facecolor(color)

ax.set_ylabel('Test Accuracy (%)', fontsize=12)
ax.set_title('Weak Baselines Comparison (5 Seeds)', fontsize=14, fontweight='bold')
ax.grid(axis='y', alpha=0.3)

# Add mean markers
means = [vgg16_mean_acc, mobilenetv2_mean_acc]
ax.scatter([1, 2], means, marker='D', s=100, color='red', zorder=3, label='Mean')
ax.legend()

plt.tight_layout()
boxplot_path = RESULTS_DIR / "weak_baselines_boxplot.png"
plt.savefig(boxplot_path, dpi=300, bbox_inches='tight')
plt.show()

print(f"[OK] Box plot saved to {boxplot_path}")

---
# Training Complete!
---

**Models saved to:** `experiments/phase2_models/`
- vgg16_seed42.pth, vgg16_seed123.pth, vgg16_seed456.pth, vgg16_seed789.pth, vgg16_seed101112.pth
- mobilenetv2_seed42.pth, mobilenetv2_seed123.pth, mobilenetv2_seed456.pth, mobilenetv2_seed789.pth, mobilenetv2_seed101112.pth

**Results saved to:** `experiments/phase2_results/`
- vgg16_results.csv
- mobilenetv2_results.csv
- weak_baselines_summary.csv
- weak_baselines_boxplot.png

**Next steps:**
1. Compare these results with other baselines (ResNet-50, DenseNet-121, etc.)
2. Proceed to Phase 3: Statistical validation (12_statistical_validation.ipynb)
3. Calculate 95% confidence intervals
4. Perform hypothesis testing (CrossViT vs all baselines)