<a href="https://colab.research.google.com/github/Lookieman/home_projects/blob/main/ThinkOnwardsComp%5CPhase2d_Unet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Phase** 2d now tries to focus on improvement of performance by:



*   Tweaking the parameters for teh Cosine Annealing LR
*   Adding Sprectral Norm for Stabilized Unet










In [None]:
#Phase2a
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, Subset
import torch.nn.functional as F
import numpy as np
import os
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
import time
from tqdm import tqdm
import gc
from scipy import ndimage
from math import e

#phase 2b
import pandas as pd
import seaborn as sns
from google.colab import drive
drive.mount('/content/drive')

#phase 2c
import random

#phase 2d
from torch.nn.utils import spectral_norm

Mounted at /content/drive


In [None]:
# Configuration
USE_ADAPTIVE_MODEL = True  # Set to True for adaptive model
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BATCH_SIZE = 32
LEARNING_RATE = 1e-4
NUM_EPOCHS = 35
ENSEMBLE_SEEDS = [42, 123, 456]  # 3 seeds per fold
K_FOLDS = 5
EARLY_STOPPING_PATIENCE = 8
GRADIENT_CLIP_NORM = 0.5
DROPOUT=0.05



data_dir = Path('/content/drive/MyDrive/ThinkOnward/Data/Train')
result_dir = Path('/content/drive/MyDrive/ThinkOnward/Result/Phase2c')

print(f"Using device: {DEVICE}")
#print(f"Model type: {'Adaptive' if USE_ADAPTIVE_MODEL else 'Resize'}")

Using device: cuda


In [None]:
def setup_deterministic_training(seed):
    """Setup completely deterministic training environment"""

    # Set all random seeds
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    # Ensure deterministic behavior
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    # Enable deterministic algorithms where possible
    try:
        torch.use_deterministic_algorithms(True, warn_only=True)
    except:
        pass  # Not all PyTorch versions support this

    print(f"✅ Deterministic training setup complete with seed {seed}")

In [None]:
class SeismicDataset(Dataset):
    def __init__(self, data_dir, sample_indices, use_adaptive=False):
        self.data_dir = data_dir
        self.sample_indices = sample_indices
        self.use_adaptive = use_adaptive
        self.receiver_files = [
            'receiver_data_src_1.npy',
            'receiver_data_src_75.npy',
            'receiver_data_src_150.npy',
            'receiver_data_src_225.npy',
            'receiver_data_src_300.npy'
        ]

    def __len__(self):
        return len(self.sample_indices)

    def __getitem__(self, idx):
        sample_idx = self.sample_indices[idx]
        sample_dir = os.path.join(self.data_dir, f'TrainingData_{sample_idx}')

        # Load receiver data (5 files)
        receiver_data = []
        for file_name in self.receiver_files:
            file_path = os.path.join(sample_dir, file_name)
            data = np.load(file_path).astype(np.float32)  # (10001, 31)
            receiver_data.append(data)

        # Load target velocity model
        target_path = os.path.join(sample_dir, 'vp_model.npy')
        target = np.load(target_path).astype(np.float32)  # (300, 1259)

        if self.use_adaptive:
            # Process for adaptive model
            processed_inputs = []
            for data in receiver_data:
                # 1D conv + maxpool simulation using numpy
                # Downsample from 10001 to ~313 (factor of ~32)
                downsampled = data[::32, :]  # (313, 31)

                # Zero pad from 31 to 32 channels
                if downsampled.shape[1] == 31:
                    padded = np.pad(downsampled, ((0, 0), (0, 1)), mode='constant')  # (313, 32)
                else:
                    padded = downsampled

                # Reshape to make it more compact 2D
                # We'll treat this as (313, 32) for now and let the model handle it
                processed_inputs.append(padded.T)  # (32, 313) for easier processing

            # Stack all 5 processed inputs
            input_tensor = np.stack(processed_inputs, axis=0)  # (5, 32, 313)
        else:
            # Process for resize model
            processed_inputs = []
            for data in receiver_data:
                # Resize (10001, 31) to (300, 1259)
                resized = ndimage.zoom(data, (300/10001, 1259/31), order=1)
                processed_inputs.append(resized)

            # Stack all 5 processed inputs
            input_tensor = np.stack(processed_inputs, axis=0)  # (5, 300, 1259)

        return torch.from_numpy(input_tensor), torch.from_numpy(target)


In [None]:
# Helper function to match tensor sizes for skip connections
def match_tensor_size(tensor1, tensor2):
    """Match the spatial dimensions of tensor1 to tensor2 by cropping or padding."""
    _, _, h1, w1 = tensor1.shape
    _, _, h2, w2 = tensor2.shape

    # Calculate differences
    dh = h2 - h1
    dw = w2 - w1

    if dh > 0 or dw > 0:
        # Pad tensor1 if it's smaller
        pad_h = max(0, dh)
        pad_w = max(0, dw)
        tensor1 = F.pad(tensor1, (0, pad_w, 0, pad_h))
    elif dh < 0 or dw < 0:
        # Crop tensor1 if it's larger
        tensor1 = tensor1[:, :, :h2, :w2]

    return tensor1

# MAPE Loss Function
def mape_loss(predictions, targets, epsilon=1e-8):
    targets_safe = torch.clamp(torch.abs(targets), min=epsilon)
    return torch.mean(torch.abs((targets - predictions) / targets_safe)) * 100


In [None]:
# Attention Block
class AttentionBlock(nn.Module):
    def __init__(self, in_channels, dropout=0.05):
        super(AttentionBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, in_channels // 8, 1)
        self.conv2 = nn.Conv2d(in_channels, in_channels // 8, 1)
        self.conv3 = nn.Conv2d(in_channels, in_channels, 1)
        self.gamma = nn.Parameter(torch.zeros(1))
        self.softmax = nn.Softmax(dim=-1)
        self.dropout = nn.Dropout2d(dropout)

    def forward(self, x):
        batch_size, channels, height, width = x.size()

        query = self.conv1(x).view(batch_size, -1, width * height).permute(0, 2, 1)
        key = self.conv2(x).view(batch_size, -1, width * height)
        value = self.conv3(x).view(batch_size, -1, width * height)

        attention = torch.bmm(query, key)
        attention = self.softmax(attention)

        out = torch.bmm(value, attention.permute(0, 2, 1))
        out = out.view(batch_size, channels, height, width)

        return self.gamma * out + x

# Double Convolution Block
class DoubleConv(nn.Module):
    def __init__(self, in_channels, out_channels, dropout=0.05):
        super(DoubleConv, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, 3, padding=1),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True),
            nn.Dropout2d(dropout),  # Add dropout
            nn.Conv2d(out_channels, out_channels, 3, padding=1),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True)
        )

    def forward(self, x):
        return self.conv(x)

# U-Net Model
class UNet(nn.Module):
    def __init__(self, in_channels=5, out_channels=1, use_adaptive=False):
        super(UNet, self).__init__()
        self.use_adaptive = use_adaptive

        if use_adaptive:
            # First process the irregular input
            self.input_processor = nn.Sequential(
                nn.Conv2d(5, 16, kernel_size=3, padding=1),
                nn.ReLU(inplace=True),
                nn.Conv2d(16, 32, kernel_size=3, padding=1),
                nn.ReLU(inplace=True)
            )
            # Input will be (5, 32, 313), output will be (32, 32, 313)
            in_channels = 32

        # Encoder
        self.enc1 = DoubleConv(in_channels, 64)
        self.pool1 = nn.MaxPool2d(2)
        self.enc2 = DoubleConv(64, 128)
        self.pool2 = nn.MaxPool2d(2)
        self.enc3 = DoubleConv(128, 256)
        self.pool3 = nn.MaxPool2d(2)
        self.enc4 = DoubleConv(256, 512)
        self.pool4 = nn.MaxPool2d(2)

        # Bottleneck with attention
        self.bottleneck = DoubleConv(512, 1024)
        self.attention = AttentionBlock(1024)

        # Decoder
        self.up4 = nn.ConvTranspose2d(1024, 512, 2, stride=2)
        self.dec4 = DoubleConv(1024, 512)
        self.up3 = nn.ConvTranspose2d(512, 256, 2, stride=2)
        self.dec3 = DoubleConv(512, 256)
        self.up2 = nn.ConvTranspose2d(256, 128, 2, stride=2)
        self.dec2 = DoubleConv(256, 128)
        self.up1 = nn.ConvTranspose2d(128, 64, 2, stride=2)
        self.dec1 = DoubleConv(128, 64)

        # Final output
        self.final_conv = nn.Conv2d(64, out_channels, 1)

        if use_adaptive:
            # Final upsampling to reach (300, 1259)
            self.final_upsample = nn.Sequential(
                nn.ConvTranspose2d(1, 1, kernel_size=4, stride=2, padding=1),
                nn.ReLU(inplace=True)
            )

    def forward(self, x):
        if self.use_adaptive:
            # Process irregular input first
            x = self.input_processor(x)  # (batch, 32, 32, 313)

        # Encoder
        e1 = self.enc1(x)
        e2 = self.enc2(self.pool1(e1))
        e3 = self.enc3(self.pool2(e2))
        e4 = self.enc4(self.pool3(e3))

        # Bottleneck
        b = self.bottleneck(self.pool4(e4))
        b = self.attention(b)

        # Decoder with skip connections
        d4 = self.up4(b)
        d4 = match_tensor_size(d4, e4)
        d4 = torch.cat([d4, e4], dim=1)
        d4 = self.dec4(d4)

        d3 = self.up3(d4)
        d3 = match_tensor_size(d3, e3)
        d3 = torch.cat([d3, e3], dim=1)
        d3 = self.dec3(d3)

        d2 = self.up2(d3)
        d2 = match_tensor_size(d2, e2)
        d2 = torch.cat([d2, e2], dim=1)
        d2 = self.dec2(d2)

        d1 = self.up1(d2)
        d1 = match_tensor_size(d1, e1)
        d1 = torch.cat([d1, e1], dim=1)
        d1 = self.dec1(d1)

        # Final output
        output = self.final_conv(d1)

        if self.use_adaptive:
            # Upsample to final target size (300, 1259)
            output = F.interpolate(output, size=(300, 1259), mode='bilinear', align_corners=False)

        return output

In [None]:
class StabilizedUNet(nn.Module):
    def __init__(self, in_channels=5, out_channels=1, use_adaptive=True, dropout=0.05):
        super(StabilizedUNet, self).__init__()
        self.use_adaptive = use_adaptive

        if use_adaptive:
            self.input_processor = nn.Sequential(
                nn.Conv2d(5, 16, kernel_size=3, padding=1),
                nn.BatchNorm2d(16),
                nn.ReLU(inplace=True),
                nn.Dropout2d(dropout),
                nn.Conv2d(16, 32, kernel_size=3, padding=1),
                nn.BatchNorm2d(32),
                nn.ReLU(inplace=True)
            )
            in_channels = 32

        # Encoder with dropout
        self.enc1 = DoubleConv(in_channels, 64, dropout)
        self.pool1 = nn.MaxPool2d(2)
        self.enc2 = DoubleConv(64, 128, dropout)
        self.pool2 = nn.MaxPool2d(2)
        self.enc3 = DoubleConv(128, 256, dropout)
        self.pool3 = nn.MaxPool2d(2)
        self.enc4 = DoubleConv(256, 512, dropout)
        self.pool4 = nn.MaxPool2d(2)

        # Bottleneck with attention
        self.bottleneck = DoubleConv(512, 1024, dropout)
        self.attention = AttentionBlock(1024, dropout)

        # Decoder
        self.up4 = nn.ConvTranspose2d(1024, 512, 2, stride=2)
        self.dec4 = DoubleConv(1024, 512, dropout)
        self.up3 = nn.ConvTranspose2d(512, 256, 2, stride=2)
        self.dec3 = DoubleConv(512, 256, dropout)
        self.up2 = nn.ConvTranspose2d(256, 128, 2, stride=2)
        self.dec2 = DoubleConv(256, 128, dropout)
        self.up1 = nn.ConvTranspose2d(128, 64, 2, stride=2)
        self.dec1 = DoubleConv(128, 64, dropout)

        # Final output
        self.final_conv = nn.Conv2d(64, out_channels, 1)

        # Initialize weights for stability
        self._initialize_weights()

    def _initialize_weights(self):
        """Stable weight initialization"""
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.ConvTranspose2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')

    def forward(self, x):
        if self.use_adaptive:
            x = self.input_processor(x)

        # Encoder
        e1 = self.enc1(x)
        e2 = self.enc2(self.pool1(e1))
        e3 = self.enc3(self.pool2(e2))
        e4 = self.enc4(self.pool3(e3))

        # Bottleneck
        b = self.bottleneck(self.pool4(e4))
        b = self.attention(b)

        # Decoder with skip connections
        d4 = self.up4(b)
        d4 = self._match_tensor_size(d4, e4)
        d4 = torch.cat([d4, e4], dim=1)
        d4 = self.dec4(d4)

        d3 = self.up3(d4)
        d3 = self._match_tensor_size(d3, e3)
        d3 = torch.cat([d3, e3], dim=1)
        d3 = self.dec3(d3)

        d2 = self.up2(d3)
        d2 = self._match_tensor_size(d2, e2)
        d2 = torch.cat([d2, e2], dim=1)
        d2 = self.dec2(d2)

        d1 = self.up1(d2)
        d1 = self._match_tensor_size(d1, e1)
        d1 = torch.cat([d1, e1], dim=1)
        d1 = self.dec1(d1)

        # Final output
        output = self.final_conv(d1)

        if self.use_adaptive:
            output = F.interpolate(output, size=(300, 1259), mode='bilinear', align_corners=False)

        return output

    def _match_tensor_size(self, tensor1, tensor2):
        """Helper function for matching tensor sizes"""
        _, _, h1, w1 = tensor1.shape
        _, _, h2, w2 = tensor2.shape

        dh = h2 - h1
        dw = w2 - w1

        if dh > 0 or dw > 0:
            pad_h = max(0, dh)
            pad_w = max(0, dw)
            tensor1 = F.pad(tensor1, (0, pad_w, 0, pad_h))
        elif dh < 0 or dw < 0:
            tensor1 = tensor1[:, :, :h2, :w2]

        return tensor1

In [None]:
def train_epoch_fast(model, train_loader, optimizer):
    """Fast training epoch without progress bars"""
    model.train()
    total_loss = 0.0
    num_batches = 0

    for batch_idx, (inputs, targets) in enumerate(train_loader):
        inputs, targets = inputs.to(DEVICE, non_blocking=True), targets.to(DEVICE, non_blocking=True)
        targets = targets.unsqueeze(1)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = mape_loss(outputs, targets)
        loss.backward()

        # Gradient clipping for stability
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=GRADIENT_CLIP_NORM)

        optimizer.step()

        total_loss += loss.item()
        num_batches += 1

        # Minimal memory cleanup
        if batch_idx % 20 == 0:
            torch.cuda.empty_cache()

    return total_loss / num_batches

def validate_epoch_fast(model, val_loader):
    """Fast validation epoch without progress bars"""
    model.eval()
    total_loss = 0.0
    num_batches = 0

    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(val_loader):
            inputs, targets = inputs.to(DEVICE, non_blocking=True), targets.to(DEVICE, non_blocking=True)
            targets = targets.unsqueeze(1)

            outputs = model(inputs)
            loss = mape_loss(outputs, targets)

            total_loss += loss.item()
            num_batches += 1

            # Minimal memory cleanup
            if batch_idx % 20 == 0:
                torch.cuda.empty_cache()

    return total_loss / num_batches

def fast_train_single_fold(data_dir, fold_info, seed, result_dir):
    """Streamlined single fold training with specific seed"""

    # AGGRESSIVE MEMORY CLEANUP AT START
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
    gc.collect()

    fold_num = fold_info['fold']

    print(f"🚀 Training Fold {fold_num} with Seed {seed}")
    start_time = time.time()

    # Setup deterministic training
    setup_deterministic_training(seed)

    # Create model
    model = StabilizedUNet(in_channels=5, out_channels=1, use_adaptive=USE_ADAPTIVE_MODEL,dropout=DROPOUT)
    model.to(DEVICE)

    # Optimized DataLoader settings for A100
    train_dataset = SeismicDataset(data_dir, fold_info['train_indices'], use_adaptive=USE_ADAPTIVE_MODEL)
    val_dataset = SeismicDataset(data_dir, fold_info['val_indices'], use_adaptive=USE_ADAPTIVE_MODEL)

    train_loader = DataLoader(
        train_dataset,
        batch_size=BATCH_SIZE,
        shuffle=True,
        num_workers=2,  # Optimized for stability
        pin_memory=True,
        persistent_workers=False,  # More stable
        prefetch_factor=1
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=BATCH_SIZE,
        shuffle=False,
        num_workers=2,
        pin_memory=True,
        persistent_workers=False,
        prefetch_factor=1
    )

    # Optimizer and scheduler
    optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=0.005)
    scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=10, T_mult=2, eta_min=1e-6)

    # Training tracking
    best_val_loss = float('inf')
    patience_counter = 0
    epoch_times = []
    train_losses = []
    val_losses = []

    # Training loop
    for epoch in range(NUM_EPOCHS):
        epoch_start = time.time()

        # Train and validate
        train_loss = train_epoch_fast(model, train_loader, optimizer)
        val_loss = validate_epoch_fast(model, val_loader)

        # Update scheduler
        old_lr = optimizer.param_groups[0]['lr']
        scheduler.step(val_loss)
        current_lr = optimizer.param_groups[0]['lr']

        # Track metrics
        epoch_time = time.time() - epoch_start
        epoch_times.append(epoch_time)
        train_losses.append(train_loss)
        val_losses.append(val_loss)

        # Progress reporting (minimal)
        if epoch % 5 == 0 or val_loss < best_val_loss:
            print(f"  Epoch {epoch+1:2d}: Train={train_loss:.4f}%, Val={val_loss:.4f}%, LR={current_lr:.6f}, Time={epoch_time:.1f}s")

        # Learning rate change notification
        if current_lr != old_lr:
            print(f"    📉 Learning rate reduced: {old_lr:.2e} → {current_lr:.2e}")

        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0

            # Save model checkpoint
            model_path = result_dir / f'model_fold_{fold_num}_seed_{seed}.pth'
            torch.save({
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'epoch': epoch,
                'val_loss': best_val_loss,
                'fold': fold_num,
                'seed': seed,
                'train_losses': train_losses,
                'val_losses': val_losses
            }, model_path)

        else:
            patience_counter += 1

        # Early stopping
        if patience_counter >= EARLY_STOPPING_PATIENCE:
            print(f"    ⏹️  Early stopping at epoch {epoch+1}")
            break

        if current_lr < 1e-7:
            print(f"    ⏹️  Learning rate too low: {current_lr:.2e}")
            break

    total_time = time.time() - start_time
    print(f"✅ Fold {fold_num} Seed {seed} complete: Best Val MAPE = {best_val_loss:.4f}% in {total_time/60:.1f}min")

    # Cleanup
    del model, optimizer, scheduler, train_loader, val_loader
    torch.cuda.empty_cache()
    gc.collect()

    return {
        'fold': fold_num,
        'seed': seed,
        'best_val_loss': best_val_loss,
        'final_epoch': epoch + 1,
        'total_time': total_time,
        'train_losses': train_losses,
        'val_losses': val_losses,
        'avg_epoch_time': np.mean(epoch_times)
    }

In [None]:
class OptimizedMultiSeedTrainer:
    def __init__(self, data_dir, result_dir, k_folds=5, seeds=None):
        self.data_dir = Path(data_dir)
        self.result_dir = Path(result_dir)
        self.k_folds = k_folds
        self.seeds = seeds or ENSEMBLE_SEEDS
        self.results = {}

        # Create results directory
        self.result_dir.mkdir(parents=True, exist_ok=True)

        print(f"🎯 Multi-Seed Trainer initialized")
        print(f"   Data: {self.data_dir}")
        print(f"   Results: {self.result_dir}")
        print(f"   Seeds: {self.seeds}")
        print(f"   Total models to train: {k_folds * len(self.seeds)}")

    def create_kfold_splits(self, dataset_size=2000):
        """Create K-fold splits (same as Phase 2b)"""
        indices = np.arange(1, dataset_size + 1)
        kfold = KFold(n_splits=self.k_folds, shuffle=True, random_state=42)

        splits = []
        for fold, (train_idx, val_idx) in enumerate(kfold.split(indices)):
            train_samples = indices[train_idx]
            val_samples = indices[val_idx]

            splits.append({
                'fold': fold,
                'train_indices': train_samples,
                'val_indices': val_samples,
                'train_size': len(train_samples),
                'val_size': len(val_samples)
            })

        print(f"📊 Created {self.k_folds}-fold splits")
        return splits

    def train_fold_ensemble(self, fold_info):
        """Train multiple seeds for single fold"""
        fold_num = fold_info['fold']
        fold_results = []

        print(f"\n🎯 Training Fold {fold_num} with {len(self.seeds)} seeds")
        fold_start_time = time.time()

        for seed in self.seeds:
            try:
                result = fast_train_single_fold(self.data_dir, fold_info, seed, self.result_dir)
                fold_results.append(result)

                # Brief pause between models
                time.sleep(2)

            except Exception as e:
                print(f"❌ Error training Fold {fold_num} Seed {seed}: {str(e)}")
                fold_results.append({
                    'fold': fold_num,
                    'seed': seed,
                    'best_val_loss': float('inf'),
                    'error': str(e)
                })

        fold_time = time.time() - fold_start_time

        # Analyze fold ensemble
        valid_results = [r for r in fold_results if 'error' not in r]
        if valid_results:
            performances = [r['best_val_loss'] for r in valid_results]
            mean_perf = np.mean(performances)
            std_perf = np.std(performances)
            best_perf = np.min(performances)

            print(f"📈 Fold {fold_num} ensemble results:")
            print(f"   Mean MAPE: {mean_perf:.4f}% ± {std_perf:.4f}%")
            print(f"   Best MAPE: {best_perf:.4f}%")
            print(f"   Seeds: {[r['seed'] for r in valid_results]}")
            print(f"   Time: {fold_time/60:.1f} minutes")

        return fold_results

    def run_full_training(self):
        """Run complete multi-seed ensemble training"""
        print(f"\n{'='*60}")
        print(f"🚀 STARTING OPTIMIZED MULTI-SEED ENSEMBLE TRAINING")
        print(f"{'='*60}")

        start_time = time.time()

        # Create fold splits
        splits = self.create_kfold_splits()

        # Train all folds
        all_results = {}
        for fold_info in splits:
            fold_results = self.train_fold_ensemble(fold_info)
            all_results[fold_info['fold']] = fold_results

        total_time = time.time() - start_time

        # Comprehensive analysis
        self.analyze_results(all_results, total_time)

        # Save results
        self.save_results(all_results)

        return all_results

    def analyze_results(self, all_results, total_time):
        """Comprehensive results analysis"""
        print(f"\n{'='*60}")
        print(f"📊 COMPREHENSIVE ENSEMBLE ANALYSIS")
        print(f"{'='*60}")

        # Extract all valid results
        all_performances = []
        fold_best = []
        fold_ensembles = []

        print(f"\n📋 Individual Fold Analysis:")
        print(f"{'Fold':<6} {'Best':<8} {'Mean':<8} {'Std':<8} {'Seeds':<12}")
        print("-" * 50)

        for fold_num in range(self.k_folds):
            fold_results = all_results[fold_num]
            valid_results = [r for r in fold_results if 'error' not in r]

            if valid_results:
                performances = [r['best_val_loss'] for r in valid_results]
                all_performances.extend(performances)

                fold_mean = np.mean(performances)
                fold_std = np.std(performances)
                fold_min = np.min(performances)

                fold_best.append(fold_min)
                fold_ensembles.append(fold_mean)

                print(f"{fold_num:<6} {fold_min:<8.4f} {fold_mean:<8.4f} {fold_std:<8.4f} {len(valid_results):<12}")
            else:
                print(f"{fold_num:<6} {'FAILED':<8} {'FAILED':<8} {'FAILED':<8} {'0':<12}")
                fold_best.append(float('inf'))
                fold_ensembles.append(float('inf'))

        # Overall statistics
        if all_performances:
            overall_mean = np.mean(all_performances)
            overall_std = np.std(all_performances)
            overall_best = np.min(all_performances)
            ensemble_mean = np.mean(fold_ensembles)

            print(f"\n🎯 Overall Performance Summary:")
            print(f"   Total models trained: {len(all_performances)}")
            print(f"   Overall mean MAPE: {overall_mean:.4f}% ± {overall_std:.4f}%")
            print(f"   Best single model: {overall_best:.4f}%")
            print(f"   Expected ensemble MAPE: {ensemble_mean:.4f}%")
            print(f"   Total training time: {total_time/3600:.2f} hours")
            print(f"   Average time per model: {total_time/len(all_performances)/60:.1f} minutes")

            # Performance improvements
            best_fold_mean = np.mean(fold_best)
            print(f"\n📈 Performance Analysis:")
            print(f"   Best single fold performance: {np.min(fold_best):.4f}%")
            print(f"   Mean of best fold performances: {best_fold_mean:.4f}%")
            print(f"   Stability improvement: {overall_std:.4f}% standard deviation")

            # Competition readiness
            target_performance = 3.0
            models_below_target = sum(1 for p in all_performances if p < target_performance)
            print(f"\n🏆 Competition Analysis:")
            print(f"   Models below 3.0% target: {models_below_target}/{len(all_performances)}")
            print(f"   Success rate: {models_below_target/len(all_performances)*100:.1f}%")

            if overall_best < target_performance:
                print(f"   🎉 TARGET ACHIEVED! Best model: {overall_best:.4f}%")
            else:
                gap = overall_best - target_performance
                print(f"   🎯 Gap to target: {gap:.4f}%")

    def save_results(self, all_results):
        """Save comprehensive results"""
        import json

        # Save detailed results
        results_file = self.result_dir / 'ensemble_results.json'

        # Convert to JSON-serializable format
        json_results = {}
        for fold_num, fold_results in all_results.items():
            json_results[f'fold_{fold_num}'] = []
            for result in fold_results:
                json_result = {k: v for k, v in result.items() if k not in ['train_losses', 'val_losses']}
                json_results[f'fold_{fold_num}'].append(json_result)

        with open(results_file, 'w') as f:
            json.dump(json_results, f, indent=2)

        # Create summary file
        summary_file = self.result_dir / 'training_summary.txt'
        with open(summary_file, 'w') as f:
            f.write("Phase 2c Multi-Seed Ensemble Training Summary\n")
            f.write("=" * 50 + "\n\n")

            for fold_num in range(self.k_folds):
                fold_results = all_results[fold_num]
                valid_results = [r for r in fold_results if 'error' not in r]

                if valid_results:
                    performances = [r['best_val_loss'] for r in valid_results]
                    f.write(f"Fold {fold_num}:\n")
                    f.write(f"  Best: {np.min(performances):.4f}%\n")
                    f.write(f"  Mean: {np.mean(performances):.4f}% ± {np.std(performances):.4f}%\n")
                    f.write(f"  Seeds: {[r['seed'] for r in valid_results]}\n\n")

        print(f"💾 Results saved to:")
        print(f"   {results_file}")
        print(f"   {summary_file}")

In [None]:
def main():
    """Main execution function"""

    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()
        torch.cuda.synchronize()

    gc.collect()

    # Set memory management environment variable
    os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
    # Setup paths

    data_dir = Path('/content/drive/MyDrive/ThinkOnward/Data/Train')
    result_dir = Path('/content/drive/MyDrive/ThinkOnward/Result/Phase2c')

    print(f"🔧 Phase 2c Optimized Training Pipeline")
    print(f"   Target: Sub-3.0% MAPE with stable ensemble")
    print(f"   Strategy: {K_FOLDS} folds × {len(ENSEMBLE_SEEDS)} seeds = {K_FOLDS * len(ENSEMBLE_SEEDS)} models")
    print(f"   Hardware: A100 GPU with batch_size={BATCH_SIZE}")

    # Initialize trainer
    trainer = OptimizedMultiSeedTrainer(
        data_dir=data_dir,
        result_dir=result_dir,
        k_folds=K_FOLDS,
        seeds=ENSEMBLE_SEEDS
    )

    # Run training
    results = trainer.run_full_training()

    print(f"\n✅ Phase 2c Training Complete!")
    print(f"Ready for diffusion model development in Phase 3")

    return results

In [None]:
# Run the optimized training for original Unet
if __name__ == "__main__":
    results = main()

🔧 Phase 2c Optimized Training Pipeline
   Target: Sub-3.0% MAPE with stable ensemble
   Strategy: 5 folds × 3 seeds = 15 models
   Hardware: A100 GPU with batch_size=32
🎯 Multi-Seed Trainer initialized
   Data: /content/drive/MyDrive/ThinkOnward/Data/Train
   Results: /content/drive/MyDrive/ThinkOnward/Result/Phase2c
   Seeds: [42, 123, 456]
   Total models to train: 15

🚀 STARTING OPTIMIZED MULTI-SEED ENSEMBLE TRAINING
📊 Created 5-fold splits

🎯 Training Fold 0 with 3 seeds
🚀 Training Fold 0 with Seed 42
✅ Deterministic training setup complete with seed 42


  attention = torch.bmm(query, key)
  out = torch.bmm(value, attention.permute(0, 2, 1))
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


  Epoch  1: Train=77.3063%, Val=95.8216%, LR=0.000050, Time=1680.4s
  Epoch  2: Train=57.2865%, Val=54.2104%, LR=0.000050, Time=34.5s
  Epoch  3: Train=50.6860%, Val=48.6783%, LR=0.000050, Time=41.7s
  Epoch  5: Train=45.3088%, Val=48.0459%, LR=0.000050, Time=33.0s
  Epoch  6: Train=43.3685%, Val=43.2842%, LR=0.000050, Time=41.8s
  Epoch  7: Train=41.6620%, Val=41.8366%, LR=0.000050, Time=41.7s
  Epoch  9: Train=38.3403%, Val=39.6044%, LR=0.000050, Time=32.6s
  Epoch 11: Train=35.1564%, Val=29.5073%, LR=0.000050, Time=33.1s
  Epoch 16: Train=28.5392%, Val=38.6225%, LR=0.000050, Time=33.0s
    📉 Learning rate reduced: 5.00e-05 → 2.50e-05
  Epoch 19: Train=25.5515%, Val=29.3514%, LR=0.000025, Time=33.0s
  Epoch 21: Train=24.1497%, Val=24.8773%, LR=0.000025, Time=33.1s
  Epoch 26: Train=21.2963%, Val=24.2036%, LR=0.000025, Time=33.7s
✅ Fold 0 Seed 42 complete: Best Val MAPE = 24.2036% in 45.7min
🚀 Training Fold 0 with Seed 123
✅ Deterministic training setup complete with seed 123
  Epoch 

In [None]:
# Run the optimized training for StabilizedUnet
if __name__ == "__main__":
    results = main()

🔧 Phase 2c Optimized Training Pipeline
   Target: Sub-3.0% MAPE with stable ensemble
   Strategy: 5 folds × 3 seeds = 15 models
   Hardware: A100 GPU with batch_size=32
🎯 Multi-Seed Trainer initialized
   Data: /content/drive/MyDrive/ThinkOnward/Data/Train
   Results: /content/drive/MyDrive/ThinkOnward/Result/Phase2c
   Seeds: [42, 123, 456]
   Total models to train: 15

🚀 STARTING OPTIMIZED MULTI-SEED ENSEMBLE TRAINING
📊 Created 5-fold splits

🎯 Training Fold 0 with 3 seeds
🚀 Training Fold 0 with Seed 42
✅ Deterministic training setup complete with seed 42


  attention = torch.bmm(query, key)
  out = torch.bmm(value, attention.permute(0, 2, 1))
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


  Epoch  1: Train=136.7605%, Val=38.4531%, LR=0.000050, Time=5513.2s
  Epoch  2: Train=41.3112%, Val=20.9334%, LR=0.000050, Time=36.0s
  Epoch  3: Train=23.9276%, Val=12.8215%, LR=0.000050, Time=43.9s
  Epoch  4: Train=18.3436%, Val=9.6654%, LR=0.000050, Time=44.7s
  Epoch  5: Train=15.4221%, Val=8.7806%, LR=0.000050, Time=43.6s
  Epoch  6: Train=13.5812%, Val=7.9705%, LR=0.000050, Time=43.6s
  Epoch  7: Train=12.3909%, Val=7.6224%, LR=0.000050, Time=44.9s
  Epoch  8: Train=11.5377%, Val=6.9995%, LR=0.000050, Time=44.1s
  Epoch  9: Train=10.8307%, Val=6.8610%, LR=0.000050, Time=44.4s
  Epoch 10: Train=10.1804%, Val=6.3582%, LR=0.000050, Time=43.9s
  Epoch 11: Train=9.8002%, Val=6.1080%, LR=0.000050, Time=44.4s
  Epoch 12: Train=9.3976%, Val=5.6898%, LR=0.000050, Time=44.7s
  Epoch 13: Train=8.8665%, Val=5.5334%, LR=0.000050, Time=44.2s
  Epoch 14: Train=8.4973%, Val=5.4660%, LR=0.000050, Time=44.5s
  Epoch 16: Train=8.0429%, Val=4.9705%, LR=0.000050, Time=34.9s
  Epoch 18: Train=7.5495

In [None]:
# Run the optimized training for StabilizedUnet and tweaked parameters for performance improvement
if __name__ == "__main__":
    results = main()

🔧 Phase 2c Optimized Training Pipeline
   Target: Sub-3.0% MAPE with stable ensemble
   Strategy: 5 folds × 3 seeds = 15 models
   Hardware: A100 GPU with batch_size=32
🎯 Multi-Seed Trainer initialized
   Data: /content/drive/MyDrive/ThinkOnward/Data/Train
   Results: /content/drive/MyDrive/ThinkOnward/Result/Phase2c
   Seeds: [42, 123, 456]
   Total models to train: 15

🚀 STARTING OPTIMIZED MULTI-SEED ENSEMBLE TRAINING
📊 Created 5-fold splits

🎯 Training Fold 0 with 3 seeds
🚀 Training Fold 0 with Seed 42
✅ Deterministic training setup complete with seed 42


  attention = torch.bmm(query, key)
  out = torch.bmm(value, attention.permute(0, 2, 1))
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


  Epoch  1: Train=81.8903%, Val=26.2064%, LR=0.000010, Time=3347.3s
    📉 Learning rate reduced: 1.00e-04 → 9.53e-06
  Epoch  2: Train=26.2609%, Val=13.4480%, LR=0.000093, Time=35.1s
    📉 Learning rate reduced: 9.53e-06 → 9.29e-05
  Epoch  3: Train=19.5851%, Val=10.2425%, LR=0.000100, Time=42.5s
    📉 Learning rate reduced: 9.29e-05 → 1.00e-04
  Epoch  4: Train=14.3343%, Val=8.7828%, LR=0.000005, Time=42.3s
    📉 Learning rate reduced: 1.00e-04 → 4.58e-06
  Epoch  5: Train=11.9677%, Val=6.8735%, LR=0.000023, Time=42.2s
    📉 Learning rate reduced: 4.58e-06 → 2.30e-05
  Epoch  6: Train=11.5396%, Val=6.6761%, LR=0.000026, Time=42.5s
    📉 Learning rate reduced: 2.30e-05 → 2.56e-05
  Epoch  7: Train=11.1392%, Val=6.5268%, LR=0.000028, Time=42.5s
    📉 Learning rate reduced: 2.56e-05 → 2.77e-05
  Epoch  8: Train=10.5482%, Val=6.2672%, LR=0.000031, Time=42.4s
    📉 Learning rate reduced: 2.77e-05 → 3.13e-05
  Epoch  9: Train=10.1990%, Val=6.0243%, LR=0.000035, Time=42.2s
    📉 Learning rat