In [15]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import matplotlib.pyplot as plt
import numpy as np
import sys
import os
import time
import copy  # To store model state if needed

# Ensure the src directory is in the Python path
# Adjust the path '..' if your notebook is in a different location relative to src
module_path = os.path.abspath(os.path.join('..', 'src'))
if module_path not in sys.path:
    sys.path.append(module_path)

# Import modules from src
from utils import load_processed_data
from models import Model_1, Model_2, Model_3

In [16]:
# --- Configuration ---
BATCH_SIZE = 128 # Reasonable batch size (can try 64, 256)
LEARNING_RATE = 1e-3 # A common default starting LR for Adam
N_MINIBATCHES = 15
EVAL_INTERVAL = 5 # Evaluate on validation set every X mini-batches
SEED = 42 # For reproducibility

In [None]:
# --- Set Seed ---
torch.manual_seed(SEED)
np.random.seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
# Note: MPS backend reproducibility might have limitations, but setting CPU/CUDA seeds is good practice.

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {DEVICE}")
print(f"Reproducibility seed set to: {SEED}")

In [None]:
# --- Load Data ---
print("Loading data...")
X_train, y_train, X_val, y_val, _, _ = load_processed_data()
print("Data loaded.")

In [19]:
# Create datasets
train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset(X_val, y_val)

In [20]:
# Create dataloaders
# Use shuffle=True for training to ensure batches are different each epoch
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, generator=torch.Generator().manual_seed(SEED))
# No need to shuffle validation data
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE * 2) # Larger batch size for faster validation

In [21]:
# --- Define Training and Evaluation Functions ---

def train_one_step(model, batch, criterion, optimizer, device):
    """Performs a single training step (forward pass, loss calc, backward pass, optimizer step)."""
    model.train() # Set model to training mode
    inputs, targets = batch
    inputs, targets = inputs.to(device), targets.to(device)

    # Zero gradients
    optimizer.zero_grad()

    # Forward pass
    outputs = model(inputs)
    loss = criterion(outputs, targets)

    # Backward pass and optimize
    loss.backward()
    optimizer.step()

    return loss.item()

In [22]:
def evaluate(model, loader, criterion, device):
    """Evaluates the model on the given data loader."""
    model.eval() # Set model to evaluation mode
    total_loss = 0.0
    correct_predictions = 0
    total_samples = 0

    with torch.no_grad(): # Disable gradient calculations during evaluation
        for batch in loader:
            inputs, targets = batch
            inputs, targets = inputs.to(device), targets.to(device)

            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, targets)

            total_loss += loss.item() * inputs.size(0) # Accumulate loss weighted by batch size

            # Calculate accuracy
            _, predicted = torch.max(outputs.data, 1)
            total_samples += targets.size(0)
            correct_predictions += (predicted == targets).sum().item()

    avg_loss = total_loss / total_samples
    accuracy = correct_predictions / total_samples
    return avg_loss, accuracy

In [23]:
# --- Experiment Setup ---
model_architectures = {
    "Model_1 (128x128)": Model_1,
    "Model_2 (256x256)": Model_2,
    "Model_3 (256x128x64)": Model_3
}

results = {} # To store detailed results

criterion = nn.CrossEntropyLoss()

In [None]:
# --- Run Experiments ---

for name, ModelClass in model_architectures.items():
    print(f"\n--- Running Experiment for: {name} ---")
    # Re-seed generator for dataloader for each model if desired (optional, but good practice)
    # train_loader.generator.manual_seed(SEED) # Reset iterator state implicitly
    train_iter = iter(train_loader) # Create a fresh iterator

    # Instantiate model and move to device
    # Note: Parameter initialization depends on the global torch seed set earlier
    model = ModelClass().to(DEVICE)
    print(model) # Print architecture details

    # Use Adam optimizer with a default learning rate
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

    # Store performance metrics
    minibatch_losses = []
    eval_batches = [] # Batch numbers where evaluation was performed
    val_losses = []
    val_accuracies = []

    start_time = time.time()

    # Training loop for N_MINIBATCHES
    batch_count = 0

    while batch_count < N_MINIBATCHES:
        try:
            # Fetch the next batch
            batch = next(train_iter)
        except StopIteration:
            # Reset iterator if it runs out (shouldn't happen in 15 batches normally)
            print("Resetting train_loader iterator...")
            train_iter = iter(train_loader)
            batch = next(train_iter)

        # Perform one training step
        loss = train_one_step(model, batch, criterion, optimizer, DEVICE)
        minibatch_losses.append(loss)
        batch_count += 1

        # Optional: Print progress
        # print(f"  Batch {batch_count}/{N_MINIBATCHES}, Loss: {loss:.4f}") # Can be verbose

        # Intermediate Evaluation
        if batch_count % EVAL_INTERVAL == 0 or batch_count == N_MINIBATCHES:
            eval_start_time = time.time()
            val_loss, val_accuracy = evaluate(model, val_loader, criterion, DEVICE)
            eval_end_time = time.time()
            eval_batches.append(batch_count)
            val_losses.append(val_loss)
            val_accuracies.append(val_accuracy)
            print(f"  Batch {batch_count}/{N_MINIBATCHES} -> Train Loss (last batch): {loss:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.4f} (Eval took {eval_end_time - eval_start_time:.2f}s)")


    end_time = time.time()
    total_training_time = end_time - start_time

    print(f"Finished {name}.")
    print(f"  Total Training Time for {N_MINIBATCHES} batches: {total_training_time:.2f} seconds")

    # Store results
    results[name] = {
        'minibatch_losses': minibatch_losses,
        'eval_batches': eval_batches, # e.g., [5, 10, 15]
        'val_losses': val_losses,     # List of val losses at eval points
        'val_accuracies': val_accuracies, # List of val accuracies at eval points
        'final_val_loss': val_losses[-1], # Get the last recorded val loss
        'final_val_accuracy': val_accuracies[-1], # Get the last recorded val accuracy
        'training_time': total_training_time
    }

In [None]:
# Plot Training Loss (per mini-batch)
plt.figure(figsize=(14, 8))
plt.subplot(2, 1, 1) # Create subplot 1
for name, data in results.items():
    plt.plot(range(1, N_MINIBATCHES + 1), data['minibatch_losses'], label=f"{name}", alpha=0.8)

plt.xlabel("Mini-batch Number")
plt.ylabel("Training Loss")
plt.title("Training Loss per Mini-batch")
plt.legend()
plt.grid(True)
plt.xticks(range(1, N_MINIBATCHES + 1))

In [None]:
# Plot Validation Performance (at evaluation intervals)
plt.subplot(2, 1, 2) # Create subplot 2
for name, data in results.items():
    # Plot validation accuracy
    plt.plot(data['eval_batches'], data['val_accuracies'], label=f"{name} Val Acc", marker='o', linestyle='-')
    # Optionally plot validation loss on a secondary y-axis if scales differ too much
    # (Let's keep it simple for now and focus on accuracy)

plt.xlabel("Mini-batch Number (Evaluation Points)")
plt.ylabel("Validation Accuracy")
plt.title(f"Validation Accuracy at Evaluation Intervals (every {EVAL_INTERVAL} batches)")
plt.legend()
plt.grid(True)
plt.xticks(results[list(results.keys())[0]]['eval_batches']) # Use eval batches from first result as ticks

plt.tight_layout() # Adjust layout to prevent overlap
plt.show()

In [None]:
# Print summary table (using final validation metrics after 15 batches)
print("\n--- Summary of Initial Runs (Performance after 15 Mini-batches) ---")
print(f"{'Architecture':<25} | {'Final Val Loss':<15} | {'Final Val Accuracy':<18} | {'Training Time (s)':<15}")
print("-" * 80)
for name, data in results.items():
    print(f"{name:<25} | {data['final_val_loss']:.4f}{' ':<10} | {data['final_val_accuracy']:.4f}{' ':<13} | {data['training_time']:.2f}")

In [None]:
# --- Select Best Performing Architecture ---
# Based on final validation accuracy primarily
best_model_name = ""
best_val_accuracy = -1.0

for name, data in results.items():
    if data['final_val_accuracy'] > best_val_accuracy:
        best_val_accuracy = data['final_val_accuracy']
        best_model_name = name
    # Could add tie-breaking logic using final_val_loss if needed

print(f"\nBased on final validation accuracy after {N_MINIBATCHES} batches, the best performing architecture appears to be: {best_model_name}")
print(f"(Achieved {best_val_accuracy:.4f} accuracy)")
print("(Note: This is based on very limited training. The validation trajectory plot provides more context.)")