In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.datasets as dset
import torchvision.transforms as T
from torch.utils.data import DataLoader, SubsetRandomSampler
from torch.amp import autocast, GradScaler
import torch.nn.functional as F
from torch.optim.lr_scheduler import StepLR
from itertools import product
import copy
import math
import numpy as np
import optuna
import time
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict

import optuna

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
dtype = torch.float32
print("Using device:", device)

In [None]:
NUM_TRAIN = 49000
BATCH_SIZE = 256
NUM_WORKERS = 12
# MAX_ROUNDS = 3
NUM_TRIALS = 20

iters_per_epoch = np.ceil(NUM_TRAIN / BATCH_SIZE)

RANDOM_SEED = 0
torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

In [None]:
transform_train = T.Compose([
    T.RandomHorizontalFlip(),
    T.RandomCrop(32, padding=4),
    T.ToTensor(),
    T.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])

transform_val_test = T.Compose([
    T.ToTensor(),
    T.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])

cifar10_train = dset.CIFAR10('datasets', train=True, download=True, transform=transform_train)
cifar10_val_test = dset.CIFAR10('datasets', train=True, download=True, transform=transform_val_test)
cifar10_test = dset.CIFAR10('datasets', train=False, download=True, transform=transform_val_test)

indices = np.arange(len(cifar10_train))
np.random.shuffle(indices)
train_idx = indices[:NUM_TRAIN]
val_idx = indices[NUM_TRAIN:]

train_sampler = SubsetRandomSampler(train_idx)
val_sampler = SubsetRandomSampler(val_idx)

loader_train = DataLoader(
    cifar10_train,
    batch_size=BATCH_SIZE,
    sampler=train_sampler,
    num_workers=NUM_WORKERS,
    pin_memory=True,
    persistent_workers=True
)

loader_val = DataLoader(
    cifar10_val_test,
    batch_size=BATCH_SIZE,
    sampler=val_sampler,
    num_workers=NUM_WORKERS,
    pin_memory=True,
    persistent_workers=True
)

print(f"Train batches: {len(loader_train)}, Val batches: {len(loader_val)}")
print(f"Results in {iters_per_epoch} iterations per epoch.")

In [None]:
results = []

def train_model(
    model,
    train_loader,
    val_loader,
    optimizer,
    num_epochs=10,
    criterion=None,
    device=None,
    print_every=20,
    patience=3,
    accumulation_steps=1,
    scheduler=None
):
    """
    Optimized training function with AMP, early stopping, gradient accumulation, 
    best model saving, and optional scheduler.
    """
    device = device or torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    criterion = criterion or F.cross_entropy
    optimizer = optimizer
    scaler = GradScaler()

    best_val_acc = -float('inf')
    best_model = None
    epochs_no_improve = 0

    train_accs, val_accs, lrs, epoch_times = [], [], [], []
    start_time = time.time()

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        start_epoch_time = time.time()

        for i, (x, y) in enumerate(train_loader):
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad() if accumulation_steps == 1 else None

            with autocast("cuda"):
                outputs = model(x)
                loss = criterion(outputs, y) / accumulation_steps  # scale loss for accumulation

            scaler.scale(loss).backward()

            if (i + 1) % accumulation_steps == 0:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
            
            running_loss += loss.item() * accumulation_steps
            if (i + 1) % print_every == 0:
                iter_time = (time.time() - start_epoch_time) / (i + 1)
                iters_per_sec = 1 / iter_time
                print(f"Epoch {epoch+1}, Iter {i+1}, Avg Loss: {running_loss/print_every:.4f}")
                running_loss = 0.0

        # Step scheduler at epoch end
        if scheduler is not None:
            lrs.append(scheduler.get_last_lr()[0])
            scheduler.step()

        # Validation check
        val_acc = get_accuracy(val_loader, model, f"Validation Epoch {epoch+1}", device)
        val_accs.append(val_acc)
        
        train_acc = get_accuracy(train_loader, model, f"Training Epoch {epoch+1}", device=device)
        train_accs.append(train_acc)

        # Early stopping
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_model = copy.deepcopy(model)
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= patience:
                print(f"Early stopping triggered after {epoch+1} epochs.")
                break

    total_time = time.time() - start_time

    return {
        "model": best_model if best_model else model,
        "train_accs": train_accs,
        "val_accs": val_accs,
        "lrs": lrs,
        "epoch_times": epoch_times,
        "total_time": total_time
    }


def get_accuracy(loader, model, name="Validation", device=device, debug=True):
    """
    Compute accuracy of the model on a given DataLoader.
    """
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device), y.to(device)
            with autocast("cuda"):
                scores = model(x)
            preds = scores.argmax(dim=1)
            correct += (preds == y).sum().item()
            total += y.size(0)
    acc = correct / total
    if debug:
        print(f"{name} Accuracy: {100*acc:.2f}%")
    return acc

In [None]:
best_model = None
best_val_acc = -float('inf')

def objective(trial):
    global best_model, best_val_acc

    lr = trial.suggest_float("lr", 1e-4, 1e-2, log=True)
    n_epochs = trial.suggest_int("n_epochs", 5, 20)       
    
    # Build model
    ch1 = 16
    ch2 = 64
    ch3 = 32
    weight_decay = 2.075612963823467e-06
    model = nn.Sequential(
        nn.Conv2d(3, ch1, 5, padding=2),
        nn.BatchNorm2d(ch1),
        nn.ReLU(),

        nn.Conv2d(ch1, ch2, 3, padding=1),
        nn.BatchNorm2d(ch2),
        nn.ReLU(),
        nn.MaxPool2d(2,2),

        nn.Conv2d(ch2, ch3, 3, padding=1),
        nn.BatchNorm2d(ch3),
        nn.ReLU(),
        nn.MaxPool2d(2,2),

        nn.Flatten(),
        nn.Linear(ch3 * 8 * 8, 128),
        nn.ReLU(),
        nn.Dropout(0.25),
        nn.Linear(128, 10)
    ).to(device)
            
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=n_epochs)

    training_result = train_model(
        model=model,
        train_loader=loader_train,
        val_loader=loader_val,
        num_epochs=n_epochs,
        device=device,
        print_every=len(loader_train) // 4,
        patience=2,
        optimizer=optimizer,
        scheduler=scheduler,
    )
    trained_model = training_result["model"]
    
    val_acc = get_accuracy(loader_val, trained_model, device=device, debug=False)

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_model = copy.deepcopy(trained_model)
        print(f"âœ… Found new best model: {100*val_acc:.2f}")

    results.append({
        "trial": trial.number,
        "ch1": ch1, "ch2": ch2, "ch3": ch3,
        "lr": lr, "weight_decay": weight_decay,
        "val_acc": val_acc, 
        "train_accs": training_result["train_accs"],
        "val_accs": training_result["val_accs"],
        "lrs": training_result["lrs"],
        "runtime": training_result["total_time"]
    })
    
    return val_acc

In [None]:
study_name = "bayesian_optimization"
storage_name = f"sqlite:///{study_name}.db"

summaries = optuna.study.get_all_study_summaries(storage=storage_name)
existing_studies = [s.study_name for s in summaries]

if study_name in existing_studies:
    optuna.delete_study(study_name=study_name, storage=storage_name)
    print(f"Deleted existing study: {study_name}")

# Create or load the study
study = optuna.create_study(
    study_name=study_name,
    storage=storage_name,
    direction="maximize",
    # load_if_exists=True
)

# Run optimization
study.optimize(objective, n_trials=NUM_TRIALS)

best_trial = study.best_trial
print("Best hyperparameters:", best_trial.params)
print(f"Best validation accuracy: {100*best_trial.value:.2f}%")

torch.save(best_model, "best model")

In [None]:
# TODO: add grid search 

In [None]:
#TODO: add random search