In [None]:
import os
import time
import itertools
from pathlib import Path
import csv
import json

import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader

In [None]:
import torch
torch.cuda.is_available(), torch.cuda.get_device_name(0)

(True, 'Tesla T4')

In [None]:
EPOCHS = 25                # number of epochs per run (you chose 25)
BATCH_SIZE = 128           # default batch size for main runs
BATCH_SIZE_SWEEPS = [32,64,128,256]   # for batch-size experiments
LR_DEFAULT = {"sgd": 0.01, "adam": 0.001, "adamw": 0.001}
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
RESULTS_ROOT = Path("results")         # where all outputs will be saved
NUM_WORKERS = 2 if torch.cuda.is_available() else 0
PIN_MEMORY = True if torch.cuda.is_available() else False
SEED = 42

In [None]:
torch.manual_seed(SEED)
np.random.seed(SEED)

# Create folders
(RESULTS_ROOT).mkdir(parents=True, exist_ok=True)
(RESULTS_ROOT / "ann").mkdir(exist_ok=True)
(RESULTS_ROOT / "cnn").mkdir(exist_ok=True)
(RESULTS_ROOT / "summary").mkdir(exist_ok=True)

In [None]:
def save_plot(fig, out_path):
    fig.savefig(out_path, bbox_inches="tight")
    plt.close(fig)

def save_metrics_csv(rows, out_path):
    import pandas as pd
    df = pd.DataFrame(rows)
    df.to_csv(out_path, index=False)

In [None]:
def get_dataloaders(task="mnist", batch_size=128):
    if task == "mnist":
        transform = transforms.Compose([transforms.ToTensor()])
        train_ds = torchvision.datasets.MNIST(root="./data", train=True, download=True, transform=transform)
        test_ds  = torchvision.datasets.MNIST(root="./data", train=False, download=True, transform=transform)
        train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY)
        test_loader  = DataLoader(test_ds,  batch_size=batch_size, shuffle=False, num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY)
        input_shape = (1,28,28)
    elif task == "cifar":
        transform_train = transforms.Compose([
            transforms.RandomHorizontalFlip(),
            transforms.RandomCrop(32, padding=4),
            transforms.ToTensor(),
            transforms.Normalize((0.4914, 0.4822, 0.4465),
                                 (0.2023, 0.1994, 0.2010)),
        ])
        transform_test = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.4914, 0.4822, 0.4465),
                                 (0.2023, 0.1994, 0.2010)),
        ])
        train_ds = torchvision.datasets.CIFAR10(root="./data", train=True, download=True, transform=transform_train)
        test_ds  = torchvision.datasets.CIFAR10(root="./data", train=False, download=True, transform=transform_test)
        train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY)
        test_loader  = DataLoader(test_ds,  batch_size=batch_size, shuffle=False, num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY)
        input_shape = (3,32,32)
    else:
        raise ValueError("Unknown task")
    return train_loader, test_loader, input_shape

In [None]:
class SimpleANN(nn.Module):
    def __init__(self, activation="relu"):
        super().__init__()
        acts = {
            "sigmoid": nn.Sigmoid(),
            "relu": nn.ReLU(),
            "leakyrelu": nn.LeakyReLU(0.01),
            "gelu": nn.GELU()
        }
        act = acts[activation.lower()]
        self.net = nn.Sequential(
            nn.Flatten(),
            nn.Linear(28*28, 512), act,
            nn.Linear(512, 256), act,
            nn.Linear(256, 10)
        )
    def forward(self, x): return self.net(x)

class SmallCNN(nn.Module):
    def __init__(self, activation="relu"):
        super().__init__()
        acts = {
            "sigmoid": nn.Sigmoid(),
            "relu": nn.ReLU(),
            "leakyrelu": nn.LeakyReLU(0.01),
            "gelu": nn.GELU()
        }
        act = acts[activation.lower()]
        self.features = nn.Sequential(
            nn.Conv2d(3, 32, 3, padding=1), act,
            nn.Conv2d(32, 64, 3, padding=1), act,
            nn.MaxPool2d(2,2),
            nn.Conv2d(64, 128, 3, padding=1), act,
            nn.MaxPool2d(2,2),
            nn.Flatten()
        )
        self.classifier = nn.Sequential(
            nn.Linear(128*8*8, 256), act,
            nn.Linear(256, 10)
        )
    def forward(self, x):
        x = self.features(x)
        return self.classifier(x)


In [None]:
def average_grad_norm(model):
    total_norm_sq = 0.0
    count = 0
    for p in model.parameters():
        if p.grad is not None:
            norm = p.grad.detach().float().norm(2).item()
            total_norm_sq += norm**2
            count += 1
    return float(total_norm_sq ** 0.5) if count>0 else 0.0

def simple_dead_neurons_count(model):
    # Proxy: count number of neurons in Linear layers whose weight-vector norm < small_thresh
    thresh = 1e-6
    dead = 0
    total = 0
    for name, p in model.named_parameters():
        if 'weight' in name and p.dim()==2:   # linear weight shapes (out_features, in_features)
            W = p.detach().cpu().numpy()
            norms = np.linalg.norm(W, axis=1)
            dead += np.sum(norms < thresh)
            total += W.shape[0]
    return int(dead), int(total)


In [None]:
from torch.cuda.amp import autocast, GradScaler

def train_one_epoch(model, loader, optimizer, loss_fn, scaler=None):
    model.train()
    running_loss = 0.0
    correct = 0
    grad_norms = []
    for xb, yb in loader:
        xb, yb = xb.to(DEVICE), yb.to(DEVICE)
        optimizer.zero_grad()
        if DEVICE.type == "cuda" and scaler is not None:
            with autocast():
                out = model(xb)
                loss = loss_fn(out, yb)
            scaler.scale(loss).backward()
            # compute grad norm on unscaled grads
            scaler.unscale_(optimizer)
            total_norm = average_grad_norm(model)
            grad_norms.append(total_norm)
            scaler.step(optimizer)
            scaler.update()
        else:
            out = model(xb)
            loss = loss_fn(out, yb)
            loss.backward()
            total_norm = average_grad_norm(model)
            grad_norms.append(total_norm)
            optimizer.step()
        running_loss += loss.item() * xb.size(0)
        correct += (out.argmax(1) == yb).sum().item()
    avg_loss = running_loss / len(loader.dataset)
    avg_acc = correct / len(loader.dataset)
    avg_grad = float(np.mean(grad_norms)) if len(grad_norms)>0 else 0.0
    return avg_loss, avg_acc, avg_grad

def evaluate(model, loader, loss_fn):
    model.eval()
    running_loss = 0.0
    correct = 0
    with torch.no_grad():
        for xb, yb in loader:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)
            out = model(xb)
            loss = loss_fn(out, yb)
            running_loss += loss.item() * xb.size(0)
            correct += (out.argmax(1) == yb).sum().item()
    return running_loss / len(loader.dataset), correct / len(loader.dataset)


In [None]:
def get_optimizer(name, params, lr):
    if name == "sgd":
        return optim.SGD(params, lr=lr, momentum=0.9)
    if name == "adam":
        return optim.Adam(params, lr=lr)
    if name == "adamw":
        return optim.AdamW(params, lr=lr)
    raise ValueError("Unknown optimizer")

def run_grid(tasks=("mnist","cifar"), activations=("sigmoid","relu","leakyrelu","gelu"), optimizers_list=("sgd","adam","adamw"),
             epochs=EPOCHS, batch_size=BATCH_SIZE, results_root=RESULTS_ROOT):
    summary = []
    for task in tasks:
        print(f"\n==== Starting TASK: {task.upper()} ====")
        train_loader, test_loader, _ = get_dataloaders(task=task, batch_size=batch_size)
        for act, opt_name in itertools.product(activations, optimizers_list):
            print(f"\n--- RUN: {task} | act={act} | opt={opt_name} ---")
            # model
            model = SimpleANN(act).to(DEVICE) if task=="mnist" else SmallCNN(act).to(DEVICE)
            loss_fn = nn.CrossEntropyLoss()
            lr = LR_DEFAULT.get(opt_name, 0.001)
            optimizer = get_optimizer(opt_name, model.parameters(), lr)
            scaler = GradScaler() if DEVICE.type=="cuda" else None

            run_name = f"{task}_{act}_{opt_name}"
            out_dir = Path(results_root) / task / run_name
            out_dir.mkdir(parents=True, exist_ok=True)

            # lists for epoch metrics
            rows = []
            train_losses = []; val_losses = []
            train_accs = []; val_accs = []
            grad_norms = []; dead_counts = []

            start_time = time.time()
            for epoch in range(1, epochs+1):
                tl, ta, gn = train_one_epoch(model, train_loader, optimizer, loss_fn, scaler)
                vl, va = evaluate(model, test_loader, loss_fn)
                dn, total_neurons = simple_dead_neurons_count(model)

                train_losses.append(tl); val_losses.append(vl)
                train_accs.append(ta); val_accs.append(va)
                grad_norms.append(gn); dead_counts.append(dn)

                rows.append({
                    "epoch": epoch,
                    "train_loss": tl, "val_loss": vl,
                    "train_acc": ta, "val_acc": va,
                    "grad_norm": gn, "dead_neurons": dn
                })
                print(f"Epoch {epoch}/{epochs} | train_acc {ta:.4f} val_acc {va:.4f} grad_norm {gn:.4f} dead {dn}")
            elapsed = time.time() - start_time

            # Save model state dict
            model_path = out_dir / f"{run_name}_final.pth"
            torch.save(model.state_dict(), model_path)

            # Save CSV
            csv_path = out_dir / f"{run_name}_metrics.csv"
            save_metrics_csv(rows, csv_path)

            # Save summary row
            summary_row = {
                "task": task, "activation": act, "optimizer": opt_name,
                "final_train_acc": train_accs[-1], "final_val_acc": val_accs[-1],
                "avg_grad_norm": float(np.mean(grad_norms)), "dead_neurons_final": int(dead_counts[-1]),
                "run_time_sec": int(elapsed), "epochs": epochs, "batch_size": batch_size, "lr": lr
            }
            summary.append(summary_row)
            # Save summary CSV per run
            with open(out_dir / f"{run_name}_summary.json","w") as f:
                json.dump(summary_row, f, indent=2)

            # Save plots: accuracy, loss, gradnorm, dead neurons
            # Accuracy
            fig = plt.figure(figsize=(6,4))
            plt.plot(range(1,epochs+1), train_accs, label="train_acc")
            plt.plot(range(1,epochs+1), val_accs, label="val_acc")
            plt.xlabel("Epoch"); plt.ylabel("Accuracy"); plt.title(f"{run_name} Accuracy")
            plt.legend(); plt.grid(True)
            save_plot(fig, out_dir / f"{run_name}_accuracy.png")

            # Loss
            fig = plt.figure(figsize=(6,4))
            plt.plot(range(1,epochs+1), train_losses, label="train_loss")
            plt.plot(range(1,epochs+1), val_losses, label="val_loss")
            plt.xlabel("Epoch"); plt.ylabel("Loss"); plt.title(f"{run_name} Loss")
            plt.legend(); plt.grid(True)
            save_plot(fig, out_dir / f"{run_name}_loss.png")

            # Grad norm
            fig = plt.figure(figsize=(6,4))
            plt.plot(range(1,epochs+1), grad_norms, marker='o')
            plt.xlabel("Epoch"); plt.ylabel("Avg Grad Norm"); plt.title(f"{run_name} GradNorm")
            plt.grid(True)
            save_plot(fig, out_dir / f"{run_name}_gradnorm.png")

            # Dead neuron
            fig = plt.figure(figsize=(6,4))
            plt.plot(range(1,epochs+1), dead_counts, marker='o')
            plt.xlabel("Epoch"); plt.ylabel("Dead Neuron Count"); plt.title(f"{run_name} DeadNeurons")
            plt.grid(True)
            save_plot(fig, out_dir / f"{run_name}_deadneurons.png")

            # Save final CSV summary of the run metrics
            # (already saved per-epoch CSV)
            print(f"Saved run artifacts to: {out_dir.resolve()}")

    # save global summary CSV
    summary_csv = Path(results_root) / "summary" / "all_runs_summary.csv"
    import pandas as pd
    pd.DataFrame(summary).to_csv(summary_csv, index=False)
    print("All experiments complete. Summary at:", summary_csv)

# -------------------------
# Optional helpers: LR sweep & batch-size sweep
# -------------------------
def lr_sweep(task, activation, optimizer_name, lrs=[1e-1,1e-2,1e-3,1e-4], epochs=12, batch_size=128):
    rows = []
    for lr in lrs:
        print(f"LR sweep: {task} {activation} {optimizer_name} lr={lr}")
        train_loader, test_loader, _ = get_dataloaders(task=task, batch_size=batch_size)
        model = SimpleANN(activation).to(DEVICE) if task=="mnist" else SmallCNN(activation).to(DEVICE)
        loss_fn = nn.CrossEntropyLoss()
        optimizer = get_optimizer(optimizer_name, model.parameters(), lr)
        scaler = GradScaler() if DEVICE.type=="cuda" else None
        for epoch in range(1, epochs+1):
            tl, ta, gn = train_one_epoch(model, train_loader, optimizer, loss_fn, scaler)
            vl, va = evaluate(model, test_loader, loss_fn)
            print(f"  epoch {epoch}/{epochs} | train_acc {ta:.3f} val_acc {va:.3f}")
        rows.append({"task":task, "activation":activation, "optimizer":optimizer_name, "lr":lr, "final_val_acc":va})
    out = Path(RESULTS_ROOT) / "summary" / f"lr_sweep_{task}_{activation}_{optimizer_name}.csv"
    save_metrics_csv(rows, out)
    return out

def batch_size_sweep(task, activation, optimizer_name, batch_sizes=[32,64,128,256], epochs=12, lr=None):
    rows = []
    for bs in batch_sizes:
        print(f"Batch sweep: {task} {activation} {optimizer_name} bs={bs}")
        train_loader, test_loader, _ = get_dataloaders(task=task, batch_size=bs)
        model = SimpleANN(activation).to(DEVICE) if task=="mnist" else SmallCNN(activation).to(DEVICE)
        loss_fn = nn.CrossEntropyLoss()
        lr = lr if lr is not None else LR_DEFAULT.get(optimizer_name, 0.001)
        optimizer = get_optimizer(optimizer_name, model.parameters(), lr)
        scaler = GradScaler() if DEVICE.type=="cuda" else None
        for epoch in range(1, epochs+1):
            tl, ta, gn = train_one_epoch(model, train_loader, optimizer, loss_fn, scaler)
            vl, va = evaluate(model, test_loader, loss_fn)
            print(f"  epoch {epoch}/{epochs} | train_acc {ta:.3f} val_acc {va:.3f}")
        rows.append({"task":task, "activation":activation, "optimizer":optimizer_name, "batch_size":bs, "final_val_acc":va})
    out = Path(RESULTS_ROOT) / "summary" / f"batch_sweep_{task}_{activation}_{optimizer_name}.csv"
    save_metrics_csv(rows, out)
    return out


In [None]:
if __name__ == "__main__":
    print("DEVICE:", DEVICE)
    print(f"Running grid: epochs={EPOCHS}, batch_size={BATCH_SIZE}")
    run_grid(tasks=("mnist","cifar"),
             activations=("sigmoid","relu","leakyrelu","gelu"),
             optimizers_list=("sgd","adam","adamw"),
             epochs=EPOCHS,
             batch_size=BATCH_SIZE,
             results_root=RESULTS_ROOT)
    print("Done. Check the results/ folder for outputs and CSVs.")

'\nif __name__ == "__main__":\n    print("DEVICE:", DEVICE)\n    print(f"Running grid: epochs={EPOCHS}, batch_size={BATCH_SIZE}")\n    run_grid(tasks=("mnist","cifar"),\n             activations=("sigmoid","relu","leakyrelu","gelu"),\n             optimizers_list=("sgd","adam","adamw"),\n             epochs=EPOCHS,\n             batch_size=BATCH_SIZE,\n             results_root=RESULTS_ROOT)\n    print("Done. Check the results/ folder for outputs and CSVs.")\n    '