# V3: Architecture Universality (VGG11)

**Purpose**: Confirm hysteresis is not ResNet-specific

**Key Question**: Is the phenomenon architecture-universal?

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os, glob, json, time
from datetime import datetime

EXP_NAME = 'exp_V3_vgg11'
NOTEBOOK_ID = 'V3'
BASE_DIR = '/content/drive/MyDrive/dual-gradient-learning/Paper-A'

existing = glob.glob(f'{BASE_DIR}/{EXP_NAME}_*')
if existing:
    SAVE_DIR = sorted(existing)[-1]
    print(f'Resuming: {SAVE_DIR}')
else:
    TIMESTAMP = datetime.now().strftime('%Y%m%d_%H%M%S')
    SAVE_DIR = f'{BASE_DIR}/{EXP_NAME}_{TIMESTAMP}'
    os.makedirs(SAVE_DIR, exist_ok=True)
    print(f'New: {SAVE_DIR}')

os.makedirs(f'{SAVE_DIR}/checkpoints', exist_ok=True)
os.makedirs(f'{SAVE_DIR}/figures', exist_ok=True)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils import parameters_to_vector
import torchvision
import torchvision.transforms as transforms
import numpy as np

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')

In [None]:
BATCH_SIZE, NUM_WORKERS, LR, K, NOISE_RATE = 128, 4, 0.05, 16, 0.4
ORDERED_LAMBDA, ORDERED_EPOCHS, ORDERED_THRESHOLD = 0.35, 50, 0.25
COLLAPSE_LAMBDA, COLLAPSE_EPOCHS, COLLAPSE_THRESHOLD = 0.60, 80, 0.45
LAMBDA_START, LAMBDA_END, LAMBDA_STEP, EPOCHS_PER_LAMBDA = 0.30, 0.70, 0.05, 3
LAMBDA_GRID_UP = np.round(np.arange(LAMBDA_START, LAMBDA_END + LAMBDA_STEP/2, LAMBDA_STEP), 2)
LAMBDA_GRID_DOWN = np.round(np.arange(LAMBDA_END, LAMBDA_START - LAMBDA_STEP/2, -LAMBDA_STEP), 2)
N_SEEDS = 3
print(f'VGG11-BN, λ points: {len(LAMBDA_GRID_UP)}')

In [None]:
cfg = [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M']

class VGG11BN(nn.Module):
    def __init__(self, num_classes=10):
        super().__init__()
        layers = []
        in_ch = 3
        for v in cfg:
            if v == 'M': layers += [nn.MaxPool2d(2, 2)]
            else: layers += [nn.Conv2d(in_ch, v, 3, padding=1), nn.BatchNorm2d(v), nn.ReLU(True)]; in_ch = v
        self.features = nn.Sequential(*layers)
        self.classifier = nn.Sequential(nn.Linear(512, 512), nn.ReLU(True), nn.Dropout(0.5), nn.Linear(512, num_classes))
    def forward(self, x):
        return self.classifier(self.features(x).view(x.size(0), -1))

def get_vgg11(): return VGG11BN()

class IndexedDataset(Dataset):
    def __init__(self, ds): self.ds = ds
    def __getitem__(self, i): img, lbl = self.ds[i]; return img, lbl, i
    def __len__(self): return len(self.ds)

def set_seed(s): torch.manual_seed(s); torch.cuda.manual_seed_all(s); np.random.seed(s); torch.backends.cudnn.deterministic = True

def inject_noise(labels, rate, seed):
    np.random.seed(seed); noisy = labels.copy(); idx = np.random.choice(len(labels), int(rate * len(labels)), False)
    for i in idx: noisy[i] = np.random.choice([l for l in range(10) if l != labels[i]])
    return noisy

def load_cifar10():
    tr = transforms.Compose([transforms.RandomCrop(32, 4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914,0.4822,0.4465),(0.2023,0.1994,0.2010))])
    te = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.4914,0.4822,0.4465),(0.2023,0.1994,0.2010))])
    return torchvision.datasets.CIFAR10('./data', True, tr, download=True), torchvision.datasets.CIFAR10('./data', False, te, download=True)

def evaluate(m, loader):
    m.eval(); c = t = 0
    with torch.no_grad():
        for x, y in loader: x, y = x.to(device), y.to(device); c += (m(x).argmax(1) == y).sum().item(); t += y.size(0)
    return c / t

In [None]:
def train_epoch(model, loader, opt, clean_t, noisy_t, lam, state):
    crit = nn.CrossEntropyLoss(); model.train(); step, cached_gv = state['step'], state['gv']
    for x, _, idx in loader:
        x, idx = x.to(device), idx.to(device); bn, bc = noisy_t[idx], clean_t[idx]
        opt.zero_grad(); crit(model(x), bn).backward(retain_graph=True)
        gs = parameters_to_vector([p.grad for p in model.parameters()]).clone()
        if step % K == 0 or cached_gv is None:
            opt.zero_grad(); crit(model(x), bc).backward()
            cached_gv = parameters_to_vector([p.grad for p in model.parameters()]).clone()
        gs_n, gv_n = gs / (gs.norm() + 1e-12), cached_gv / (cached_gv.norm() + 1e-12)
        g_mix = (1 - lam) * gs_n + lam * gv_n
        opt.zero_grad(); i = 0
        for p in model.parameters(): n = p.numel(); p.grad = g_mix[i:i+n].view(p.shape).clone(); i += n
        opt.step(); step += 1
    state['step'], state['gv'] = step, cached_gv

In [None]:
trainset, testset = load_cifar10()
clean_labels = np.array(trainset.targets)
train_loader = DataLoader(IndexedDataset(trainset), BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, pin_memory=True)
test_loader = DataLoader(testset, BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)
m = get_vgg11().to(device); print(f'Params: {sum(p.numel() for p in m.parameters()):,}')
for _ in range(3): _ = m(torch.randn(BATCH_SIZE,3,32,32,device=device))
del m; torch.cuda.empty_cache(); print('Ready')

In [None]:
results = []
ckpt_file = f'{SAVE_DIR}/{NOTEBOOK_ID}_checkpoint.json'
done_seeds = set(json.load(open(ckpt_file))['seed'] for r in json.load(open(ckpt_file))) if os.path.exists(ckpt_file) else set()
if os.path.exists(ckpt_file): results = json.load(open(ckpt_file)); done_seeds = {r['seed'] for r in results}

for seed in range(N_SEEDS):
    if seed in done_seeds: continue
    print(f'\n{"="*50}\nSEED {seed}\n{"="*50}')
    noisy_labels = inject_noise(clean_labels, NOISE_RATE, seed)
    clean_t, noisy_t = torch.tensor(clean_labels, device=device), torch.tensor(noisy_labels, device=device)
    
    # Ordered
    print(f'Ordered...')
    set_seed(seed); model = get_vgg11().to(device)
    opt = optim.SGD(model.parameters(), lr=LR, momentum=0.9, weight_decay=5e-4)
    sched = optim.lr_scheduler.MultiStepLR(opt, [25, 40], 0.1); state = {'step': 0, 'gv': None}
    for ep in range(ORDERED_EPOCHS): train_epoch(model, train_loader, opt, clean_t, noisy_t, ORDERED_LAMBDA, state); sched.step()
    ordered_error = 1 - evaluate(model, test_loader); print(f'  {ordered_error:.4f}')
    if ordered_error >= ORDERED_THRESHOLD: print('  Failed'); continue
    ordered_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}; del model; torch.cuda.empty_cache()
    
    # Collapse
    print(f'Collapse...')
    set_seed(seed + 100); model = get_vgg11().to(device)
    opt = optim.SGD(model.parameters(), lr=LR, momentum=0.9, weight_decay=5e-4)
    sched = optim.lr_scheduler.MultiStepLR(opt, [40, 60], 0.1); state = {'step': 0, 'gv': None}
    for ep in range(COLLAPSE_EPOCHS): train_epoch(model, train_loader, opt, clean_t, noisy_t, COLLAPSE_LAMBDA, state); sched.step()
    collapse_error = 1 - evaluate(model, test_loader); print(f'  {collapse_error:.4f}')
    if collapse_error < COLLAPSE_THRESHOLD: print('  Failed'); continue
    collapse_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}; del model; torch.cuda.empty_cache()
    
    # Sweep ordered
    print(f'Sweep ordered...')
    set_seed(seed + 200); model = get_vgg11().to(device)
    model.load_state_dict({k: v.to(device) for k, v in ordered_state.items()})
    opt = optim.SGD(model.parameters(), lr=LR * 0.01, momentum=0.9, weight_decay=5e-4); state = {'step': 0, 'gv': None}
    ordered_traj = []
    for lam in LAMBDA_GRID_UP:
        for _ in range(EPOCHS_PER_LAMBDA): train_epoch(model, train_loader, opt, clean_t, noisy_t, lam, state)
        err = 1 - evaluate(model, test_loader); ordered_traj.append({'lambda': float(lam), 'error': err}); print(f'  {lam:.2f}: {err:.4f}')
    del model; torch.cuda.empty_cache()
    
    # Sweep collapse
    print(f'Sweep collapse...')
    set_seed(seed + 300); model = get_vgg11().to(device)
    model.load_state_dict({k: v.to(device) for k, v in collapse_state.items()})
    opt = optim.SGD(model.parameters(), lr=LR * 0.01, momentum=0.9, weight_decay=5e-4); state = {'step': 0, 'gv': None}
    collapse_traj = []
    for lam in LAMBDA_GRID_DOWN:
        for _ in range(EPOCHS_PER_LAMBDA): train_epoch(model, train_loader, opt, clean_t, noisy_t, lam, state)
        err = 1 - evaluate(model, test_loader); collapse_traj.append({'lambda': float(lam), 'error': err}); print(f'  {lam:.2f}: {err:.4f}')
    
    results.append({'seed': seed, 'arch': 'VGG11-BN', 'ordered_init': ordered_error, 'collapse_init': collapse_error,
                    'ordered_trajectory': ordered_traj, 'collapse_trajectory': collapse_traj, 'experiment_id': f'{NOTEBOOK_ID}-s{seed}'})
    json.dump(results, open(ckpt_file, 'w'), indent=2); del model; torch.cuda.empty_cache()

print(f'\n{NOTEBOOK_ID} COMPLETE')

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

json.dump(results, open(f'{SAVE_DIR}/{NOTEBOOK_ID}_results.json', 'w'), indent=2)
all_data = []
for r in results:
    for t in r['ordered_trajectory']: all_data.append({'seed': r['seed'], 'branch': 'ordered', 'lambda': t['lambda'], 'error': t['error']})
    for t in r['collapse_trajectory']: all_data.append({'seed': r['seed'], 'branch': 'collapse', 'lambda': t['lambda'], 'error': t['error']})
df = pd.DataFrame(all_data); df.to_csv(f'{SAVE_DIR}/{NOTEBOOK_ID}_results.csv', index=False)

fig, ax = plt.subplots(figsize=(10, 6))
for branch, color, marker in [('ordered', 'blue', 'o'), ('collapse', 'red', 's')]:
    dfb = df[df['branch'] == branch]
    if len(dfb) > 0:
        m = dfb.groupby('lambda')['error'].agg(['mean', 'std']).reset_index()
        ax.fill_between(m['lambda'], m['mean']-m['std'], m['mean']+m['std'], alpha=0.3, color=color)
        ax.plot(m['lambda'], m['mean'], f'{color[0]}-{marker}', linewidth=2, label=branch.capitalize())

ax.axhline(0.40, color='orange', linestyle='--', alpha=0.5)
ax.set_xlabel('λ'); ax.set_ylabel('Test Error'); ax.set_title('VGG11: Hysteresis', fontweight='bold')
ax.legend(); ax.grid(True, alpha=0.3); ax.set_xlim(0.28, 0.72)
plt.tight_layout(); plt.savefig(f'{SAVE_DIR}/figures/{NOTEBOOK_ID}_hysteresis.png', dpi=150); plt.show()

print('\nSUMMARY')
if len(df[df['branch']=='ordered']) > 0 and len(df[df['branch']=='collapse']) > 0:
    ord_err = df[(df['branch']=='ordered') & (df['lambda']==0.50)]['error'].mean()
    col_err = df[(df['branch']=='collapse') & (df['lambda']==0.50)]['error'].mean()
    gap = col_err - ord_err
    print(f'Gap at λ=0.50: {gap*100:.1f}%\nTwo-branch: {"YES" if gap > 0.10 else "NO"}')