# TŒ≤2: Hysteresis at Œ∑=0.8 (High Noise)

**Purpose**: Confirm two-branch structure exists at higher noise rate

**Protocol**:
- Train from scratch at Œ∑=0.8 (no checkpoints from Œ∑=0.4)
- Create ordered & collapse checkpoints
- Run sweep comparison

**Key Question**: Is hysteresis Œ∑-specific or a general phenomenon?

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os, glob, json, time
from datetime import datetime

EXP_NAME = 'exp_Tb2_eta08'
NOTEBOOK_ID = 'Tb2'
BASE_DIR = '/content/drive/MyDrive/dual-gradient-learning/Paper-A'

existing = glob.glob(f'{BASE_DIR}/{EXP_NAME}_*')
if existing:
    SAVE_DIR = sorted(existing)[-1]
    print(f'üîÑ Resuming: {SAVE_DIR}')
else:
    TIMESTAMP = datetime.now().strftime('%Y%m%d_%H%M%S')
    SAVE_DIR = f'{BASE_DIR}/{EXP_NAME}_{TIMESTAMP}'
    os.makedirs(SAVE_DIR, exist_ok=True)
    print(f'üÜï New: {SAVE_DIR}')

os.makedirs(f'{SAVE_DIR}/checkpoints', exist_ok=True)
os.makedirs(f'{SAVE_DIR}/figures', exist_ok=True)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils import parameters_to_vector
import torchvision
import torchvision.transforms as transforms
from torchvision.models import resnet18
import numpy as np

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')

In [None]:
# Key parameter: High noise rate
NOISE_RATE = 0.8  # Different from Œ∑=0.4

BATCH_SIZE = 256
NUM_WORKERS = 4
LR = 0.1
K = 16

# Checkpoint creation
ORDERED_LAMBDA = 0.35
ORDERED_EPOCHS = 50
ORDERED_THRESHOLD = 0.30  # Relaxed for high noise

COLLAPSE_LAMBDA = 0.65  # Adjusted for Œ∑=0.8
COLLAPSE_EPOCHS = 100
COLLAPSE_THRESHOLD = 0.50

# Sweep settings (Á∏ÆÂ∞èÁâà)
LAMBDA_START = 0.30
LAMBDA_END = 0.70
LAMBDA_STEP = 0.04  # Coarser for speed
EPOCHS_PER_LAMBDA = 3

LAMBDA_GRID_UP = np.round(np.arange(LAMBDA_START, LAMBDA_END + LAMBDA_STEP/2, LAMBDA_STEP), 2)
LAMBDA_GRID_DOWN = np.round(np.arange(LAMBDA_END, LAMBDA_START - LAMBDA_STEP/2, -LAMBDA_STEP), 2)

N_SEEDS = 3  # Reduced for speed

print(f'Œ∑ = {NOISE_RATE}')
print(f'Œª points: {len(LAMBDA_GRID_UP)}')

In [None]:
def get_resnet18():
    model = resnet18(weights=None, num_classes=10)
    model.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
    model.maxpool = nn.Identity()
    return model

class IndexedDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset
    def __getitem__(self, idx):
        img, label = self.dataset[idx]
        return img, label, idx
    def __len__(self):
        return len(self.dataset)

def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

def inject_label_noise(labels, noise_rate, seed):
    np.random.seed(seed)
    noisy = labels.copy()
    n_noisy = int(noise_rate * len(labels))
    idx = np.random.choice(len(labels), n_noisy, replace=False)
    for i in idx:
        noisy[i] = np.random.choice([l for l in range(10) if l != labels[i]])
    return noisy

def load_cifar10():
    tr = transforms.Compose([transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(),
                             transforms.ToTensor(), transforms.Normalize((0.4914,0.4822,0.4465),(0.2023,0.1994,0.2010))])
    te = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.4914,0.4822,0.4465),(0.2023,0.1994,0.2010))])
    return torchvision.datasets.CIFAR10('./data', True, tr, download=True), torchvision.datasets.CIFAR10('./data', False, te, download=True)

def evaluate(model, loader):
    model.eval()
    correct = total = 0
    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device), y.to(device)
            correct += (model(x).argmax(1) == y).sum().item()
            total += y.size(0)
    return correct / total

In [None]:
def train_one_epoch(model, train_loader, opt, clean_t, noisy_t, lam, cached_gv_ref):
    crit = nn.CrossEntropyLoss()
    model.train()
    step = cached_gv_ref['step']
    cached_gv = cached_gv_ref['gv']
    
    for x, _, idx in train_loader:
        x, idx = x.to(device), idx.to(device)
        bn, bc = noisy_t[idx], clean_t[idx]
        
        opt.zero_grad()
        loss_s = crit(model(x), bn)
        loss_s.backward(retain_graph=True)
        gs = parameters_to_vector([p.grad for p in model.parameters()]).clone()
        
        if step % K == 0 or cached_gv is None:
            opt.zero_grad()
            loss_v = crit(model(x), bc)
            loss_v.backward()
            cached_gv = parameters_to_vector([p.grad for p in model.parameters()]).clone()
        
        gs_n = gs / (gs.norm() + 1e-12)
        gv_n = cached_gv / (cached_gv.norm() + 1e-12)
        
        g_mix = (1 - lam) * gs_n + lam * gv_n
        opt.zero_grad()
        i = 0
        for p in model.parameters():
            n = p.numel()
            p.grad = g_mix[i:i+n].view(p.shape).clone()
            i += n
        opt.step()
        step += 1
    
    cached_gv_ref['step'] = step
    cached_gv_ref['gv'] = cached_gv

In [None]:
trainset, testset = load_cifar10()
clean_labels = np.array(trainset.targets)
train_loader = DataLoader(IndexedDataset(trainset), BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, pin_memory=True)
test_loader = DataLoader(testset, BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)

m = get_resnet18().to(device)
for _ in range(10): _ = m(torch.randn(BATCH_SIZE,3,32,32,device=device))
del m; torch.cuda.empty_cache()
print('Ready')

In [None]:
# Phase 1: Create Ordered Checkpoints for Œ∑=0.8
print('='*60)
print('PHASE 1: Creating Ordered Checkpoints (Œ∑=0.8)')
print('='*60)

ordered_ckpts = []

for seed in range(N_SEEDS):
    ckpt_path = f'{SAVE_DIR}/checkpoints/ordered_eta08_seed{seed:02d}.pth'
    if os.path.exists(ckpt_path):
        print(f'Seed {seed}: Already exists')
        ordered_ckpts.append(ckpt_path)
        continue
    
    print(f'\nSeed {seed}: Training at Œª={ORDERED_LAMBDA}...')
    
    set_seed(seed)
    noisy_labels = inject_label_noise(clean_labels, NOISE_RATE, seed)
    clean_t = torch.tensor(clean_labels, device=device)
    noisy_t = torch.tensor(noisy_labels, device=device)
    
    model = get_resnet18().to(device)
    opt = optim.SGD(model.parameters(), lr=LR, momentum=0.9, weight_decay=5e-4)
    sched = optim.lr_scheduler.MultiStepLR(opt, [30, 40], gamma=0.1)
    cached_gv_ref = {'step': 0, 'gv': None}
    
    for ep in range(ORDERED_EPOCHS):
        train_one_epoch(model, train_loader, opt, clean_t, noisy_t, ORDERED_LAMBDA, cached_gv_ref)
        sched.step()
        if (ep + 1) % 10 == 0:
            err = 1 - evaluate(model, test_loader)
            print(f'  Epoch {ep+1}: error={err:.4f}')
    
    final_error = 1 - evaluate(model, test_loader)
    
    if final_error < ORDERED_THRESHOLD:
        torch.save({
            'seed': seed,
            'eta': NOISE_RATE,
            'lambda': ORDERED_LAMBDA,
            'final_error': final_error,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': opt.state_dict()
        }, ckpt_path)
        ordered_ckpts.append(ckpt_path)
        print(f'  ‚úÖ SAVED: error={final_error:.4f}')
    else:
        print(f'  ‚ùå Failed threshold: error={final_error:.4f}')
    
    torch.cuda.empty_cache()

print(f'\nOrdered checkpoints: {len(ordered_ckpts)}')

In [None]:
# Phase 2: Create Collapse Checkpoints for Œ∑=0.8
print('='*60)
print('PHASE 2: Creating Collapse Checkpoints (Œ∑=0.8)')
print('='*60)

collapse_ckpts = []

for seed in range(N_SEEDS):
    ckpt_path = f'{SAVE_DIR}/checkpoints/collapse_eta08_seed{seed:02d}.pth'
    if os.path.exists(ckpt_path):
        print(f'Seed {seed}: Already exists')
        collapse_ckpts.append(ckpt_path)
        continue
    
    print(f'\nSeed {seed}: Training at Œª={COLLAPSE_LAMBDA}...')
    
    set_seed(seed + 100)
    noisy_labels = inject_label_noise(clean_labels, NOISE_RATE, seed + 100)
    clean_t = torch.tensor(clean_labels, device=device)
    noisy_t = torch.tensor(noisy_labels, device=device)
    
    model = get_resnet18().to(device)
    opt = optim.SGD(model.parameters(), lr=LR, momentum=0.9, weight_decay=5e-4)
    sched = optim.lr_scheduler.MultiStepLR(opt, [50, 80], gamma=0.1)
    cached_gv_ref = {'step': 0, 'gv': None}
    
    for ep in range(COLLAPSE_EPOCHS):
        train_one_epoch(model, train_loader, opt, clean_t, noisy_t, COLLAPSE_LAMBDA, cached_gv_ref)
        sched.step()
        if (ep + 1) % 20 == 0:
            err = 1 - evaluate(model, test_loader)
            print(f'  Epoch {ep+1}: error={err:.4f}')
    
    final_error = 1 - evaluate(model, test_loader)
    
    if final_error >= COLLAPSE_THRESHOLD:
        torch.save({
            'seed': seed + 100,
            'eta': NOISE_RATE,
            'lambda': COLLAPSE_LAMBDA,
            'final_error': final_error,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': opt.state_dict()
        }, ckpt_path)
        collapse_ckpts.append(ckpt_path)
        print(f'  üíÄ SAVED: error={final_error:.4f}')
    else:
        print(f'  ‚ö†Ô∏è Not collapsed enough: error={final_error:.4f}')
    
    torch.cuda.empty_cache()

print(f'\nCollapse checkpoints: {len(collapse_ckpts)}')

In [None]:
# Phase 3: Run Sweep Comparison
print('='*60)
print('PHASE 3: Sweep Comparison')
print('='*60)

results = []

def run_sweep(ckpt_path, direction):
    ckpt = torch.load(ckpt_path, map_location=device)
    seed = ckpt['seed']
    init_error = ckpt['final_error']
    
    lambda_grid = LAMBDA_GRID_UP if direction == 'up' else LAMBDA_GRID_DOWN
    
    noisy_labels = inject_label_noise(clean_labels, NOISE_RATE, seed)
    clean_t = torch.tensor(clean_labels, device=device)
    noisy_t = torch.tensor(noisy_labels, device=device)
    
    set_seed(seed + 3000)
    model = get_resnet18().to(device)
    model.load_state_dict(ckpt['model_state_dict'])
    
    opt = optim.SGD(model.parameters(), lr=LR * 0.01, momentum=0.9, weight_decay=5e-4)
    cached_gv_ref = {'step': 0, 'gv': None}
    trajectory = []
    
    for lam in lambda_grid:
        for _ in range(EPOCHS_PER_LAMBDA):
            train_one_epoch(model, train_loader, opt, clean_t, noisy_t, lam, cached_gv_ref)
        err = 1 - evaluate(model, test_loader)
        trajectory.append({'lambda': float(lam), 'error': err})
        print(f'    Œª={lam:.2f}: err={err:.4f}')
    
    return {
        'seed': seed,
        'init_error': init_error,
        'final_error': trajectory[-1]['error'],
        'branch': 'ordered_up' if direction == 'up' else 'collapse_down',
        'trajectory': trajectory,
        'eta': NOISE_RATE
    }

# Ordered branch
print('\n--- Ordered Branch (Œª ‚Üë) ---')
for ckpt in ordered_ckpts:
    print(f'\n{os.path.basename(ckpt)}')
    result = run_sweep(ckpt, 'up')
    results.append(result)
    torch.cuda.empty_cache()

# Collapse branch
print('\n--- Collapse Branch (Œª ‚Üì) ---')
for ckpt in collapse_ckpts:
    print(f'\n{os.path.basename(ckpt)}')
    result = run_sweep(ckpt, 'down')
    results.append(result)
    torch.cuda.empty_cache()

json.dump(results, open(f'{SAVE_DIR}/{NOTEBOOK_ID}_results.json', 'w'), indent=2, default=str)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Visualization
all_data = []
for r in results:
    for t in r['trajectory']:
        all_data.append({'seed': r['seed'], 'lambda': t['lambda'], 'error': t['error'], 'branch': r['branch']})
df = pd.DataFrame(all_data)
df.to_csv(f'{SAVE_DIR}/{NOTEBOOK_ID}_results.csv', index=False)

fig, ax = plt.subplots(figsize=(10, 6))

df_ord = df[df['branch'] == 'ordered_up']
df_col = df[df['branch'] == 'collapse_down']

if len(df_ord) > 0:
    df_ord_mean = df_ord.groupby('lambda')['error'].agg(['mean', 'std']).reset_index()
    ax.fill_between(df_ord_mean['lambda'], df_ord_mean['mean'] - df_ord_mean['std'],
                    df_ord_mean['mean'] + df_ord_mean['std'], alpha=0.3, color='blue')
    ax.plot(df_ord_mean['lambda'], df_ord_mean['mean'], 'b-o', linewidth=2, markersize=6, label='Ordered (Œª‚Üë)')

if len(df_col) > 0:
    df_col_mean = df_col.groupby('lambda')['error'].agg(['mean', 'std']).reset_index()
    ax.fill_between(df_col_mean['lambda'], df_col_mean['mean'] - df_col_mean['std'],
                    df_col_mean['mean'] + df_col_mean['std'], alpha=0.3, color='red')
    ax.plot(df_col_mean['lambda'], df_col_mean['mean'], 'r-s', linewidth=2, markersize=6, label='Collapse (Œª‚Üì)')

ax.axhline(0.50, color='red', linestyle='--', linewidth=1.5, alpha=0.7, label='Collapse threshold')
ax.axhline(0.30, color='green', linestyle='--', linewidth=1.5, alpha=0.7, label='Ordered threshold')

ax.set_xlabel('Œª', fontsize=12)
ax.set_ylabel('Test Error', fontsize=12)
ax.set_title(f'Hysteresis at Œ∑={NOISE_RATE} (High Noise)', fontsize=14, fontweight='bold')
ax.legend(fontsize=10)
ax.grid(True, alpha=0.3)
ax.set_xlim(0.28, 0.72)

plt.tight_layout()
plt.savefig(f'{SAVE_DIR}/figures/{NOTEBOOK_ID}_hysteresis_eta08.png', dpi=150)
plt.show()

# Summary
print('\n' + '='*60)
print(f'{NOTEBOOK_ID} SUMMARY: Œ∑={NOISE_RATE}')
print('='*60)

if len(df_ord) > 0 and len(df_col) > 0:
    mid_lam = 0.50 if 0.50 in df_ord_mean['lambda'].values else df_ord_mean['lambda'].values[len(df_ord_mean)//2]
    ord_at_mid = df_ord_mean[df_ord_mean['lambda']==mid_lam]['mean'].values[0]
    col_at_mid = df_col_mean[df_col_mean['lambda']==mid_lam]['mean'].values[0]
    gap = col_at_mid - ord_at_mid
    print(f'\nüìä Gap at Œª={mid_lam}: {gap*100:.1f}%')
    print(f'   Two-branch structure: {"YES" if gap > 0.10 else "WEAK or NO"}')