# Exp Q3: Learning Rate - Lambda Interaction

## ÁõÆÁöÑ
„ÄåÂÆü„ÅØlr„ÅåÂäπ„ÅÑ„Å¶„Çã„Å†„Åë„Äç„Å®„ÅÑ„ÅÜÊüªË™≠„ÉÑ„ÉÉ„Ç≥„Éü„ÇíÂÖà„Å´ÊΩ∞„Åô„ÄÇ
Áõ∏Â¢ÉÁïå„Åålr„Åß„Å©„Çå„Å†„ÅëÂãï„Åè„Åã„ÇíÁ≥ªÁµ±ÁöÑ„Å´Ê§úË®º„ÄÇ

## ÂÆüÈ®ìË®≠Ë®à
- **Model**: ResNet18
- **Œ∑**: 0.4, 0.8
- **lr**: [0.05, 0.1, 0.2]
- **Œª**: [0.30, 0.35, 0.45, 0.50, 0.60]
- **Seeds**: 5
- **Total**: 2 √ó 3 √ó 5 √ó 5 = **150 runs**

## Êé®ÂÆöÊôÇÈñì
~150 runs √ó 9 min ‚âà **22.5 ÊôÇÈñì**Ôºà3‰∏¶Âàó„Åß7.5ÊôÇÈñìÔºâ

In [None]:
# ===== „Çª„ÉÉ„Éà„Ç¢„ÉÉ„Éó =====
from google.colab import drive
drive.mount('/content/drive')

import os
import glob
from datetime import datetime

EXP_NAME = 'exp_Q3_lr_lambda_interaction'
BASE_DIR = '/content/drive/MyDrive/dual-gradient-learning/Paper-A'

existing = glob.glob(f'{BASE_DIR}/{EXP_NAME}_*')
if existing:
    SAVE_DIR = sorted(existing)[-1]
    print(f'üîÑ Resuming from: {SAVE_DIR}')
else:
    TIMESTAMP = datetime.now().strftime('%Y%m%d_%H%M%S')
    SAVE_DIR = f'{BASE_DIR}/{EXP_NAME}_{TIMESTAMP}'
    os.makedirs(SAVE_DIR, exist_ok=True)
    print(f'üÜï New experiment: {SAVE_DIR}')

os.makedirs(f'{SAVE_DIR}/figures', exist_ok=True)
print(f'Save directory: {SAVE_DIR}')

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils import parameters_to_vector
import torchvision
import torchvision.transforms as transforms
from torchvision.models import resnet18
import numpy as np
import json
import time

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')
if torch.cuda.is_available():
    print(f'GPU: {torch.cuda.get_device_name()}')

In [None]:
# ===== ResNet18 for CIFAR-10 =====
def get_resnet18():
    model = resnet18(weights=None, num_classes=10)
    model.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
    model.maxpool = nn.Identity()
    return model

# ===== IndexedDataset =====
class IndexedDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset
    def __getitem__(self, idx):
        img, label = self.dataset[idx]
        return img, label, idx
    def __len__(self):
        return len(self.dataset)

In [None]:
# ===== ÂÆüÈ®ì„Éë„É©„É°„Éº„Çø =====
BATCH_SIZE = 256
NUM_WORKERS = 4
EPOCHS = 100
K = 16

# Â§âÊï∞
LEARNING_RATES = [0.05, 0.1, 0.2]
LAMBDAS = [0.30, 0.35, 0.45, 0.50, 0.60]
NOISE_RATES = [0.4, 0.8]
SEEDS = list(range(5))

experiments = []
for lr in LEARNING_RATES:
    for eta in NOISE_RATES:
        for lam in LAMBDAS:
            for seed in SEEDS:
                experiments.append({
                    'lr': lr,
                    'noise_rate': eta,
                    'lambda': lam,
                    'seed': seed
                })

print(f'Learning rates: {LEARNING_RATES}')
print(f'Lambdas: {LAMBDAS}')
print(f'Noise rates: {NOISE_RATES}')
print(f'Total experiments: {len(experiments)}')
print(f'Estimated time: {len(experiments) * 9 / 60:.1f} hours')

# Save config
config = {
    'experiment': EXP_NAME,
    'description': 'Learning rate - lambda interaction: does lr shift phase boundary?',
    'parameters': {
        'learning_rates': LEARNING_RATES,
        'lambdas': LAMBDAS,
        'noise_rates': NOISE_RATES,
        'seeds': SEEDS,
        'epochs': EPOCHS,
        'batch_size': BATCH_SIZE,
        'K': K
    },
    'total_runs': len(experiments)
}
with open(f'{SAVE_DIR}/{EXP_NAME}_config.json', 'w') as f:
    json.dump(config, f, indent=2)

In [None]:
# ===== „É¶„Éº„ÉÜ„Ç£„É™„ÉÜ„Ç£Èñ¢Êï∞ =====
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)

def load_cifar10():
    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616))
    ])
    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616))
    ])
    trainset = torchvision.datasets.CIFAR10('./data', train=True, download=True, transform=transform_train)
    testset = torchvision.datasets.CIFAR10('./data', train=False, download=True, transform=transform_test)
    return trainset, testset

def inject_label_noise(labels, noise_rate, seed):
    np.random.seed(seed)
    labels = np.array(labels)
    n_samples = len(labels)
    n_noisy = int(noise_rate * n_samples)
    noisy_idx = np.random.choice(n_samples, n_noisy, replace=False)
    noisy_labels = labels.copy()
    for idx in noisy_idx:
        choices = [l for l in range(10) if l != labels[idx]]
        noisy_labels[idx] = np.random.choice(choices)
    return noisy_labels

def get_data_loaders(trainset, testset):
    train_loader = DataLoader(IndexedDataset(trainset), batch_size=BATCH_SIZE,
                              shuffle=True, num_workers=NUM_WORKERS, pin_memory=True)
    test_loader = DataLoader(testset, batch_size=BATCH_SIZE,
                             shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)
    return train_loader, test_loader

def evaluate(model, test_loader):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
    return correct / total

In [None]:
# ===== Dual-Gradient Learning with variable LR =====
def train_dual_gradient(model, train_loader, test_loader, clean_labels, noisy_labels, lam, lr):
    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4)
    # lr‰æùÂ≠ò„ÅÆmilestonesÔºàÁõ∏ÂØæÁöÑ„Å´Âêå„Åò‰ΩçÁΩÆÔºâ
    scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[50, 75], gamma=0.1)
    criterion = nn.CrossEntropyLoss()
    
    clean_labels_tensor = torch.tensor(clean_labels, device=device)
    noisy_labels_tensor = torch.tensor(noisy_labels, device=device)
    
    cached_g_value = None
    global_step = 0
    
    best_acc = 0
    cos_history = []
    history = {'epoch': [], 'test_acc': [], 'test_error': []}
    
    for epoch in range(EPOCHS):
        model.train()
        epoch_cos = []
        
        for inputs, _, indices in train_loader:
            inputs = inputs.to(device, non_blocking=True)
            indices = indices.to(device, non_blocking=True)
            batch_noisy = noisy_labels_tensor[indices]
            batch_clean = clean_labels_tensor[indices]
            
            # Structure gradient
            optimizer.zero_grad()
            outputs = model(inputs)
            loss_struct = criterion(outputs, batch_noisy)
            loss_struct.backward(retain_graph=True)
            g_struct = parameters_to_vector([p.grad for p in model.parameters()]).clone()
            
            # Value gradient
            if global_step % K == 0 or cached_g_value is None:
                optimizer.zero_grad()
                outputs = model(inputs)
                loss_value = criterion(outputs, batch_clean)
                loss_value.backward()
                cached_g_value = parameters_to_vector([p.grad for p in model.parameters()]).clone()
            
            # Mix gradients
            g_struct_norm = g_struct / (g_struct.norm() + 1e-12)
            g_value_norm = cached_g_value / (cached_g_value.norm() + 1e-12)
            
            cos_sim = (g_struct_norm @ g_value_norm).item()
            epoch_cos.append(cos_sim)
            
            g_mix = (1 - lam) * g_struct_norm + lam * g_value_norm
            
            # Apply mixed gradient
            optimizer.zero_grad()
            idx = 0
            for p in model.parameters():
                numel = p.numel()
                p.grad = g_mix[idx:idx+numel].view(p.shape).clone()
                idx += numel
            optimizer.step()
            global_step += 1
        
        scheduler.step()
        cos_history.append(np.mean(epoch_cos))
        
        if (epoch + 1) % 10 == 0:
            acc = evaluate(model, test_loader)
            best_acc = max(best_acc, acc)
            history['epoch'].append(epoch + 1)
            history['test_acc'].append(acc)
            history['test_error'].append(1 - acc)
    
    final_acc = evaluate(model, test_loader)
    avg_cos = np.mean(cos_history)
    
    return final_acc, max(best_acc, final_acc), avg_cos, history

In [None]:
# ===== „Éá„Éº„ÇøÊ∫ñÂÇô =====
trainset, testset = load_cifar10()
clean_labels = np.array(trainset.targets)
train_loader, test_loader = get_data_loaders(trainset, testset)

print('Data prepared')

# GPU warmup
warmup_model = get_resnet18().to(device)
for _ in range(20):
    _ = warmup_model(torch.randn(BATCH_SIZE, 3, 32, 32, device=device))
del warmup_model
torch.cuda.empty_cache()
print('GPU warmed up')

In [None]:
# ===== „É°„Ç§„É≥ÂÆüÈ®ì„É´„Éº„Éó =====
results = []
checkpoint_file = f'{SAVE_DIR}/{EXP_NAME}_checkpoint.json'
completed = set()

if os.path.exists(checkpoint_file):
    with open(checkpoint_file, 'r') as f:
        results = json.load(f)
    for r in results:
        completed.add((r['lr'], r['noise_rate'], r['lambda'], r['seed']))
    print(f'Checkpoint loaded: {len(results)} runs completed')

total = len(experiments)

for exp in experiments:
    lr = exp['lr']
    eta = exp['noise_rate']
    lam = exp['lambda']
    seed = exp['seed']
    
    if (lr, eta, lam, seed) in completed:
        continue
    
    run_num = len(completed) + 1
    print(f'\n[{run_num}/{total}] lr={lr} Œ∑={eta} Œª={lam} seed={seed}')
    
    set_seed(seed)
    noisy_labels = inject_label_noise(clean_labels, eta, seed)
    
    model = get_resnet18().to(device)
    
    start_time = time.time()
    final_acc, best_acc, avg_cos, history = train_dual_gradient(
        model, train_loader, test_loader, clean_labels, noisy_labels, lam, lr
    )
    elapsed = time.time() - start_time
    
    result = {
        'experiment_id': f'Q3-{run_num:03d}',
        'experiment': EXP_NAME,
        'lr': lr,
        'noise_rate': eta,
        'lambda': lam,
        'seed': seed,
        'test_acc': final_acc,
        'test_error': 1 - final_acc,
        'best_acc': best_acc,
        'best_error': 1 - best_acc,
        'avg_cos_struct_value': avg_cos,
        'time_seconds': elapsed,
        'history': history
    }
    results.append(result)
    completed.add((lr, eta, lam, seed))
    
    status = '‚úÖ' if best_acc > 0.85 else ('‚ö†Ô∏è COLLAPSE' if best_acc < 0.5 else '')
    print(f'  Error: {1-best_acc:.4f} | cos: {avg_cos:.4f} | Time: {elapsed/60:.1f} min {status}')
    
    with open(checkpoint_file, 'w') as f:
        json.dump(results, f, indent=2)
    
    remaining = total - run_num
    eta_hours = remaining * elapsed / 3600
    print(f'  Progress: {run_num}/{total} | ETA: {eta_hours:.1f} hours')
    
    del model
    torch.cuda.empty_cache()

print('\n' + '='*70)
print('ALL EXPERIMENTS COMPLETED!')
print('='*70)

In [None]:
# ===== ÁµêÊûú‰øùÂ≠ò =====
import pandas as pd

with open(f'{SAVE_DIR}/{EXP_NAME}_results.json', 'w') as f:
    json.dump(results, f, indent=2)

results_flat = [{k: v for k, v in r.items() if k != 'history'} for r in results]
df = pd.DataFrame(results_flat)
df.to_csv(f'{SAVE_DIR}/{EXP_NAME}_results.csv', index=False)

print(f'Results saved to {SAVE_DIR}')

In [None]:
# ===== lr-Œª‰∫§‰∫í‰ΩúÁî®ÂàÜÊûê =====
import matplotlib.pyplot as plt

print('='*70)
print('LR-LAMBDA INTERACTION ANALYSIS')
print('='*70)

# ÈõÜË®à
df_agg = df.groupby(['lr', 'noise_rate', 'lambda']).agg({
    'best_error': ['mean', 'std']
}).reset_index()
df_agg.columns = ['lr', 'noise_rate', 'lambda', 'mean_error', 'std_error']

lr_colors = {0.05: '#3498db', 0.1: '#2ecc71', 0.2: '#e74c3c'}

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

for i, eta in enumerate([0.4, 0.8]):
    # Left: Error curves by lr
    ax = axes[i, 0]
    for lr in LEARNING_RATES:
        subset = df_agg[(df_agg['noise_rate'] == eta) & (df_agg['lr'] == lr)]
        subset = subset.sort_values('lambda')
        ax.errorbar(subset['lambda'], subset['mean_error'], yerr=subset['std_error'],
                   marker='o', color=lr_colors[lr], capsize=4,
                   label=f'lr={lr}', linewidth=2)
    
    ax.axhline(y=0.15, color='green', linestyle='--', alpha=0.5)
    ax.axhline(y=0.25, color='red', linestyle='--', alpha=0.5)
    ax.set_xlabel('Œª', fontsize=12)
    ax.set_ylabel('Test Error', fontsize=12)
    ax.set_title(f'Œ∑={eta}: Error vs Œª (different learning rates)', fontsize=12)
    ax.legend()
    ax.grid(True, alpha=0.3)
    ax.set_ylim(0, 1)
    
    # Right: Phase boundary estimation
    ax = axes[i, 1]
    
    # ÂêÑlr„ÅÆËá®ÁïåÁÇπ„ÇíÊé®ÂÆöÔºàerror > 0.20 „Å´„Å™„ÇãÊúÄÂàù„ÅÆŒªÔºâ
    critical_points = []
    for lr in LEARNING_RATES:
        subset = df_agg[(df_agg['noise_rate'] == eta) & (df_agg['lr'] == lr)]
        subset = subset.sort_values('lambda')
        
        lambda_c = None
        for _, row in subset.iterrows():
            if row['mean_error'] > 0.20:
                lambda_c = row['lambda']
                break
        critical_points.append({'lr': lr, 'lambda_c': lambda_c})
    
    df_crit = pd.DataFrame(critical_points)
    ax.bar(range(len(df_crit)), df_crit['lambda_c'], 
           color=[lr_colors[lr] for lr in df_crit['lr']])
    ax.set_xticks(range(len(df_crit)))
    ax.set_xticklabels([f'lr={lr}' for lr in df_crit['lr']])
    ax.set_ylabel('Critical Œª (Œª_c)', fontsize=12)
    ax.set_title(f'Œ∑={eta}: Phase Boundary vs Learning Rate', fontsize=12)
    ax.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig(f'{SAVE_DIR}/figures/{EXP_NAME}_interaction.png', dpi=150, bbox_inches='tight')
plt.show()

# Êï∞ÂÄ§„Çµ„Éû„É™„Éº
print('\n--- Phase Boundary Analysis ---')
for eta in [0.4, 0.8]:
    print(f'\nŒ∑={eta}:')
    for lr in LEARNING_RATES:
        subset = df_agg[(df_agg['noise_rate'] == eta) & (df_agg['lr'] == lr)]
        subset = subset.sort_values('lambda')
        
        # ÊúÄËâØÊÄßËÉΩ
        best_row = subset.loc[subset['mean_error'].idxmin()]
        
        # Ëá®ÁïåÁÇπ
        lambda_c = 'N/A'
        for _, row in subset.iterrows():
            if row['mean_error'] > 0.20:
                lambda_c = f'{row["lambda"]:.2f}'
                break
        
        print(f'  lr={lr}: best error={best_row["mean_error"]:.4f} at Œª={best_row["lambda"]:.2f}, Œª_c‚âà{lambda_c}')

# ÁµêË´ñ
print('\n--- Conclusion ---')
print('If Œª_c varies significantly with lr: "Phase boundary depends on (Œª, Œ∑, lr)"')
print('If Œª_c is stable: "Phase structure is robust to lr variation"')