# Homework 6


## 1 Hessian

In [1]:
train_batch_size = 100
test_batch_size = 10000
epoch_num = 10
lr = 0.2
seed = 50

In [2]:
import torch
import torchvision.transforms as transforms
from torchvision.datasets import CIFAR10
from torch.utils.data import DataLoader

torch.manual_seed(seed)

device = torch.device('cuda')
print("Using device:", device)
print(torch.__version__)
print(torch.version.cuda)              # Should not be None
print(torch.cuda.is_available())       # Should be True if everything is correct

transformation = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])

train_dataset = CIFAR10(root='./data', train=True, download=True,
    transform=transforms.Compose([
        transforms.RandomCrop(size=32, padding=4),
        transforms.RandomHorizontalFlip(),
        transformation
    ])
)

train_dataset_fast = CIFAR10(root='./data', train=True, download=True,
    transform=transformation
)

test_dataset = CIFAR10(root='./data', train=False, download=True, transform=transformation)

train_loader = DataLoader(train_dataset, batch_size=100, shuffle=True, num_workers=4)

train_loader_fast = DataLoader(train_dataset_fast, batch_size=1000, shuffle=True,
    num_workers=4)

test_loader = DataLoader(test_dataset, batch_size=10000, shuffle=False, num_workers=4)

Using device: cuda
2.5.1
12.4
True
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified


In [19]:
import torch.nn as nn

model = nn.Sequential(
    nn.Conv2d(3, 5, 5),
    nn.ReLU(),
    nn.MaxPool2d(2, 2),
    nn.Conv2d(5, 5, 5),
    nn.ReLU(),
    nn.MaxPool2d(2, 2),
    nn.Flatten(),
    nn.Linear(125, 30),
    nn.ReLU(),
    nn.Linear(30, 10),
)


In [20]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

num_epochs = 2
device = torch.device('cuda')
model.to(device)

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f'Epoch {epoch + 1}/{num_epochs} | Train Loss: {running_loss/len(train_loader):.4f}')


Epoch 1/2 | Train Loss: 1.9268
Epoch 2/2 | Train Loss: 1.6882


In [21]:
def compute_hessian(model, loss):
    grad = torch.autograd.grad(loss, model.parameters(), create_graph=True)
    flattened_grad = torch.cat([g.reshape(-1) for g in grad])

    hessian = []

    for g in flattened_grad:
        hessian_g = torch.autograd.grad(g, model.parameters(), retain_graph=True)
        flattened_hessian_g = torch.cat([h.reshape(-1) for h in hessian_g])
        hessian.append(flattened_hessian_g)

    return torch.stack(hessian)




### Try torch.autograd.functional.hessian

Doesn't work

In [15]:
# model.eval()
model.train()
images, labels = next(iter(train_loader_fast))
images, labels = images.to(device), labels.to(device)

outputs = model(images)

def loss_fn(params):
    # Reconstruct the model.parameters() to a 1D array) for the loss
    param_idx = 0  # Track position in the 1D params tensor
    
    for p in model.parameters():
        numel = p.numel()  # Get number of elements in the current parameter
        p.data = params[param_idx : param_idx + numel].view_as(p).data  # Reshape and assign
        param_idx += numel  # Move index forward

    outputs = model(images)  # Forward pass
    return criterion(outputs, labels)  # Compute loss

params = torch.cat([p.view(-1) for p in model.parameters()])
print('Calculating Hessian Start [via torch.autograd.functional.hessian]')
hessian = torch.autograd.functional.hessian(loss_fn, params)
print('Calculating Hessian End [via torch.autograd.functional.hessian]')

eigval = torch.linalg.eigvalsh(hessian)
small_eigval = eigval[:5]
big_eigval = eigval[-5:]
smoothness = torch.max(torch.abs(eigval))

print("Smallest 5 eigenvalues: ", small_eigval)
print("Largest 5 eigenvalues: ", big_eigval)
print("Smoothness (largest absolute value of eigenvalues): ", smoothness)

Calculating Hessian Start [via torch.autograd.functional.hessian]
Calculating Hessian End [via torch.autograd.functional.hessian]
Smallest 5 eigenvalues:  tensor([0., 0., 0., 0., 0.], device='cuda:0')
Largest 5 eigenvalues:  tensor([0., 0., 0., 0., 0.], device='cuda:0')
Smoothness (largest absolute value of eigenvalues):  tensor(0., device='cuda:0')


### Try DIY Hessain Calculation

In [26]:
model.eval()
images, labels = next(iter(train_loader_fast))
images, labels = images.to(device), labels.to(device)

outputs = model(images)
loss = criterion(outputs, labels)

print('Calculating Hessian Start [via DIY function]')
hessian = compute_hessian(model, loss)
print('Calculating Hessian End [via DIY function]')

eigval = torch.linalg.eigvalsh(hessian)
small_eigval = eigval[:5]
big_eigval = eigval[-5:]
smoothness = torch.max(torch.abs(eigval))

print("Smallest 5 eigenvalues: ", small_eigval)
print("Largest 5 eigenvalues: ", big_eigval)
print("Smoothness (largest absolute value of eigenvalues): ", smoothness)


Calculating Hessian Start [via DIY function]
Calculating Hessian End [via DIY function]
Smallest 5 eigenvalues:  tensor([-1.3055, -1.1555, -0.9420, -0.8477, -0.8134], device='cuda:0')
Largest 5 eigenvalues:  tensor([17.5356, 18.0376, 22.0060, 29.3952, 43.8828], device='cuda:0')
Smoothness (largest absolute value of eigenvalues):  tensor(43.8828, device='cuda:0')


## 2 HVP

In [22]:
def hessian_vector_product(model, criterion, images, labels, v):
    outputs = model(images)
    loss = criterion(outputs, labels)
    grad = torch.autograd.grad(loss, model.parameters(), create_graph=True)
    flattened_grad = torch.cat([g.view(-1) for g in grad])

    # gv = flattened_grad @ v

    hv = torch.autograd.grad(flattened_grad, model.parameters(), grad_outputs=v, retain_graph=False)
    
    return torch.cat([h.reshape(-1) for h in hv])

    

In [23]:
import torch.nn.functional as F


model.eval()

d = sum(p.numel() for p in model.parameters())

v = torch.randn(d, device=device)
v = F.normalize(v, p=2, dim=0)

max_iters = 30

for i, (images, labels) in enumerate(train_loader_fast):
    if i >= max_iters:
        break 
    # images, labels = next(iter(train_loader_fast))
    images, labels = images.to(device), labels.to(device)
    hv = hessian_vector_product(model, criterion, images, labels, v)
    lambda_k = v @ hv
    v = F.normalize(hv, p=2, dim=0)
    print(f'lamda_{i} = {lambda_k}')



lamda_0 = 0.04943562671542168
lamda_1 = 17.987499237060547
lamda_2 = 34.403419494628906
lamda_3 = 43.2529411315918
lamda_4 = 41.97267150878906
lamda_5 = 47.71524429321289
lamda_6 = 44.2011604309082
lamda_7 = 43.06145477294922
lamda_8 = 44.172874450683594
lamda_9 = 44.66304016113281
lamda_10 = 44.983062744140625
lamda_11 = 51.16176223754883
lamda_12 = 48.02653884887695
lamda_13 = 46.00290298461914
lamda_14 = 42.938636779785156
lamda_15 = 47.85685729980469
lamda_16 = 44.14205551147461
lamda_17 = 43.55768585205078
lamda_18 = 46.36376190185547
lamda_19 = 43.89331817626953
lamda_20 = 44.943477630615234
lamda_21 = 49.79100799560547
lamda_22 = 41.97179412841797
lamda_23 = 48.928672790527344
lamda_24 = 44.84471893310547
lamda_25 = 41.98912048339844
lamda_26 = 44.790489196777344
lamda_27 = 44.08843231201172
lamda_28 = 47.47641372680664
lamda_29 = 47.21642303466797


The Result above is not stable since we are using different minibatch. The Hessian Computed directly above is using data from one minibatch. Here we are trying to approximate the actual hessian across different minibatches.

#### With acceleration


In [25]:
import torch.nn.functional as F

beta = 0.9

model.eval()

d = sum(p.numel() for p in model.parameters())


v = torch.randn(d, device=device)
v = F.normalize(v, p=2, dim=0)

v_prev = v.clone()

max_iters = 100

for i, (images, labels) in enumerate(train_loader_fast):
    if i >= max_iters:
        break 

    # images, labels = next(iter(train_loader_fast))
    images, labels = images.to(device), labels.to(device)
    hv = hessian_vector_product(model, criterion, images, labels, v)
    lambda_k = v @ hv
    w = F.normalize(hv, p=2, dim=0)
    v = beta * v_prev + (1 - beta) * w
    v = F.normalize(v, p=2, dim=0)
    v_prev = v.clone()
    print(f'lamda_{i} = {lambda_k}')

lamda_0 = 0.04608114808797836
lamda_1 = 0.6441468000411987
lamda_2 = 2.113363265991211
lamda_3 = 4.504873752593994
lamda_4 = 7.472621917724609
lamda_5 = 10.638004302978516
lamda_6 = 14.062822341918945
lamda_7 = 18.413776397705078
lamda_8 = 22.378620147705078
lamda_9 = 23.377376556396484
lamda_10 = 29.914936065673828
lamda_11 = 29.294254302978516
lamda_12 = 32.178897857666016
lamda_13 = 33.120391845703125
lamda_14 = 35.00140380859375
lamda_15 = 42.59052658081055
lamda_16 = 38.71431350708008
lamda_17 = 40.72813415527344
lamda_18 = 43.86983871459961
lamda_19 = 42.07209014892578
lamda_20 = 40.213172912597656
lamda_21 = 45.33381652832031
lamda_22 = 43.199344635009766
lamda_23 = 43.30857849121094
lamda_24 = 45.44932556152344
lamda_25 = 41.69606018066406
lamda_26 = 46.18921661376953
lamda_27 = 42.7132568359375
lamda_28 = 43.35588455200195
lamda_29 = 41.27893829345703
lamda_30 = 44.70640563964844
lamda_31 = 48.74162292480469
lamda_32 = 41.39077377319336
lamda_33 = 43.6795654296875
lamda_34 = 4

After adding the acceleration, it is more stable. But still shaking around 45. I think the true max eigenvalue should be set around 45.

## 3 RESNET 50


In [39]:
import torchvision.models as models
from torchvision.models import ResNet50_Weights

res50 = models.resnet50(weights=ResNet50_Weights.DEFAULT)
res50.fc = nn.Linear(2048, 10)
res50 = res50.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(res50.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4) 


Before Training, run 50 iter HPV to find current largest eigenvalue.

In [40]:
import torch.nn.functional as F

beta = 0.7

res50.eval()

d = sum(p.numel() for p in res50.parameters())


v = torch.randn(d, device=device)
v = F.normalize(v, p=2, dim=0)

v_prev = v.clone()

max_iters = 100

for i, (images, labels) in enumerate(train_loader_fast):
    if i >= max_iters:
        break 

    # images, labels = next(iter(train_loader_fast))
    images, labels = images.to(device), labels.to(device)
    hv = hessian_vector_product(res50, criterion, images, labels, v)
    lambda_k = v @ hv
    w = F.normalize(hv, p=2, dim=0)
    v = beta * v_prev + (1 - beta) * w
    v = F.normalize(v, p=2, dim=0)
    v_prev = v.clone()
    print(f'lamda_{i} = {lambda_k}')

lamda_0 = 0.000440148520283401
lamda_1 = 3.352783441543579
lamda_2 = 17.269290924072266
lamda_3 = 40.06093215942383
lamda_4 = 56.9256477355957
lamda_5 = 75.51548767089844
lamda_6 = 89.07769775390625
lamda_7 = 91.07374572753906
lamda_8 = 102.65342712402344
lamda_9 = 105.86251831054688
lamda_10 = 110.94886779785156
lamda_11 = 119.92045593261719
lamda_12 = 110.92315673828125
lamda_13 = 120.11367797851562
lamda_14 = 117.76911163330078
lamda_15 = 130.5001678466797
lamda_16 = 125.642578125
lamda_17 = 133.51815795898438
lamda_18 = 139.22418212890625
lamda_19 = 138.70111083984375
lamda_20 = 128.70687866210938
lamda_21 = 140.86239624023438
lamda_22 = 142.3839111328125
lamda_23 = 134.17550659179688
lamda_24 = 138.90469360351562
lamda_25 = 129.22598266601562
lamda_26 = 131.9697265625
lamda_27 = 135.31312561035156
lamda_28 = 128.69544982910156
lamda_29 = 134.53318786621094
lamda_30 = 129.28634643554688
lamda_31 = 145.08905029296875
lamda_32 = 142.05050659179688
lamda_33 = 137.40682983398438
lamda_

Before Training, the max eigenvalue is about 140-150

#### Train for 10 Epochs (with SGD)

In [41]:
num_epochs = 10

res50.train()
for epoch in range(num_epochs):
    running_loss = 0.0
    correct = 0

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = res50(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = outputs.max(1)
        correct += predicted.eq(labels).sum().item()
        

    print(f'Epoch {epoch + 1}/{num_epochs} | Train Loss: {running_loss/len(train_loader):.4f} | Train ACC {correct/len(train_loader):.4f}')

Epoch 1/10 | Train Loss: 1.7064 | Train ACC 37.4520
Epoch 2/10 | Train Loss: 1.2523 | Train ACC 56.1740
Epoch 3/10 | Train Loss: 1.0304 | Train ACC 64.7760
Epoch 4/10 | Train Loss: 0.9372 | Train ACC 68.1960
Epoch 5/10 | Train Loss: 0.8725 | Train ACC 70.4100
Epoch 6/10 | Train Loss: 0.8524 | Train ACC 71.1720
Epoch 7/10 | Train Loss: 0.8318 | Train ACC 71.9320
Epoch 8/10 | Train Loss: 0.8180 | Train ACC 72.5040
Epoch 9/10 | Train Loss: 0.8120 | Train ACC 72.6020
Epoch 10/10 | Train Loss: 0.8105 | Train ACC 72.8640


In [42]:
import torch.nn.functional as F

beta = 0.7

res50.eval()

d = sum(p.numel() for p in res50.parameters())


v = torch.randn(d, device=device)
v = F.normalize(v, p=2, dim=0)

v_prev = v.clone()

max_iters = 100

for i, (images, labels) in enumerate(train_loader_fast):
    if i >= max_iters:
        break 

    # images, labels = next(iter(train_loader_fast))
    images, labels = images.to(device), labels.to(device)
    hv = hessian_vector_product(res50, criterion, images, labels, v)
    lambda_k = v @ hv
    w = F.normalize(hv, p=2, dim=0)
    v = beta * v_prev + (1 - beta) * w
    v = F.normalize(v, p=2, dim=0)
    v_prev = v.clone()
    print(f'lamda_{i} = {lambda_k}')

lamda_0 = 5.1203918701503426e-05
lamda_1 = 10.533380508422852
lamda_2 = 33.889408111572266
lamda_3 = 57.487884521484375
lamda_4 = 78.51637268066406
lamda_5 = 78.56939697265625
lamda_6 = 82.51615905761719
lamda_7 = 89.28289794921875
lamda_8 = 78.9213638305664
lamda_9 = 90.4259262084961
lamda_10 = 93.57878112792969
lamda_11 = 88.91712951660156
lamda_12 = 85.32963562011719
lamda_13 = 90.96769714355469
lamda_14 = 89.72052001953125
lamda_15 = 83.48465728759766
lamda_16 = 90.65206146240234
lamda_17 = 86.23033142089844
lamda_18 = 93.91207122802734
lamda_19 = 92.53163146972656
lamda_20 = 93.11539459228516
lamda_21 = 89.27662658691406
lamda_22 = 84.85992431640625
lamda_23 = 84.87395477294922
lamda_24 = 91.73284912109375
lamda_25 = 88.80448150634766
lamda_26 = 84.22303009033203
lamda_27 = 86.10324096679688
lamda_28 = 94.76228332519531
lamda_29 = 92.35308074951172
lamda_30 = 90.03672790527344
lamda_31 = 101.22908782958984
lamda_32 = 88.31964874267578
lamda_33 = 89.49870300292969
lamda_34 = 92.961

Here the largest eigenvalue is about 85 - 90

#### Train for Another 90 Epoches

In [43]:
num_epochs = 90

res50.train()
for epoch in range(num_epochs):
    running_loss = 0.0
    correct = 0

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = res50(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = outputs.max(1)
        correct += predicted.eq(labels).sum().item()
        

    print(f'Epoch {epoch + 1}/{num_epochs} | Train Loss: {running_loss/len(train_loader):.4f} | Train ACC {correct/len(train_loader):.4f}')

Epoch 1/90 | Train Loss: 0.8119 | Train ACC 72.7020
Epoch 2/90 | Train Loss: 0.8039 | Train ACC 72.7060
Epoch 3/90 | Train Loss: 0.7974 | Train ACC 72.9300
Epoch 4/90 | Train Loss: 0.8000 | Train ACC 72.8840
Epoch 5/90 | Train Loss: 0.7996 | Train ACC 73.0760
Epoch 6/90 | Train Loss: 0.7895 | Train ACC 73.4100
Epoch 7/90 | Train Loss: 0.7857 | Train ACC 73.5040
Epoch 8/90 | Train Loss: 0.7874 | Train ACC 73.6060
Epoch 9/90 | Train Loss: 0.7841 | Train ACC 73.3660
Epoch 10/90 | Train Loss: 0.7761 | Train ACC 73.8440
Epoch 11/90 | Train Loss: 0.7653 | Train ACC 74.2760
Epoch 12/90 | Train Loss: 0.7743 | Train ACC 73.8640
Epoch 13/90 | Train Loss: 0.7706 | Train ACC 74.0900
Epoch 14/90 | Train Loss: 0.7728 | Train ACC 74.0140
Epoch 15/90 | Train Loss: 0.7684 | Train ACC 74.1700
Epoch 16/90 | Train Loss: 0.7644 | Train ACC 74.3360
Epoch 17/90 | Train Loss: 0.7599 | Train ACC 74.4760
Epoch 18/90 | Train Loss: 0.7609 | Train ACC 74.3200
Epoch 19/90 | Train Loss: 0.7590 | Train ACC 74.4960
Ep

In [44]:
import torch.nn.functional as F

beta = 0.7

res50.eval()

d = sum(p.numel() for p in res50.parameters())


v = torch.randn(d, device=device)
v = F.normalize(v, p=2, dim=0)

v_prev = v.clone()

max_iters = 100

for i, (images, labels) in enumerate(train_loader_fast):
    if i >= max_iters:
        break 

    # images, labels = next(iter(train_loader_fast))
    images, labels = images.to(device), labels.to(device)
    hv = hessian_vector_product(res50, criterion, images, labels, v)
    lambda_k = v @ hv
    w = F.normalize(hv, p=2, dim=0)
    v = beta * v_prev + (1 - beta) * w
    v = F.normalize(v, p=2, dim=0)
    v_prev = v.clone()
    print(f'lamda_{i} = {lambda_k}')

lamda_0 = 1.2004034033452626e-05
lamda_1 = 2.223109722137451
lamda_2 = 14.219112396240234
lamda_3 = 29.89647102355957
lamda_4 = 42.67970657348633
lamda_5 = 49.205081939697266
lamda_6 = 53.42639923095703
lamda_7 = 55.85221862792969
lamda_8 = 57.554874420166016
lamda_9 = 47.59864044189453
lamda_10 = 60.894447326660156
lamda_11 = 56.13178253173828
lamda_12 = 55.43880081176758
lamda_13 = 63.29621124267578
lamda_14 = 56.31103515625
lamda_15 = 57.8975830078125
lamda_16 = 57.65886306762695
lamda_17 = 53.80331802368164
lamda_18 = 57.59252166748047
lamda_19 = 56.668182373046875
lamda_20 = 60.59246826171875
lamda_21 = 62.371795654296875
lamda_22 = 60.13347244262695
lamda_23 = 51.59734344482422
lamda_24 = 55.82365417480469
lamda_25 = 55.86796951293945
lamda_26 = 59.20222091674805
lamda_27 = 58.959434509277344
lamda_28 = 59.363983154296875
lamda_29 = 54.76301574707031
lamda_30 = 56.89745330810547
lamda_31 = 59.04413986206055
lamda_32 = 55.65510177612305
lamda_33 = 63.013431549072266
lamda_34 = 58.

The Largest Eigenvalue is about 60 after 100 epoch training.

Note the loss function improved only slightly from 0.81 to 0.71 during the last 90 epoches using SGD. 