In [1]:
import sys
sys.path.insert(0, "../..")

In [2]:
import torch
from torch import nn
from torchvision import models
from copy import deepcopy
import os

from nn_extrapolation import AcceleratedSGD
from nn_utils import *

In [3]:
trainer = Trainer(
    device="cuda:0",
    loss_fn=nn.NLLLoss(reduction="mean"),
    val_loss_fn=nn.NLLLoss(reduction="sum"),
)

In [4]:
dl = load_dataset(
    dataset="CIFAR10",
    root=os.path.join("/tmp", os.environ["USER"], "CIFAR"),
    validation_split=0.2,
    batch_size=128,
    num_workers=10,
)

Files already downloaded and verified
Files already downloaded and verified


In [5]:
model = models.resnet18(pretrained=False)
model.fc = nn.Sequential(
    nn.Linear(512, 10),
    nn.LogSoftmax(-1)
)
model.to(trainer.device)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [7]:
trainer.validation(model, dl["valid"])

(0.0994, 2.3962655769348142)

## No momentum

In [8]:
optimizer = AcceleratedSGD(model.parameters(), 1e-1, k=5, momentum=0, weight_decay=1e-5)
logger = Logger("resnet_log_no_momentum.txt.no_resizing")
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, [5, 10], gamma=0.1, verbose=True)

Adjusting learning rate of group 0 to 1.0000e-01.


In [9]:
epochs = 18

for epoch in range(epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    scheduler.step()
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")

100%|██████████| 313/313 [00:15<00:00, 20.14it/s, loss=1.6812]
Adjusting learning rate of group 0 to 1.0000e-01.
Epoch 1 | Training loss: 1.6812, validation accuracy: 0.5009, validation loss: 1.4852
100%|██████████| 313/313 [00:15<00:00, 19.75it/s, loss=1.1737]
Adjusting learning rate of group 0 to 1.0000e-01.
Epoch 2 | Training loss: 1.1737, validation accuracy: 0.5765, validation loss: 1.2064
100%|██████████| 313/313 [00:15<00:00, 19.94it/s, loss=0.9536]
Adjusting learning rate of group 0 to 1.0000e-01.
Epoch 3 | Training loss: 0.9536, validation accuracy: 0.6255, validation loss: 1.0741
100%|██████████| 313/313 [00:15<00:00, 19.65it/s, loss=0.7974]
Adjusting learning rate of group 0 to 1.0000e-01.
Epoch 4 | Training loss: 0.7974, validation accuracy: 0.6062, validation loss: 1.1609
100%|██████████| 313/313 [00:15<00:00, 19.78it/s, loss=0.6590]
Adjusting learning rate of group 0 to 1.0000e-02.
Epoch 5 | Training loss: 0.6590, validation accuracy: 0.6603, validation loss: 1.0318
100%|

In [9]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.999, 0.01205787510573864)
Valid: (0.7166, 1.3761080833435058)


In [10]:
optimizer.accelerate()

In [11]:
optimizer.store_parameters()
model.to(trainer.device)
None

In [12]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.998975, 0.01300976274833083)
Valid: (0.7164, 1.36950100440979)


## Momentum

In [12]:
model = models.resnet18(pretrained=False)
model.fc = nn.Sequential(
    nn.Linear(512, 10),
    nn.LogSoftmax(-1)
)
model.to(trainer.device)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [13]:
optimizer = AcceleratedSGD(model.parameters(), 1e-1, k=5, momentum=0.9,  weight_decay=1e-5)
logger = Logger("resnet_log_momentum.txt.no_resizing")
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, [5, 10], gamma=0.1, verbose=True)

Adjusting learning rate of group 0 to 1.0000e-01.


In [14]:
epochs = 18

for epoch in range(epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    scheduler.step()
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")

100%|██████████| 313/313 [00:18<00:00, 17.27it/s, loss=2.2050]
Adjusting learning rate of group 0 to 1.0000e-01.
Epoch 1 | Training loss: 2.2050, validation accuracy: 0.4242, validation loss: 1.5878
100%|██████████| 313/313 [00:18<00:00, 16.91it/s, loss=1.4366]
Adjusting learning rate of group 0 to 1.0000e-01.
Epoch 2 | Training loss: 1.4366, validation accuracy: 0.5195, validation loss: 1.3524
100%|██████████| 313/313 [00:18<00:00, 17.25it/s, loss=1.2166]
Adjusting learning rate of group 0 to 1.0000e-01.
Epoch 3 | Training loss: 1.2166, validation accuracy: 0.5997, validation loss: 1.1192
100%|██████████| 313/313 [00:18<00:00, 16.97it/s, loss=1.0520]
Adjusting learning rate of group 0 to 1.0000e-01.
Epoch 4 | Training loss: 1.0520, validation accuracy: 0.6385, validation loss: 1.0259
100%|██████████| 313/313 [00:18<00:00, 17.06it/s, loss=0.9105]
Adjusting learning rate of group 0 to 1.0000e-02.
Epoch 5 | Training loss: 0.9105, validation accuracy: 0.6580, validation loss: 0.9670
100%|

In [15]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.93575, 0.2349222775220871)
Valid: (0.721, 1.06977393951416)


In [16]:
optimizer.accelerate()

In [17]:
optimizer.store_parameters()
model.to(trainer.device)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [18]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9302, 0.2552979892492294)
Valid: (0.7206, 1.079448751449585)
