In [1]:
import sys
sys.path.insert(0, "../..")

In [2]:
import torch
from torch import nn
from torchvision import models
from copy import deepcopy
import os

from nn_extrapolation import AcceleratedSGD
from nn_utils import *

In [3]:
trainer = Trainer(
    device="cuda",
    loss_fn=nn.NLLLoss(reduction="mean"),
    val_loss_fn=nn.NLLLoss(reduction="sum"),
)

In [4]:
dl = load_dataset(
    dataset="CIFAR10",
    root=os.path.join("/tmp", os.environ["USER"], "CIFAR"),
    augmentation=transforms.RandomAffine(10, scale=(0.9, 1.1), translate=(0.2, 0.2)),
    validation_split=0.2,
    batch_size=128,
    num_workers=10,
)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to /tmp/michal/CIFAR/cifar-10-python.tar.gz


  0%|          | 0/170498071 [00:00<?, ?it/s]

Extracting /tmp/michal/CIFAR/cifar-10-python.tar.gz to /tmp/michal/CIFAR
Files already downloaded and verified


In [5]:
model = models.vgg19(pretrained=False)
model.classifier[6] = nn.Linear(4096, 10)
model.classifier.add_module("7", nn.LogSoftmax(-1))
initial_state = deepcopy(model.state_dict())
model.to(trainer.device)

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padd

In [6]:
sum(param.numel() for param in model.parameters())

139611210

In [7]:
state_path = "vgg19_initial_state.pth"
if os.path.exists(state_path):
    initial_state = torch.load(state_path)
    model.load_state_dict(initial_state)
    model.to(trainer.device)
else:
    torch.save(initial_state, state_path)

In [8]:
trainer.validation(model, dl["valid"])

(0.1043, 2.302760050201416)

## Momentum

In [9]:
optimizer = AcceleratedSGD(model.parameters(), 1e-2, k=10, momentum=0.9, weight_decay=1e-5, lambda_=1e-16)
logger = Logger("vgg19_log_augmentation-early_stopping.txt.no_resizing")

In [10]:
torch.manual_seed(2020)
max_epochs = 300
early_stopping = EarlyStopping(10)

for epoch in range(max_epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")
    if early_stopping.should_stop(val_loss):
        break

100%|██████████████████████████████████████████████████████████| 313/313 [00:23<00:00, 13.24it/s, loss=2.1611]
Epoch 1 | Training loss: 2.1611, validation accuracy: 0.2535, validation loss: 1.9201
100%|██████████████████████████████████████████████████████████| 313/313 [00:23<00:00, 13.18it/s, loss=1.8354]
Epoch 2 | Training loss: 1.8354, validation accuracy: 0.3438, validation loss: 1.6752
100%|██████████████████████████████████████████████████████████| 313/313 [00:23<00:00, 13.13it/s, loss=1.6278]
Epoch 3 | Training loss: 1.6278, validation accuracy: 0.4493, validation loss: 1.4668
100%|██████████████████████████████████████████████████████████| 313/313 [00:23<00:00, 13.07it/s, loss=1.4521]
Epoch 4 | Training loss: 1.4521, validation accuracy: 0.5288, validation loss: 1.2978
100%|██████████████████████████████████████████████████████████| 313/313 [00:23<00:00, 13.05it/s, loss=1.3332]
Epoch 5 | Training loss: 1.3332, validation accuracy: 0.5430, validation loss: 1.2779
100%|██████████

In [11]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.909975, 0.26392972049713137)
Valid: (0.8467, 0.5307384500980378)


In [12]:
model_acc = deepcopy(model)
optimizer.accelerate()
optimizer.store_parameters([model_acc.parameters()])
model_acc.to(trainer.device)

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padd

In [13]:
train_score = trainer.validation(model_acc, dl["train"])
valid_score = trainer.validation(model_acc, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9393, 0.18326329290866852)
Valid: (0.8705, 0.4170723099708557)


In [14]:
optimizer.param_groups[0]["lr"] = 1e-3

In [15]:
max_epochs = 300
early_stopping = EarlyStopping(10)

for epoch in range(max_epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")
    if early_stopping.should_stop(val_loss):
        break

100%|██████████████████████████████████████████████████████████| 313/313 [00:23<00:00, 13.05it/s, loss=0.1859]
Epoch 1 | Training loss: 0.1859, validation accuracy: 0.8698, validation loss: 0.4737
100%|██████████████████████████████████████████████████████████| 313/313 [00:24<00:00, 13.03it/s, loss=0.1586]
Epoch 2 | Training loss: 0.1586, validation accuracy: 0.8713, validation loss: 0.4825
100%|██████████████████████████████████████████████████████████| 313/313 [00:24<00:00, 13.04it/s, loss=0.1485]
Epoch 3 | Training loss: 0.1485, validation accuracy: 0.8713, validation loss: 0.4949
100%|██████████████████████████████████████████████████████████| 313/313 [00:24<00:00, 13.04it/s, loss=0.1405]
Epoch 4 | Training loss: 0.1405, validation accuracy: 0.8723, validation loss: 0.4902
100%|██████████████████████████████████████████████████████████| 313/313 [00:24<00:00, 13.04it/s, loss=0.1363]
Epoch 5 | Training loss: 0.1363, validation accuracy: 0.8736, validation loss: 0.4927
100%|██████████

In [16]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.963725, 0.1060615211367607)
Valid: (0.8733, 0.5109335117340088)


In [17]:
model_acc = deepcopy(model)
optimizer.accelerate()
optimizer.store_parameters([model_acc.parameters()])
model_acc.to(trainer.device)

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padd

In [18]:
train_score = trainer.validation(model_acc, dl["train"])
valid_score = trainer.validation(model_acc, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.961975, 0.1120340293765068)
Valid: (0.875, 0.4917816843032837)


## Epoch average, span = 10

In [22]:
model.load_state_dict(initial_state)
model.to(trainer.device)

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padd

In [23]:
trainer.validation(model, dl["valid"])

(0.1043, 2.302760050201416)

In [24]:
optimizer = AcceleratedSGD(model.parameters(), 1e-2, k=10, momentum=0.9, weight_decay=1e-5, lambda_=1e-8, mode="epoch_avg", avg_alpha=2 / (10 + 1))
logger = Logger("vgg19_log_augmentation_averaging_span10-early_stopping.txt")

In [25]:
torch.manual_seed(2020)
max_epochs = 300
early_stopping = EarlyStopping(10)

for epoch in range(max_epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")
    if early_stopping.should_stop(val_loss):
        break

100%|██████████████████████████████████████████████████████████| 313/313 [00:27<00:00, 11.44it/s, loss=2.1627]
Epoch 1 | Training loss: 2.1627, validation accuracy: 0.2518, validation loss: 1.9156
100%|██████████████████████████████████████████████████████████| 313/313 [00:27<00:00, 11.42it/s, loss=1.8265]
Epoch 2 | Training loss: 1.8265, validation accuracy: 0.3027, validation loss: 1.7506
100%|██████████████████████████████████████████████████████████| 313/313 [00:27<00:00, 11.37it/s, loss=1.6314]
Epoch 3 | Training loss: 1.6314, validation accuracy: 0.4537, validation loss: 1.4516
100%|██████████████████████████████████████████████████████████| 313/313 [00:27<00:00, 11.31it/s, loss=1.4606]
Epoch 4 | Training loss: 1.4606, validation accuracy: 0.5385, validation loss: 1.2868
100%|██████████████████████████████████████████████████████████| 313/313 [00:27<00:00, 11.32it/s, loss=1.3261]
Epoch 5 | Training loss: 1.3261, validation accuracy: 0.5249, validation loss: 1.3137
100%|██████████

In [26]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.911825, 0.2559004005908966)
Valid: (0.8466, 0.5284058502197265)


In [27]:
model_acc = deepcopy(model)
optimizer.accelerate()
optimizer.store_parameters([model_acc.parameters()])
model_acc.to(trainer.device)

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padd

In [28]:
train_score = trainer.validation(model_acc, dl["train"])
valid_score = trainer.validation(model_acc, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.937825, 0.18623180618286134)
Valid: (0.873, 0.40214862213134767)


In [29]:
optimizer.param_groups[0]["lr"] = 1e-3

In [30]:
max_epochs = 300
early_stopping = EarlyStopping(10)

for epoch in range(max_epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")
    if early_stopping.should_stop(val_loss):
        break    

100%|██████████████████████████████████████████████████████████| 313/313 [00:27<00:00, 11.27it/s, loss=0.1900]
Epoch 1 | Training loss: 0.1900, validation accuracy: 0.8698, validation loss: 0.4631
100%|██████████████████████████████████████████████████████████| 313/313 [00:27<00:00, 11.28it/s, loss=0.1589]
Epoch 2 | Training loss: 0.1589, validation accuracy: 0.8725, validation loss: 0.4685
100%|██████████████████████████████████████████████████████████| 313/313 [00:27<00:00, 11.28it/s, loss=0.1512]
Epoch 3 | Training loss: 0.1512, validation accuracy: 0.8742, validation loss: 0.4785
100%|██████████████████████████████████████████████████████████| 313/313 [00:27<00:00, 11.28it/s, loss=0.1425]
Epoch 4 | Training loss: 0.1425, validation accuracy: 0.8717, validation loss: 0.4819
100%|██████████████████████████████████████████████████████████| 313/313 [00:27<00:00, 11.27it/s, loss=0.1406]
Epoch 5 | Training loss: 0.1406, validation accuracy: 0.8715, validation loss: 0.4883
100%|██████████

In [31]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.96395, 0.10519717630147934)
Valid: (0.872, 0.5027237926483155)


In [32]:
model_acc = deepcopy(model)
optimizer.accelerate()
optimizer.store_parameters([model_acc.parameters()])
model_acc.to(trainer.device)

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padd

In [33]:
train_score = trainer.validation(model_acc, dl["train"])
valid_score = trainer.validation(model_acc, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.96125, 0.1131628319323063)
Valid: (0.8744, 0.48371202144622805)
