In [1]:
import sys
sys.path.insert(0, "../..")

In [2]:
import torch
from torch import nn
from torchvision import models
from copy import deepcopy
import os

from nn_extrapolation import AcceleratedSGD
from nn_utils import *

In [3]:
torch.cuda.is_available()

True

In [4]:
trainer = Trainer(
    device="cuda",
    loss_fn=nn.NLLLoss(reduction="mean"),
    val_loss_fn=nn.NLLLoss(reduction="sum"),
)

In [None]:
dl = load_dataset(
    dataset="CIFAR10",
    root=os.path.join("/tmp", os.environ["USER"], "CIFAR"),
    augmentation=transforms.RandomAffine(10, scale=(0.9, 1.1), translate=(0.2, 0.2)),
    validation_split=0.2,
    batch_size=128,
    num_workers=10,
)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to /tmp/michal/CIFAR/cifar-10-python.tar.gz


In [6]:
model = models.vgg16(pretrained=False)
model.classifier[6] = nn.Linear(4096, 10)
model.classifier.add_module("7", nn.LogSoftmax(-1))
initial_state = deepcopy(model.state_dict())
model.to(trainer.device)

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1

## Momentum

In [7]:
ks = [20, 10, 5]
optimizer = AcceleratedSGD(model.parameters(), 1e-2, k=max(ks), momentum=0.9, weight_decay=1e-5, lambda_=1e-8)
loss_fn = nn.NLLLoss()
k_loggers = {k: Logger(f"vgg_log_augmentation-early_stopping-10-k={k}.txt.no_resizing") for k in ks}
logger = Logger("vgg_log_augmentation-early_stopping-10.txt.no_resizing")

In [8]:
torch.manual_seed(2020)
max_epochs = 300
early_stopping = EarlyStopping(5)

for epoch in range(max_epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")
    if early_stopping.should_stop(val_loss):
        break

100%|██████████████████████████████████████████████████████████| 313/313 [00:21<00:00, 14.27it/s, loss=2.0769]
Epoch 1 | Training loss: 2.0769, validation accuracy: 0.2676, validation loss: 1.9881
100%|██████████████████████████████████████████████████████████| 313/313 [00:21<00:00, 14.44it/s, loss=1.7477]
Epoch 2 | Training loss: 1.7477, validation accuracy: 0.4170, validation loss: 1.5209
100%|██████████████████████████████████████████████████████████| 313/313 [00:21<00:00, 14.43it/s, loss=1.5488]
Epoch 3 | Training loss: 1.5488, validation accuracy: 0.4623, validation loss: 1.4217
100%|██████████████████████████████████████████████████████████| 313/313 [00:21<00:00, 14.41it/s, loss=1.3787]
Epoch 4 | Training loss: 1.3787, validation accuracy: 0.5573, validation loss: 1.2151
100%|██████████████████████████████████████████████████████████| 313/313 [00:21<00:00, 14.38it/s, loss=1.2382]
Epoch 5 | Training loss: 1.2382, validation accuracy: 0.5784, validation loss: 1.1616
100%|██████████

In [9]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.883575, 0.333403667473793)
Valid: (0.8289, 0.5285954370498657)


In [10]:
for k in ks:
    model_acc = deepcopy(model)
    optimizer.param_groups[0]["k"] = k
    optimizer.accelerate()
    optimizer.store_parameters([model_acc.parameters()])
    model_acc.to(trainer.device)
    train_score = trainer.validation(model_acc, dl["train"])
    valid_score = trainer.validation(model_acc, dl["valid"])
    k_loggers[k].log("Train:", train_score)
    k_loggers[k].log("Valid:", valid_score)
optimizer.param_groups[0]["k"] = max(ks)

Train: (0.8967, 0.31249006004333496)
Valid: (0.853, 0.4314487959861755)
Train: (0.921475, 0.23516985907554627)
Valid: (0.8622, 0.4163259819984436)
Train: (0.9254, 0.2260812639117241)
Valid: (0.855, 0.44542790412902833)


In [11]:
optimizer.param_groups[0]["lr"] = 1e-3

In [12]:
max_epochs = 300
early_stopping = EarlyStopping(10)

for epoch in range(max_epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")
    if early_stopping.should_stop(val_loss):
        break

100%|██████████████████████████████████████████████████████████| 313/313 [00:22<00:00, 13.96it/s, loss=0.2294]
Epoch 1 | Training loss: 0.2294, validation accuracy: 0.8595, validation loss: 0.4620
100%|██████████████████████████████████████████████████████████| 313/313 [00:22<00:00, 14.08it/s, loss=0.1955]
Epoch 2 | Training loss: 0.1955, validation accuracy: 0.8632, validation loss: 0.4569
100%|██████████████████████████████████████████████████████████| 313/313 [00:22<00:00, 14.08it/s, loss=0.1843]
Epoch 3 | Training loss: 0.1843, validation accuracy: 0.8646, validation loss: 0.4608
100%|██████████████████████████████████████████████████████████| 313/313 [00:22<00:00, 14.09it/s, loss=0.1815]
Epoch 4 | Training loss: 0.1815, validation accuracy: 0.8681, validation loss: 0.4535
100%|██████████████████████████████████████████████████████████| 313/313 [00:22<00:00, 14.10it/s, loss=0.1706]
Epoch 5 | Training loss: 0.1706, validation accuracy: 0.8696, validation loss: 0.4594
100%|██████████

In [13]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.955975, 0.12859068150520325)
Valid: (0.8641, 0.4905152772903442)


In [14]:
for k in ks:
    model_acc = deepcopy(model)
    optimizer.param_groups[0]["k"] = k
    optimizer.accelerate()
    optimizer.store_parameters([model_acc.parameters()])
    model_acc.to(trainer.device)
    train_score = trainer.validation(model_acc, dl["train"])
    valid_score = trainer.validation(model_acc, dl["valid"])
    k_loggers[k].log("Train:", train_score)
    k_loggers[k].log("Valid:", valid_score)
optimizer.param_groups[0]["k"] = max(ks)

Train: (0.952175, 0.13988058605194092)
Valid: (0.87, 0.4663050844192505)
Train: (0.953775, 0.13276294705867767)
Valid: (0.8694, 0.47475941944122313)
Train: (0.955825, 0.12781404001712798)
Valid: (0.8689, 0.48250939292907713)


In [15]:
optimizer.param_groups[0]["lr"] = 1e-4

In [16]:
max_epochs = 300
early_stopping = EarlyStopping(10)

for epoch in range(max_epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")
    if early_stopping.should_stop(val_loss):
        break

100%|██████████████████████████████████████████████████████████| 313/313 [00:22<00:00, 13.93it/s, loss=0.1292]
Epoch 1 | Training loss: 0.1292, validation accuracy: 0.8669, validation loss: 0.4865
100%|██████████████████████████████████████████████████████████| 313/313 [00:22<00:00, 13.94it/s, loss=0.1201]
Epoch 2 | Training loss: 0.1201, validation accuracy: 0.8668, validation loss: 0.4887
100%|██████████████████████████████████████████████████████████| 313/313 [00:22<00:00, 13.94it/s, loss=0.1206]
Epoch 3 | Training loss: 0.1206, validation accuracy: 0.8687, validation loss: 0.4892
100%|██████████████████████████████████████████████████████████| 313/313 [00:22<00:00, 13.91it/s, loss=0.1228]
Epoch 4 | Training loss: 0.1228, validation accuracy: 0.8683, validation loss: 0.4929
100%|██████████████████████████████████████████████████████████| 313/313 [00:22<00:00, 13.91it/s, loss=0.1245]
Epoch 5 | Training loss: 0.1245, validation accuracy: 0.8688, validation loss: 0.4906
100%|██████████

In [17]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.961125, 0.11509836530685424)
Valid: (0.8688, 0.4952331829071045)


In [18]:
for k in ks:
    model_acc = deepcopy(model)
    optimizer.param_groups[0]["k"] = k
    optimizer.accelerate()
    optimizer.store_parameters([model_acc.parameters()])
    model_acc.to(trainer.device)
    train_score = trainer.validation(model_acc, dl["train"])
    valid_score = trainer.validation(model_acc, dl["valid"])
    k_loggers[k].log("Train:", train_score)
    k_loggers[k].log("Valid:", valid_score)
optimizer.param_groups[0]["k"] = max(ks)

Train: (0.96155, 0.11272089142203331)
Valid: (0.8683, 0.49355858478546144)
Train: (0.960275, 0.11707456923723221)
Valid: (0.8679, 0.49414962043762206)
Train: (0.960575, 0.11554838743209839)
Valid: (0.8682, 0.4965355113983154)


## Epoch average, span = 10

In [7]:
model.load_state_dict(initial_state)
model.to(trainer.device)

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1

In [8]:
trainer.validation(model, dl["valid"])

(0.0875, 2.3028329822540283)

In [9]:
optimizer = AcceleratedSGD(model.parameters(), 1e-2, k=10, momentum=0.9, weight_decay=1e-5, lambda_=1e-8, mode="epoch_avg", avg_alpha=2 / (10 + 1))
logger = Logger("vgg_log_augmentation_averaging_span10-early_stopping.txt")

In [10]:
torch.manual_seed(2020)
max_epochs = 300
early_stopping = EarlyStopping(10)

for epoch in range(max_epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")
    if early_stopping.should_stop(val_loss):
        break

100%|██████████████████████████████████████████████████████████| 313/313 [00:25<00:00, 12.43it/s, loss=2.1162]
Epoch 1 | Training loss: 2.1162, validation accuracy: 0.2941, validation loss: 1.8020
100%|██████████████████████████████████████████████████████████| 313/313 [00:25<00:00, 12.37it/s, loss=1.7647]
Epoch 2 | Training loss: 1.7647, validation accuracy: 0.4005, validation loss: 1.5425
100%|██████████████████████████████████████████████████████████| 313/313 [00:25<00:00, 12.36it/s, loss=1.5520]
Epoch 3 | Training loss: 1.5520, validation accuracy: 0.4661, validation loss: 1.4122
100%|██████████████████████████████████████████████████████████| 313/313 [00:25<00:00, 12.37it/s, loss=1.3848]
Epoch 4 | Training loss: 1.3848, validation accuracy: 0.5636, validation loss: 1.1928
100%|██████████████████████████████████████████████████████████| 313/313 [00:25<00:00, 12.37it/s, loss=1.2572]
Epoch 5 | Training loss: 1.2572, validation accuracy: 0.5833, validation loss: 1.1647
100%|██████████

In [11]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.949775, 0.14794875841140748)
Valid: (0.854, 0.5653819366455078)


In [12]:
model_acc = deepcopy(model)
optimizer.accelerate()
optimizer.store_parameters([model_acc.parameters()])
model_acc.to(trainer.device)

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1

In [13]:
train_score = trainer.validation(model_acc, dl["train"])
valid_score = trainer.validation(model_acc, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.98125, 0.06213511547744274)
Valid: (0.8814, 0.4210780303955078)


In [14]:
optimizer.param_groups[0]["lr"] = 1e-3

In [15]:
max_epochs = 300
early_stopping = EarlyStopping(10)

for epoch in range(max_epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")
    if early_stopping.should_stop(val_loss):
        break

100%|██████████████████████████████████████████████████████████| 313/313 [00:25<00:00, 12.36it/s, loss=0.0898]
Epoch 1 | Training loss: 0.0898, validation accuracy: 0.8730, validation loss: 0.5054
100%|██████████████████████████████████████████████████████████| 313/313 [00:25<00:00, 12.34it/s, loss=0.0676]
Epoch 2 | Training loss: 0.0676, validation accuracy: 0.8766, validation loss: 0.5170
100%|██████████████████████████████████████████████████████████| 313/313 [00:25<00:00, 12.35it/s, loss=0.0610]
Epoch 3 | Training loss: 0.0610, validation accuracy: 0.8759, validation loss: 0.5363
100%|██████████████████████████████████████████████████████████| 313/313 [00:25<00:00, 12.36it/s, loss=0.0588]
Epoch 4 | Training loss: 0.0588, validation accuracy: 0.8767, validation loss: 0.5291
100%|██████████████████████████████████████████████████████████| 313/313 [00:25<00:00, 12.36it/s, loss=0.0544]
Epoch 5 | Training loss: 0.0544, validation accuracy: 0.8798, validation loss: 0.5334
100%|██████████

In [16]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.985775, 0.04270667671114206)
Valid: (0.8789, 0.5515179138183594)


In [17]:
model_acc = deepcopy(model)
optimizer.accelerate()
optimizer.store_parameters([model_acc.parameters()])
model_acc.to(trainer.device)

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1

In [18]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.98585, 0.04149313524737954)
Valid: (0.8789, 0.5515179138183594)
