In [1]:
import torch
from torch import nn
from torchvision import models
from copy import deepcopy
import os

from nn_extrapolation import AcceleratedSGD
from nn_utils import *

In [2]:
torch.cuda.is_available()

True

In [3]:
trainer = Trainer(
    device="cuda:2",
    loss_fn=nn.NLLLoss(reduction="mean"),
    val_loss_fn=nn.NLLLoss(reduction="sum"),
)

In [4]:
dl = load_dataset(
    dataset="CIFAR10",
    root=os.path.join("/tmp", os.environ["USER"], "CIFAR"),
    augmentation=transforms.RandomAffine(10, scale=(0.9, 1.1), translate=(0.2, 0.2)),
    validation_split=0.2,
    batch_size=128,
    num_workers=10,
)

Files already downloaded and verified
Files already downloaded and verified


In [5]:
model = models.densenet121(pretrained=False)
model.classifier = nn.Sequential(
    nn.Linear(1024, 10),
    nn.LogSoftmax(-1)
)
model.to(trainer.device)

DenseNet(
  (features): Sequential(
    (conv0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (norm0): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu0): ReLU(inplace=True)
    (pool0): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (denseblock1): _DenseBlock(
      (denselayer1): _DenseLayer(
        (norm1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu1): ReLU(inplace=True)
        (conv1): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu2): ReLU(inplace=True)
        (conv2): Conv2d(128, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      )
      (denselayer2): _DenseLayer(
        (norm1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu

In [6]:
trainer.validation(model, dl["valid"])

(0.1064, 2.334063491821289)

## No momentum

In [7]:
optimizer = AcceleratedSGD(model.parameters(), 1e-1, k=10, momentum=0, weight_decay=0, lambda_=1e-8)
logger = Logger("densenet_log_augmentation_no_momentum-early_stopping.txt.no_resizing")

In [8]:
max_epochs = 300
early_stopping = EarlyStopping(15)

for epoch in range(max_epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")
    if early_stopping.should_stop(val_loss):
        break

100%|██████████| 313/313 [00:47<00:00,  6.65it/s, loss=1.8889]
Epoch 1 | Training loss: 1.8889, validation accuracy: 0.3869, validation loss: 1.7467
100%|██████████| 313/313 [00:46<00:00,  6.76it/s, loss=1.5460]
Epoch 2 | Training loss: 1.5460, validation accuracy: 0.3508, validation loss: 2.1740
100%|██████████| 313/313 [00:46<00:00,  6.79it/s, loss=1.3910]
Epoch 3 | Training loss: 1.3910, validation accuracy: 0.4894, validation loss: 1.5510
100%|██████████| 313/313 [00:46<00:00,  6.68it/s, loss=1.2770]
Epoch 4 | Training loss: 1.2770, validation accuracy: 0.5515, validation loss: 1.2931
100%|██████████| 313/313 [00:46<00:00,  6.66it/s, loss=1.1930]
Epoch 5 | Training loss: 1.1930, validation accuracy: 0.5891, validation loss: 1.1630
100%|██████████| 313/313 [00:45<00:00,  6.83it/s, loss=1.1159]
Epoch 6 | Training loss: 1.1159, validation accuracy: 0.5985, validation loss: 1.2002
100%|██████████| 313/313 [00:45<00:00,  6.82it/s, loss=1.0514]
Epoch 7 | Training loss: 1.0514, validation

In [9]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.88335, 0.3303005283355713)
Valid: (0.7818, 0.7461417873382569)


In [10]:
model_acc = deepcopy(model)
optimizer.accelerate()
optimizer.store_parameters([model_acc.parameters()])

In [11]:
model_acc.to(trainer.device)
train_score = trainer.validation(model_acc, dl["train"])
valid_score = trainer.validation(model_acc, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.880225, 0.336710541677475)
Valid: (0.7906, 0.6641518604278565)


In [12]:
optimizer.param_groups[0]["lr"] = 1e-2

In [13]:
max_epochs = 300
early_stopping = EarlyStopping(15)

for epoch in range(max_epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")
    if early_stopping.should_stop(val_loss):
        break

100%|██████████| 313/313 [00:46<00:00,  6.70it/s, loss=0.2596]
Epoch 1 | Training loss: 0.2596, validation accuracy: 0.8104, validation loss: 0.6576
100%|██████████| 313/313 [00:46<00:00,  6.79it/s, loss=0.2336]
Epoch 2 | Training loss: 0.2336, validation accuracy: 0.8108, validation loss: 0.6673
100%|██████████| 313/313 [00:46<00:00,  6.73it/s, loss=0.2254]
Epoch 3 | Training loss: 0.2254, validation accuracy: 0.8092, validation loss: 0.6774
100%|██████████| 313/313 [00:45<00:00,  6.82it/s, loss=0.2148]
Epoch 4 | Training loss: 0.2148, validation accuracy: 0.8119, validation loss: 0.6827
100%|██████████| 313/313 [00:45<00:00,  6.81it/s, loss=0.2119]
Epoch 5 | Training loss: 0.2119, validation accuracy: 0.8096, validation loss: 0.6891
100%|██████████| 313/313 [00:46<00:00,  6.73it/s, loss=0.2045]
Epoch 6 | Training loss: 0.2045, validation accuracy: 0.8106, validation loss: 0.7010
100%|██████████| 313/313 [00:46<00:00,  6.73it/s, loss=0.2012]
Epoch 7 | Training loss: 0.2012, validation

In [14]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.95455, 0.13593391649723052)
Valid: (0.8103, 0.7438725898742676)


In [15]:
model_acc = deepcopy(model)
optimizer.accelerate()
optimizer.store_parameters([model_acc.parameters()])

In [16]:
model_acc.to(trainer.device)
train_score = trainer.validation(model_acc, dl["train"])
valid_score = trainer.validation(model_acc, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9473, 0.15239884653091432)
Valid: (0.8104, 0.7434439922332764)


In [17]:
optimizer.param_groups[0]["lr"] = 1e-3

In [18]:
max_epochs = 300
early_stopping = EarlyStopping(15)

for epoch in range(max_epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")
    if early_stopping.should_stop(val_loss):
        break

100%|██████████| 313/313 [00:46<00:00,  6.77it/s, loss=0.1616]
Epoch 1 | Training loss: 0.1616, validation accuracy: 0.8099, validation loss: 0.7462
100%|██████████| 313/313 [00:46<00:00,  6.69it/s, loss=0.1602]
Epoch 2 | Training loss: 0.1602, validation accuracy: 0.8119, validation loss: 0.7406
100%|██████████| 313/313 [00:46<00:00,  6.77it/s, loss=0.1562]
Epoch 3 | Training loss: 0.1562, validation accuracy: 0.8129, validation loss: 0.7414
100%|██████████| 313/313 [00:46<00:00,  6.80it/s, loss=0.1636]
Epoch 4 | Training loss: 0.1636, validation accuracy: 0.8125, validation loss: 0.7448
100%|██████████| 313/313 [00:46<00:00,  6.80it/s, loss=0.1591]
Epoch 5 | Training loss: 0.1591, validation accuracy: 0.8130, validation loss: 0.7432
100%|██████████| 313/313 [00:46<00:00,  6.74it/s, loss=0.1573]
Epoch 6 | Training loss: 0.1573, validation accuracy: 0.8113, validation loss: 0.7495
100%|██████████| 313/313 [00:46<00:00,  6.78it/s, loss=0.1578]
Epoch 7 | Training loss: 0.1578, validation

In [19]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9569, 0.12778740136623382)
Valid: (0.8133, 0.7551136692047119)


In [20]:
model_acc = deepcopy(model)
optimizer.accelerate()
optimizer.store_parameters([model_acc.parameters()])

In [21]:
model_acc.to(trainer.device)
train_score = trainer.validation(model_acc, dl["train"])
valid_score = trainer.validation(model_acc, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.95665, 0.12491433500051498)
Valid: (0.8115, 0.7564371599197388)


## Momentum

In [5]:
model = models.densenet121(pretrained=False)
model.classifier = nn.Sequential(
    nn.Linear(1024, 10),
    nn.LogSoftmax(-1)
)
model.to(trainer.device)

DenseNet(
  (features): Sequential(
    (conv0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (norm0): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu0): ReLU(inplace=True)
    (pool0): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (denseblock1): _DenseBlock(
      (denselayer1): _DenseLayer(
        (norm1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu1): ReLU(inplace=True)
        (conv1): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu2): ReLU(inplace=True)
        (conv2): Conv2d(128, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      )
      (denselayer2): _DenseLayer(
        (norm1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu

In [6]:
optimizer = AcceleratedSGD(model.parameters(), 1e-1, k=10, momentum=0.9, weight_decay=1e-5, lambda_=1e-8)
logger = Logger("densenet_log_augmentation-early_stopping.txt.no_resizing")

In [7]:
trainer.validation(model, dl["valid"])

(0.0978, 2.33697576713562)

In [8]:
max_epochs = 300
early_stopping = EarlyStopping(15)

for epoch in range(max_epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")
    if early_stopping.should_stop(val_loss):
        break

100%|██████████| 313/313 [00:55<00:00,  5.60it/s, loss=2.1956]
Epoch 1 | Training loss: 2.1956, validation accuracy: 0.3569, validation loss: 2.1444
100%|██████████| 313/313 [00:56<00:00,  5.58it/s, loss=1.6407]
Epoch 2 | Training loss: 1.6407, validation accuracy: 0.4271, validation loss: 2.3865
100%|██████████| 313/313 [00:55<00:00,  5.61it/s, loss=1.4696]
Epoch 3 | Training loss: 1.4696, validation accuracy: 0.4715, validation loss: 1.4993
100%|██████████| 313/313 [00:55<00:00,  5.68it/s, loss=1.3852]
Epoch 4 | Training loss: 1.3852, validation accuracy: 0.4686, validation loss: 1.5938
100%|██████████| 313/313 [00:55<00:00,  5.62it/s, loss=1.3631]
Epoch 5 | Training loss: 1.3631, validation accuracy: 0.5431, validation loss: 1.2848
100%|██████████| 313/313 [00:55<00:00,  5.60it/s, loss=1.3176]
Epoch 6 | Training loss: 1.3176, validation accuracy: 0.5352, validation loss: 1.6345
100%|██████████| 313/313 [00:55<00:00,  5.65it/s, loss=1.2341]
Epoch 7 | Training loss: 1.2341, validation

In [9]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.899975, 0.2813507518291473)
Valid: (0.8062, 0.6687633808135987)


In [10]:
model_acc = deepcopy(model)
optimizer.accelerate()
optimizer.store_parameters([model_acc.parameters()])

In [11]:
model_acc.to(trainer.device)
train_score = trainer.validation(model_acc, dl["train"])
valid_score = trainer.validation(model_acc, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.875525, 0.34509133644104006)
Valid: (0.7968, 0.7246498273849488)


In [12]:
optimizer.param_groups[0]["lr"] = 1e-2

In [13]:
max_epochs = 300
early_stopping = EarlyStopping(15)

for epoch in range(max_epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")
    if early_stopping.should_stop(val_loss):
        break

100%|██████████| 313/313 [00:56<00:00,  5.57it/s, loss=0.2314]
Epoch 1 | Training loss: 0.2314, validation accuracy: 0.8244, validation loss: 0.6301
100%|██████████| 313/313 [00:55<00:00,  5.61it/s, loss=0.1967]
Epoch 2 | Training loss: 0.1967, validation accuracy: 0.8246, validation loss: 0.6360
100%|██████████| 313/313 [00:55<00:00,  5.63it/s, loss=0.1874]
Epoch 3 | Training loss: 0.1874, validation accuracy: 0.8258, validation loss: 0.6725
100%|██████████| 313/313 [00:55<00:00,  5.67it/s, loss=0.1776]
Epoch 4 | Training loss: 0.1776, validation accuracy: 0.8304, validation loss: 0.6694
100%|██████████| 313/313 [00:55<00:00,  5.59it/s, loss=0.1679]
Epoch 5 | Training loss: 0.1679, validation accuracy: 0.8278, validation loss: 0.6805
100%|██████████| 313/313 [00:56<00:00,  5.52it/s, loss=0.1613]
Epoch 6 | Training loss: 0.1613, validation accuracy: 0.8277, validation loss: 0.6905
100%|██████████| 313/313 [00:56<00:00,  5.50it/s, loss=0.1606]
Epoch 7 | Training loss: 0.1606, validation

In [14]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.966875, 0.09502119271755219)
Valid: (0.8273, 0.7496071128845215)


In [15]:
model_acc = deepcopy(model)
optimizer.accelerate()
optimizer.store_parameters([model_acc.parameters()])

In [16]:
model_acc.to(trainer.device)
train_score = trainer.validation(model_acc, dl["train"])
valid_score = trainer.validation(model_acc, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.960075, 0.11413726024627685)
Valid: (0.8233, 0.774401446723938)


In [17]:
optimizer.param_groups[0]["lr"] = 1e-3

In [18]:
max_epochs = 300
early_stopping = EarlyStopping(15)

for epoch in range(max_epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")
    if early_stopping.should_stop(val_loss):
        break

100%|██████████| 313/313 [00:55<00:00,  5.61it/s, loss=0.1181]
Epoch 1 | Training loss: 0.1181, validation accuracy: 0.8286, validation loss: 0.7534
100%|██████████| 313/313 [00:56<00:00,  5.56it/s, loss=0.1179]
Epoch 2 | Training loss: 0.1179, validation accuracy: 0.8295, validation loss: 0.7543
100%|██████████| 313/313 [00:55<00:00,  5.63it/s, loss=0.1174]
Epoch 3 | Training loss: 0.1174, validation accuracy: 0.8291, validation loss: 0.7536
100%|██████████| 313/313 [00:55<00:00,  5.62it/s, loss=0.1134]
Epoch 4 | Training loss: 0.1134, validation accuracy: 0.8290, validation loss: 0.7627
100%|██████████| 313/313 [00:55<00:00,  5.61it/s, loss=0.1154]
Epoch 5 | Training loss: 0.1154, validation accuracy: 0.8311, validation loss: 0.7493
100%|██████████| 313/313 [00:55<00:00,  5.65it/s, loss=0.1087]
Epoch 6 | Training loss: 0.1087, validation accuracy: 0.8296, validation loss: 0.7627
100%|██████████| 313/313 [00:56<00:00,  5.58it/s, loss=0.1113]
Epoch 7 | Training loss: 0.1113, validation

In [19]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.972325, 0.08151594339609146)
Valid: (0.8282, 0.7831070388793945)


In [20]:
model_acc = deepcopy(model)
optimizer.accelerate()
optimizer.store_parameters([model_acc.parameters()])

In [21]:
model_acc.to(trainer.device)
train_score = trainer.validation(model_acc, dl["train"])
valid_score = trainer.validation(model_acc, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.97135, 0.08327368819117546)
Valid: (0.8292, 0.7835927715301514)
