In [1]:
import sys
sys.path.insert(0, "../..")

In [2]:
import torch
from torch import nn
import numpy as np
from copy import deepcopy

from nn_extrapolation import AcceleratedSGD
from nn_utils import *

In [3]:
torch.cuda.is_available()

True

In [4]:
trainer = Trainer(
    device="cuda",
    loss_fn=nn.NLLLoss(reduction="mean"),
    val_loss_fn=nn.NLLLoss(reduction="sum"),
)

In [5]:
dl = load_dataset(
    dataset="mnist", 
    root="../../../MNIST", 
    download=False, 
    validation_split=0.2,
    batch_size=64, 
    num_workers=2,
)

In [6]:
def make_model():
    return nn.Sequential(
        nn.Conv2d(1, 32, 3),
        nn.ReLU(),
        nn.Conv2d(32, 32, 3),
        nn.ReLU(),
        nn.MaxPool2d(2),
        nn.Conv2d(32, 64, 3),
        nn.ReLU(),
        nn.Conv2d(64, 64, 3),
        nn.ReLU(),
        nn.MaxPool2d(2),
        nn.Flatten(),
        nn.Linear(4*4*64, 128),
        nn.ReLU(),
        nn.Linear(128, 10),
        nn.LogSoftmax(-1),
    )

## Epoch

In [7]:
model = make_model()
model.to(trainer.device)

Sequential(
  (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (1): ReLU()
  (2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
  (3): ReLU()
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (6): ReLU()
  (7): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (8): ReLU()
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Flatten(start_dim=1, end_dim=-1)
  (11): Linear(in_features=1024, out_features=128, bias=True)
  (12): ReLU()
  (13): Linear(in_features=128, out_features=10, bias=True)
  (14): LogSoftmax(dim=-1)
)

### Lambda = 1e-10

In [8]:
optimizer = AcceleratedSGD(model.parameters(), 1e-3, k=10, momentum=0.5, weight_decay=1e-5, mode="epoch")
logger = Logger("SGD_momentum2.txt")

In [9]:
epochs = 30

for epoch in range(epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")

100%|██████████| 750/750 [00:13<00:00, 54.24it/s, loss=2.2990]
Epoch 1 | Training loss: 2.2990, validation accuracy: 0.1552, validation loss: 2.2878
100%|██████████| 750/750 [00:13<00:00, 56.67it/s, loss=2.1467]
Epoch 2 | Training loss: 2.1467, validation accuracy: 0.6899, validation loss: 1.3216
100%|██████████| 750/750 [00:12<00:00, 58.27it/s, loss=0.5990]
Epoch 3 | Training loss: 0.5990, validation accuracy: 0.8728, validation loss: 0.4100
100%|██████████| 750/750 [00:12<00:00, 58.49it/s, loss=0.3524]
Epoch 4 | Training loss: 0.3524, validation accuracy: 0.9179, validation loss: 0.2779
100%|██████████| 750/750 [00:12<00:00, 58.88it/s, loss=0.2594]
Epoch 5 | Training loss: 0.2594, validation accuracy: 0.9377, validation loss: 0.2105
100%|██████████| 750/750 [00:12<00:00, 58.10it/s, loss=0.2019]
Epoch 6 | Training loss: 0.2019, validation accuracy: 0.9484, validation loss: 0.1698
100%|██████████| 750/750 [00:13<00:00, 57.58it/s, loss=0.1649]
Epoch 7 | Training loss: 0.1649, validation

In [10]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.99, 0.0328793765764373)
Valid: (0.9811666666666666, 0.06281401680440953)


In [11]:
optimizer.accelerate()

In [12]:
optimizer.store_parameters()
model.cuda()
None

In [13]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9908958333333333, 0.03196341497482111)
Valid: (0.9836666666666667, 0.058639409329742195)


### Lambda = 1e-5

In [14]:
optimizer = AcceleratedSGD(model.parameters(), 1e-3, k=10, momentum=0.5, weight_decay=1e-5, mode="epoch", lambda_=1e-5)
logger = Logger("SGD_momentum_lambda=1e-5_2.txt")

In [15]:
epochs = 30

for epoch in range(epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")

100%|██████████| 750/750 [00:12<00:00, 58.61it/s, loss=0.0384]
Epoch 1 | Training loss: 0.0384, validation accuracy: 0.9807, validation loss: 0.0683
100%|██████████| 750/750 [00:12<00:00, 58.58it/s, loss=0.0376]
Epoch 2 | Training loss: 0.0376, validation accuracy: 0.9808, validation loss: 0.0675
100%|██████████| 750/750 [00:13<00:00, 54.35it/s, loss=0.0365]
Epoch 3 | Training loss: 0.0365, validation accuracy: 0.9799, validation loss: 0.0690
100%|██████████| 750/750 [00:12<00:00, 58.01it/s, loss=0.0344]
Epoch 4 | Training loss: 0.0344, validation accuracy: 0.9802, validation loss: 0.0685
100%|██████████| 750/750 [00:12<00:00, 57.84it/s, loss=0.0332]
Epoch 5 | Training loss: 0.0332, validation accuracy: 0.9832, validation loss: 0.0596
100%|██████████| 750/750 [00:13<00:00, 57.48it/s, loss=0.0326]
Epoch 6 | Training loss: 0.0326, validation accuracy: 0.9826, validation loss: 0.0591
100%|██████████| 750/750 [00:13<00:00, 56.52it/s, loss=0.0311]
Epoch 7 | Training loss: 0.0311, validation

In [16]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9966666666666667, 0.011765114218501063)
Valid: (0.9848333333333333, 0.05939000487828162)


In [17]:
optimizer.accelerate()

In [18]:
optimizer.store_parameters()
model.cuda()
None

In [19]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9976875, 0.010387330086668953)
Valid: (0.9855, 0.05503412218927406)


### Lambda = 1e-2

In [22]:
optimizer = AcceleratedSGD(model.parameters(), 1e-3, k=10, momentum=0.5, weight_decay=1e-5, mode="epoch", lambda_=1e-2)
logger = Logger("SGD_momentum_lambda=1e-2.txt")

In [23]:
epochs = 30

for epoch in range(epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")

100%|██████████| 750/750 [00:13<00:00, 57.18it/s, loss=2.2916]
Epoch 1 | Training loss: 2.2916, validation accuracy: 0.2213, validation loss: 2.2673
100%|██████████| 750/750 [00:13<00:00, 56.96it/s, loss=1.6294]
Epoch 2 | Training loss: 1.6294, validation accuracy: 0.8552, validation loss: 0.5024
100%|██████████| 750/750 [00:13<00:00, 56.92it/s, loss=0.4307]
Epoch 3 | Training loss: 0.4307, validation accuracy: 0.8996, validation loss: 0.3240
100%|██████████| 750/750 [00:12<00:00, 57.84it/s, loss=0.3069]
Epoch 4 | Training loss: 0.3069, validation accuracy: 0.9252, validation loss: 0.2443
100%|██████████| 750/750 [00:12<00:00, 58.69it/s, loss=0.2397]
Epoch 5 | Training loss: 0.2397, validation accuracy: 0.9426, validation loss: 0.1965
100%|██████████| 750/750 [00:12<00:00, 59.01it/s, loss=0.1923]
Epoch 6 | Training loss: 0.1923, validation accuracy: 0.9517, validation loss: 0.1626
100%|██████████| 750/750 [00:12<00:00, 58.58it/s, loss=0.1597]
Epoch 7 | Training loss: 0.1597, validation

In [24]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9895833333333334, 0.03415543323522434)
Valid: (0.981, 0.06364911372043813)


In [25]:
optimizer.accelerate()

In [26]:
optimizer.store_parameters()
model.cuda()
None

In [27]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9904583333333333, 0.032220454464045666)
Valid: (0.9824166666666667, 0.05974859910442804)


## Epoch average

In [20]:
model = make_model()
model.to(trainer.device)

Sequential(
  (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (1): ReLU()
  (2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
  (3): ReLU()
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (6): ReLU()
  (7): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (8): ReLU()
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Flatten(start_dim=1, end_dim=-1)
  (11): Linear(in_features=1024, out_features=128, bias=True)
  (12): ReLU()
  (13): Linear(in_features=128, out_features=10, bias=True)
  (14): LogSoftmax(dim=-1)
)

In [24]:
optimizer = AcceleratedSGD(model.parameters(), 1e-3, k=10, momentum=0.5, weight_decay=1e-5, mode="epoch_avg")
logger = Logger("SGD_momentum-avg.txt")

In [25]:
epochs = 30

for epoch in range(epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")

100%|██████████| 750/750 [00:12<00:00, 58.54it/s, loss=2.2948]
Epoch 1 | Training loss: 2.2948, validation accuracy: 0.3392, validation loss: 2.2834
100%|██████████| 750/750 [00:12<00:00, 58.87it/s, loss=2.1638]
Epoch 2 | Training loss: 2.1638, validation accuracy: 0.7480, validation loss: 1.4245
100%|██████████| 750/750 [00:12<00:00, 58.95it/s, loss=0.5741]
Epoch 3 | Training loss: 0.5741, validation accuracy: 0.8956, validation loss: 0.3469
100%|██████████| 750/750 [00:12<00:00, 58.82it/s, loss=0.3348]
Epoch 4 | Training loss: 0.3348, validation accuracy: 0.9178, validation loss: 0.2648
100%|██████████| 750/750 [00:12<00:00, 58.42it/s, loss=0.2508]
Epoch 5 | Training loss: 0.2508, validation accuracy: 0.9355, validation loss: 0.2086
100%|██████████| 750/750 [00:12<00:00, 58.91it/s, loss=0.1987]
Epoch 6 | Training loss: 0.1987, validation accuracy: 0.9499, validation loss: 0.1704
100%|██████████| 750/750 [00:12<00:00, 58.85it/s, loss=0.1641]
Epoch 7 | Training loss: 0.1641, validation

In [26]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9895625, 0.03502268844128897)
Valid: (0.9824166666666667, 0.0605216880949835)


In [27]:
optimizer.accelerate()

In [28]:
optimizer.store_parameters()
model.cuda()
None

In [29]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9853333333333333, 0.050679075373957556)
Valid: (0.98, 0.06859579290946324)


## Epoch average, with span = 100

In [7]:
model = make_model()
model.to(trainer.device)

Sequential(
  (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (1): ReLU()
  (2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
  (3): ReLU()
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (6): ReLU()
  (7): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (8): ReLU()
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Flatten(start_dim=1, end_dim=-1)
  (11): Linear(in_features=1024, out_features=128, bias=True)
  (12): ReLU()
  (13): Linear(in_features=128, out_features=10, bias=True)
  (14): LogSoftmax(dim=-1)
)

In [8]:
optimizer = AcceleratedSGD(model.parameters(), 1e-3, k=10, momentum=0.5, weight_decay=1e-5, mode="epoch_avg", avg_alpha=(2 / (100 + 1)))
logger = Logger("SGD_momentum-avg_span_100.txt")

In [9]:
epochs = 30

for epoch in range(epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")

100%|██████████| 750/750 [00:14<00:00, 51.22it/s, loss=2.2602]
Epoch 1 | Training loss: 2.2602, validation accuracy: 0.5413, validation loss: 2.0833
100%|██████████| 750/750 [00:13<00:00, 53.90it/s, loss=0.8352]
Epoch 2 | Training loss: 0.8352, validation accuracy: 0.8910, validation loss: 0.3745
100%|██████████| 750/750 [00:13<00:00, 57.67it/s, loss=0.3561]
Epoch 3 | Training loss: 0.3561, validation accuracy: 0.9173, validation loss: 0.2778
100%|██████████| 750/750 [00:13<00:00, 57.62it/s, loss=0.2592]
Epoch 4 | Training loss: 0.2592, validation accuracy: 0.9348, validation loss: 0.2249
100%|██████████| 750/750 [00:13<00:00, 56.78it/s, loss=0.2007]
Epoch 5 | Training loss: 0.2007, validation accuracy: 0.9523, validation loss: 0.1656
100%|██████████| 750/750 [00:12<00:00, 58.19it/s, loss=0.1642]
Epoch 6 | Training loss: 0.1642, validation accuracy: 0.9587, validation loss: 0.1441
100%|██████████| 750/750 [00:12<00:00, 58.40it/s, loss=0.1401]
Epoch 7 | Training loss: 0.1401, validation

In [10]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9880833333333333, 0.03772656109680732)
Valid: (0.9805833333333334, 0.06853191727896532)


In [11]:
optimizer.accelerate()

In [12]:
optimizer.store_parameters()
model.cuda()
None

In [13]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9909166666666667, 0.030257200048925976)
Valid: (0.9831666666666666, 0.06084559714452674)


## Epoch average, with span = 50

In [14]:
model = make_model()
model.to(trainer.device)

Sequential(
  (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (1): ReLU()
  (2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
  (3): ReLU()
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (6): ReLU()
  (7): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (8): ReLU()
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Flatten(start_dim=1, end_dim=-1)
  (11): Linear(in_features=1024, out_features=128, bias=True)
  (12): ReLU()
  (13): Linear(in_features=128, out_features=10, bias=True)
  (14): LogSoftmax(dim=-1)
)

In [15]:
optimizer = AcceleratedSGD(model.parameters(), 1e-3, k=10, momentum=0.5, weight_decay=1e-5, mode="epoch_avg", avg_alpha=(2 / (50 + 1)))
logger = Logger("SGD_momentum-avg_span_50.txt")

In [16]:
epochs = 30

for epoch in range(epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")

100%|██████████| 750/750 [00:12<00:00, 57.83it/s, loss=2.2969]
Epoch 1 | Training loss: 2.2969, validation accuracy: 0.3403, validation loss: 2.2866
100%|██████████| 750/750 [00:12<00:00, 59.03it/s, loss=2.2358]
Epoch 2 | Training loss: 2.2358, validation accuracy: 0.6383, validation loss: 2.0291
100%|██████████| 750/750 [00:12<00:00, 58.70it/s, loss=0.8192]
Epoch 3 | Training loss: 0.8192, validation accuracy: 0.8860, validation loss: 0.3838
100%|██████████| 750/750 [00:12<00:00, 58.56it/s, loss=0.3641]
Epoch 4 | Training loss: 0.3641, validation accuracy: 0.9111, validation loss: 0.2941
100%|██████████| 750/750 [00:12<00:00, 58.87it/s, loss=0.2729]
Epoch 5 | Training loss: 0.2729, validation accuracy: 0.9348, validation loss: 0.2198
100%|██████████| 750/750 [00:12<00:00, 58.90it/s, loss=0.2172]
Epoch 6 | Training loss: 0.2172, validation accuracy: 0.9488, validation loss: 0.1779
100%|██████████| 750/750 [00:12<00:00, 59.08it/s, loss=0.1766]
Epoch 7 | Training loss: 0.1766, validation

In [17]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9885625, 0.03738672524054224)
Valid: (0.97925, 0.06491781446182479)


In [18]:
optimizer.accelerate()

In [19]:
optimizer.store_parameters()
model.cuda()
None

In [20]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9910416666666667, 0.03172329428357382)
Valid: (0.9831666666666666, 0.059001391724838564)


## Epoch average, with span = 20

In [7]:
model = make_model()
model.to(trainer.device)

Sequential(
  (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (1): ReLU()
  (2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
  (3): ReLU()
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (6): ReLU()
  (7): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (8): ReLU()
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Flatten(start_dim=1, end_dim=-1)
  (11): Linear(in_features=1024, out_features=128, bias=True)
  (12): ReLU()
  (13): Linear(in_features=128, out_features=10, bias=True)
  (14): LogSoftmax(dim=-1)
)

In [8]:
optimizer = AcceleratedSGD(model.parameters(), 1e-3, k=10, momentum=0.5, weight_decay=1e-5, mode="epoch_avg", avg_alpha=(2 / (20 + 1)))
logger = Logger("SGD_momentum-avg_span_20.txt")

In [9]:
epochs = 30

for epoch in range(epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")

100%|██████████| 750/750 [00:13<00:00, 53.69it/s, loss=2.2796]
Epoch 1 | Training loss: 2.2796, validation accuracy: 0.3551, validation loss: 2.2134
100%|██████████| 750/750 [00:13<00:00, 56.22it/s, loss=1.1575]
Epoch 2 | Training loss: 1.1575, validation accuracy: 0.8743, validation loss: 0.4219
100%|██████████| 750/750 [00:13<00:00, 53.75it/s, loss=0.3831]
Epoch 3 | Training loss: 0.3831, validation accuracy: 0.9120, validation loss: 0.2998
100%|██████████| 750/750 [00:13<00:00, 55.53it/s, loss=0.2788]
Epoch 4 | Training loss: 0.2788, validation accuracy: 0.9347, validation loss: 0.2218
100%|██████████| 750/750 [00:13<00:00, 56.44it/s, loss=0.2145]
Epoch 5 | Training loss: 0.2145, validation accuracy: 0.9466, validation loss: 0.1835
100%|██████████| 750/750 [00:13<00:00, 56.95it/s, loss=0.1745]
Epoch 6 | Training loss: 0.1745, validation accuracy: 0.9567, validation loss: 0.1473
100%|██████████| 750/750 [00:13<00:00, 55.57it/s, loss=0.1464]
Epoch 7 | Training loss: 0.1464, validation

In [10]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.98875, 0.03643931998917833)
Valid: (0.9816666666666667, 0.06241245872341097)


In [11]:
optimizer.accelerate()

In [12]:
optimizer.store_parameters()
model.cuda()
None

In [13]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9903541666666666, 0.033172125154950965)
Valid: (0.98325, 0.05807564333003635)


## Epoch average, with span = 15

In [37]:
model = make_model()
model.to(trainer.device)

Sequential(
  (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (1): ReLU()
  (2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
  (3): ReLU()
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (6): ReLU()
  (7): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (8): ReLU()
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Flatten(start_dim=1, end_dim=-1)
  (11): Linear(in_features=1024, out_features=128, bias=True)
  (12): ReLU()
  (13): Linear(in_features=128, out_features=10, bias=True)
  (14): LogSoftmax(dim=-1)
)

In [38]:
optimizer = AcceleratedSGD(model.parameters(), 1e-3, k=10, momentum=0.5, weight_decay=1e-5, mode="epoch_avg", avg_alpha=(2 / (15 + 1)))
logger = Logger("SGD_momentum-avg_span_15.txt")

In [39]:
epochs = 30

for epoch in range(epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")

100%|██████████| 750/750 [00:12<00:00, 57.82it/s, loss=2.2893]
Epoch 1 | Training loss: 2.2893, validation accuracy: 0.2376, validation loss: 2.2610
100%|██████████| 750/750 [00:12<00:00, 57.93it/s, loss=1.5748]
Epoch 2 | Training loss: 1.5748, validation accuracy: 0.8377, validation loss: 0.5393
100%|██████████| 750/750 [00:12<00:00, 58.60it/s, loss=0.4447]
Epoch 3 | Training loss: 0.4447, validation accuracy: 0.9035, validation loss: 0.3362
100%|██████████| 750/750 [00:13<00:00, 57.58it/s, loss=0.3109]
Epoch 4 | Training loss: 0.3109, validation accuracy: 0.9275, validation loss: 0.2392
100%|██████████| 750/750 [00:12<00:00, 57.79it/s, loss=0.2386]
Epoch 5 | Training loss: 0.2386, validation accuracy: 0.9422, validation loss: 0.1944
100%|██████████| 750/750 [00:12<00:00, 57.71it/s, loss=0.1906]
Epoch 6 | Training loss: 0.1906, validation accuracy: 0.9534, validation loss: 0.1629
100%|██████████| 750/750 [00:12<00:00, 57.71it/s, loss=0.1597]
Epoch 7 | Training loss: 0.1597, validation

In [40]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9896875, 0.0338580307077306)
Valid: (0.9830833333333333, 0.05742358929710463)


In [41]:
optimizer.accelerate()

In [42]:
optimizer.store_parameters()
model.cuda()
None

In [43]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.990375, 0.03216092803166248)
Valid: (0.98375, 0.056568290886934845)


## Epoch average, with span = 10

In [44]:
model = make_model()
model.to(trainer.device)

Sequential(
  (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (1): ReLU()
  (2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
  (3): ReLU()
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (6): ReLU()
  (7): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (8): ReLU()
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Flatten(start_dim=1, end_dim=-1)
  (11): Linear(in_features=1024, out_features=128, bias=True)
  (12): ReLU()
  (13): Linear(in_features=128, out_features=10, bias=True)
  (14): LogSoftmax(dim=-1)
)

In [45]:
optimizer = AcceleratedSGD(model.parameters(), 1e-3, k=10, momentum=0.5, weight_decay=1e-5, mode="epoch_avg", avg_alpha=(2 / (10 + 1)))
logger = Logger("SGD_momentum-avg_span_10.txt")

In [46]:
epochs = 30

for epoch in range(epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")

100%|██████████| 750/750 [00:12<00:00, 58.53it/s, loss=2.3012]
Epoch 1 | Training loss: 2.3012, validation accuracy: 0.2513, validation loss: 2.2979
100%|██████████| 750/750 [00:12<00:00, 58.65it/s, loss=2.2930]
Epoch 2 | Training loss: 2.2930, validation accuracy: 0.2069, validation loss: 2.2850
100%|██████████| 750/750 [00:13<00:00, 56.10it/s, loss=2.2420]
Epoch 3 | Training loss: 2.2420, validation accuracy: 0.5530, validation loss: 2.0761
100%|██████████| 750/750 [00:14<00:00, 52.61it/s, loss=0.8874]
Epoch 4 | Training loss: 0.8874, validation accuracy: 0.8747, validation loss: 0.4132
100%|██████████| 750/750 [00:13<00:00, 55.95it/s, loss=0.3751]
Epoch 5 | Training loss: 0.3751, validation accuracy: 0.9149, validation loss: 0.2929
100%|██████████| 750/750 [00:12<00:00, 58.64it/s, loss=0.2752]
Epoch 6 | Training loss: 0.2752, validation accuracy: 0.9349, validation loss: 0.2226
100%|██████████| 750/750 [00:12<00:00, 58.65it/s, loss=0.2136]
Epoch 7 | Training loss: 0.2136, validation

In [47]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9874791666666667, 0.040641075562064845)
Valid: (0.9809166666666667, 0.06505878606423114)


In [48]:
optimizer.accelerate()

In [49]:
optimizer.store_parameters()
model.cuda()
None

In [50]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9892083333333334, 0.03670764370728284)
Valid: (0.982, 0.06022620059705029)


## Epoch average, with span = 5

In [10]:
model = make_model()
model.to(trainer.device)

Sequential(
  (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (1): ReLU()
  (2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
  (3): ReLU()
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (6): ReLU()
  (7): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (8): ReLU()
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Flatten(start_dim=1, end_dim=-1)
  (11): Linear(in_features=1024, out_features=128, bias=True)
  (12): ReLU()
  (13): Linear(in_features=128, out_features=10, bias=True)
  (14): LogSoftmax(dim=-1)
)

In [12]:
optimizer = AcceleratedSGD(model.parameters(), 1e-3, k=10, momentum=0.5, weight_decay=1e-5, mode="epoch_avg", avg_alpha = (2 / (5 + 1)))
logger = Logger("SGD_momentum-avg_span_5.txt")

In [13]:
epochs = 30

for epoch in range(epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")

100%|██████████| 750/750 [00:14<00:00, 52.12it/s, loss=2.2817]
Epoch 1 | Training loss: 2.2817, validation accuracy: 0.4597, validation loss: 2.2244
100%|██████████| 750/750 [00:14<00:00, 51.81it/s, loss=1.1693]
Epoch 2 | Training loss: 1.1693, validation accuracy: 0.8622, validation loss: 0.4646
100%|██████████| 750/750 [00:13<00:00, 55.07it/s, loss=0.4020]
Epoch 3 | Training loss: 0.4020, validation accuracy: 0.8876, validation loss: 0.3594
100%|██████████| 750/750 [00:13<00:00, 55.52it/s, loss=0.2927]
Epoch 4 | Training loss: 0.2927, validation accuracy: 0.9314, validation loss: 0.2345
100%|██████████| 750/750 [00:13<00:00, 55.42it/s, loss=0.2260]
Epoch 5 | Training loss: 0.2260, validation accuracy: 0.9414, validation loss: 0.1945
100%|██████████| 750/750 [00:13<00:00, 54.91it/s, loss=0.1805]
Epoch 6 | Training loss: 0.1805, validation accuracy: 0.9548, validation loss: 0.1524
100%|██████████| 750/750 [00:13<00:00, 54.71it/s, loss=0.1515]
Epoch 7 | Training loss: 0.1515, validation

In [14]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9882708333333333, 0.03823670375036697)
Valid: (0.9831666666666666, 0.06175728642366206)


In [15]:
optimizer.accelerate()

In [17]:
optimizer.store_parameters()
model.to(trainer.device)
None

In [18]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9910208333333334, 0.03151546738048395)
Valid: (0.9838333333333333, 0.056970573332936814)


## Split + epoch

In [38]:
model = make_model()
model.to(trainer.device)
None

In [39]:
groups = [{"params": [param]} for param in model.parameters()]
optimizer = AcceleratedSGD(groups, 1e-3, k=10, momentum=0.5, weight_decay=1e-5, mode="epoch")
logger = Logger("SGD_momentum-split.txt")

In [40]:
epochs = 30

for epoch in range(epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")

100%|██████████| 750/750 [00:14<00:00, 53.44it/s, loss=2.2987]
Epoch 1 | Training loss: 2.2987, validation accuracy: 0.1991, validation loss: 2.2909
100%|██████████| 750/750 [00:14<00:00, 52.09it/s, loss=2.2648]
Epoch 2 | Training loss: 2.2648, validation accuracy: 0.5484, validation loss: 2.1851
100%|██████████| 750/750 [00:14<00:00, 51.88it/s, loss=1.0827]
Epoch 3 | Training loss: 1.0827, validation accuracy: 0.8698, validation loss: 0.4168
100%|██████████| 750/750 [00:13<00:00, 54.02it/s, loss=0.3671]
Epoch 4 | Training loss: 0.3671, validation accuracy: 0.9094, validation loss: 0.2927
100%|██████████| 750/750 [00:14<00:00, 53.24it/s, loss=0.2689]
Epoch 5 | Training loss: 0.2689, validation accuracy: 0.9228, validation loss: 0.2406
100%|██████████| 750/750 [00:13<00:00, 54.68it/s, loss=0.2100]
Epoch 6 | Training loss: 0.2100, validation accuracy: 0.9413, validation loss: 0.1904
100%|██████████| 750/750 [00:14<00:00, 52.38it/s, loss=0.1691]
Epoch 7 | Training loss: 0.1691, validation

In [41]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9896458333333333, 0.034043262171170985)
Valid: (0.982, 0.059607748738334826)


In [42]:
optimizer.accelerate()

In [43]:
optimizer.store_parameters()
model.cuda()
None

In [44]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.98975, 0.03377577416237909)
Valid: (0.9814166666666667, 0.06653467167946898)


## Linear only

In [17]:
model = make_model()
model.to(trainer.device)
None

In [18]:
conv_group = {
    "params": [param for child in list(model.children())[:10] for param in child.parameters()],
    "method": None
}
fc_group = {
    "params": [param for child in list(model.children())[10:] for param in child.parameters()]
}
groups = [conv_group, fc_group]
optimizer = AcceleratedSGD(groups, 1e-3, k=10, momentum=0.5, weight_decay=1e-5, mode="epoch")
logger = Logger("SGD_momentum-linear_only.txt")

In [19]:
epochs = 30

for epoch in range(epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")

100%|██████████| 750/750 [00:13<00:00, 56.64it/s, loss=2.2948]
Epoch 1 | Training loss: 2.2948, validation accuracy: 0.1727, validation loss: 2.2792
100%|██████████| 750/750 [00:13<00:00, 53.93it/s, loss=2.0173]
Epoch 2 | Training loss: 2.0173, validation accuracy: 0.7997, validation loss: 0.8372
100%|██████████| 750/750 [00:13<00:00, 54.55it/s, loss=0.5167]
Epoch 3 | Training loss: 0.5167, validation accuracy: 0.8948, validation loss: 0.3565
100%|██████████| 750/750 [00:13<00:00, 53.93it/s, loss=0.3349]
Epoch 4 | Training loss: 0.3349, validation accuracy: 0.9243, validation loss: 0.2652
100%|██████████| 750/750 [00:13<00:00, 53.76it/s, loss=0.2482]
Epoch 5 | Training loss: 0.2482, validation accuracy: 0.9375, validation loss: 0.2130
100%|██████████| 750/750 [00:13<00:00, 55.21it/s, loss=0.1947]
Epoch 6 | Training loss: 0.1947, validation accuracy: 0.9482, validation loss: 0.1759
100%|██████████| 750/750 [00:13<00:00, 55.04it/s, loss=0.1591]
Epoch 7 | Training loss: 0.1591, validation

In [20]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9894791666666667, 0.035931128314696255)
Valid: (0.9816666666666667, 0.060276718021680914)


In [21]:
optimizer.accelerate()

In [22]:
optimizer.store_parameters()
model.cuda()
None

In [23]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9907291666666667, 0.03270646293833852)
Valid: (0.9835, 0.05615483507156993)


## Linear + conv separately

In [24]:
model = make_model()
model.to(trainer.device)
None

In [25]:
conv_group = {
    "params": [param for child in list(model.children())[:10] for param in child.parameters()],
}
fc_group = {
    "params": [param for child in list(model.children())[10:] for param in child.parameters()]
}
groups = [conv_group, fc_group]
optimizer = AcceleratedSGD(groups, 1e-3, k=10, momentum=0.5, weight_decay=1e-5, mode="epoch")
logger = Logger("SGD_momentum-linear_conv.txt")

In [26]:
epochs = 30

for epoch in range(epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")

100%|██████████| 750/750 [00:13<00:00, 56.57it/s, loss=2.2902]
Epoch 1 | Training loss: 2.2902, validation accuracy: 0.2859, validation loss: 2.2658
100%|██████████| 750/750 [00:13<00:00, 53.59it/s, loss=1.6940]
Epoch 2 | Training loss: 1.6940, validation accuracy: 0.8273, validation loss: 0.5655
100%|██████████| 750/750 [00:13<00:00, 54.41it/s, loss=0.4346]
Epoch 3 | Training loss: 0.4346, validation accuracy: 0.9075, validation loss: 0.3109
100%|██████████| 750/750 [00:15<00:00, 48.84it/s, loss=0.3043]
Epoch 4 | Training loss: 0.3043, validation accuracy: 0.9292, validation loss: 0.2416
100%|██████████| 750/750 [00:15<00:00, 47.18it/s, loss=0.2335]
Epoch 5 | Training loss: 0.2335, validation accuracy: 0.9460, validation loss: 0.1903
100%|██████████| 750/750 [00:16<00:00, 46.71it/s, loss=0.1865]
Epoch 6 | Training loss: 0.1865, validation accuracy: 0.9505, validation loss: 0.1660
100%|██████████| 750/750 [00:13<00:00, 56.90it/s, loss=0.1543]
Epoch 7 | Training loss: 0.1543, validation

In [27]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.99025, 0.033030950621236115)
Valid: (0.9830833333333333, 0.061157136692665516)


In [28]:
optimizer.accelerate()

In [29]:
optimizer.store_parameters()
model.cuda()
None

In [30]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9900208333333333, 0.03382842983398587)
Valid: (0.98275, 0.06026328789287557)
