In [1]:
import sys
sys.path.insert(0, "../..")

In [2]:
import torch
from torch import nn
import numpy as np
from copy import deepcopy

from nn_extrapolation import AcceleratedSGD
from nn_utils import *

In [3]:
torch.cuda.is_available()

True

In [4]:
trainer = Trainer(
    device="cuda:1",
    loss_fn=nn.NLLLoss(reduction="mean"),
    val_loss_fn=nn.NLLLoss(reduction="sum"),
)

In [5]:
dl = load_dataset(
    dataset="mnist", 
    root="../../../MNIST", 
    download=False, 
    validation_split=0.2,
    batch_size=64, 
    num_workers=2,
)

In [6]:
def make_model():
    return nn.Sequential(
        nn.Conv2d(1, 32, 3),
        nn.ReLU(),
        nn.Conv2d(32, 32, 3),
        nn.ReLU(),
        nn.MaxPool2d(2),
        nn.Conv2d(32, 64, 3),
        nn.ReLU(),
        nn.Conv2d(64, 64, 3),
        nn.ReLU(),
        nn.MaxPool2d(2),
        nn.Flatten(),
        nn.Linear(4*4*64, 128),
        nn.ReLU(),
        nn.Linear(128, 10),
        nn.LogSoftmax(-1),
    )

## Epoch

In [7]:
model = make_model()
initial_state = deepcopy(model.state_dict())
model.to(trainer.device)

Sequential(
  (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (1): ReLU()
  (2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
  (3): ReLU()
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (6): ReLU()
  (7): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (8): ReLU()
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Flatten(start_dim=1, end_dim=-1)
  (11): Linear(in_features=1024, out_features=128, bias=True)
  (12): ReLU()
  (13): Linear(in_features=128, out_features=10, bias=True)
  (14): LogSoftmax(dim=-1)
)

### Lambda = 1e-10

In [8]:
optimizer = AcceleratedSGD(model.parameters(), 1e-3, k=10, momentum=0.5, weight_decay=1e-5, mode="epoch")
logger = Logger("SGD_momentum2.txt")

In [9]:
torch.manual_seed(2020)
epochs = 30

for epoch in range(epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")

100%|██████████| 750/750 [00:09<00:00, 81.12it/s, loss=2.2952]
Epoch 1 | Training loss: 2.2952, validation accuracy: 0.2307, validation loss: 2.2780
100%|██████████| 750/750 [00:09<00:00, 79.99it/s, loss=1.9824]
Epoch 2 | Training loss: 1.9824, validation accuracy: 0.7867, validation loss: 0.7826
100%|██████████| 750/750 [00:09<00:00, 81.48it/s, loss=0.4920]
Epoch 3 | Training loss: 0.4920, validation accuracy: 0.8973, validation loss: 0.3334
100%|██████████| 750/750 [00:08<00:00, 83.61it/s, loss=0.3123]
Epoch 4 | Training loss: 0.3123, validation accuracy: 0.9278, validation loss: 0.2451
100%|██████████| 750/750 [00:09<00:00, 82.10it/s, loss=0.2293]
Epoch 5 | Training loss: 0.2293, validation accuracy: 0.9437, validation loss: 0.1867
100%|██████████| 750/750 [00:09<00:00, 83.28it/s, loss=0.1787]
Epoch 6 | Training loss: 0.1787, validation accuracy: 0.9564, validation loss: 0.1499
100%|██████████| 750/750 [00:09<00:00, 82.31it/s, loss=0.1471]
Epoch 7 | Training loss: 0.1471, validation

In [10]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.98975, 0.03250145790306851)
Valid: (0.98225, 0.05921080670940379)


In [11]:
optimizer.accelerate()

In [12]:
optimizer.store_parameters()
model.to(trainer.device)
None

In [13]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9911666666666666, 0.030389028998169428)
Valid: (0.98325, 0.05647537519518907)


### Lambda = 1e-5

In [14]:
model.load_state_dict(initial_state)
optimizer = AcceleratedSGD(model.parameters(), 1e-3, k=10, momentum=0.5, weight_decay=1e-5, mode="epoch", lambda_=1e-5)
logger = Logger("SGD_momentum_lambda=1e-5_2.txt")

In [15]:
torch.manual_seed(2020)
epochs = 30

for epoch in range(epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")

100%|██████████| 750/750 [00:08<00:00, 83.75it/s, loss=2.2952]
Epoch 1 | Training loss: 2.2952, validation accuracy: 0.2307, validation loss: 2.2780
100%|██████████| 750/750 [00:09<00:00, 81.31it/s, loss=1.9824]
Epoch 2 | Training loss: 1.9824, validation accuracy: 0.7863, validation loss: 0.7826
100%|██████████| 750/750 [00:09<00:00, 81.15it/s, loss=0.4920]
Epoch 3 | Training loss: 0.4920, validation accuracy: 0.8969, validation loss: 0.3335
100%|██████████| 750/750 [00:08<00:00, 84.16it/s, loss=0.3125]
Epoch 4 | Training loss: 0.3125, validation accuracy: 0.9273, validation loss: 0.2453
100%|██████████| 750/750 [00:09<00:00, 83.26it/s, loss=0.2295]
Epoch 5 | Training loss: 0.2295, validation accuracy: 0.9436, validation loss: 0.1873
100%|██████████| 750/750 [00:09<00:00, 82.30it/s, loss=0.1789]
Epoch 6 | Training loss: 0.1789, validation accuracy: 0.9563, validation loss: 0.1499
100%|██████████| 750/750 [00:08<00:00, 84.55it/s, loss=0.1472]
Epoch 7 | Training loss: 0.1472, validation

In [16]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9898125, 0.0323003632936161)
Valid: (0.98175, 0.059265850248746574)


In [17]:
optimizer.accelerate()

In [18]:
optimizer.store_parameters()
model.to(trainer.device)
None

In [19]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9911458333333333, 0.030479643390513955)
Valid: (0.9830833333333333, 0.05683630394128462)


### Lambda = 1e-2

In [20]:
model.load_state_dict(initial_state)
optimizer = AcceleratedSGD(model.parameters(), 1e-3, k=10, momentum=0.5, weight_decay=1e-5, mode="epoch", lambda_=1e-2)
logger = Logger("SGD_momentum_lambda=1e-2.txt")

In [21]:
torch.manual_seed(2020)
epochs = 30

for epoch in range(epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")

100%|██████████| 750/750 [00:08<00:00, 83.35it/s, loss=2.2952]
Epoch 1 | Training loss: 2.2952, validation accuracy: 0.2307, validation loss: 2.2780
100%|██████████| 750/750 [00:09<00:00, 82.13it/s, loss=1.9824]
Epoch 2 | Training loss: 1.9824, validation accuracy: 0.7863, validation loss: 0.7827
100%|██████████| 750/750 [00:09<00:00, 80.62it/s, loss=0.4920]
Epoch 3 | Training loss: 0.4920, validation accuracy: 0.8968, validation loss: 0.3334
100%|██████████| 750/750 [00:08<00:00, 83.57it/s, loss=0.3125]
Epoch 4 | Training loss: 0.3125, validation accuracy: 0.9275, validation loss: 0.2456
100%|██████████| 750/750 [00:09<00:00, 82.05it/s, loss=0.2294]
Epoch 5 | Training loss: 0.2294, validation accuracy: 0.9439, validation loss: 0.1867
100%|██████████| 750/750 [00:09<00:00, 81.96it/s, loss=0.1787]
Epoch 6 | Training loss: 0.1787, validation accuracy: 0.9557, validation loss: 0.1500
100%|██████████| 750/750 [00:08<00:00, 84.78it/s, loss=0.1471]
Epoch 7 | Training loss: 0.1471, validation

In [22]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9899166666666667, 0.032230870001173265)
Valid: (0.9820833333333333, 0.05960622924364482)


In [23]:
optimizer.accelerate()

In [24]:
optimizer.store_parameters()
model.to(trainer.device)
None

In [25]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9909166666666667, 0.030724221766150247)
Valid: (0.983, 0.057033593920214724)


## Epoch average

In [26]:
model.load_state_dict(initial_state)
model.to(trainer.device)

Sequential(
  (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (1): ReLU()
  (2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
  (3): ReLU()
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (6): ReLU()
  (7): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (8): ReLU()
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Flatten(start_dim=1, end_dim=-1)
  (11): Linear(in_features=1024, out_features=128, bias=True)
  (12): ReLU()
  (13): Linear(in_features=128, out_features=10, bias=True)
  (14): LogSoftmax(dim=-1)
)

In [27]:
optimizer = AcceleratedSGD(model.parameters(), 1e-3, k=10, momentum=0.5, weight_decay=1e-5, mode="epoch_avg")
logger = Logger("SGD_momentum-avg.txt")

In [28]:
torch.manual_seed(2020)
epochs = 30

for epoch in range(epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")

100%|██████████| 750/750 [00:08<00:00, 87.66it/s, loss=2.2952]
Epoch 1 | Training loss: 2.2952, validation accuracy: 0.2307, validation loss: 2.2780
100%|██████████| 750/750 [00:09<00:00, 82.39it/s, loss=1.9823]
Epoch 2 | Training loss: 1.9823, validation accuracy: 0.7866, validation loss: 0.7825
100%|██████████| 750/750 [00:08<00:00, 86.31it/s, loss=0.4920]
Epoch 3 | Training loss: 0.4920, validation accuracy: 0.8964, validation loss: 0.3332
100%|██████████| 750/750 [00:08<00:00, 86.00it/s, loss=0.3126]
Epoch 4 | Training loss: 0.3126, validation accuracy: 0.9277, validation loss: 0.2452
100%|██████████| 750/750 [00:08<00:00, 87.01it/s, loss=0.2295]
Epoch 5 | Training loss: 0.2295, validation accuracy: 0.9436, validation loss: 0.1868
100%|██████████| 750/750 [00:08<00:00, 87.53it/s, loss=0.1787]
Epoch 6 | Training loss: 0.1787, validation accuracy: 0.9563, validation loss: 0.1501
100%|██████████| 750/750 [00:08<00:00, 86.62it/s, loss=0.1470]
Epoch 7 | Training loss: 0.1470, validation

In [29]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9899791666666666, 0.03208676589505437)
Valid: (0.9821666666666666, 0.05884942181020354)


In [30]:
optimizer.accelerate()

In [31]:
optimizer.store_parameters()
model.to(trainer.device)
None

In [32]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9862916666666667, 0.045984817957816025)
Valid: (0.98, 0.06431335149084529)


## Epoch average, with span = 100

In [33]:
model.load_state_dict(initial_state)
model.to(trainer.device)

Sequential(
  (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (1): ReLU()
  (2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
  (3): ReLU()
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (6): ReLU()
  (7): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (8): ReLU()
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Flatten(start_dim=1, end_dim=-1)
  (11): Linear(in_features=1024, out_features=128, bias=True)
  (12): ReLU()
  (13): Linear(in_features=128, out_features=10, bias=True)
  (14): LogSoftmax(dim=-1)
)

In [34]:
optimizer = AcceleratedSGD(model.parameters(), 1e-3, k=10, momentum=0.5, weight_decay=1e-5, mode="epoch_avg", avg_alpha=(2 / (100 + 1)))
logger = Logger("SGD_momentum-avg_span_100.txt")

In [35]:
torch.manual_seed(2020)
epochs = 30

for epoch in range(epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")

100%|██████████| 750/750 [00:09<00:00, 81.32it/s, loss=2.2952]
Epoch 1 | Training loss: 2.2952, validation accuracy: 0.2306, validation loss: 2.2780
100%|██████████| 750/750 [00:09<00:00, 81.92it/s, loss=1.9824]
Epoch 2 | Training loss: 1.9824, validation accuracy: 0.7867, validation loss: 0.7825
100%|██████████| 750/750 [00:09<00:00, 82.88it/s, loss=0.4920]
Epoch 3 | Training loss: 0.4920, validation accuracy: 0.8968, validation loss: 0.3335
100%|██████████| 750/750 [00:09<00:00, 80.12it/s, loss=0.3123]
Epoch 4 | Training loss: 0.3123, validation accuracy: 0.9277, validation loss: 0.2455
100%|██████████| 750/750 [00:09<00:00, 81.24it/s, loss=0.2292]
Epoch 5 | Training loss: 0.2292, validation accuracy: 0.9433, validation loss: 0.1870
100%|██████████| 750/750 [00:09<00:00, 78.31it/s, loss=0.1785]
Epoch 6 | Training loss: 0.1785, validation accuracy: 0.9560, validation loss: 0.1500
100%|██████████| 750/750 [00:08<00:00, 83.89it/s, loss=0.1468]
Epoch 7 | Training loss: 0.1468, validation

In [36]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9899166666666667, 0.032112725746352225)
Valid: (0.982, 0.05929478842578828)


In [37]:
optimizer.accelerate()

In [38]:
optimizer.store_parameters()
model.to(trainer.device)
None

In [39]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9913958333333334, 0.029459121460095047)
Valid: (0.9835, 0.0570049082779636)


## Epoch average, with span = 50

In [40]:
model.load_state_dict(initial_state)
model.to(trainer.device)

Sequential(
  (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (1): ReLU()
  (2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
  (3): ReLU()
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (6): ReLU()
  (7): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (8): ReLU()
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Flatten(start_dim=1, end_dim=-1)
  (11): Linear(in_features=1024, out_features=128, bias=True)
  (12): ReLU()
  (13): Linear(in_features=128, out_features=10, bias=True)
  (14): LogSoftmax(dim=-1)
)

In [41]:
optimizer = AcceleratedSGD(model.parameters(), 1e-3, k=10, momentum=0.5, weight_decay=1e-5, mode="epoch_avg", avg_alpha=(2 / (50 + 1)))
logger = Logger("SGD_momentum-avg_span_50.txt")

In [42]:
epochs = 30

for epoch in range(epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")

100%|██████████| 750/750 [00:08<00:00, 87.47it/s, loss=2.2952]
Epoch 1 | Training loss: 2.2952, validation accuracy: 0.2288, validation loss: 2.2781
100%|██████████| 750/750 [00:08<00:00, 85.25it/s, loss=1.9861]
Epoch 2 | Training loss: 1.9861, validation accuracy: 0.8002, validation loss: 0.7786
100%|██████████| 750/750 [00:08<00:00, 88.09it/s, loss=0.4941]
Epoch 3 | Training loss: 0.4941, validation accuracy: 0.8790, validation loss: 0.3798
100%|██████████| 750/750 [00:08<00:00, 85.25it/s, loss=0.3130]
Epoch 4 | Training loss: 0.3130, validation accuracy: 0.9291, validation loss: 0.2431
100%|██████████| 750/750 [00:08<00:00, 85.55it/s, loss=0.2293]
Epoch 5 | Training loss: 0.2293, validation accuracy: 0.9424, validation loss: 0.1954
100%|██████████| 750/750 [00:08<00:00, 87.05it/s, loss=0.1794]
Epoch 6 | Training loss: 0.1794, validation accuracy: 0.9561, validation loss: 0.1502
100%|██████████| 750/750 [00:08<00:00, 86.15it/s, loss=0.1484]
Epoch 7 | Training loss: 0.1484, validation

In [43]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.99025, 0.03290733187684479)
Valid: (0.98175, 0.06210598905943334)


In [44]:
optimizer.accelerate()

In [45]:
optimizer.store_parameters()
model.to(trainer.device)
None

In [46]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9915, 0.02952898053638637)
Valid: (0.9834166666666667, 0.057382562783857186)


## Epoch average, with span = 20

In [47]:
model.load_state_dict(initial_state)
model.to(trainer.device)

Sequential(
  (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (1): ReLU()
  (2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
  (3): ReLU()
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (6): ReLU()
  (7): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (8): ReLU()
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Flatten(start_dim=1, end_dim=-1)
  (11): Linear(in_features=1024, out_features=128, bias=True)
  (12): ReLU()
  (13): Linear(in_features=128, out_features=10, bias=True)
  (14): LogSoftmax(dim=-1)
)

In [48]:
optimizer = AcceleratedSGD(model.parameters(), 1e-3, k=10, momentum=0.5, weight_decay=1e-5, mode="epoch_avg", avg_alpha=(2 / (20 + 1)))
logger = Logger("SGD_momentum-avg_span_20.txt")

In [49]:
torch.manual_seed(2020)
epochs = 30

for epoch in range(epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")

100%|██████████| 750/750 [00:08<00:00, 87.05it/s, loss=2.2952]
Epoch 1 | Training loss: 2.2952, validation accuracy: 0.2306, validation loss: 2.2780
100%|██████████| 750/750 [00:08<00:00, 87.24it/s, loss=1.9824]
Epoch 2 | Training loss: 1.9824, validation accuracy: 0.7863, validation loss: 0.7827
100%|██████████| 750/750 [00:08<00:00, 87.46it/s, loss=0.4921]
Epoch 3 | Training loss: 0.4921, validation accuracy: 0.8969, validation loss: 0.3337
100%|██████████| 750/750 [00:08<00:00, 85.77it/s, loss=0.3124]
Epoch 4 | Training loss: 0.3124, validation accuracy: 0.9283, validation loss: 0.2452
100%|██████████| 750/750 [00:08<00:00, 86.93it/s, loss=0.2294]
Epoch 5 | Training loss: 0.2294, validation accuracy: 0.9432, validation loss: 0.1872
100%|██████████| 750/750 [00:08<00:00, 85.93it/s, loss=0.1788]
Epoch 6 | Training loss: 0.1788, validation accuracy: 0.9561, validation loss: 0.1502
100%|██████████| 750/750 [00:08<00:00, 86.62it/s, loss=0.1471]
Epoch 7 | Training loss: 0.1471, validation

In [50]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9897916666666666, 0.03247240138938651)
Valid: (0.9820833333333333, 0.059331939990166574)


In [51]:
optimizer.accelerate()

In [52]:
optimizer.store_parameters()
model.to(trainer.device)
None

In [53]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9913333333333333, 0.02995359788167601)
Valid: (0.9833333333333333, 0.05707832488107185)


## Epoch average, with span = 15

In [54]:
model.load_state_dict(initial_state)
model.to(trainer.device)

Sequential(
  (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (1): ReLU()
  (2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
  (3): ReLU()
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (6): ReLU()
  (7): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (8): ReLU()
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Flatten(start_dim=1, end_dim=-1)
  (11): Linear(in_features=1024, out_features=128, bias=True)
  (12): ReLU()
  (13): Linear(in_features=128, out_features=10, bias=True)
  (14): LogSoftmax(dim=-1)
)

In [55]:
optimizer = AcceleratedSGD(model.parameters(), 1e-3, k=10, momentum=0.5, weight_decay=1e-5, mode="epoch_avg", avg_alpha=(2 / (15 + 1)))
logger = Logger("SGD_momentum-avg_span_15.txt")

In [56]:
torch.manual_seed(2020)
epochs = 30

for epoch in range(epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")

100%|██████████| 750/750 [00:08<00:00, 87.80it/s, loss=2.2952]
Epoch 1 | Training loss: 2.2952, validation accuracy: 0.2305, validation loss: 2.2780
100%|██████████| 750/750 [00:08<00:00, 86.13it/s, loss=1.9824]
Epoch 2 | Training loss: 1.9824, validation accuracy: 0.7864, validation loss: 0.7826
100%|██████████| 750/750 [00:08<00:00, 87.12it/s, loss=0.4921]
Epoch 3 | Training loss: 0.4921, validation accuracy: 0.8968, validation loss: 0.3334
100%|██████████| 750/750 [00:08<00:00, 87.77it/s, loss=0.3125]
Epoch 4 | Training loss: 0.3125, validation accuracy: 0.9280, validation loss: 0.2456
100%|██████████| 750/750 [00:09<00:00, 81.83it/s, loss=0.2295]
Epoch 5 | Training loss: 0.2295, validation accuracy: 0.9433, validation loss: 0.1871
100%|██████████| 750/750 [00:08<00:00, 84.72it/s, loss=0.1788]
Epoch 6 | Training loss: 0.1788, validation accuracy: 0.9563, validation loss: 0.1501
100%|██████████| 750/750 [00:08<00:00, 85.91it/s, loss=0.1471]
Epoch 7 | Training loss: 0.1471, validation

In [57]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.989875, 0.0321807465457047)
Valid: (0.9820833333333333, 0.05877002091783409)


In [58]:
optimizer.accelerate()

In [59]:
optimizer.store_parameters()
model.to(trainer.device)
None

In [60]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9912708333333333, 0.029978158516266073)
Valid: (0.9829166666666667, 0.05700672493486976)


## Epoch average, with span = 10

In [61]:
model.load_state_dict(initial_state)
model.to(trainer.device)

Sequential(
  (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (1): ReLU()
  (2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
  (3): ReLU()
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (6): ReLU()
  (7): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (8): ReLU()
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Flatten(start_dim=1, end_dim=-1)
  (11): Linear(in_features=1024, out_features=128, bias=True)
  (12): ReLU()
  (13): Linear(in_features=128, out_features=10, bias=True)
  (14): LogSoftmax(dim=-1)
)

In [62]:
optimizer = AcceleratedSGD(model.parameters(), 1e-3, k=10, momentum=0.5, weight_decay=1e-5, mode="epoch_avg", avg_alpha=(2 / (10 + 1)))
logger = Logger("SGD_momentum-avg_span_10.txt")

In [63]:
torch.manual_seed(2020)
epochs = 30

for epoch in range(epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")

100%|██████████| 750/750 [00:08<00:00, 85.54it/s, loss=2.2952]
Epoch 1 | Training loss: 2.2952, validation accuracy: 0.2307, validation loss: 2.2780
100%|██████████| 750/750 [00:08<00:00, 85.12it/s, loss=1.9824]
Epoch 2 | Training loss: 1.9824, validation accuracy: 0.7866, validation loss: 0.7827
100%|██████████| 750/750 [00:08<00:00, 84.26it/s, loss=0.4920]
Epoch 3 | Training loss: 0.4920, validation accuracy: 0.8965, validation loss: 0.3331
100%|██████████| 750/750 [00:08<00:00, 86.05it/s, loss=0.3124]
Epoch 4 | Training loss: 0.3124, validation accuracy: 0.9277, validation loss: 0.2453
100%|██████████| 750/750 [00:08<00:00, 84.08it/s, loss=0.2295]
Epoch 5 | Training loss: 0.2295, validation accuracy: 0.9432, validation loss: 0.1869
100%|██████████| 750/750 [00:08<00:00, 86.83it/s, loss=0.1788]
Epoch 6 | Training loss: 0.1788, validation accuracy: 0.9561, validation loss: 0.1500
100%|██████████| 750/750 [00:09<00:00, 82.36it/s, loss=0.1471]
Epoch 7 | Training loss: 0.1471, validation

In [64]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9901041666666667, 0.032095543530924864)
Valid: (0.9820833333333333, 0.05878527865000069)


In [65]:
optimizer.accelerate()

In [66]:
optimizer.store_parameters()
model.to(trainer.device)
None

In [67]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9912083333333334, 0.03028397676581517)
Valid: (0.9831666666666666, 0.05725179621841137)


## Epoch average, with span = 5

In [68]:
model.load_state_dict(initial_state)
model.to(trainer.device)

Sequential(
  (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (1): ReLU()
  (2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
  (3): ReLU()
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (6): ReLU()
  (7): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (8): ReLU()
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Flatten(start_dim=1, end_dim=-1)
  (11): Linear(in_features=1024, out_features=128, bias=True)
  (12): ReLU()
  (13): Linear(in_features=128, out_features=10, bias=True)
  (14): LogSoftmax(dim=-1)
)

In [69]:
optimizer = AcceleratedSGD(model.parameters(), 1e-3, k=10, momentum=0.5, weight_decay=1e-5, mode="epoch_avg", avg_alpha = (2 / (5 + 1)))
logger = Logger("SGD_momentum-avg_span_5.txt")

In [70]:
torch.manual_seed(2020)
epochs = 30

for epoch in range(epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")

100%|██████████| 750/750 [00:08<00:00, 84.64it/s, loss=2.2952]
Epoch 1 | Training loss: 2.2952, validation accuracy: 0.2306, validation loss: 2.2780
100%|██████████| 750/750 [00:08<00:00, 86.58it/s, loss=1.9825]
Epoch 2 | Training loss: 1.9825, validation accuracy: 0.7864, validation loss: 0.7827
100%|██████████| 750/750 [00:08<00:00, 85.10it/s, loss=0.4920]
Epoch 3 | Training loss: 0.4920, validation accuracy: 0.8968, validation loss: 0.3333
100%|██████████| 750/750 [00:08<00:00, 85.67it/s, loss=0.3125]
Epoch 4 | Training loss: 0.3125, validation accuracy: 0.9278, validation loss: 0.2455
100%|██████████| 750/750 [00:08<00:00, 84.59it/s, loss=0.2295]
Epoch 5 | Training loss: 0.2295, validation accuracy: 0.9434, validation loss: 0.1871
100%|██████████| 750/750 [00:08<00:00, 85.70it/s, loss=0.1790]
Epoch 6 | Training loss: 0.1790, validation accuracy: 0.9562, validation loss: 0.1503
100%|██████████| 750/750 [00:08<00:00, 86.71it/s, loss=0.1473]
Epoch 7 | Training loss: 0.1473, validation

In [71]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9900208333333333, 0.032129305653972554)
Valid: (0.9823333333333333, 0.05854459940797339)


In [72]:
optimizer.accelerate()

In [73]:
optimizer.store_parameters()
model.to(trainer.device)
None

In [74]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9908541666666667, 0.030489669628441334)
Valid: (0.9829166666666667, 0.05682149266917259)


## Split + epoch

In [75]:
model.load_state_dict(initial_state)
model.to(trainer.device)
None

In [76]:
groups = [{"params": [param]} for param in model.parameters()]
optimizer = AcceleratedSGD(groups, 1e-3, k=10, momentum=0.5, weight_decay=1e-5, mode="epoch")
logger = Logger("SGD_momentum-split.txt")

In [77]:
torch.manual_seed(2020)
epochs = 30

for epoch in range(epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")

100%|██████████| 750/750 [00:08<00:00, 84.42it/s, loss=2.2952]
Epoch 1 | Training loss: 2.2952, validation accuracy: 0.2307, validation loss: 2.2780
100%|██████████| 750/750 [00:08<00:00, 86.01it/s, loss=1.9824]
Epoch 2 | Training loss: 1.9824, validation accuracy: 0.7867, validation loss: 0.7826
100%|██████████| 750/750 [00:09<00:00, 81.15it/s, loss=0.4920]
Epoch 3 | Training loss: 0.4920, validation accuracy: 0.8969, validation loss: 0.3331
100%|██████████| 750/750 [00:09<00:00, 82.42it/s, loss=0.3123]
Epoch 4 | Training loss: 0.3123, validation accuracy: 0.9281, validation loss: 0.2453
100%|██████████| 750/750 [00:08<00:00, 85.00it/s, loss=0.2293]
Epoch 5 | Training loss: 0.2293, validation accuracy: 0.9433, validation loss: 0.1869
100%|██████████| 750/750 [00:09<00:00, 80.96it/s, loss=0.1786]
Epoch 6 | Training loss: 0.1786, validation accuracy: 0.9562, validation loss: 0.1498
100%|██████████| 750/750 [00:08<00:00, 83.36it/s, loss=0.1471]
Epoch 7 | Training loss: 0.1471, validation

In [78]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9898333333333333, 0.032251678868662564)
Valid: (0.9819166666666667, 0.05914909615786746)


In [79]:
optimizer.accelerate()

In [80]:
optimizer.store_parameters()
model.to(trainer.device)
None

In [81]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9910625, 0.029343846139633874)
Valid: (0.9828333333333333, 0.06293854748888407)


## Linear only

In [82]:
model.load_state_dict(initial_state)
model.to(trainer.device)
None

In [83]:
conv_group = {
    "params": [param for child in list(model.children())[:10] for param in child.parameters()],
    "method": None
}
fc_group = {
    "params": [param for child in list(model.children())[10:] for param in child.parameters()]
}
groups = [conv_group, fc_group]
optimizer = AcceleratedSGD(groups, 1e-3, k=10, momentum=0.5, weight_decay=1e-5, mode="epoch")
logger = Logger("SGD_momentum-linear_only.txt")

In [84]:
torch.manual_seed(2020)
epochs = 30

for epoch in range(epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")

100%|██████████| 750/750 [00:09<00:00, 83.16it/s, loss=2.2952]
Epoch 1 | Training loss: 2.2952, validation accuracy: 0.2307, validation loss: 2.2780
100%|██████████| 750/750 [00:09<00:00, 80.68it/s, loss=1.9824]
Epoch 2 | Training loss: 1.9824, validation accuracy: 0.7866, validation loss: 0.7826
100%|██████████| 750/750 [00:09<00:00, 82.62it/s, loss=0.4921]
Epoch 3 | Training loss: 0.4921, validation accuracy: 0.8968, validation loss: 0.3333
100%|██████████| 750/750 [00:09<00:00, 79.93it/s, loss=0.3124]
Epoch 4 | Training loss: 0.3124, validation accuracy: 0.9277, validation loss: 0.2451
100%|██████████| 750/750 [00:09<00:00, 82.36it/s, loss=0.2293]
Epoch 5 | Training loss: 0.2293, validation accuracy: 0.9433, validation loss: 0.1869
100%|██████████| 750/750 [00:09<00:00, 76.19it/s, loss=0.1788]
Epoch 6 | Training loss: 0.1788, validation accuracy: 0.9562, validation loss: 0.1500
100%|██████████| 750/750 [00:09<00:00, 82.14it/s, loss=0.1471]
Epoch 7 | Training loss: 0.1471, validation

In [85]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.989625, 0.032427263285421455)
Valid: (0.982, 0.05911266437987797)


In [86]:
optimizer.accelerate()

In [87]:
optimizer.store_parameters()
model.to(trainer.device)
None

In [88]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9907916666666666, 0.030542890603809308)
Valid: (0.98225, 0.05680294658135002)


## Linear + conv separately

In [89]:
model.load_state_dict(initial_state)
model.to(trainer.device)
None

In [90]:
conv_group = {
    "params": [param for child in list(model.children())[:10] for param in child.parameters()],
}
fc_group = {
    "params": [param for child in list(model.children())[10:] for param in child.parameters()]
}
groups = [conv_group, fc_group]
optimizer = AcceleratedSGD(groups, 1e-3, k=10, momentum=0.5, weight_decay=1e-5, mode="epoch")
logger = Logger("SGD_momentum-linear_conv.txt")

In [91]:
torch.manual_seed(2020)
epochs = 30

for epoch in range(epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")

100%|██████████| 750/750 [00:08<00:00, 86.02it/s, loss=2.2952]
Epoch 1 | Training loss: 2.2952, validation accuracy: 0.2306, validation loss: 2.2780
100%|██████████| 750/750 [00:08<00:00, 84.36it/s, loss=1.9824]
Epoch 2 | Training loss: 1.9824, validation accuracy: 0.7867, validation loss: 0.7826
100%|██████████| 750/750 [00:08<00:00, 85.13it/s, loss=0.4921]
Epoch 3 | Training loss: 0.4921, validation accuracy: 0.8970, validation loss: 0.3332
100%|██████████| 750/750 [00:08<00:00, 85.11it/s, loss=0.3124]
Epoch 4 | Training loss: 0.3124, validation accuracy: 0.9277, validation loss: 0.2454
100%|██████████| 750/750 [00:08<00:00, 84.30it/s, loss=0.2295]
Epoch 5 | Training loss: 0.2295, validation accuracy: 0.9433, validation loss: 0.1872
100%|██████████| 750/750 [00:08<00:00, 86.59it/s, loss=0.1789]
Epoch 6 | Training loss: 0.1789, validation accuracy: 0.9563, validation loss: 0.1505
100%|██████████| 750/750 [00:08<00:00, 84.76it/s, loss=0.1472]
Epoch 7 | Training loss: 0.1472, validation

In [92]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9900208333333333, 0.03202596840658225)
Valid: (0.9823333333333333, 0.058900529234359664)


In [93]:
optimizer.accelerate()

In [94]:
optimizer.store_parameters()
model.to(trainer.device)
None

In [95]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9909166666666667, 0.03049213180830702)
Valid: (0.9826666666666667, 0.05718373842506359)
