In [1]:
import sys
sys.path.insert(0, "../..")

In [2]:
import torch
from torch import nn
import numpy as np
from copy import deepcopy

from nn_extrapolation import AcceleratedSGD
from nn_utils import *

In [3]:
torch.cuda.is_available()

True

In [4]:
trainer = Trainer(
    device="cuda",
    loss_fn=nn.NLLLoss(reduction="mean"),
    val_loss_fn=nn.NLLLoss(reduction="sum"),
)

In [5]:
dl = load_dataset(
    dataset="mnist", 
    root="../../../MNIST", 
    download=False, 
    validation_split=0.2,
    batch_size=64, 
    num_workers=2,
)

In [6]:
def make_model():
    return nn.Sequential(
        nn.Conv2d(1, 32, 3),
        nn.ReLU(),
        nn.Conv2d(32, 32, 3),
        nn.ReLU(),
        nn.MaxPool2d(2),
        nn.Conv2d(32, 64, 3),
        nn.ReLU(),
        nn.Conv2d(64, 64, 3),
        nn.ReLU(),
        nn.MaxPool2d(2),
        nn.Flatten(),
        nn.Linear(4*4*64, 128),
        nn.ReLU(),
        nn.Linear(128, 10),
        nn.LogSoftmax(-1),
    )

## Epoch

In [16]:
model = make_model()
model.to(trainer.device)

Sequential(
  (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (1): ReLU()
  (2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
  (3): ReLU()
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (6): ReLU()
  (7): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (8): ReLU()
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Flatten(start_dim=1, end_dim=-1)
  (11): Linear(in_features=1024, out_features=128, bias=True)
  (12): ReLU()
  (13): Linear(in_features=128, out_features=10, bias=True)
  (14): LogSoftmax(dim=-1)
)

In [17]:
optimizer = AcceleratedSGD(model.parameters(), 1e-3, k=10, momentum=0.5, weight_decay=1e-5, mode="epoch")
logger = Logger("SGD_momentum.txt")

In [18]:
epochs = 30

for epoch in range(epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")

100%|██████████| 750/750 [00:12<00:00, 58.79it/s, loss=2.2955]
Epoch 1 | Training loss: 2.2955, validation accuracy: 0.0997, validation loss: 2.2847
100%|██████████| 750/750 [00:12<00:00, 59.07it/s, loss=2.2005]
Epoch 2 | Training loss: 2.2005, validation accuracy: 0.6655, validation loss: 1.7379
100%|██████████| 750/750 [00:13<00:00, 57.39it/s, loss=0.6490]
Epoch 3 | Training loss: 0.6490, validation accuracy: 0.8916, validation loss: 0.3614
100%|██████████| 750/750 [00:12<00:00, 58.47it/s, loss=0.3329]
Epoch 4 | Training loss: 0.3329, validation accuracy: 0.9234, validation loss: 0.2637
100%|██████████| 750/750 [00:12<00:00, 58.58it/s, loss=0.2432]
Epoch 5 | Training loss: 0.2432, validation accuracy: 0.9413, validation loss: 0.1995
100%|██████████| 750/750 [00:12<00:00, 58.83it/s, loss=0.1885]
Epoch 6 | Training loss: 0.1885, validation accuracy: 0.9540, validation loss: 0.1644
100%|██████████| 750/750 [00:13<00:00, 56.19it/s, loss=0.1548]
Epoch 7 | Training loss: 0.1548, validation

In [19]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9866875, 0.041570605262182654)
Valid: (0.9805833333333334, 0.0671343806393755)


In [20]:
optimizer.accelerate()

In [21]:
optimizer.store_parameters()
model.cuda()
None

In [22]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9905625, 0.03230505989926557)
Valid: (0.9838333333333333, 0.05686103586635242)


## Epoch average

In [23]:
model = make_model()
model.to(trainer.device)

Sequential(
  (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (1): ReLU()
  (2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
  (3): ReLU()
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (6): ReLU()
  (7): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (8): ReLU()
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Flatten(start_dim=1, end_dim=-1)
  (11): Linear(in_features=1024, out_features=128, bias=True)
  (12): ReLU()
  (13): Linear(in_features=128, out_features=10, bias=True)
  (14): LogSoftmax(dim=-1)
)

In [24]:
optimizer = AcceleratedSGD(model.parameters(), 1e-3, k=10, momentum=0.5, weight_decay=1e-5, mode="epoch_avg")
logger = Logger("SGD_momentum-avg.txt")

In [25]:
epochs = 30

for epoch in range(epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")

100%|██████████| 750/750 [00:12<00:00, 58.54it/s, loss=2.2948]
Epoch 1 | Training loss: 2.2948, validation accuracy: 0.3392, validation loss: 2.2834
100%|██████████| 750/750 [00:12<00:00, 58.87it/s, loss=2.1638]
Epoch 2 | Training loss: 2.1638, validation accuracy: 0.7480, validation loss: 1.4245
100%|██████████| 750/750 [00:12<00:00, 58.95it/s, loss=0.5741]
Epoch 3 | Training loss: 0.5741, validation accuracy: 0.8956, validation loss: 0.3469
100%|██████████| 750/750 [00:12<00:00, 58.82it/s, loss=0.3348]
Epoch 4 | Training loss: 0.3348, validation accuracy: 0.9178, validation loss: 0.2648
100%|██████████| 750/750 [00:12<00:00, 58.42it/s, loss=0.2508]
Epoch 5 | Training loss: 0.2508, validation accuracy: 0.9355, validation loss: 0.2086
100%|██████████| 750/750 [00:12<00:00, 58.91it/s, loss=0.1987]
Epoch 6 | Training loss: 0.1987, validation accuracy: 0.9499, validation loss: 0.1704
100%|██████████| 750/750 [00:12<00:00, 58.85it/s, loss=0.1641]
Epoch 7 | Training loss: 0.1641, validation

In [26]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9895625, 0.03502268844128897)
Valid: (0.9824166666666667, 0.0605216880949835)


In [27]:
optimizer.accelerate()

In [28]:
optimizer.store_parameters()
model.cuda()
None

In [29]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9853333333333333, 0.050679075373957556)
Valid: (0.98, 0.06859579290946324)


## Epoch average, with span = 20

In [30]:
model = make_model()
model.to(trainer.device)

Sequential(
  (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (1): ReLU()
  (2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
  (3): ReLU()
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (6): ReLU()
  (7): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (8): ReLU()
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Flatten(start_dim=1, end_dim=-1)
  (11): Linear(in_features=1024, out_features=128, bias=True)
  (12): ReLU()
  (13): Linear(in_features=128, out_features=10, bias=True)
  (14): LogSoftmax(dim=-1)
)

In [31]:
optimizer = AcceleratedSGD(model.parameters(), 1e-3, k=10, momentum=0.5, weight_decay=1e-5, mode="epoch_avg", avg_alpha=(2 / (20 + 1)))
logger = Logger("SGD_momentum-avg_span_20.txt")

In [32]:
epochs = 30

for epoch in range(epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")

100%|██████████| 750/750 [00:12<00:00, 58.00it/s, loss=2.2948]
Epoch 1 | Training loss: 2.2948, validation accuracy: 0.2427, validation loss: 2.2836
100%|██████████| 750/750 [00:12<00:00, 58.30it/s, loss=2.1289]
Epoch 2 | Training loss: 2.1289, validation accuracy: 0.7519, validation loss: 1.1900
100%|██████████| 750/750 [00:12<00:00, 58.63it/s, loss=0.5330]
Epoch 3 | Training loss: 0.5330, validation accuracy: 0.8986, validation loss: 0.3425
100%|██████████| 750/750 [00:12<00:00, 58.15it/s, loss=0.3211]
Epoch 4 | Training loss: 0.3211, validation accuracy: 0.9276, validation loss: 0.2503
100%|██████████| 750/750 [00:12<00:00, 58.18it/s, loss=0.2434]
Epoch 5 | Training loss: 0.2434, validation accuracy: 0.9418, validation loss: 0.2012
100%|██████████| 750/750 [00:12<00:00, 58.39it/s, loss=0.1928]
Epoch 6 | Training loss: 0.1928, validation accuracy: 0.9499, validation loss: 0.1692
100%|██████████| 750/750 [00:12<00:00, 58.39it/s, loss=0.1602]
Epoch 7 | Training loss: 0.1602, validation

In [33]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9902916666666667, 0.03297150648067084)
Valid: (0.9829166666666667, 0.060114016773644835)


In [34]:
optimizer.accelerate()

In [35]:
optimizer.store_parameters()
model.cuda()
None

In [36]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9907083333333333, 0.03163718872750178)
Valid: (0.9828333333333333, 0.058318879128588984)


## Epoch average, with span = 15

In [37]:
model = make_model()
model.to(trainer.device)

Sequential(
  (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (1): ReLU()
  (2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
  (3): ReLU()
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (6): ReLU()
  (7): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (8): ReLU()
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Flatten(start_dim=1, end_dim=-1)
  (11): Linear(in_features=1024, out_features=128, bias=True)
  (12): ReLU()
  (13): Linear(in_features=128, out_features=10, bias=True)
  (14): LogSoftmax(dim=-1)
)

In [38]:
optimizer = AcceleratedSGD(model.parameters(), 1e-3, k=10, momentum=0.5, weight_decay=1e-5, mode="epoch_avg", avg_alpha=(2 / (15 + 1)))
logger = Logger("SGD_momentum-avg_span_15.txt")

In [39]:
epochs = 30

for epoch in range(epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")

100%|██████████| 750/750 [00:12<00:00, 57.82it/s, loss=2.2893]
Epoch 1 | Training loss: 2.2893, validation accuracy: 0.2376, validation loss: 2.2610
100%|██████████| 750/750 [00:12<00:00, 57.93it/s, loss=1.5748]
Epoch 2 | Training loss: 1.5748, validation accuracy: 0.8377, validation loss: 0.5393
100%|██████████| 750/750 [00:12<00:00, 58.60it/s, loss=0.4447]
Epoch 3 | Training loss: 0.4447, validation accuracy: 0.9035, validation loss: 0.3362
100%|██████████| 750/750 [00:13<00:00, 57.58it/s, loss=0.3109]
Epoch 4 | Training loss: 0.3109, validation accuracy: 0.9275, validation loss: 0.2392
100%|██████████| 750/750 [00:12<00:00, 57.79it/s, loss=0.2386]
Epoch 5 | Training loss: 0.2386, validation accuracy: 0.9422, validation loss: 0.1944
100%|██████████| 750/750 [00:12<00:00, 57.71it/s, loss=0.1906]
Epoch 6 | Training loss: 0.1906, validation accuracy: 0.9534, validation loss: 0.1629
100%|██████████| 750/750 [00:12<00:00, 57.71it/s, loss=0.1597]
Epoch 7 | Training loss: 0.1597, validation

In [40]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9896875, 0.0338580307077306)
Valid: (0.9830833333333333, 0.05742358929710463)


In [41]:
optimizer.accelerate()

In [42]:
optimizer.store_parameters()
model.cuda()
None

In [43]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.990375, 0.03216092803166248)
Valid: (0.98375, 0.056568290886934845)


## Epoch average, with span = 10

In [44]:
model = make_model()
model.to(trainer.device)

Sequential(
  (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (1): ReLU()
  (2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
  (3): ReLU()
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (6): ReLU()
  (7): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (8): ReLU()
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Flatten(start_dim=1, end_dim=-1)
  (11): Linear(in_features=1024, out_features=128, bias=True)
  (12): ReLU()
  (13): Linear(in_features=128, out_features=10, bias=True)
  (14): LogSoftmax(dim=-1)
)

In [45]:
optimizer = AcceleratedSGD(model.parameters(), 1e-3, k=10, momentum=0.5, weight_decay=1e-5, mode="epoch_avg", avg_alpha=(2 / (10 + 1)))
logger = Logger("SGD_momentum-avg_span_10.txt")

In [46]:
epochs = 30

for epoch in range(epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")

100%|██████████| 750/750 [00:12<00:00, 58.53it/s, loss=2.3012]
Epoch 1 | Training loss: 2.3012, validation accuracy: 0.2513, validation loss: 2.2979
100%|██████████| 750/750 [00:12<00:00, 58.65it/s, loss=2.2930]
Epoch 2 | Training loss: 2.2930, validation accuracy: 0.2069, validation loss: 2.2850
100%|██████████| 750/750 [00:13<00:00, 56.10it/s, loss=2.2420]
Epoch 3 | Training loss: 2.2420, validation accuracy: 0.5530, validation loss: 2.0761
100%|██████████| 750/750 [00:14<00:00, 52.61it/s, loss=0.8874]
Epoch 4 | Training loss: 0.8874, validation accuracy: 0.8747, validation loss: 0.4132
100%|██████████| 750/750 [00:13<00:00, 55.95it/s, loss=0.3751]
Epoch 5 | Training loss: 0.3751, validation accuracy: 0.9149, validation loss: 0.2929
100%|██████████| 750/750 [00:12<00:00, 58.64it/s, loss=0.2752]
Epoch 6 | Training loss: 0.2752, validation accuracy: 0.9349, validation loss: 0.2226
100%|██████████| 750/750 [00:12<00:00, 58.65it/s, loss=0.2136]
Epoch 7 | Training loss: 0.2136, validation

In [47]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9874791666666667, 0.040641075562064845)
Valid: (0.9809166666666667, 0.06505878606423114)


In [48]:
optimizer.accelerate()

In [49]:
optimizer.store_parameters()
model.cuda()
None

In [50]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9892083333333334, 0.03670764370728284)
Valid: (0.982, 0.06022620059705029)


## Epoch average, with span = 5

In [51]:
log_file = open("SGD_momentum-avg_span_5.txt", "w")

In [52]:
model = make_model()
model.cuda()
optimizer = AcceleratedSGD(model.parameters(), 1e-3, k=10, momentum=0.5, weight_decay=1e-5, mode="epoch_avg", avg_alpha = (2 / (5 + 1)))

In [53]:
epochs = 30

for epoch in range(epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")

100%|██████████| 750/750 [00:13<00:00, 56.71it/s, loss=2.2816]
Epoch 1 | Training loss: 2.2816, validation accuracy: 0.3873, validation loss: 2.2266
100%|██████████| 750/750 [00:13<00:00, 57.33it/s, loss=1.1890]
Epoch 2 | Training loss: 1.1890, validation accuracy: 0.8763, validation loss: 0.4169
100%|██████████| 750/750 [00:13<00:00, 57.62it/s, loss=0.3771]
Epoch 3 | Training loss: 0.3771, validation accuracy: 0.9032, validation loss: 0.3150
100%|██████████| 750/750 [00:13<00:00, 56.94it/s, loss=0.2757]
Epoch 4 | Training loss: 0.2757, validation accuracy: 0.9327, validation loss: 0.2234
100%|██████████| 750/750 [00:13<00:00, 55.67it/s, loss=0.2161]
Epoch 5 | Training loss: 0.2161, validation accuracy: 0.9437, validation loss: 0.1861
100%|██████████| 750/750 [00:13<00:00, 56.92it/s, loss=0.1760]
Epoch 6 | Training loss: 0.1760, validation accuracy: 0.9581, validation loss: 0.1479
100%|██████████| 750/750 [00:13<00:00, 57.16it/s, loss=0.1508]
Epoch 7 | Training loss: 0.1508, validation

In [54]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9887291666666667, 0.03748172978963703)
Valid: (0.9805833333333334, 0.06541612339609613)


In [55]:
optimizer.accelerate()

In [56]:
optimizer.store_parameters()
model.cuda()
None

In [57]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9899375, 0.03544833152741194)
Valid: (0.9823333333333333, 0.061589378842928755)
