In [1]:
import sys
sys.path.insert(0, "../..")

In [2]:
import torch
from torch import nn
import numpy as np
from copy import deepcopy

from nn_extrapolation import AcceleratedSGD
from nn_utils import *

In [3]:
torch.cuda.is_available()

True

In [4]:
trainer = Trainer(
    device="cuda",
    loss_fn=nn.NLLLoss(reduction="mean"),
    val_loss_fn=nn.NLLLoss(reduction="sum"),
)

In [5]:
dl = load_dataset(
    dataset="mnist", 
    root="../../../MNIST", 
    download=False, 
    validation_split=0.2,
    batch_size=64, 
    num_workers=2,
)

In [6]:
def make_model():
    return nn.Sequential(
        nn.Conv2d(1, 32, 3),
        nn.ReLU(),
        nn.Conv2d(32, 32, 3),
        nn.ReLU(),
        nn.MaxPool2d(2),
        nn.Conv2d(32, 64, 3),
        nn.ReLU(),
        nn.Conv2d(64, 64, 3),
        nn.ReLU(),
        nn.MaxPool2d(2),
        nn.Flatten(),
        nn.Linear(4*4*64, 128),
        nn.ReLU(),
        nn.Linear(128, 10),
        nn.LogSoftmax(-1),
    )

## Epoch

In [38]:
model = make_model()
model.to(trainer.device)

Sequential(
  (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (1): ReLU()
  (2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
  (3): ReLU()
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (6): ReLU()
  (7): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (8): ReLU()
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Flatten(start_dim=1, end_dim=-1)
  (11): Linear(in_features=1024, out_features=128, bias=True)
  (12): ReLU()
  (13): Linear(in_features=128, out_features=10, bias=True)
  (14): LogSoftmax(dim=-1)
)

In [39]:
optimizer = AcceleratedSGD(model.parameters(), 1e-3, k=10, momentum=0.5, weight_decay=1e-5, mode="epoch")
logger = Logger("SGD_momentum-early_stopping.txt")

In [40]:
epochs = 1000
early_stopping = EarlyStopping(5)

for epoch in range(epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")
    if early_stopping.should_stop(val_loss):
        break

100%|██████████| 750/750 [00:12<00:00, 58.20it/s, loss=2.2992]
Epoch 1 | Training loss: 2.2992, validation accuracy: 0.1511, validation loss: 2.2926
100%|██████████| 750/750 [00:13<00:00, 57.63it/s, loss=2.2726]
Epoch 2 | Training loss: 2.2726, validation accuracy: 0.3871, validation loss: 2.2102
100%|██████████| 750/750 [00:12<00:00, 57.92it/s, loss=1.1880]
Epoch 3 | Training loss: 1.1880, validation accuracy: 0.8709, validation loss: 0.4423
100%|██████████| 750/750 [00:13<00:00, 56.29it/s, loss=0.3963]
Epoch 4 | Training loss: 0.3963, validation accuracy: 0.9091, validation loss: 0.3042
100%|██████████| 750/750 [00:13<00:00, 54.59it/s, loss=0.2894]
Epoch 5 | Training loss: 0.2894, validation accuracy: 0.9340, validation loss: 0.2288
100%|██████████| 750/750 [00:14<00:00, 53.19it/s, loss=0.2237]
Epoch 6 | Training loss: 0.2237, validation accuracy: 0.9453, validation loss: 0.1848
100%|██████████| 750/750 [00:13<00:00, 57.33it/s, loss=0.1815]
Epoch 7 | Training loss: 0.1815, validation

In [41]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9901458333333333, 0.031363423331175)
Valid: (0.982, 0.06327754400360087)


In [42]:
optimizer.accelerate()

In [43]:
optimizer.store_parameters()
model.cuda()
None

In [44]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9927708333333334, 0.02555174517771229)
Valid: (0.9851666666666666, 0.0553959670245337)


## Epoch average

In [45]:
model = make_model()
model.cuda()
optimizer = AcceleratedSGD(model.parameters(), 1e-3, k=10, momentum=0.5, weight_decay=1e-5, mode="epoch_avg")
logger = Logger("SGD_momentum-avg-early_stopping.txt")

In [46]:
epochs = 1000
early_stopping = EarlyStopping(5)

for epoch in range(epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")
    if early_stopping.should_stop(val_loss):
        break

100%|██████████| 750/750 [00:13<00:00, 56.97it/s, loss=2.2981]
Epoch 1 | Training loss: 2.2981, validation accuracy: 0.0998, validation loss: 2.2918
100%|██████████| 750/750 [00:13<00:00, 57.19it/s, loss=2.2707]
Epoch 2 | Training loss: 2.2707, validation accuracy: 0.4677, validation loss: 2.2159
100%|██████████| 750/750 [00:13<00:00, 57.28it/s, loss=1.2826]
Epoch 3 | Training loss: 1.2826, validation accuracy: 0.8683, validation loss: 0.4352
100%|██████████| 750/750 [00:13<00:00, 57.47it/s, loss=0.4021]
Epoch 4 | Training loss: 0.4021, validation accuracy: 0.9037, validation loss: 0.3251
100%|██████████| 750/750 [00:13<00:00, 57.27it/s, loss=0.2987]
Epoch 5 | Training loss: 0.2987, validation accuracy: 0.9243, validation loss: 0.2460
100%|██████████| 750/750 [00:13<00:00, 57.19it/s, loss=0.2280]
Epoch 6 | Training loss: 0.2280, validation accuracy: 0.9435, validation loss: 0.1911
100%|██████████| 750/750 [00:13<00:00, 55.58it/s, loss=0.1819]
Epoch 7 | Training loss: 0.1819, validation

In [47]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9924583333333333, 0.024017436567383508)
Valid: (0.9825, 0.063618650574858)


In [48]:
optimizer.accelerate()

In [49]:
optimizer.store_parameters()
model.cuda()
None

In [50]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.99075, 0.031733908565094074)
Valid: (0.9831666666666666, 0.0573713893860889)


## Epoch average, with span = 20

In [51]:
model = make_model()
model.cuda()
optimizer = AcceleratedSGD(model.parameters(), 1e-3, k=10, momentum=0.5, weight_decay=1e-5, mode="epoch_avg", avg_alpha = (2 / (20 + 1)))
logger = Logger("SGD_momentum-avg_span_20-early_stopping.txt")

In [52]:
epochs = 1000
early_stopping = EarlyStopping(5)

for epoch in range(epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")
    if early_stopping.should_stop(val_loss):
        break

100%|██████████| 750/750 [00:13<00:00, 57.41it/s, loss=2.3007]
Epoch 1 | Training loss: 2.3007, validation accuracy: 0.1477, validation loss: 2.2957
100%|██████████| 750/750 [00:13<00:00, 56.88it/s, loss=2.2873]
Epoch 2 | Training loss: 2.2873, validation accuracy: 0.3137, validation loss: 2.2699
100%|██████████| 750/750 [00:12<00:00, 58.18it/s, loss=1.9593]
Epoch 3 | Training loss: 1.9593, validation accuracy: 0.8196, validation loss: 0.7019
100%|██████████| 750/750 [00:12<00:00, 57.74it/s, loss=0.4518]
Epoch 4 | Training loss: 0.4518, validation accuracy: 0.9101, validation loss: 0.3042
100%|██████████| 750/750 [00:12<00:00, 58.43it/s, loss=0.2956]
Epoch 5 | Training loss: 0.2956, validation accuracy: 0.9192, validation loss: 0.2644
100%|██████████| 750/750 [00:12<00:00, 58.49it/s, loss=0.2250]
Epoch 6 | Training loss: 0.2250, validation accuracy: 0.9467, validation loss: 0.1848
100%|██████████| 750/750 [00:12<00:00, 58.33it/s, loss=0.1808]
Epoch 7 | Training loss: 0.1808, validation

In [53]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9876041666666666, 0.03878446874084572)
Valid: (0.9811666666666666, 0.06671205336274579)


In [54]:
optimizer.accelerate()

In [55]:
optimizer.store_parameters()
model.cuda()
None

In [56]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9906458333333333, 0.03162350018741563)
Valid: (0.9840833333333333, 0.05809748757778046)


## Epoch average, with span = 15

In [57]:
model = make_model()
model.cuda()
optimizer = AcceleratedSGD(model.parameters(), 1e-3, k=10, momentum=0.5, weight_decay=1e-5, mode="epoch_avg", avg_alpha = (2 / (15 + 1)))
logger = Logger("SGD_momentum-avg_span_15.txt")

In [58]:
epochs = 1000
early_stopping = EarlyStopping(5)

for epoch in range(epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")
    if early_stopping.should_stop(val_loss):
        break

100%|██████████| 750/750 [00:13<00:00, 56.71it/s, loss=2.2811]
Epoch 1 | Training loss: 2.2811, validation accuracy: 0.3981, validation loss: 2.2195
100%|██████████| 750/750 [00:13<00:00, 57.08it/s, loss=1.1816]
Epoch 2 | Training loss: 1.1816, validation accuracy: 0.8792, validation loss: 0.4092
100%|██████████| 750/750 [00:13<00:00, 57.23it/s, loss=0.3728]
Epoch 3 | Training loss: 0.3728, validation accuracy: 0.9155, validation loss: 0.2841
100%|██████████| 750/750 [00:13<00:00, 57.50it/s, loss=0.2639]
Epoch 4 | Training loss: 0.2639, validation accuracy: 0.9397, validation loss: 0.2078
100%|██████████| 750/750 [00:13<00:00, 57.26it/s, loss=0.2044]
Epoch 5 | Training loss: 0.2044, validation accuracy: 0.9468, validation loss: 0.1763
100%|██████████| 750/750 [00:13<00:00, 57.05it/s, loss=0.1663]
Epoch 6 | Training loss: 0.1663, validation accuracy: 0.9590, validation loss: 0.1432
100%|██████████| 750/750 [00:13<00:00, 57.16it/s, loss=0.1407]
Epoch 7 | Training loss: 0.1407, validation

In [59]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9930208333333334, 0.02317388749010085)
Valid: (0.9831666666666666, 0.06085876511565099)


In [60]:
optimizer.accelerate()

In [61]:
optimizer.store_parameters()
model.cuda()
None

In [62]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.995, 0.01873590029787738)
Valid: (0.9849166666666667, 0.05407277113137146)


## Epoch average, with span = 10

In [63]:
model = make_model()
model.cuda()
optimizer = AcceleratedSGD(model.parameters(), 1e-3, k=10, momentum=0.5, weight_decay=1e-5, mode="epoch_avg", avg_alpha = (2 / (10 + 1)))
logger = Logger("SGD_momentum-avg_span_10.txt")

In [64]:
epochs = 1000
early_stopping = EarlyStopping(5)

for epoch in range(epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")
    if early_stopping.should_stop(val_loss):
        break

100%|██████████| 750/750 [00:13<00:00, 57.26it/s, loss=2.2934]
Epoch 1 | Training loss: 2.2934, validation accuracy: 0.3521, validation loss: 2.2806
100%|██████████| 750/750 [00:12<00:00, 58.31it/s, loss=1.9816]
Epoch 2 | Training loss: 1.9816, validation accuracy: 0.7953, validation loss: 0.7369
100%|██████████| 750/750 [00:13<00:00, 56.91it/s, loss=0.5012]
Epoch 3 | Training loss: 0.5012, validation accuracy: 0.8977, validation loss: 0.3449
100%|██████████| 750/750 [00:13<00:00, 57.23it/s, loss=0.3298]
Epoch 4 | Training loss: 0.3298, validation accuracy: 0.9147, validation loss: 0.2790
100%|██████████| 750/750 [00:12<00:00, 57.76it/s, loss=0.2472]
Epoch 5 | Training loss: 0.2472, validation accuracy: 0.9376, validation loss: 0.2150
100%|██████████| 750/750 [00:13<00:00, 56.44it/s, loss=0.1948]
Epoch 6 | Training loss: 0.1948, validation accuracy: 0.9530, validation loss: 0.1617
100%|██████████| 750/750 [00:12<00:00, 58.29it/s, loss=0.1634]
Epoch 7 | Training loss: 0.1634, validation

In [65]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.993625, 0.02147273684075723)
Valid: (0.98425, 0.05929475574420454)


In [66]:
optimizer.accelerate()

In [67]:
optimizer.store_parameters()
model.cuda()
None

In [68]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9943958333333334, 0.021218339710030704)
Valid: (0.9840833333333333, 0.055943363830912855)


## Epoch average, with span = 5

In [69]:
model = make_model()
model.cuda()
optimizer = AcceleratedSGD(model.parameters(), 1e-3, k=10, momentum=0.5, weight_decay=1e-5, mode="epoch_avg", avg_alpha = (2 / (5 + 1)))
logger = Logger("SGD_momentum-avg_span_5.txt")

In [70]:
epochs = 1000
early_stopping = EarlyStopping(5)

for epoch in range(epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")
    if early_stopping.should_stop(val_loss):
        break

100%|██████████| 750/750 [00:13<00:00, 56.20it/s, loss=2.2822]
Epoch 1 | Training loss: 2.2822, validation accuracy: 0.4733, validation loss: 2.2303
100%|██████████| 750/750 [00:12<00:00, 58.21it/s, loss=1.2210]
Epoch 2 | Training loss: 1.2210, validation accuracy: 0.8795, validation loss: 0.4153
100%|██████████| 750/750 [00:12<00:00, 58.13it/s, loss=0.3777]
Epoch 3 | Training loss: 0.3777, validation accuracy: 0.9143, validation loss: 0.2905
100%|██████████| 750/750 [00:12<00:00, 57.99it/s, loss=0.2733]
Epoch 4 | Training loss: 0.2733, validation accuracy: 0.9253, validation loss: 0.2524
100%|██████████| 750/750 [00:13<00:00, 56.13it/s, loss=0.2096]
Epoch 5 | Training loss: 0.2096, validation accuracy: 0.9432, validation loss: 0.1837
100%|██████████| 750/750 [00:12<00:00, 57.78it/s, loss=0.1695]
Epoch 6 | Training loss: 0.1695, validation accuracy: 0.9586, validation loss: 0.1419
100%|██████████| 750/750 [00:12<00:00, 57.75it/s, loss=0.1434]
Epoch 7 | Training loss: 0.1434, validation

In [71]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9945625, 0.01829809874645434)
Valid: (0.98525, 0.057204695962873905)


In [72]:
optimizer.accelerate()

In [73]:
optimizer.store_parameters()
model.cuda()
None

In [74]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9955416666666667, 0.016502816360560245)
Valid: (0.9854166666666667, 0.052549691832003496)
