In [1]:
import sys
sys.path.insert(0, "../..")

In [2]:
import torch
from torch import nn
import numpy as np
from copy import deepcopy

from nn_extrapolation import AcceleratedSGD
from nn_utils import *

In [3]:
torch.cuda.is_available()

True

In [4]:
trainer = Trainer(
    device="cuda",
    loss_fn=nn.NLLLoss(reduction="mean"),
    val_loss_fn=nn.NLLLoss(reduction="sum"),
)

In [5]:
dl = load_dataset(
    dataset="mnist", 
    root="../../../MNIST", 
    download=False, 
    validation_split=0.2,
    batch_size=64, 
    num_workers=2,
)

## Levin t

In [6]:
model = nn.Sequential(
    nn.Conv2d(1, 32, 3),
    nn.ReLU(),
    nn.Conv2d(32, 32, 3),
    nn.ReLU(),
    nn.MaxPool2d(2),
    nn.Conv2d(32, 64, 3),
    nn.ReLU(),
    nn.Conv2d(64, 64, 3),
    nn.ReLU(),
    nn.MaxPool2d(2),
    nn.Flatten(),
    nn.Linear(4*4*64, 128),
    nn.ReLU(),
    nn.Linear(128, 10),
    nn.LogSoftmax(-1),
)
model.cuda()

Sequential(
  (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (1): ReLU()
  (2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
  (3): ReLU()
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (6): ReLU()
  (7): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (8): ReLU()
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Flatten(start_dim=1, end_dim=-1)
  (11): Linear(in_features=1024, out_features=128, bias=True)
  (12): ReLU()
  (13): Linear(in_features=128, out_features=10, bias=True)
  (14): LogSoftmax(dim=-1)
)

In [7]:
optimizer = AcceleratedSGD(model.parameters(), 1e-3, k=8, mode="epoch", method="Levin:t")
logger = Logger("SGD-levin:t2")

In [8]:
epochs = 30

for epoch in range(epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")

100%|██████████| 750/750 [00:15<00:00, 49.77it/s, loss=2.3002]
Epoch 1 | Training loss: 2.3002, validation accuracy: 0.1784, validation loss: 2.2951
100%|██████████| 750/750 [00:13<00:00, 54.01it/s, loss=2.2882]
Epoch 2 | Training loss: 2.2882, validation accuracy: 0.2983, validation loss: 2.2773
100%|██████████| 750/750 [00:14<00:00, 53.05it/s, loss=2.2472]
Epoch 3 | Training loss: 2.2472, validation accuracy: 0.4923, validation loss: 2.1775
100%|██████████| 750/750 [00:14<00:00, 53.10it/s, loss=1.5423]
Epoch 4 | Training loss: 1.5423, validation accuracy: 0.8037, validation loss: 0.6540
100%|██████████| 750/750 [00:14<00:00, 50.97it/s, loss=0.5278]
Epoch 5 | Training loss: 0.5278, validation accuracy: 0.8751, validation loss: 0.4153
100%|██████████| 750/750 [00:13<00:00, 55.79it/s, loss=0.4005]
Epoch 6 | Training loss: 0.4005, validation accuracy: 0.9020, validation loss: 0.3310
100%|██████████| 750/750 [00:13<00:00, 56.47it/s, loss=0.3354]
Epoch 7 | Training loss: 0.3354, validation

In [9]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9794583333333333, 0.06719030682742595)
Valid: (0.97525, 0.08373364659336706)


In [10]:
optimizer.accelerate()

In [11]:
optimizer.store_parameters()
model.cuda()

Sequential(
  (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (1): ReLU()
  (2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
  (3): ReLU()
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (6): ReLU()
  (7): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (8): ReLU()
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Flatten(start_dim=1, end_dim=-1)
  (11): Linear(in_features=1024, out_features=128, bias=True)
  (12): ReLU()
  (13): Linear(in_features=128, out_features=10, bias=True)
  (14): LogSoftmax(dim=-1)
)

In [12]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.8101666666666667, 0.7286712407867114)
Valid: (0.8155, 0.7012091176509857)


## Levin u

In [13]:
model = nn.Sequential(
    nn.Conv2d(1, 32, 3),
    nn.ReLU(),
    nn.Conv2d(32, 32, 3),
    nn.ReLU(),
    nn.MaxPool2d(2),
    nn.Conv2d(32, 64, 3),
    nn.ReLU(),
    nn.Conv2d(64, 64, 3),
    nn.ReLU(),
    nn.MaxPool2d(2),
    nn.Flatten(),
    nn.Linear(4*4*64, 128),
    nn.ReLU(),
    nn.Linear(128, 10),
    nn.LogSoftmax(-1),
)
model.cuda()

Sequential(
  (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (1): ReLU()
  (2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
  (3): ReLU()
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (6): ReLU()
  (7): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (8): ReLU()
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Flatten(start_dim=1, end_dim=-1)
  (11): Linear(in_features=1024, out_features=128, bias=True)
  (12): ReLU()
  (13): Linear(in_features=128, out_features=10, bias=True)
  (14): LogSoftmax(dim=-1)
)

In [14]:
optimizer = AcceleratedSGD(model.parameters(), 1e-3, k=8, mode="epoch", method="Levin:u")
logger = Logger("SGD-levin:u2")

In [15]:
epochs = 30

for epoch in range(epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")

100%|██████████| 750/750 [00:13<00:00, 56.69it/s, loss=2.2992]
Epoch 1 | Training loss: 2.2992, validation accuracy: 0.2289, validation loss: 2.2914
100%|██████████| 750/750 [00:13<00:00, 57.06it/s, loss=2.2786]
Epoch 2 | Training loss: 2.2786, validation accuracy: 0.2469, validation loss: 2.2553
100%|██████████| 750/750 [00:13<00:00, 57.24it/s, loss=2.1310]
Epoch 3 | Training loss: 2.1310, validation accuracy: 0.6774, validation loss: 1.7008
100%|██████████| 750/750 [00:13<00:00, 56.88it/s, loss=0.8341]
Epoch 4 | Training loss: 0.8341, validation accuracy: 0.8553, validation loss: 0.4897
100%|██████████| 750/750 [00:13<00:00, 56.17it/s, loss=0.4617]
Epoch 5 | Training loss: 0.4617, validation accuracy: 0.8806, validation loss: 0.3882
100%|██████████| 750/750 [00:14<00:00, 53.23it/s, loss=0.3727]
Epoch 6 | Training loss: 0.3727, validation accuracy: 0.9074, validation loss: 0.3106
100%|██████████| 750/750 [00:13<00:00, 56.40it/s, loss=0.3129]
Epoch 7 | Training loss: 0.3129, validation

In [16]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9793541666666666, 0.06807203026550511)
Valid: (0.9741666666666666, 0.08341941005301973)


In [17]:
optimizer.accelerate()

In [18]:
optimizer.store_parameters()
model.cuda()

Sequential(
  (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (1): ReLU()
  (2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
  (3): ReLU()
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (6): ReLU()
  (7): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (8): ReLU()
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Flatten(start_dim=1, end_dim=-1)
  (11): Linear(in_features=1024, out_features=128, bias=True)
  (12): ReLU()
  (13): Linear(in_features=128, out_features=10, bias=True)
  (14): LogSoftmax(dim=-1)
)

In [19]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.7462708333333333, 0.8923707112272581)
Valid: (0.7598333333333334, 0.8656922217210133)


## Levin v

In [20]:
model = nn.Sequential(
    nn.Conv2d(1, 32, 3),
    nn.ReLU(),
    nn.Conv2d(32, 32, 3),
    nn.ReLU(),
    nn.MaxPool2d(2),
    nn.Conv2d(32, 64, 3),
    nn.ReLU(),
    nn.Conv2d(64, 64, 3),
    nn.ReLU(),
    nn.MaxPool2d(2),
    nn.Flatten(),
    nn.Linear(4*4*64, 128),
    nn.ReLU(),
    nn.Linear(128, 10),
    nn.LogSoftmax(-1),
)
model.cuda()

Sequential(
  (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (1): ReLU()
  (2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
  (3): ReLU()
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (6): ReLU()
  (7): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (8): ReLU()
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Flatten(start_dim=1, end_dim=-1)
  (11): Linear(in_features=1024, out_features=128, bias=True)
  (12): ReLU()
  (13): Linear(in_features=128, out_features=10, bias=True)
  (14): LogSoftmax(dim=-1)
)

In [21]:
optimizer = AcceleratedSGD(model.parameters(), 1e-3, k=8, mode="epoch", method="Levin:v")
logger = Logger("SGD-levin:v2")

In [22]:
epochs = 30

for epoch in range(epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")

100%|██████████| 750/750 [00:13<00:00, 56.16it/s, loss=2.3005]
Epoch 1 | Training loss: 2.3005, validation accuracy: 0.1060, validation loss: 2.2977
100%|██████████| 750/750 [00:13<00:00, 54.69it/s, loss=2.2922]
Epoch 2 | Training loss: 2.2922, validation accuracy: 0.1076, validation loss: 2.2864
100%|██████████| 750/750 [00:13<00:00, 54.68it/s, loss=2.2711]
Epoch 3 | Training loss: 2.2711, validation accuracy: 0.5406, validation loss: 2.2445
100%|██████████| 750/750 [00:13<00:00, 55.16it/s, loss=2.0400]
Epoch 4 | Training loss: 2.0400, validation accuracy: 0.7024, validation loss: 1.3331
100%|██████████| 750/750 [00:13<00:00, 55.12it/s, loss=0.7081]
Epoch 5 | Training loss: 0.7081, validation accuracy: 0.8393, validation loss: 0.4918
100%|██████████| 750/750 [00:12<00:00, 58.27it/s, loss=0.4452]
Epoch 6 | Training loss: 0.4452, validation accuracy: 0.8870, validation loss: 0.3728
100%|██████████| 750/750 [00:12<00:00, 57.74it/s, loss=0.3696]
Epoch 7 | Training loss: 0.3696, validation

In [23]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9799375, 0.06498752380969623)
Valid: (0.9761666666666666, 0.07972234211799999)


In [24]:
optimizer.accelerate()

In [25]:
optimizer.store_parameters()
model.cuda()

Sequential(
  (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (1): ReLU()
  (2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
  (3): ReLU()
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (6): ReLU()
  (7): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (8): ReLU()
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Flatten(start_dim=1, end_dim=-1)
  (11): Linear(in_features=1024, out_features=128, bias=True)
  (12): ReLU()
  (13): Linear(in_features=128, out_features=10, bias=True)
  (14): LogSoftmax(dim=-1)
)

In [26]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9762083333333333, 0.07590009386464953)
Valid: (0.9741666666666666, 0.08678819672514995)
