In [1]:
import sys
sys.path.insert(0, "../..")

In [2]:
import torch
from torch import nn
import numpy as np
from copy import deepcopy

from nn_extrapolation import AcceleratedSGD
from nn_utils import *

In [3]:
torch.cuda.is_available()

True

In [4]:
trainer = Trainer(
    device="cuda",
    loss_fn=nn.NLLLoss(reduction="mean"),
    val_loss_fn=nn.NLLLoss(reduction="sum"),
)

In [5]:
dl = load_dataset(
    dataset="mnist", 
    root="../../../MNIST", 
    download=False, 
    validation_split=0.2,
    batch_size=64, 
    num_workers=2,
)

## Levin t

In [6]:
model = nn.Sequential(
    nn.Conv2d(1, 32, 3),
    nn.ReLU(),
    nn.Conv2d(32, 32, 3),
    nn.ReLU(),
    nn.MaxPool2d(2),
    nn.Conv2d(32, 64, 3),
    nn.ReLU(),
    nn.Conv2d(64, 64, 3),
    nn.ReLU(),
    nn.MaxPool2d(2),
    nn.Flatten(),
    nn.Linear(4*4*64, 128),
    nn.ReLU(),
    nn.Linear(128, 10),
    nn.LogSoftmax(-1),
)
model.cuda()

Sequential(
  (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (1): ReLU()
  (2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
  (3): ReLU()
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (6): ReLU()
  (7): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (8): ReLU()
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Flatten(start_dim=1, end_dim=-1)
  (11): Linear(in_features=1024, out_features=128, bias=True)
  (12): ReLU()
  (13): Linear(in_features=128, out_features=10, bias=True)
  (14): LogSoftmax(dim=-1)
)

In [7]:
optimizer = AcceleratedSGD(model.parameters(), 1e-3, k=4, mode="epoch", method="Levin:t")
logger = Logger("SGD-levin:t:ep20:k=4")

In [8]:
epochs = 20

for epoch in range(epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")

100%|██████████| 750/750 [00:12<00:00, 57.71it/s, loss=2.2994]
Epoch 1 | Training loss: 2.2994, validation accuracy: 0.0975, validation loss: 2.2951
100%|██████████| 750/750 [00:12<00:00, 58.25it/s, loss=2.2871]
Epoch 2 | Training loss: 2.2871, validation accuracy: 0.3467, validation loss: 2.2762
100%|██████████| 750/750 [00:12<00:00, 59.30it/s, loss=2.2388]
Epoch 3 | Training loss: 2.2388, validation accuracy: 0.5018, validation loss: 2.1553
100%|██████████| 750/750 [00:12<00:00, 59.48it/s, loss=1.4344]
Epoch 4 | Training loss: 1.4344, validation accuracy: 0.8310, validation loss: 0.5880
100%|██████████| 750/750 [00:12<00:00, 59.58it/s, loss=0.4990]
Epoch 5 | Training loss: 0.4990, validation accuracy: 0.8776, validation loss: 0.4064
100%|██████████| 750/750 [00:12<00:00, 59.39it/s, loss=0.3933]
Epoch 6 | Training loss: 0.3933, validation accuracy: 0.8976, validation loss: 0.3315
100%|██████████| 750/750 [00:13<00:00, 56.59it/s, loss=0.3324]
Epoch 7 | Training loss: 0.3324, validation

In [9]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.973, 0.0901744787854453)
Valid: (0.9704166666666667, 0.09790041659027338)


In [10]:
optimizer.accelerate()

In [11]:
optimizer.store_parameters()
model.cuda()

Sequential(
  (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (1): ReLU()
  (2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
  (3): ReLU()
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (6): ReLU()
  (7): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (8): ReLU()
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Flatten(start_dim=1, end_dim=-1)
  (11): Linear(in_features=1024, out_features=128, bias=True)
  (12): ReLU()
  (13): Linear(in_features=128, out_features=10, bias=True)
  (14): LogSoftmax(dim=-1)
)

In [12]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9709166666666667, 0.0966013014105459)
Valid: (0.9701666666666666, 0.1036723493685325)


## Levin u

In [13]:
model = nn.Sequential(
    nn.Conv2d(1, 32, 3),
    nn.ReLU(),
    nn.Conv2d(32, 32, 3),
    nn.ReLU(),
    nn.MaxPool2d(2),
    nn.Conv2d(32, 64, 3),
    nn.ReLU(),
    nn.Conv2d(64, 64, 3),
    nn.ReLU(),
    nn.MaxPool2d(2),
    nn.Flatten(),
    nn.Linear(4*4*64, 128),
    nn.ReLU(),
    nn.Linear(128, 10),
    nn.LogSoftmax(-1),
)
model.cuda()

Sequential(
  (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (1): ReLU()
  (2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
  (3): ReLU()
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (6): ReLU()
  (7): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (8): ReLU()
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Flatten(start_dim=1, end_dim=-1)
  (11): Linear(in_features=1024, out_features=128, bias=True)
  (12): ReLU()
  (13): Linear(in_features=128, out_features=10, bias=True)
  (14): LogSoftmax(dim=-1)
)

In [14]:
optimizer = AcceleratedSGD(model.parameters(), 1e-3, k=4, mode="epoch", method="Levin:u")
logger = Logger("SGD-levin:u:ep20:k=4")

In [15]:
epochs = 20

for epoch in range(epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")

100%|██████████| 750/750 [00:12<00:00, 58.11it/s, loss=2.2953]
Epoch 1 | Training loss: 2.2953, validation accuracy: 0.1350, validation loss: 2.2847
100%|██████████| 750/750 [00:12<00:00, 58.62it/s, loss=2.2561]
Epoch 2 | Training loss: 2.2561, validation accuracy: 0.5682, validation loss: 2.1973
100%|██████████| 750/750 [00:12<00:00, 58.31it/s, loss=1.6071]
Epoch 3 | Training loss: 1.6071, validation accuracy: 0.8318, validation loss: 0.6260
100%|██████████| 750/750 [00:12<00:00, 58.04it/s, loss=0.4850]
Epoch 4 | Training loss: 0.4850, validation accuracy: 0.8898, validation loss: 0.3704
100%|██████████| 750/750 [00:12<00:00, 59.03it/s, loss=0.3578]
Epoch 5 | Training loss: 0.3578, validation accuracy: 0.9116, validation loss: 0.2971
100%|██████████| 750/750 [00:12<00:00, 59.15it/s, loss=0.2947]
Epoch 6 | Training loss: 0.2947, validation accuracy: 0.9308, validation loss: 0.2448
100%|██████████| 750/750 [00:12<00:00, 58.99it/s, loss=0.2503]
Epoch 7 | Training loss: 0.2503, validation

In [16]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9756458333333333, 0.08024124085282286)
Valid: (0.9726666666666667, 0.09067367375890414)


In [17]:
optimizer.accelerate()

In [18]:
optimizer.store_parameters()
model.cuda()

Sequential(
  (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (1): ReLU()
  (2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
  (3): ReLU()
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (6): ReLU()
  (7): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (8): ReLU()
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Flatten(start_dim=1, end_dim=-1)
  (11): Linear(in_features=1024, out_features=128, bias=True)
  (12): ReLU()
  (13): Linear(in_features=128, out_features=10, bias=True)
  (14): LogSoftmax(dim=-1)
)

In [19]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.2939375, 7.032645418802897)
Valid: (0.28541666666666665, 6.914684150695801)


## Levin v

In [20]:
model = nn.Sequential(
    nn.Conv2d(1, 32, 3),
    nn.ReLU(),
    nn.Conv2d(32, 32, 3),
    nn.ReLU(),
    nn.MaxPool2d(2),
    nn.Conv2d(32, 64, 3),
    nn.ReLU(),
    nn.Conv2d(64, 64, 3),
    nn.ReLU(),
    nn.MaxPool2d(2),
    nn.Flatten(),
    nn.Linear(4*4*64, 128),
    nn.ReLU(),
    nn.Linear(128, 10),
    nn.LogSoftmax(-1),
)
model.cuda()

Sequential(
  (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (1): ReLU()
  (2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
  (3): ReLU()
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (6): ReLU()
  (7): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (8): ReLU()
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Flatten(start_dim=1, end_dim=-1)
  (11): Linear(in_features=1024, out_features=128, bias=True)
  (12): ReLU()
  (13): Linear(in_features=128, out_features=10, bias=True)
  (14): LogSoftmax(dim=-1)
)

In [21]:
optimizer = AcceleratedSGD(model.parameters(), 1e-3, k=4, mode="epoch", method="Levin:v")
logger = Logger("SGD-levin:v:ep20:k=4")

In [22]:
epochs = 20

for epoch in range(epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")

100%|██████████| 750/750 [00:13<00:00, 53.95it/s, loss=2.2989]
Epoch 1 | Training loss: 2.2989, validation accuracy: 0.1081, validation loss: 2.2919
100%|██████████| 750/750 [00:13<00:00, 54.88it/s, loss=2.2819]
Epoch 2 | Training loss: 2.2819, validation accuracy: 0.3173, validation loss: 2.2632
100%|██████████| 750/750 [00:13<00:00, 55.29it/s, loss=2.1814]
Epoch 3 | Training loss: 2.1814, validation accuracy: 0.6442, validation loss: 1.9236
100%|██████████| 750/750 [00:13<00:00, 55.62it/s, loss=0.9821]
Epoch 4 | Training loss: 0.9821, validation accuracy: 0.8458, validation loss: 0.5208
100%|██████████| 750/750 [00:13<00:00, 55.39it/s, loss=0.4801]
Epoch 5 | Training loss: 0.4801, validation accuracy: 0.8835, validation loss: 0.3892
100%|██████████| 750/750 [00:13<00:00, 55.14it/s, loss=0.3821]
Epoch 6 | Training loss: 0.3821, validation accuracy: 0.8955, validation loss: 0.3357
100%|██████████| 750/750 [00:13<00:00, 55.54it/s, loss=0.3210]
Epoch 7 | Training loss: 0.3210, validation

In [23]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9719375, 0.09182257438451051)
Valid: (0.9694166666666667, 0.10108074158926805)


In [24]:
optimizer.accelerate()

In [25]:
optimizer.store_parameters()
model.cuda()

Sequential(
  (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (1): ReLU()
  (2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1))
  (3): ReLU()
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (6): ReLU()
  (7): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (8): ReLU()
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Flatten(start_dim=1, end_dim=-1)
  (11): Linear(in_features=1024, out_features=128, bias=True)
  (12): ReLU()
  (13): Linear(in_features=128, out_features=10, bias=True)
  (14): LogSoftmax(dim=-1)
)

In [26]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.9684791666666667, 0.104595688033849)
Valid: (0.9650833333333333, 0.11421944808214902)
