In [1]:
import sys
sys.path.insert(0, "../..")

In [2]:
import torch
from torch import nn
from torchvision import models
from copy import deepcopy
import os

from nn_extrapolation import AcceleratedSGD
from nn_utils import *

In [3]:
trainer = Trainer(
    device="cuda:0",
    loss_fn=nn.NLLLoss(reduction="mean"),
    val_loss_fn=nn.NLLLoss(reduction="sum"),
)

In [4]:
dl = load_dataset(
    dataset="CIFAR10",
    root=os.path.join("/tmp", os.environ["USER"], "CIFAR"),
    augmentation=transforms.RandomAffine(10, scale=(0.9, 1.1), translate=(0.2, 0.2)),
    validation_split=0.2,
    batch_size=128,
    num_workers=10,
)

Files already downloaded and verified
Files already downloaded and verified


In [5]:
model = models.resnet34(pretrained=False)
model.fc = nn.Sequential(
    nn.Linear(512, 10),
    nn.LogSoftmax(-1)
)
initial_state = deepcopy(model.state_dict())
model.to(trainer.device)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [6]:
sum(param.numel() for param in model.parameters())

21289802

In [7]:
trainer.validation(model, dl["valid"])

(0.0936, 2.5926870376586915)

## No momentum

In [8]:
optimizer = AcceleratedSGD(model.parameters(), 1e-1, k=10, momentum=0, weight_decay=0, lambda_=1e-8)
logger = Logger("resnet34_log_augmentation_no_momentum.txt.no_resizing")
torch.manual_seed(2020)

<torch._C.Generator at 0x7f4bd03003b0>

In [9]:
epochs = 25

for epoch in range(epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")

100%|██████████████████████████████████████████████████████████| 313/313 [00:08<00:00, 35.73it/s, loss=2.1418]
Epoch 1 | Training loss: 2.1418, validation accuracy: 0.3445, validation loss: 1.9445
100%|██████████████████████████████████████████████████████████| 313/313 [00:08<00:00, 35.77it/s, loss=1.6584]
Epoch 2 | Training loss: 1.6584, validation accuracy: 0.3831, validation loss: 2.4550
100%|██████████████████████████████████████████████████████████| 313/313 [00:08<00:00, 35.31it/s, loss=1.5359]
Epoch 3 | Training loss: 1.5359, validation accuracy: 0.4975, validation loss: 1.4326
100%|██████████████████████████████████████████████████████████| 313/313 [00:08<00:00, 35.22it/s, loss=1.4069]
Epoch 4 | Training loss: 1.4069, validation accuracy: 0.4929, validation loss: 1.6508
100%|██████████████████████████████████████████████████████████| 313/313 [00:08<00:00, 35.18it/s, loss=1.3077]
Epoch 5 | Training loss: 1.3077, validation accuracy: 0.5087, validation loss: 1.5419
100%|██████████

In [10]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.765175, 0.6567683944702148)
Valid: (0.7562, 0.7065862510681152)


In [11]:
optimizer.param_groups[0]["method"] = "RNA"
logger.log("RNA")
model_acc = deepcopy(model)
optimizer.accelerate()
optimizer.store_parameters([model_acc.parameters()])

RNA


In [12]:
model_acc.to(trainer.device)
train_score = trainer.validation(model_acc, dl["train"])
valid_score = trainer.validation(model_acc, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.765725, 0.6632355155944825)
Valid: (0.7561, 0.7060411434173584)


In [13]:
optimizer.param_groups[0]["method"] = "RRE"
logger.log("RRE")
model_acc = deepcopy(model)
optimizer.accelerate()
optimizer.store_parameters([model_acc.parameters()])

RRE


In [14]:
model_acc.to(trainer.device)
train_score = trainer.validation(model_acc, dl["train"])
valid_score = trainer.validation(model_acc, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.7669, 0.6606591187477112)
Valid: (0.756, 0.7061777219772339)


In [15]:
optimizer.param_groups[0]["lr"] = 1e-2

In [16]:
epochs = 25

for epoch in range(epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")

100%|██████████████████████████████████████████████████████████| 313/313 [00:09<00:00, 34.35it/s, loss=0.5732]
Epoch 1 | Training loss: 0.5732, validation accuracy: 0.7905, validation loss: 0.6225
100%|██████████████████████████████████████████████████████████| 313/313 [00:09<00:00, 34.57it/s, loss=0.5496]
Epoch 2 | Training loss: 0.5496, validation accuracy: 0.7913, validation loss: 0.6230
100%|██████████████████████████████████████████████████████████| 313/313 [00:09<00:00, 34.57it/s, loss=0.5400]
Epoch 3 | Training loss: 0.5400, validation accuracy: 0.7855, validation loss: 0.6219
100%|██████████████████████████████████████████████████████████| 313/313 [00:09<00:00, 34.62it/s, loss=0.5319]
Epoch 4 | Training loss: 0.5319, validation accuracy: 0.7886, validation loss: 0.6225
100%|██████████████████████████████████████████████████████████| 313/313 [00:09<00:00, 33.76it/s, loss=0.5274]
Epoch 5 | Training loss: 0.5274, validation accuracy: 0.7917, validation loss: 0.6200
100%|██████████

In [17]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.847675, 0.431540719461441)
Valid: (0.7947, 0.6249418844223023)


In [18]:
optimizer.param_groups[0]["method"] = "RNA"
logger.log("RNA")
model_acc = deepcopy(model)
optimizer.accelerate()
optimizer.store_parameters([model_acc.parameters()])

RNA


In [19]:
model_acc.to(trainer.device)
train_score = trainer.validation(model_acc, dl["train"])
valid_score = trainer.validation(model_acc, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.844925, 0.4386705246448517)
Valid: (0.7979, 0.631107096862793)


In [20]:
optimizer.param_groups[0]["method"] = "RRE"
logger.log("RRE")
model_acc = deepcopy(model)
optimizer.accelerate()
optimizer.store_parameters([model_acc.parameters()])

RRE


In [21]:
model_acc.to(trainer.device)
train_score = trainer.validation(model_acc, dl["train"])
valid_score = trainer.validation(model_acc, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.84045, 0.4458770673751831)
Valid: (0.7979, 0.6311119369506836)


In [22]:
optimizer.param_groups[0]["lr"] = 1e-3

In [23]:
epochs = 25

for epoch in range(epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")

100%|██████████████████████████████████████████████████████████| 313/313 [00:09<00:00, 34.03it/s, loss=0.4451]
Epoch 1 | Training loss: 0.4451, validation accuracy: 0.7971, validation loss: 0.6162
100%|██████████████████████████████████████████████████████████| 313/313 [00:09<00:00, 33.92it/s, loss=0.4470]
Epoch 2 | Training loss: 0.4470, validation accuracy: 0.7985, validation loss: 0.6185
100%|██████████████████████████████████████████████████████████| 313/313 [00:09<00:00, 34.09it/s, loss=0.4398]
Epoch 3 | Training loss: 0.4398, validation accuracy: 0.7998, validation loss: 0.6170
100%|██████████████████████████████████████████████████████████| 313/313 [00:09<00:00, 33.76it/s, loss=0.4390]
Epoch 4 | Training loss: 0.4390, validation accuracy: 0.7966, validation loss: 0.6164
100%|██████████████████████████████████████████████████████████| 313/313 [00:09<00:00, 34.11it/s, loss=0.4390]
Epoch 5 | Training loss: 0.4390, validation accuracy: 0.8001, validation loss: 0.6132
100%|██████████

In [24]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.854375, 0.4096952498435974)
Valid: (0.7982, 0.6191634141921997)


In [25]:
optimizer.param_groups[0]["method"] = "RNA"
logger.log("RNA")
model_acc = deepcopy(model)
optimizer.accelerate()
optimizer.store_parameters([model_acc.parameters()])

RNA


In [26]:
model_acc.to(trainer.device)
train_score = trainer.validation(model_acc, dl["train"])
valid_score = trainer.validation(model_acc, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.854175, 0.41333082404136656)
Valid: (0.7981, 0.6218553371429444)


In [27]:
optimizer.param_groups[0]["method"] = "RRE"
logger.log("RRE")
model_acc = deepcopy(model)
optimizer.accelerate()
optimizer.store_parameters([model_acc.parameters()])

RRE


In [28]:
model_acc.to(trainer.device)
train_score = trainer.validation(model_acc, dl["train"])
valid_score = trainer.validation(model_acc, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.8531, 0.4100926302433014)
Valid: (0.7981, 0.6218560708999634)


## Momentum

In [29]:
model.load_state_dict(initial_state)
model.to(trainer.device)
None

In [30]:
trainer.validation(model, dl["valid"])

(0.0936, 2.5926870376586915)

In [31]:
optimizer = AcceleratedSGD(model.parameters(), 1e-1, k=10, momentum=0, weight_decay=0, lambda_=1e-8)
logger = Logger("resnet34_log_augmentation.txt.no_resizing")
torch.manual_seed(2020)

<torch._C.Generator at 0x7f4bd03003b0>

In [32]:
epochs = 25

for epoch in range(epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")

100%|██████████████████████████████████████████████████████████| 313/313 [00:09<00:00, 34.32it/s, loss=2.1017]
Epoch 1 | Training loss: 2.1017, validation accuracy: 0.3332, validation loss: 5.4656
100%|██████████████████████████████████████████████████████████| 313/313 [00:09<00:00, 33.95it/s, loss=1.6294]
Epoch 2 | Training loss: 1.6294, validation accuracy: 0.3185, validation loss: 2.1533
100%|██████████████████████████████████████████████████████████| 313/313 [00:09<00:00, 34.15it/s, loss=1.4835]
Epoch 3 | Training loss: 1.4835, validation accuracy: 0.4622, validation loss: 1.4778
100%|██████████████████████████████████████████████████████████| 313/313 [00:16<00:00, 19.52it/s, loss=1.3766]
Epoch 4 | Training loss: 1.3766, validation accuracy: 0.4389, validation loss: 2.1726
100%|██████████████████████████████████████████████████████████| 313/313 [00:12<00:00, 25.35it/s, loss=1.2893]
Epoch 5 | Training loss: 1.2893, validation accuracy: 0.4768, validation loss: 2.7873
100%|██████████

In [33]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.74695, 0.7144866751670838)
Valid: (0.7378, 0.7768762789726257)


In [34]:
optimizer.param_groups[0]["method"] = "RNA"
logger.log("RNA")
model_acc = deepcopy(model)
optimizer.accelerate()
optimizer.store_parameters([model_acc.parameters()])

RNA


In [35]:
model_acc.to(trainer.device)
train_score = trainer.validation(model_acc, dl["train"])
valid_score = trainer.validation(model_acc, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.761075, 0.6842308477401733)
Valid: (0.7516, 0.7148696293830872)


In [36]:
optimizer.param_groups[0]["method"] = "RRE"
logger.log("RRE")
model_acc = deepcopy(model)
optimizer.accelerate()
optimizer.store_parameters([model_acc.parameters()])

RRE


In [37]:
model_acc.to(trainer.device)
train_score = trainer.validation(model_acc, dl["train"])
valid_score = trainer.validation(model_acc, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.7597, 0.68345962266922)
Valid: (0.7517, 0.7149833367347718)


In [38]:
optimizer.param_groups[0]["lr"] = 1e-2

In [39]:
epochs = 25

for epoch in range(epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")

100%|██████████████████████████████████████████████████████████| 313/313 [00:09<00:00, 34.41it/s, loss=0.5799]
Epoch 1 | Training loss: 0.5799, validation accuracy: 0.7852, validation loss: 0.6344
100%|██████████████████████████████████████████████████████████| 313/313 [00:09<00:00, 34.52it/s, loss=0.5554]
Epoch 2 | Training loss: 0.5554, validation accuracy: 0.7862, validation loss: 0.6317
100%|██████████████████████████████████████████████████████████| 313/313 [00:09<00:00, 34.61it/s, loss=0.5457]
Epoch 3 | Training loss: 0.5457, validation accuracy: 0.7877, validation loss: 0.6302
100%|██████████████████████████████████████████████████████████| 313/313 [00:09<00:00, 34.61it/s, loss=0.5386]
Epoch 4 | Training loss: 0.5386, validation accuracy: 0.7868, validation loss: 0.6358
100%|██████████████████████████████████████████████████████████| 313/313 [00:09<00:00, 34.56it/s, loss=0.5325]
Epoch 5 | Training loss: 0.5325, validation accuracy: 0.7912, validation loss: 0.6295
100%|██████████

In [40]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.841975, 0.4354125330448151)
Valid: (0.7939, 0.6406325536727905)


In [41]:
optimizer.param_groups[0]["method"] = "RNA"
logger.log("RNA")
model_acc = deepcopy(model)
optimizer.accelerate()
optimizer.store_parameters([model_acc.parameters()])

RNA


In [42]:
model_acc.to(trainer.device)
train_score = trainer.validation(model_acc, dl["train"])
valid_score = trainer.validation(model_acc, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.8423, 0.44416335334777834)
Valid: (0.7911, 0.6476818460464477)


In [43]:
optimizer.param_groups[0]["method"] = "RRE"
logger.log("RRE")
model_acc = deepcopy(model)
optimizer.accelerate()
optimizer.store_parameters([model_acc.parameters()])

RRE


In [44]:
model_acc.to(trainer.device)
train_score = trainer.validation(model_acc, dl["train"])
valid_score = trainer.validation(model_acc, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.8394, 0.44526529016494754)
Valid: (0.791, 0.6476817321777344)


In [45]:
optimizer.param_groups[0]["lr"] = 1e-3

In [46]:
epochs = 25

for epoch in range(epochs):
    train_loss = trainer.train_epoch(model, optimizer, dl["train"])
    optimizer.finish_epoch()
    val_acc, val_loss = trainer.validation(model, dl["valid"])
    logger.log("Epoch", epoch+1, "|", 
          f"Training loss: {train_loss:.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")

100%|██████████████████████████████████████████████████████████| 313/313 [00:09<00:00, 34.45it/s, loss=0.4492]
Epoch 1 | Training loss: 0.4492, validation accuracy: 0.7948, validation loss: 0.6316
100%|██████████████████████████████████████████████████████████| 313/313 [00:09<00:00, 34.25it/s, loss=0.4521]
Epoch 2 | Training loss: 0.4521, validation accuracy: 0.7961, validation loss: 0.6372
100%|██████████████████████████████████████████████████████████| 313/313 [00:09<00:00, 34.28it/s, loss=0.4435]
Epoch 3 | Training loss: 0.4435, validation accuracy: 0.7955, validation loss: 0.6335
100%|██████████████████████████████████████████████████████████| 313/313 [00:09<00:00, 34.25it/s, loss=0.4472]
Epoch 4 | Training loss: 0.4472, validation accuracy: 0.7931, validation loss: 0.6351
100%|██████████████████████████████████████████████████████████| 313/313 [00:09<00:00, 34.31it/s, loss=0.4438]
Epoch 5 | Training loss: 0.4438, validation accuracy: 0.7943, validation loss: 0.6336
100%|██████████

In [47]:
train_score = trainer.validation(model, dl["train"])
valid_score = trainer.validation(model, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.8538, 0.4085409345149994)
Valid: (0.7969, 0.6360045793533325)


In [48]:
optimizer.param_groups[0]["method"] = "RNA"
logger.log("RNA")
model_acc = deepcopy(model)
optimizer.accelerate()
optimizer.store_parameters([model_acc.parameters()])

RNA


In [49]:
model_acc.to(trainer.device)
train_score = trainer.validation(model_acc, dl["train"])
valid_score = trainer.validation(model_acc, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.850325, 0.414179242515564)
Valid: (0.7956, 0.6398091527938843)


In [50]:
optimizer.param_groups[0]["method"] = "RRE"
logger.log("RRE")
model_acc = deepcopy(model)
optimizer.accelerate()
optimizer.store_parameters([model_acc.parameters()])

RRE


In [51]:
model_acc.to(trainer.device)
train_score = trainer.validation(model_acc, dl["train"])
valid_score = trainer.validation(model_acc, dl["valid"])
logger.log("Train:", train_score)
logger.log("Valid:", valid_score)

Train: (0.852225, 0.41267845845222473)
Valid: (0.7956, 0.6398136833190918)
