In [10]:
import sys
sys.path.insert(0, "../..")

In [11]:
import torch
from torch import nn
from torch.utils import data
from torchvision import datasets, transforms
from tqdm.notebook import tqdm
import numpy as np
from copy import deepcopy

from nn_extrapolation import AcceleratedSGD

In [12]:
torch.cuda.is_available()

True

In [13]:
val_loss_fn = nn.NLLLoss(reduction="sum")

def validation(model, loader):
    ok = 0
    loss_sum = 0
    total = 0
    model.eval()
    with torch.no_grad():
        for x, y in loader:
            x = x.cuda()
            y = y.cuda()
            out = model(x)
            loss_sum += val_loss_fn(out, y)
            preds = out.argmax(1)
            ok += (y == preds).sum()
            total += len(y)
    return ok / total, loss_sum / total

def train_epoch(loss_log):
    model.train()
    for x, y in train_loader:
        x = x.cuda()
        y = y.cuda()
        optimizer.zero_grad()
        out = model(x)
        loss = loss_fn(out, y)
        loss_log += list(loss.flatten().cpu().detach().numpy())
        loss.backward()
        optimizer.step()

In [14]:
train_ds = datasets.MNIST("../../../MNIST", download=True, train=True, transform=transforms.ToTensor())
test_ds = datasets.MNIST("../../../MNIST", download=True, train=False, transform=transforms.ToTensor())
valid_size = int(0.2 * len(train_ds))
train_ds, valid_ds = data.random_split(train_ds, [len(train_ds) - valid_size, valid_size])

train_loader = data.DataLoader(train_ds, batch_size=64, shuffle=True, num_workers=2)
valid_loader = data.DataLoader(valid_ds, batch_size=64, shuffle=True, num_workers=2)
test_loader = data.DataLoader(test_ds, batch_size=64, shuffle=False, num_workers=2)

In [15]:
model = nn.Sequential(
    nn.Flatten(),
    nn.Linear(28*28, 512),
    nn.ReLU(),
    nn.Linear(512, 10),
    nn.LogSoftmax(-1),
)
model.cuda()

Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (1): Linear(in_features=784, out_features=512, bias=True)
  (2): ReLU()
  (3): Linear(in_features=512, out_features=10, bias=True)
  (4): LogSoftmax(dim=-1)
)

In [16]:
optimizer = AcceleratedSGD(model.parameters(), 1e-3, k=10, momentum=0.8, weight_decay=1e-5, mode="epoch")
loss_fn = nn.NLLLoss()

In [19]:
log_file = open("SGD-2l_momentum.txt", "w")

## Epoch

In [20]:
epochs = 20

for epoch in range(epochs):
    print("Epoch", epoch+1)
    loss_log = []
    train_epoch(loss_log)
    print(f"Training loss: {np.mean(loss_log):.4f}")
    optimizer.finish_epoch()
    val_acc, val_loss = validation(model, valid_loader)
    print(f"Validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")
    print("Epoch", epoch+1, 
          f"Training loss: {np.mean(loss_log):.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}",
          file=log_file, flush=True
         )

Epoch 1
Training loss: 0.3447
Validation accuracy: 0.9027, validation loss: 0.3513
Epoch 2
Training loss: 0.3334
Validation accuracy: 0.9060, validation loss: 0.3409
Epoch 3
Training loss: 0.3238
Validation accuracy: 0.9072, validation loss: 0.3322
Epoch 4
Training loss: 0.3151
Validation accuracy: 0.9092, validation loss: 0.3251
Epoch 5
Training loss: 0.3075
Validation accuracy: 0.9107, validation loss: 0.3179
Epoch 6
Training loss: 0.3005
Validation accuracy: 0.9131, validation loss: 0.3121
Epoch 7
Training loss: 0.2940
Validation accuracy: 0.9142, validation loss: 0.3060
Epoch 8
Training loss: 0.2880
Validation accuracy: 0.9155, validation loss: 0.3012
Epoch 9
Training loss: 0.2823
Validation accuracy: 0.9170, validation loss: 0.2960
Epoch 10
Training loss: 0.2769
Validation accuracy: 0.9187, validation loss: 0.2912
Epoch 11
Training loss: 0.2718
Validation accuracy: 0.9200, validation loss: 0.2872
Epoch 12
Training loss: 0.2670
Validation accuracy: 0.9213, validation loss: 0.2822
E

In [21]:
train_score = validation(model, train_loader)
valid_score = validation(model, valid_loader)
print("Train:", train_score)
print("Valid:", valid_score)
print("Train:", train_score, flush=True, file=log_file)
print("Valid:", valid_score, flush=True, file=log_file)

Train: (tensor(0.9344, device='cuda:0'), tensor(0.2314, device='cuda:0'))
Valid: (tensor(0.9295, device='cuda:0'), tensor(0.2525, device='cuda:0'))


In [22]:
optimizer.param_groups[0]["method"] = "RNA"
optimizer.accelerate()

In [23]:
model_acc = deepcopy(model)
optimizer.store_parameters([model_acc.parameters()])
model_acc.cuda()

Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (1): Linear(in_features=784, out_features=512, bias=True)
  (2): ReLU()
  (3): Linear(in_features=512, out_features=10, bias=True)
  (4): LogSoftmax(dim=-1)
)

In [24]:
train_score = validation(model_acc, train_loader)
valid_score = validation(model_acc, valid_loader)
print("Train:", train_score)
print("Valid:", valid_score)
print("Train:", train_score, flush=True, file=log_file)
print("Valid:", valid_score, flush=True, file=log_file)

Train: (tensor(0.9406, device='cuda:0'), tensor(0.2127, device='cuda:0'))
Valid: (tensor(0.9342, device='cuda:0'), tensor(0.2350, device='cuda:0'))


In [25]:
optimizer.param_groups[0]["method"] = "RRE"
optimizer.accelerate()

In [26]:
model_acc = deepcopy(model)
optimizer.store_parameters([model_acc.parameters()])
model_acc.cuda()

Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (1): Linear(in_features=784, out_features=512, bias=True)
  (2): ReLU()
  (3): Linear(in_features=512, out_features=10, bias=True)
  (4): LogSoftmax(dim=-1)
)

In [27]:
train_score = validation(model_acc, train_loader)
valid_score = validation(model_acc, valid_loader)
print("Train:", train_score)
print("Valid:", valid_score)
print("Train:", train_score, flush=True, file=log_file)
print("Valid:", valid_score, flush=True, file=log_file)

Train: (tensor(0.9406, device='cuda:0'), tensor(0.2128, device='cuda:0'))
Valid: (tensor(0.9342, device='cuda:0'), tensor(0.2350, device='cuda:0'))


## Epoch average

In [None]:
model = nn.Sequential(
    nn.Flatten(),
    nn.Linear(28*28, 512),
    nn.ReLU(),
    nn.Linear(512, 10),
    nn.LogSoftmax(-1),
)
model.cuda()

In [None]:
log_file = open("SGD-2l_momentum-avg.txt", "w")
optimizer = AcceleratedSGD(model.parameters(), 1e-3, k=10, momentum=0.8, weight_decay=1e-5, mode="epoch_avg")

In [None]:
epochs = 30

for epoch in range(epochs):
    print("Epoch", epoch+1)
    loss_log = []
    train_epoch(loss_log)
    print(f"Training loss: {np.mean(loss_log):.4f}")
    optimizer.finish_epoch()
    val_acc, val_loss = validation(model, valid_loader)
    print(f"Validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")
    print("Epoch", epoch+1, 
          f"Training loss: {np.mean(loss_log):.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}",
          file=log_file, flush=True
         )

In [None]:
train_score = validation(model, train_loader)
valid_score = validation(model, valid_loader)
print("Train:", train_score)
print("Valid:", valid_score)
print("Train:", train_score, flush=True, file=log_file)
print("Valid:", valid_score, flush=True, file=log_file)

In [None]:
optimizer.param_groups[0]["method"] = "RNA"
optimizer.accelerate()

In [None]:
model_acc = deepcopy(model)
optimizer.store_parameters([model_acc.parameters()])
model_acc.cuda()

In [None]:
train_score = validation(model_acc, train_loader)
valid_score = validation(model_acc, valid_loader)
print("Train:", train_score)
print("Valid:", valid_score)
print("Train:", train_score, flush=True, file=log_file)
print("Valid:", valid_score, flush=True, file=log_file)

In [None]:
optimizer.param_groups[0]["method"] = "RRE"
optimizer.accelerate()

In [None]:
model_acc = deepcopy(model)
optimizer.store_parameters([model_acc.parameters()])
model_acc.cuda()

In [26]:
train_score = validation(model_acc, train_loader)
valid_score = validation(model_acc, valid_loader)
print("Train:", train_score)
print("Valid:", valid_score)
print("Train:", train_score, flush=True, file=log_file)
print("Valid:", valid_score, flush=True, file=log_file)

Train: (tensor(0.9321, device='cuda:0'), tensor(0.2463, device='cuda:0'))
Valid: (tensor(0.9228, device='cuda:0'), tensor(0.2774, device='cuda:0'))


## Epoch - EMA

In [28]:
optimizer = AcceleratedSGD(model.parameters(), 1e-3, k=10, momentum=0.8, weight_decay=1e-5, mode="epoch_avg", avg_alpha=0.2)
loss_fn = nn.NLLLoss()

In [29]:
log_file = open("SGD-2l_momentum_ema.txt", "w")

In [30]:
epochs = 20

for epoch in range(epochs):
    print("Epoch", epoch+1)
    loss_log = []
    train_epoch(loss_log)
    print(f"Training loss: {np.mean(loss_log):.4f}")
    optimizer.finish_epoch()
    val_acc, val_loss = validation(model, valid_loader)
    print(f"Validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}")
    print("Epoch", epoch+1, 
          f"Training loss: {np.mean(loss_log):.4f}, validation accuracy: {val_acc:.4f}, validation loss: {val_loss:.4f}",
          file=log_file, flush=True
         )

Epoch 1
Training loss: 0.2306
Validation accuracy: 0.9308, validation loss: 0.2491
Epoch 2
Training loss: 0.2271
Validation accuracy: 0.9314, validation loss: 0.2456
Epoch 3
Training loss: 0.2236
Validation accuracy: 0.9324, validation loss: 0.2427
Epoch 4
Training loss: 0.2205
Validation accuracy: 0.9327, validation loss: 0.2396
Epoch 5
Training loss: 0.2173
Validation accuracy: 0.9341, validation loss: 0.2372
Epoch 6
Training loss: 0.2141
Validation accuracy: 0.9343, validation loss: 0.2342
Epoch 7
Training loss: 0.2110
Validation accuracy: 0.9352, validation loss: 0.2312
Epoch 8
Training loss: 0.2081
Validation accuracy: 0.9359, validation loss: 0.2290
Epoch 9
Training loss: 0.2052
Validation accuracy: 0.9377, validation loss: 0.2261
Epoch 10
Training loss: 0.2023
Validation accuracy: 0.9381, validation loss: 0.2233
Epoch 11
Training loss: 0.1996
Validation accuracy: 0.9390, validation loss: 0.2205
Epoch 12
Training loss: 0.1969
Validation accuracy: 0.9392, validation loss: 0.2181
E

In [31]:
train_score = validation(model, train_loader)
valid_score = validation(model, valid_loader)
print("Train:", train_score)
print("Valid:", valid_score)
print("Train:", train_score, flush=True, file=log_file)
print("Valid:", valid_score, flush=True, file=log_file)

Train: (tensor(0.9515, device='cuda:0'), tensor(0.1756, device='cuda:0'))
Valid: (tensor(0.9430, device='cuda:0'), tensor(0.2007, device='cuda:0'))


In [32]:
optimizer.param_groups[0]["method"] = "RNA"
optimizer.accelerate()

In [33]:
model_acc = deepcopy(model)
optimizer.store_parameters([model_acc.parameters()])
model_acc.cuda()

Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (1): Linear(in_features=784, out_features=512, bias=True)
  (2): ReLU()
  (3): Linear(in_features=512, out_features=10, bias=True)
  (4): LogSoftmax(dim=-1)
)

In [34]:
train_score = validation(model_acc, train_loader)
valid_score = validation(model_acc, valid_loader)
print("Train:", train_score)
print("Valid:", valid_score)
print("Train:", train_score, flush=True, file=log_file)
print("Valid:", valid_score, flush=True, file=log_file)

Train: (tensor(0.9562, device='cuda:0'), tensor(0.1594, device='cuda:0'))
Valid: (tensor(0.9465, device='cuda:0'), tensor(0.1852, device='cuda:0'))


In [35]:
optimizer.param_groups[0]["method"] = "RRE"
optimizer.accelerate()

In [36]:
model_acc = deepcopy(model)
optimizer.store_parameters([model_acc.parameters()])
model_acc.cuda()

Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (1): Linear(in_features=784, out_features=512, bias=True)
  (2): ReLU()
  (3): Linear(in_features=512, out_features=10, bias=True)
  (4): LogSoftmax(dim=-1)
)

In [37]:
train_score = validation(model_acc, train_loader)
valid_score = validation(model_acc, valid_loader)
print("Train:", train_score)
print("Valid:", valid_score)
print("Train:", train_score, flush=True, file=log_file)
print("Valid:", valid_score, flush=True, file=log_file)

Train: (tensor(0.9562, device='cuda:0'), tensor(0.1594, device='cuda:0'))
Valid: (tensor(0.9465, device='cuda:0'), tensor(0.1852, device='cuda:0'))
