In [1]:
import math
import gc
from pathlib import Path
import numpy as np

import torch
import torch.nn as nn
import torchvision
from torch.optim.optimizer import Optimizer
import torchvision.transforms as transforms
from torchvision.datasets import ImageFolder

In [2]:
class AdamW(Optimizer):
    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, bias_correct=False, norm_weight=False):
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
        defualts = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, bias_correct=bias_correct, norm_weight=norm_weight)
        super().__init__(params, defualts)
    
    def step(self, closure=None):
        loss = None
        if closure is not None:
            loss = closure()
            
        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data
                if grad.is_sparse:
                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
                
                state = self.state[p]
                
                # State initialization
                if len(state) == 0:
                    state['step'] = 0
                    # Exponential moving average of gradient values
                    state['exp_avg'] = torch.zeros_like(p.data)
                    # Exponential moving average of squared gradient values
                    state['exp_avg_sq'] = torch.zeros_like(p.data)
                
                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                beta1, beta2 = group['betas']
                
                state['step'] += 1
                
                if group['weight_decay'] != 0:
                    grad.add_(group['weight_decay'], p.data)
                    
                exp_avg.mul_(beta1).add_(1 - beta1, grad)
                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
                
                bias_correction1 = 1 - beta1 ** state['step']
                bias_correction2 = 1 - beta2 ** state['step']
                
                if group['bias_correct']:
                    exp_avg.div_(bias_correction1)
                    exp_avg_sq.div_(bias_correction2)
                
                step_size = math.sqrt(bias_correction2) / bias_correction1
                
                denom = exp_avg_sq.sqrt().add_(group['eps'])
                if group['weight_decay'] != 0:
                    update = group['lr']*torch.div(exp_avg, denom) + group['weight_decay']*p.data
                else:
                    update = group['lr']*torch.div(exp_avg, denom)
                p.data.add_(-step_size, update)
        
        return loss

In [11]:
# Get data
path = Path('/home/kushaj/Desktop/Data/cifar10')
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
dataset = {x: ImageFolder(path/x, transform=transform) for x in ['train', 'valid']}
data = {x: torch.utils.data.DataLoader(dataset[x], batch_size=1024, shuffle=True, num_workers=8) for x in ['train', 'valid']}

# Create model
device = torch.device('cuda')
torch.backends.cudnn.benchmark = True
model = torchvision.models.resnet18()
model = model.to(device)

criterion = nn.CrossEntropyLoss()

In [4]:
def train(model, opt, epochs=10):
    train_losses = []
    val_losses = []
    val_acc = []
    
    for epoch in range(epochs):
        print(f'Epoch {epoch}/{epochs-1}')
        
        for phase in ['train', 'valid']:
            if phase == 'train':
                model.train()
            else:
                model.eval()
            
            running_loss = 0.0
            running_corrects = 0.0
            
            for inputs, labels in data[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)
                
                optim.zero_grad()
                
                with torch.set_grad_enabled(phase=='train'):
                    outputs = model(inputs)
                    loss = criterion(outputs, labels)
                    _, preds = torch.max(outputs, 1)
                
                    if phase == 'train':
                        loss.backward()
                        optim.step()
                    
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
                
            epoch_loss = running_loss / len(data[phase].dataset)
            epoch_acc = running_corrects.double() / len(data[phase].dataset)
            
            print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))
            
            if phase == 'train':
                train_losses.append(epoch_loss)
            else:
                val_losses.append(epoch_loss)
                val_acc.append(epoch_acc.cpu())
                
    return train_losses, val_losses, val_acc

In [6]:
results = {}

In [7]:
optim = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0.001)  

train_losses, val_losses, val_acc = train(model, optim)
results['adam_pytorch_train_losses'] = train_losses
results['adam_pytorch_val_losses'] = val_losses
results['adam_pytorch_val_acc'] = val_acc

Epoch 0/9
train Loss: 1.7505 Acc: 0.4546
valid Loss: 1.4606 Acc: 0.4855
Epoch 1/9
train Loss: 1.0621 Acc: 0.6178
valid Loss: 1.0873 Acc: 0.6157
Epoch 2/9
train Loss: 0.8687 Acc: 0.6905
valid Loss: 1.0754 Acc: 0.6208
Epoch 3/9
train Loss: 0.7061 Acc: 0.7533
valid Loss: 1.0660 Acc: 0.6397
Epoch 4/9
train Loss: 0.5793 Acc: 0.7976
valid Loss: 1.0629 Acc: 0.6475
Epoch 5/9
train Loss: 0.4864 Acc: 0.8293
valid Loss: 1.0970 Acc: 0.6582
Epoch 6/9
train Loss: 0.4018 Acc: 0.8585
valid Loss: 1.0101 Acc: 0.6837
Epoch 7/9
train Loss: 0.3390 Acc: 0.8810
valid Loss: 1.2283 Acc: 0.6479
Epoch 8/9
train Loss: 0.2908 Acc: 0.8972
valid Loss: 1.4540 Acc: 0.6300
Epoch 9/9
train Loss: 0.2420 Acc: 0.9148
valid Loss: 1.1458 Acc: 0.6879


In [27]:
optim = AdamW(model.parameters(), lr=0.001, weight_decay=0.001)

train_losses, val_losses, val_acc = train(model, optim)
results['adamW_noBias_train_losses'] = train_losses
results['adamW_noBias_val_losses'] = val_losses
results['adamW_noBias_val_acc'] = val_acc

Epoch 0/9
train Loss: 1.7523 Acc: 0.4557
valid Loss: 1.5828 Acc: 0.4427
Epoch 1/9
train Loss: 1.0491 Acc: 0.6265
valid Loss: 1.1771 Acc: 0.5794
Epoch 2/9
train Loss: 0.8398 Acc: 0.7043
valid Loss: 1.0735 Acc: 0.6258
Epoch 3/9
train Loss: 0.7018 Acc: 0.7557
valid Loss: 1.0805 Acc: 0.6347
Epoch 4/9
train Loss: 0.6029 Acc: 0.7926
valid Loss: 0.9704 Acc: 0.6761
Epoch 5/9
train Loss: 0.5133 Acc: 0.8201
valid Loss: 0.9381 Acc: 0.6887
Epoch 6/9
train Loss: 0.4442 Acc: 0.8458
valid Loss: 1.0932 Acc: 0.6625
Epoch 7/9
train Loss: 0.3872 Acc: 0.8673
valid Loss: 1.0217 Acc: 0.6827
Epoch 8/9
train Loss: 0.3401 Acc: 0.8843
valid Loss: 1.2005 Acc: 0.6493
Epoch 9/9
train Loss: 0.2978 Acc: 0.8984
valid Loss: 1.1359 Acc: 0.6698


In [10]:
optim = AdamW(model.parameters(), lr=0.001, weight_decay=0.01, bias_correct=True)

train_losses, val_losses, val_acc = train(model, optim)
results['adamW_Bias_train_losses'] = train_losses
results['adamW_Bias_val_losses'] = val_losses
results['adamW_Bias_val_acc'] = val_acc

Epoch 0/9
train Loss: 3.4060 Acc: 0.1352
valid Loss: 3.1564 Acc: 0.1315
Epoch 1/9
train Loss: 3.1115 Acc: 0.1358
valid Loss: 2.8878 Acc: 0.1286
Epoch 2/9
train Loss: 2.8247 Acc: 0.1358
valid Loss: 2.6565 Acc: 0.1241
Epoch 3/9
train Loss: 2.6043 Acc: 0.1362
valid Loss: 2.5004 Acc: 0.1206
Epoch 4/9


KeyboardInterrupt: 

In [12]:
optim = AdamW(model.parameters(), lr=0.01, weight_decay=0.01, bias_correct=True)

train_losses, val_losses, val_acc = train(model, optim)
results['adamW_Bias_train_losses'] = train_losses
results['adamW_Bias_val_losses'] = val_losses
results['adamW_Bias_val_acc'] = val_acc

Epoch 0/9
train Loss: 3.6042 Acc: 0.2347
valid Loss: 4.0405 Acc: 0.1621
Epoch 1/9
train Loss: 3.8407 Acc: 0.2432
valid Loss: 4.3930 Acc: 0.1608
Epoch 2/9
train Loss: 4.4393 Acc: 0.2424
valid Loss: 5.0241 Acc: 0.1548
Epoch 3/9


Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/kushaj/anaconda3/envs/PyTorch/lib/python3.7/multiprocessing/queues.py", line 242, in _feed
    send_bytes(obj)
  File "/home/kushaj/anaconda3/envs/PyTorch/lib/python3.7/multiprocessing/queues.py", line 242, in _feed
    send_bytes(obj)
  File "/home/kushaj/anaconda3/envs/PyTorch/lib/python3.7/multiprocessing/connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "/home/kushaj/anaconda3/envs/PyTorch/lib/python3.7/multiprocessing/connection.py", line 404, in _send_bytes
    self._send(header + buf)
  File "/home/kushaj/anaconda3/envs/PyTorch/lib/python3.7/multiprocessing/connection.py", line 368, in _send
    n = write(self._handle, buf)
BrokenPipeError: [Errno 32] Broken pipe
  File "/home/kushaj/anaconda3/envs/PyTorch/lib/python3.7/multiprocessing/queues.py", line 242, in _feed
    send_bytes(obj)
  File "/home/kushaj/anaconda3/env

KeyboardInterrupt: 

In [None]:
optim = AdamW(model.parameters(), lr=0.01, norm_weight=True)

In [6]:
results = np.load('results.npy').item()