In [1]:
import torch
from torch import nn
from torch.nn import functional as F
import torchvision
import torchvision.datasets as datasets
import time 

In [2]:
# Check if GPU is available
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

In [3]:
mnist_trainset = datasets.MNIST(root='./data', train=True, download=True, transform=None)
mnist_testset = datasets.MNIST(root='./data', train=False, download=True, transform=None)

In [4]:
mnist_train = mnist_trainset.data
mnist_test = mnist_testset.data

mnist_trainlabel = mnist_trainset.targets
mnist_testlabel = mnist_testset.targets

mnist_train = mnist_train.to(device)
mnist_test = mnist_test.to(device)
mnist_trainlabel = mnist_trainlabel.to(device)
mnist_testlabel = mnist_testlabel.to(device)

In [5]:
print(mnist_train.shape)
print(mnist_test.shape)
print(mnist_trainlabel.shape)
print(mnist_testlabel.shape)

torch.Size([60000, 28, 28])
torch.Size([10000, 28, 28])
torch.Size([60000])
torch.Size([10000])


In [6]:
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.fc1 = nn.Linear(784, 128)
        self.fc2 = nn.Linear(128, 64) 
        self.fc3 = nn.Linear(64, 10) 


    def forward(self, x):
        x = x.view(-1,784) # flatten
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        
        return x

In [7]:
def accuracy(y_pred,y_ground):
    return (torch.sum(torch.argmax(y_pred,axis=1) == y_ground) / y_ground.size(0)).item()

In [8]:
def train(model, train_input, train_target, test_input, test_target, mini_batch_size, nb_epochs = 100, eta = 1e-3, verbose=False, adam_w=0.3, sgd_w=0.3, dfw_w = 0.4):
    
    criterion = nn.CrossEntropyLoss()
    optimizer = AdamSGDDFWWeighted(model.parameters(), lr=eta, momentum=0, adam_w=adam_w, sgd_w=sgd_w, dfw_w=dfw_w)
    #optimizer = torch.optim.SGD(model.parameters(), eta, momentum=0)
    # scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones, gamma=0.1, last_epoch=-1, verbose=False)
    
    history = {}
    history['loss'] = []
    history['loss_val'] = []
    history['acc'] = []
    history['acc_val'] = []
    
    for e in range(nb_epochs):
        total_loss = 0

        for b in range(0, train_input.size(0), mini_batch_size):
            output = model(train_input.narrow(0, b , min(mini_batch_size, train_input.size(0)-b)))
            loss = criterion(output, train_target.narrow(0, b, min(mini_batch_size, train_input.size(0)-b)))
            total_loss = total_loss + loss.item()

            optimizer.zero_grad()
            loss.backward()
            #optimizer.step()
            optimizer.step(lambda: float(loss))
        
        output_train = model(train_input.float())
        output_val = model(test_input.float())
            
        accuracy_train = accuracy(output_train, train_target)
        accuracy_val = accuracy(output_val, test_target)
        loss_train = criterion(output_train, train_target).item()
        loss_val = criterion(output_val, test_target).item()
            
        history['loss'].append(loss_train)
        history['loss_val'].append(loss_val)
        history['acc'].append(accuracy_train)
        history['acc_val'].append(accuracy_val)
        if verbose:
            print('Epoch %d/%d, Cross Entropy Loss: %.3f' %(e+1, nb_epochs, total_loss))
    
    return history
        

In [9]:
import time
import matplotlib.pyplot as plt
import numpy as np
from adam_sgd_dfw_mix import *

torch.manual_seed(123456789) # seeding for weight initialization and train

histories = []

gammas = [1e-2,1e-3, 5e-3, 1e-4]
batch_sizes = [256]

mesh = np.meshgrid(batch_sizes, gammas)

for bs,lr in zip(mesh[0].ravel(),mesh[1].ravel()):
    print('batch-size: {} | learning-rate: {}'.format(bs,lr))
    # reset weights
    mlp = MLP().to(device)

    history = train(mlp, mnist_train.float(), mnist_trainlabel,
                mnist_test.float(), mnist_testlabel,
                mini_batch_size=bs,nb_epochs = 100,eta=lr,verbose=False, adam_w=0.3, sgd_w=0.3, dfw_w=0.4)
    
    histories.append(history)

batch-size: 256 | learning-rate: 0.01
[tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0'), tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0'), tensor([ 0.0033, -0.0002, -0.0060, -0.0072, -0.0015, -0.0041,  0.0004, -0.0052,
         0.0267, -0.0061], device='cuda:0')],


RuntimeError: The size of tensor a (784) must match the size of tensor b (10) at non-singleton dimension 1

In [None]:
fig,ax = plt.subplots(2,2,figsize=(15,10))
for j,(bs,lr) in enumerate(zip(mesh[0].ravel(),mesh[1].ravel())):
  ax[0,0].plot([i for i in range(100)],histories[j]['acc'],label='bs={}, lr={}'.format(bs,lr))
  ax[0,1].plot([i for i in range(100)],histories[j]['acc_val'],label='bs={}, lr={}'.format(bs,lr))

  ax[1,0].plot([i for i in range(100)],histories[j]['loss'],label='bs={}, lr={}'.format(bs,lr))
  ax[1,1].plot([i for i in range(100)],histories[j]['loss_val'],label='bs={}, lr={}'.format(bs,lr))

ax[0,0].legend()
ax[0,1].legend()
ax[1,0].legend()
ax[1,1].legend()

ax[0,0].set_title('train accuracy, epochs_step 1')
ax[0,1].set_title('val accuracy, epochs_step 1')
ax[1,0].set_title('train loss, epochs_step 1')
ax[1,1].set_title('val loss, epochs_step 1')
plt.show()