In [1]:
import math
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F

In [2]:
# Example is based on instructions from 
# https://github.com/kach/gradient-descent-the-ultimate-optimizer, 
# which contains the original source code for the Gradient Descent: The Ultimate Optimizer paper,
# and the modified source code for experimental results with MNIST dataset.

class MNIST_FullyConnected(nn.Module):
    def __init__(self, num_inp, num_hid, num_out):
        super(MNIST_FullyConnected, self).__init__()
        self.layer1 = nn.Linear(num_inp, num_hid)
        self.layer2 = nn.Linear(num_hid, num_out)

    def initialize(self):
        nn.init.kaiming_uniform_(self.layer1.weight, a=math.sqrt(5))
        nn.init.kaiming_uniform_(self.layer2.weight, a=math.sqrt(5))

    def forward(self, x):
        x = self.layer1(x)
        x = torch.tanh(x)
        x = self.layer2(x)
        x = torch.tanh(x)
        x = F.log_softmax(x, dim=1)
        return x

batch_size = 256
epochs = 30
device = 'cuda' if torch.cuda.is_available() else 'cpu'

mnist_train = torchvision.datasets.MNIST('./data', train=True, download=True, transform=torchvision.transforms.ToTensor())
mnist_test = torchvision.datasets.MNIST('./data', train=False, download=True, transform=torchvision.transforms.ToTensor())
dl_train = torch.utils.data.DataLoader(mnist_train, batch_size=batch_size, shuffle=True)
dl_test = torch.utils.data.DataLoader(mnist_test, batch_size=10000, shuffle=False)

model = MNIST_FullyConnected(28 * 28, 128, 10).to(device)

10.2%

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./data\MNIST\raw\train-images-idx3-ubyte.gz


100.0%


Extracting ./data\MNIST\raw\train-images-idx3-ubyte.gz to ./data\MNIST\raw


100.0%
2.0%


Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./data\MNIST\raw\train-labels-idx1-ubyte.gz
Extracting ./data\MNIST\raw\train-labels-idx1-ubyte.gz to ./data\MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./data\MNIST\raw\t10k-images-idx3-ubyte.gz


100.0%
100.0%


Extracting ./data\MNIST\raw\t10k-images-idx3-ubyte.gz to ./data\MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./data\MNIST\raw\t10k-labels-idx1-ubyte.gz
Extracting ./data\MNIST\raw\t10k-labels-idx1-ubyte.gz to ./data\MNIST\raw



In [3]:
# Initializes the module wrapper with respective optimizer stack

# Commented optimizers correspond to the optimizers used to obtain the results.
# Optimizers in which clip is set to True have gradient clipping enabled.
# Gradient clipping is disabled by default.

import gdtuo_gradient_clipping as gdtuo

 # SGD / SGD(alpha = 0.01) with no gradient clipping for second SGD
gdtuo_optimizer = gdtuo.SGD(optimizer=gdtuo.SGD(alpha=0.01))

# SGD / SGD(alpha = 0.01) with gradient clipping for second SGD
# gdtuo_optimizer = gdtuo.SGD(optimizer=gdtuo.SGD(alpha=0.01, clip=True))

# gdtuo_optimizer = gdtuo.SGD(optimizer=gdtuo.SGD(alpha=0.05))
# gdtuo_optimizer = gdtuo.SGD(optimizer=gdtuo.SGD(alpha=0.05, clip=True))
# gdtuo_optimizer = gdtuo.SGD(optimizer=gdtuo.SGD(alpha=0.1))
# gdtuo_optimizer = gdtuo.SGD(optimizer=gdtuo.SGD(alpha=0.1, clip=True))
# gdtuo_optimizer = gdtuo.SGD(optimizer=gdtuo.SGD(alpha=0.5))
# gdtuo_optimizer = gdtuo.SGD(optimizer=gdtuo.SGD(alpha=0.5, clip=True))
# gdtuo_optimizer = gdtuo.SGD(optimizer=gdtuo.SGD(alpha=1))
# gdtuo_optimizer = gdtuo.SGD(optimizer=gdtuo.SGD(alpha=1, clip=True))
# gdtuo_optimizer = gdtuo.SGD(optimizer=gdtuo.SGD(alpha=5))
# gdtuo_optimizer = gdtuo.SGD(optimizer=gdtuo.SGD(alpha=5, clip=True))
# gdtuo_optimizer = gdtuo.SGD(optimizer=gdtuo.SGD(alpha=10))
# gdtuo_optimizer = gdtuo.SGD(optimizer=gdtuo.SGD(alpha=10, clip=True))
# gdtuo_optimizer = gdtuo.SGD(optimizer=gdtuo.SGD(alpha=50))
# gdtuo_optimizer = gdtuo.SGD(optimizer=gdtuo.SGD(alpha=50, clip=True))
# gdtuo_optimizer = gdtuo.SGD(optimizer=gdtuo.SGD(alpha=75)
# gdtuo_optimizer = gdtuo.SGD(optimizer=gdtuo.SGD(alpha=75, clip=True))
# gdtuo_optimizer = gdtuo.SGD(optimizer=gdtuo.SGD(alpha=100))
# gdtuo_optimizer = gdtuo.SGD(optimizer=gdtuo.SGD(alpha=100, clip=True))
# gdtuo_optimizer = gdtuo.SGD(optimizer=gdtuo.SGD(alpha=100))
# gdtuo_optimizer = gdtuo.SGD(optimizer=gdtuo.SGD(alpha=100, clip=True))
# gdtuo_optimizer = gdtuo.SGD(optimizer=gdtuo.SGD(alpha=250))
# gdtuo_optimizer = gdtuo.SGD(optimizer=gdtuo.SGD(alpha=250, clip=True))

# Adam / SGD(alpha = 10^-5) with no gradient clipping for SGD as secondary optimizer
# gdtuo_optimizer = gdtuo.Adam(optimizer=gdtuo.SGD(alpha=1e-5))

# Adam / SGD(alpha = 10^-5) with gradient clipping for SGD as secondary optimizer
# gdtuo_optimizer = gdtuo.Adam(optimizer=gdtuo.SGD(alpha=1e-5, clip=True))

# gdtuo_optimizer = gdtuo.Adam(optimizer=gdtuo.SGD(alpha=5e-5))
# gdtuo_optimizer = gdtuo.Adam(optimizer=gdtuo.SGD(alpha=5e-5, clip=True))
# gdtuo_optimizer = gdtuo.Adam(optimizer=gdtuo.SGD(alpha=1e-4))
# gdtuo_optimizer = gdtuo.Adam(optimizer=gdtuo.SGD(alpha=1e-4, clip=True))
# gdtuo_optimizer = gdtuo.Adam(optimizer=gdtuo.SGD(alpha=5e-4))
# gdtuo_optimizer = gdtuo.Adam(optimizer=gdtuo.SGD(alpha=5e-4, clip=True))
# gdtuo_optimizer = gdtuo.Adam(optimizer=gdtuo.SGD(alpha=1e-3))
# gdtuo_optimizer = gdtuo.Adam(optimizer=gdtuo.SGD(alpha=1e-3, clip=True))
# gdtuo_optimizer = gdtuo.Adam(optimizer=gdtuo.SGD(alpha=5e-3))
# gdtuo_optimizer = gdtuo.Adam(optimizer=gdtuo.SGD(alpha=5e-3, clip=True))
# gdtuo_optimizer = gdtuo.Adam(optimizer=gdtuo.SGD(alpha=1e-2))
# gdtuo_optimizer = gdtuo.Adam(optimizer=gdtuo.SGD(alpha=1e-2, clip=True))
# gdtuo_optimizer = gdtuo.Adam(optimizer=gdtuo.SGD(alpha=5e-2))
# gdtuo_optimizer = gdtuo.Adam(optimizer=gdtuo.SGD(alpha=5e-2, clip=True))
# gdtuo_optimizer = gdtuo.Adam(optimizer=gdtuo.SGD(alpha=1e-1))
# gdtuo_optimizer = gdtuo.Adam(optimizer=gdtuo.SGD(alpha=1e-1, clip=True))
# gdtuo_optimizer = gdtuo.Adam(optimizer=gdtuo.SGD(alpha=5e-1))
# gdtuo_optimizer = gdtuo.Adam(optimizer=gdtuo.SGD(alpha=5e-1, clip=True))

# AdaGrad / SGD(alpha = 10^-5) with no gradient clipping for SGD as secondary optimizer
# gdtuo_optimizer = gdtuo.AdaGrad(optimizer=gdtuo.SGD(alpha=0.01))

# AdaGrad / SGD(alpha = 10^-5) with gradient clipping for SGD as secondary optimizer
# gdtuo_optimizer = gdtuo.AdaGrad(optimizer=gdtuo.SGD(alpha=0.01, clip=True))

# gdtuo_optimizer = gdtuo.AdaGrad(optimizer=gdtuo.SGD(alpha=0.05))
# gdtuo_optimizer = gdtuo.AdaGrad(optimizer=gdtuo.SGD(alpha=0.05, clip=True))
# gdtuo_optimizer = gdtuo.AdaGrad(optimizer=gdtuo.SGD(alpha=0.1))
# gdtuo_optimizer = gdtuo.AdaGrad(optimizer=gdtuo.SGD(alpha=0.1, clip=True))
# gdtuo_optimizer = gdtuo.AdaGrad(optimizer=gdtuo.SGD(alpha=0.5))
# gdtuo_optimizer = gdtuo.AdaGrad(optimizer=gdtuo.SGD(alpha=0.5, clip=True))
# gdtuo_optimizer = gdtuo.AdaGrad(optimizer=gdtuo.SGD(alpha=1))
# gdtuo_optimizer = gdtuo.AdaGrad(optimizer=gdtuo.SGD(alpha=1, clip=True))
# gdtuo_optimizer = gdtuo.AdaGrad(optimizer=gdtuo.SGD(alpha=2.5))
# gdtuo_optimizer = gdtuo.AdaGrad(optimizer=gdtuo.SGD(alpha=2.5, clip=True))
# gdtuo_optimizer = gdtuo.AdaGrad(optimizer=gdtuo.SGD(alpha=5))
# gdtuo_optimizer = gdtuo.AdaGrad(optimizer=gdtuo.SGD(alpha=5, clip=True))
# gdtuo_optimizer = gdtuo.AdaGrad(optimizer=gdtuo.SGD(alpha=7.5))
# gdtuo_optimizer = gdtuo.AdaGrad(optimizer=gdtuo.SGD(alpha=7.5, clip=True))
# gdtuo_optimizer = gdtuo.AdaGrad(optimizer=gdtuo.SGD(alpha=10))
# gdtuo_optimizer = gdtuo.AdaGrad(optimizer=gdtuo.SGD(alpha=10, clip=True))
# gdtuo_optimizer = gdtuo.AdaGrad(optimizer=gdtuo.SGD(alpha=50))
# gdtuo_optimizer = gdtuo.AdaGrad(optimizer=gdtuo.SGD(alpha=50, clip=True))
# gdtuo_optimizer = gdtuo.AdaGrad(optimizer=gdtuo.SGD(alpha=100))
# gdtuo_optimizer = gdtuo.AdaGrad(optimizer=gdtuo.SGD(alpha=100, clip=True))

# RMSProp / SGD(alpha = 10^-5) with no gradient clipping for SGD as secondary optimizer
# gdtuo_optimizer = gdtuo.RMSProp(optimizer=gdtuo.SGD(1e-5))

# RMSProp / SGD(alpha = 10^-5) with gradient clipping for SGD as secondary optimizer
# gdtuo_optimizer = gdtuo.RMSProp(optimizer=gdtuo.SGD(1e-5, clip=True))

# gdtuo_optimizer = gdtuo.RMSProp(optimizer=gdtuo.SGD(5e-5))
# gdtuo_optimizer = gdtuo.RMSProp(optimizer=gdtuo.SGD(5e-5, clip=True))
# gdtuo_optimizer = gdtuo.RMSProp(optimizer=gdtuo.SGD(1e-4))
# gdtuo_optimizer = gdtuo.RMSProp(optimizer=gdtuo.SGD(1e-4, clip=True))
# gdtuo_optimizer = gdtuo.RMSProp(optimizer=gdtuo.SGD(5e-4))
# gdtuo_optimizer = gdtuo.RMSProp(optimizer=gdtuo.SGD(5e-4, clip=True))
# gdtuo_optimizer = gdtuo.RMSProp(optimizer=gdtuo.SGD(1e-3))
# gdtuo_optimizer = gdtuo.RMSProp(optimizer=gdtuo.SGD(1e-3, clip=True))
# gdtuo_optimizer = gdtuo.RMSProp(optimizer=gdtuo.SGD(5e-3))
# gdtuo_optimizer = gdtuo.RMSProp(optimizer=gdtuo.SGD(5e-3, clip=True))
# gdtuo_optimizer = gdtuo.RMSProp(optimizer=gdtuo.SGD(1e-2))
# gdtuo_optimizer = gdtuo.RMSProp(optimizer=gdtuo.SGD(1e-2, clip=True))
# gdtuo_optimizer = gdtuo.RMSProp(optimizer=gdtuo.SGD(2.5e-2))
# gdtuo_optimizer = gdtuo.RMSProp(optimizer=gdtuo.SGD(2.5e-2, clip=True))
# gdtuo_optimizer = gdtuo.RMSProp(optimizer=gdtuo.SGD(5e-2))
# gdtuo_optimizer = gdtuo.RMSProp(optimizer=gdtuo.SGD(5e-2, clip=True))
# gdtuo_optimizer = gdtuo.RMSProp(optimizer=gdtuo.SGD(7.5e-2))
# gdtuo_optimizer = gdtuo.RMSProp(optimizer=gdtuo.SGD(7.5e-2, clip=True))
# gdtuo_optimizer = gdtuo.RMSProp(optimizer=gdtuo.SGD(1e-1))
# gdtuo_optimizer = gdtuo.RMSProp(optimizer=gdtuo.SGD(1e-1, clip=True))
# gdtuo_optimizer = gdtuo.RMSProp(optimizer=gdtuo.SGD(5e-1))
# gdtuo_optimizer = gdtuo.RMSProp(optimizer=gdtuo.SGD(5e-1, clip=True))

# Creates module wrapper with optimizer stack functionality
mw = gdtuo.ModuleWrapper(model, optimizer=gdtuo_optimizer)
mw.initialize()

In [4]:
def train_model(model, dl_train, num_epochs):
    print()
    print_model_optimizer_parameters(model)
    for i in range(1, num_epochs + 1):
        total_loss = 0.0
        for j, (features_, labels_) in enumerate(dl_train):
            model.begin() # before each step, enables gradient tracking on desired parameters
            features, labels = torch.reshape(features_, (-1, 28 * 28)).to(device), labels_.to(device)
            prediction = model.forward(features)
            loss = F.nll_loss(prediction, labels)
            model.zero_grad()
            loss.backward(create_graph=True)
            model.step()
            total_loss += loss.item() * features_.size(0)
        
        train_loss = total_loss / len(dl_train.dataset)
        print("\nepoch: {}, train loss: {}".format(i, train_loss))
        print_model_optimizer_parameters(model)

# Function to output the model's primary optimizer's hyperparameter values
# For example, if the stack was Adam / SGD, it would output the hyperparameter values of Adam
def print_model_optimizer_parameters(model):
    optimizer = model.optimizer
    if isinstance(optimizer, gdtuo.NoOpOptimizer):
        print("No optimizer passed into gdtuo model.")
        return

    optimizer_type = "unknown"
    if isinstance(optimizer, gdtuo.SGD):
        optimizer_type = "SGD"
    elif isinstance(optimizer, gdtuo.SGDPerParam):
        optimizer_type = "SGDPerParam"
    elif isinstance(optimizer, gdtuo.AdaGrad):
        optimizer_type = "AdaGrad"
    elif isinstance(optimizer, gdtuo.RMSProp):
        optimizer_type = "RMSProp"
    elif isinstance(optimizer, gdtuo.RMSPropAlpha):
        optimizer_type = "RMSPropAlpha"
    elif isinstance(optimizer, gdtuo.Adam):
        optimizer_type = "Adam"
    elif isinstance(optimizer, gdtuo.AdamBaydin):
        optimizer_type = "AdamBaydin"
    
    print("{} optimizer parameters:".format(optimizer_type))
    
    optimizer_parameters = optimizer.parameters

    for parameter in optimizer_parameters:
        value = optimizer_parameters[parameter]
        if parameter == "alpha" and \
        (isinstance(optimizer, gdtuo.RMSProp) or isinstance(optimizer, gdtuo.RMSPropAlpha)):
            value = torch.square(value)
        if parameter == "beta1" or parameter == "beta2":
            value = gdtuo.Adam.clamp(value)
        if parameter == "gamma":
            value = gdtuo.RMSProp.clamp(value)
        print("{}: {}\t".format(parameter, value), end="")
    print()

In [5]:
train_model(mw, dl_train, epochs)


SGD optimizer parameters:
alpha: 0.009999999776482582	mu: 0.0	


  Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass



epoch: 1, train loss: 1.3784865905125936
SGD optimizer parameters:
alpha: 0.1438463181257248	mu: 0.0	

epoch: 2, train loss: 1.0674078020095825
SGD optimizer parameters:
alpha: 0.13962674140930176	mu: 0.0	

epoch: 3, train loss: 1.022485931332906
SGD optimizer parameters:
alpha: 0.13056053221225739	mu: 0.0	

epoch: 4, train loss: 1.0005578675270081
SGD optimizer parameters:
alpha: 0.12352947145700455	mu: 0.0	

epoch: 5, train loss: 0.9862431994438171
SGD optimizer parameters:
alpha: 0.11825825273990631	mu: 0.0	

epoch: 6, train loss: 0.9753477774937948
SGD optimizer parameters:
alpha: 0.11621797829866409	mu: 0.0	

epoch: 7, train loss: 0.9663915136655171
SGD optimizer parameters:
alpha: 0.11194553971290588	mu: 0.0	

epoch: 8, train loss: 0.9586406548182169
SGD optimizer parameters:
alpha: 0.10790853947401047	mu: 0.0	

epoch: 9, train loss: 0.9520755769411723
SGD optimizer parameters:
alpha: 0.1033453419804573	mu: 0.0	

epoch: 10, train loss: 0.9462834394772848
SGD optimizer parameters

In [6]:
def get_accuracy_error(model, dl_data):
    num_correct_classifications = 0
    num_datapoints = len(dl_data.dataset)
    for j, (features_, labels_) in enumerate(dl_data):
        features, labels = torch.reshape(features_, (-1, 28 * 28)).to(device), labels_.to(device)
        prediction = model.forward(features)
        for i, row_in_prediction in enumerate(prediction):
            predicted_label = torch.argmax(row_in_prediction)
            if predicted_label == labels[i]:
                num_correct_classifications += 1

    accuracy = num_correct_classifications / num_datapoints
    error = 1 - accuracy
    
    return accuracy, error

In [7]:
print("results from model")

# outputs accuracy of the model with the corresponding optimizer stack

train_accuracy, train_error = get_accuracy_error(mw, dl_train)
print("train accuracy: {}%".format(train_accuracy * 100))
print("train error: {}%".format(train_error * 100))

test_accuracy, test_error = get_accuracy_error(mw, dl_test)
print("test accuracy: {}%".format(test_accuracy * 100))
print("test error: {}%".format(test_error * 100))

print()

# outputs gradient information of the primary optimizer's parameters
mw.optimizer.print_parameter_gradient_info()

results from model
train accuracy: 95.325%
train error: 4.674999999999995%
test accuracy: 95.04%
test error: 4.959999999999997%

sgd / sgd / parameter gradient info:
parameter alpha:
	maximum gradient norm: 0.4152931272983551
	average gradient norm: 0.006751393433660269
parameter mu:
	maximum gradient norm: 0.0
	average gradient norm: 0.0
