In [None]:
import math
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F

In [None]:
# Example is based on instructions from
# https://github.com/kach/gradient-descent-the-ultimate-optimizer,
# which contains the original source code for the Gradient Descent: The Ultimate Optimizer paper,
# and the modified source code for experimental results with the CIFAR-10 dataset.

class CIFAR10_CNN(nn.Module):
    def __init__(self, inp_dim, num_hid, num_out):
        super().__init__()

        self.conv1_filters = 16
        self.conv1_kernel_size = (3,3)
        self.conv1 = nn.Conv2d(3, self.conv1_filters, self.conv1_kernel_size)

        self.conv1_updated_dim_h = inp_dim[1] - self.conv1_kernel_size[0] + 1
        self.conv1_updated_dim_w = inp_dim[2] - self.conv1_kernel_size[1] + 1

        self.conv2_filters = 16
        self.conv2_kernel_size = (3,3)
        self.conv2 = nn.Conv2d(self.conv1_filters, self.conv2_filters, self.conv2_kernel_size)

        self.conv2_updated_dim_h = self.conv1_updated_dim_h - self.conv2_kernel_size[0] + 1
        self.conv2_updated_dim_w = self.conv1_updated_dim_w - self.conv2_kernel_size[1] + 1

        self.linear1 = nn.Linear(int(self.conv2_filters * self.conv2_updated_dim_h * self.conv2_updated_dim_w), num_hid)
        self.linear2 = nn.Linear(num_hid, num_out)

    def initialize(self):
        nn.init.kaiming_uniform_(self.conv1.weight, a=math.sqrt(5))
        nn.init.kaiming_uniform_(self.conv2.weight, a=math.sqrt(5))
        nn.init.kaiming_uniform_(self.linear1.weight, a=math.sqrt(5))
        nn.init.kaiming_uniform_(self.linear2.weight, a=math.sqrt(5))

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))

        x = x.view(x.size(0), -1)
        x = self.linear1(x)
        x = torch.tanh(x)
        x = self.linear2(x)
        x = torch.tanh(x)
        x = F.log_softmax(x, dim=1)

        return x

def get_means_stds(dataset, h, w):
    means = []
    stds = []

    num_channels = None
    X = []

    num_points = 0
    for data in dataset:
        num_points += 1

        image, label = data

        if num_channels == None:
            num_channels = image.shape[0]

        X.append(image)

    X = torch.cat([t.unsqueeze(0) for t in X])
    X = X.reshape((num_channels, num_points, h, w))

    for channel in range(num_channels):
        means.append(X[channel, :].mean().item())
        stds.append(X[channel, :].std().item())

    return means, stds

batch_size = 256
epochs = 5
device = 'cuda' if torch.cuda.is_available() else 'cpu'

normalize = torchvision.transforms.Normalize(mean=[0.474, 0.473, 0.473], std=[0.252, 0.252, 0.251])
transforms = torchvision.transforms.Compose([torchvision.transforms.ToTensor(), normalize])

cifar10_train = torchvision.datasets.CIFAR10('./data', train=True, download=True, transform=transforms)
cifar10_test = torchvision.datasets.CIFAR10('./data', train=False, download=True, transform=transforms)

# means, std = get_means_stds(cifar10_train, 32, 32)
# print("means: {}".format(means))
# print("std: {}".format(std))

dl_train = torch.utils.data.DataLoader(cifar10_train, batch_size=batch_size, shuffle=True)
dl_test = torch.utils.data.DataLoader(cifar10_test, batch_size=10000, shuffle=False)

model = CIFAR10_CNN((3, 32, 32), 128, 10).to(device)

In [None]:
# module wrapper with respective optimizer stack

# Optimizers used to obtain the results
# Optimizers in which clip is set to True have gradient clipping enabled.
# Gradient clipping is disabled by default.

import gdtuo_gradient_clipping as gdtuo

# Adam as primary optimizer, no secondary optimizer
# gdtuo_optimizer = gdtuo.Adam()

# Adam / SGD(alpha = 10^-9) with no gradient clipping for SGD as secondary optimizer
gdtuo_optimizer = gdtuo.Adam(optimizer=gdtuo.SGD(alpha=1e-9))

# Adam / SGD(alpha = 10^-9) with gradient clipping for SGD as secondary optimizer
# gdtuo_optimizer = gdtuo.Adam(optimizer=gdtuo.SGD(alpha=1e-9, clip=True))

# gdtuo_optimizer = gdtuo.Adam(optimizer=gdtuo.SGD(alpha=1e-8))
# gdtuo_optimizer = gdtuo.Adam(optimizer=gdtuo.SGD(alpha=1e-8, clip=True))
# gdtuo_optimizer = gdtuo.Adam(optimizer=gdtuo.SGD(alpha=1e-7))
# gdtuo_optimizer = gdtuo.Adam(optimizer=gdtuo.SGD(alpha=1e-7, clip=True))
# gdtuo_optimizer = gdtuo.Adam(optimizer=gdtuo.SGD(alpha=1e-6))
# gdtuo_optimizer = gdtuo.Adam(optimizer=gdtuo.SGD(alpha=1e-6, clip=True))
# gdtuo_optimizer = gdtuo.Adam(optimizer=gdtuo.SGD(alpha=1e-5))
# gdtuo_optimizer = gdtuo.Adam(optimizer=gdtuo.SGD(alpha=1e-5, clip=True))
# gdtuo_optimizer = gdtuo.Adam(optimizer=gdtuo.SGD(alpha=1e-4))
# gdtuo_optimizer = gdtuo.Adam(optimizer=gdtuo.SGD(alpha=1e-4, clip=True))
# gdtuo_optimizer = gdtuo.Adam(optimizer=gdtuo.SGD(alpha=2.5e-3))
# gdtuo_optimizer = gdtuo.Adam(optimizer=gdtuo.SGD(alpha=2.5e-3, clip=True))
# gdtuo_optimizer = gdtuo.Adam(optimizer=gdtuo.SGD(alpha=5e-3))
# gdtuo_optimizer = gdtuo.Adam(optimizer=gdtuo.SGD(alpha=5e-3, clip=True))
# gdtuo_optimizer = gdtuo.Adam(optimizer=gdtuo.SGD(alpha=1e-3))
# gdtuo_optimizer = gdtuo.Adam(optimizer=gdtuo.SGD(alpha=1e-3, clip=True))
# gdtuo_optimizer = gdtuo.Adam(optimizer=gdtuo.SGD(alpha=1e-2))
# gdtuo_optimizer = gdtuo.Adam(optimizer=gdtuo.SGD(alpha=1e-2, clip=True))

# Creates module wrapper with optimizer stack functionality
mw = gdtuo.ModuleWrapper(model, optimizer=gdtuo_optimizer)
mw.initialize()

In [None]:
def train_model(model, dl_train, num_epochs):
    print()
    print_model_optimizer_parameters(model)
    for i in range(1, num_epochs + 1):
        total_loss = 0.0
        for j, (features_, labels_) in enumerate(dl_train):
            model.begin() # before each step, enables gradient tracking on desired parameters
            features, labels = features_.to(device), labels_.to(device)
            prediction = model.forward(features)
            loss = F.nll_loss(prediction, labels)
            model.zero_grad()
            loss.backward(create_graph=True)
            model.step()
            total_loss += loss.item() * features_.size(0)

        train_loss = total_loss / len(dl_train.dataset)
        print("\nepoch: {}, train loss: {}".format(i, train_loss))
        print_model_optimizer_parameters(model)

# Function to output the model's primary optimizer's hyperparameter values
# For example, if the stack was Adam / SGD, it would output the hyperparameter values of Adam
def print_model_optimizer_parameters(model):
    optimizer = model.optimizer
    if isinstance(optimizer, gdtuo.NoOpOptimizer):
        print("No optimizer passed into gdtuo model.")
        return

    optimizer_type = "unknown"
    if isinstance(optimizer, gdtuo.SGD):
        optimizer_type = "SGD"
    elif isinstance(optimizer, gdtuo.SGDPerParam):
        optimizer_type = "SGDPerParam"
    elif isinstance(optimizer, gdtuo.AdaGrad):
        optimizer_type = "AdaGrad"
    elif isinstance(optimizer, gdtuo.RMSProp):
        optimizer_type = "RMSProp"
    elif isinstance(optimizer, gdtuo.RMSPropAlpha):
        optimizer_type = "RMSPropAlpha"
    elif isinstance(optimizer, gdtuo.Adam):
        optimizer_type = "Adam"
    elif isinstance(optimizer, gdtuo.AdamBaydin):
        optimizer_type = "AdamBaydin"

    print("{} optimizer parameters:".format(optimizer_type))

    optimizer_parameters = optimizer.parameters

    for parameter in optimizer_parameters:
        value = optimizer_parameters[parameter]
        if parameter == "alpha" and \
        (isinstance(optimizer, gdtuo.RMSProp) or isinstance(optimizer, gdtuo.RMSPropAlpha)):
            value = torch.square(value)
        if parameter == "beta1" or parameter == "beta2":
            value = gdtuo.Adam.clamp(value)
        if parameter == "gamma":
            value = gdtuo.RMSProp.clamp(value)
        print("{}: {}\t".format(parameter, value), end="")
    print()

In [None]:
train_model(mw, dl_train, epochs)

In [None]:
def get_accuracy_error(model, dl_data):
    num_correct_classifications = 0
    num_datapoints = len(dl_data.dataset)
    for j, (features_, labels_) in enumerate(dl_data):
        features, labels = features_.to(device), labels_.to(device)
        prediction = model.forward(features)
        for i, row_in_prediction in enumerate(prediction):
            predicted_label = torch.argmax(row_in_prediction)
            if predicted_label == labels[i]:
                num_correct_classifications += 1

    accuracy = num_correct_classifications / num_datapoints
    error = 1 - accuracy

    return accuracy, error

In [None]:
print("results from model")

# outputs accuracy of the model with the corresponding optimizer stack

train_accuracy, train_error = get_accuracy_error(mw, dl_train)
print("train accuracy: {}%".format(train_accuracy * 100))
print("train error: {}%".format(train_error * 100))

test_accuracy, test_error = get_accuracy_error(mw, dl_test)
print("test accuracy: {}%".format(test_accuracy * 100))
print("test error: {}%".format(test_error * 100))

print()

# outputs gradient information of the primary optimizer's parameters
mw.optimizer.print_parameter_gradient_info()