In [1]:
import torch
from torch import empty, zeros
import math
import copy

In [2]:
torch.manual_seed(1)

<torch._C.Generator at 0x7f3cc2317f90>

In [3]:
class Module(object):
    def __init__(self):
        pass
        
    def forward(self, *input_):
        raise NotImplementedError
        
    def backward(self, *gradwrtoutput):
        raise NotImplementedError
        
    def param(self):
        return []
    
    def update(self, lr):
        pass
    
    def zero_grad(self):
        pass
    
    def init_params(self, xavier_init, xavier_gain):
        pass


In [4]:
class Linear(Module):
    def __init__(self, N_in, N_out, xavier_init=False, xavier_gain=1):
        Module.__init__(self)
        self.N_in = N_in
        self.N_out = N_out
        
        self.init_params(xavier_init, xavier_gain)
        
        self.gradW = zeros(self.W.shape)
        self.gradb = zeros(self.b.shape)
        
        self.m_weights = zeros(self.gradW.shape)
        self.m_bias = zeros(self.gradb.shape)
        
        self.v_weights = zeros(self.gradW.shape)
        self.v_bias = zeros(self.gradb.shape)
    
    def init_params(self, xavier_init=True, xavier_gain=1):
        if xavier_init:
            xavier_std = xavier_gain * math.sqrt(2.0 / (self.N_in + self.N_out))
        else:
            xavier_std = 1
            
        self.W = empty((self.N_in, self.N_out)).normal_(0, xavier_std)
        self.b = empty((1, self.N_out)).normal_(0, xavier_std)
    
    def forward(self, *input_):
        # out = W * input + b
        x = input_[0].clone()
        
        self.x = x
        
        return self.x.mm(self.W) + self.b
        
    def backward(self, *gradwrtoutput):
        # grad_w += input * x^(l-1).t()
        # grad_b += input
        # out = w.t() * input
        # input = grad of activation function, i.e. dl/ds^(l)
        # x^(l-1) = input of the forward pass
        input_ = gradwrtoutput[0].clone()
        
        self.gradW += self.x.t().mm(input_)
        self.gradb += input_.sum(0)
        
        return input_.mm(self.W.t())
        
    def param(self):
        return [(self.W, self.gradW, self.m_weights, self.v_weights), (self.b, self.gradb, self.m_bias, self.v_bias)]
    
    def update(self, lr):
        self.W.sub_(lr * self.gradW)
        self.b.sub_(lr * self.gradb)
    
    def zero_grad(self):
        self.gradW = zeros(self.W.shape)
        self.gradb = zeros(self.b.shape)

In [5]:
class ReLU(Module):
    def __init__(self, ):
        Module.__init__(self)
        
    def forward(self, *input_):
        s = input_[0].clone()
        self.s = s
        
        s[s < 0] = 0.
        
        return s
        
    def backward(self, *gradwrtoutput):
        # out = f'(s^(l)) * input
        # s^(l) = input of forward pass
        # input = grad of next layer
        input_ = gradwrtoutput[0].clone()
        
        out = self.s.clone()
        out[out > 0] = 1
        out[out < 0] = 0
        
        
        return out.mul(input_)

In [6]:
class LeakyReLU(Module):
    def __init__(self, alpha=0.01):
        Module.__init__(self)
        self.alpha = alpha
        
    def forward(self, *input_):
        s = input_[0].clone()
        self.s = s
        
        s[s < 0] = self.alpha * s[s < 0]
        
        return s
        
    def backward(self, *gradwrtoutput):
        # out = f'(s^(l)) * input
        # s^(l) = input of forward pass
        # input = grad of next layer
        input_ = gradwrtoutput[0].clone()
        
        out = self.s.clone()
        out[out > 0] = 1
        out[out < 0] = self.alpha
        
        
        return out.mul(input_)

In [7]:
class Tanh(Module):
    def __init__(self):
        Module.__init__(self)
        
    def forward(self, *input_):
        s = input_[0].clone()
        self.s = s
        
        return s.tanh()
        
    def backward(self, *gradwrtoutput):
        # out = f'(s^(l)) * input
        # s^(l) = input of forward pass
        # input = grad of next layer
        input_ = gradwrtoutput[0].clone()
        
        out = self.s.clone()
        out = 1 - out.tanh().pow(2)
        
        return out.mul(input_)

In [8]:
class Sigmoid(Module):
    def __init__(self):
        Module.__init__(self)
        
    def forward(self, *input_):
        s = input_[0].clone()
        self.s = s
        
        return s.sigmoid()
        
    def backward(self, *gradwrtoutput):
        # out = f'(s^(l)) * input
        # s^(l) = input of forward pass
        # input = grad of next layer
        input_ = gradwrtoutput[0].clone()
        
        out = self.s.clone()
        out = out.sigmoid() * (1 - out.sigmoid())
        
        return out.mul(input_)

In [9]:
class Sequential(Module):
    def __init__(self, *modules, xavier_init=None, xavier_gain=1):
        Module.__init__(self)
        self.modules = modules
        self.xavier_init = xavier_init
        if xavier_init is not None:
            for module in self.modules:
                module.init_params(xavier_init, xavier_gain)
        
    def forward(self, *input_):
        x = input_[0].clone()
        
        for m in self.modules:
            x = m.forward(x)
        
        return x
        
    def backward(self, *gradwrtoutput):
        x = gradwrtoutput[0].clone()
        
        for i, m in enumerate(reversed(self.modules)):
            #print("{} : {}".format(i, x))
            x = m.backward(x)
        
    def param(self):
        params = []
        
        for m in self.modules:
            for param in m.param():
                params.append(param)
        
        return params

    def update(self, lr):
        for m in self.modules:
            m.update(lr)
    
    def zero_grad(self):
        for m in self.modules:
            m.zero_grad()

In [10]:
"""class Loss: 
    ...
"""
class LossMSE(Module):
    def __init__(self):
        Module.__init__(self)
        
    def forward(self, y, target):
        # out = e^2
        # e = (y - f(x))
        self.y = y.clone()
        target_onehot = zeros((target.shape[0], 2)) 
        self.target = target_onehot.scatter_(1, target.view(-1, 1), 1)
        
        self.e = (self.y - self.target)
        self.n = self.y.size(0)
        
        
        return self.e.pow(2).sum()
        
    def backward(self):
        # out = 2 * e
        
        return 2 * self.e # / self.n        

In [11]:
class LossCrossEntropy(Module):
    def __init__(self):
        Module.__init__(self)
        
    def forward(self, y, target):
        self.y = y.clone()
        self.target = target.clone()
        sm = torch.softmax(self.y, dim=1)
        likelihood = -torch.log(torch.clamp(sm[range(target.size(0)), target], min=1e-3, max=None))
        return likelihood.mean()
        
    def backward(self):
        sm = torch.softmax(self.y, dim=1)
        target_onehot = zeros((self.target.shape[0], 2)) 
        target_onehot = target_onehot.scatter_(1, self.target.view(-1, 1), 1)
        return sm - target_onehot

In [12]:
class Optimizer:
    def __init__(self, model, nb_epochs, mini_batch_size, lr, criterion):
        self.model = model
        self.nb_epochs = nb_epochs
        self.lr = lr
        self.mini_batch_size = mini_batch_size
        self.criterion = criterion
    
    def train(self, train_input, train_target, verbose=True):
        for e in range(self.nb_epochs):
            sum_loss = 0.
        
            for b in range(0, train_input.size(0), self.mini_batch_size):
                self.model.zero_grad()
                
                output = self.model.forward(train_input.narrow(0, b, self.mini_batch_size))
                loss = self.criterion.forward(output, train_target.narrow(0, b, self.mini_batch_size))
            
                sum_loss += loss
            
                l_grad = self.criterion.backward()
                self.model.backward(l_grad)
                self.step()
                
            if verbose:
                print("{} iteration: loss={}".format(e, sum_loss))
        return self.model
    
    def step(self):
        raise NotImplementedError

class SGD(Optimizer):
    def __init__(self, model, nb_epochs = 50, mini_batch_size=1, lr=1e-4, criterion=LossMSE()):
        Optimizer.__init__(self, model, nb_epochs, mini_batch_size, lr, criterion)
    
    def step(self):
        self.model.update(self.lr)

In [13]:
class Adam(Optimizer):
    def __init__(self, model, nb_epochs = 50, mini_batch_size=1, lr=1e-3, criterion=LossMSE(), 
                 b1=0.9, b2=0.999, epsilon=1e-8):
        Optimizer.__init__(self, model, nb_epochs, mini_batch_size, lr, criterion)
        
        self.b1 = b1
        self.b2 = b2
        self.epsilon = epsilon
        self.t = 0
    
    def step(self):
        self.t += 1
        
        for (param, grad, m, v) in self.model.param():
            g = grad.clone()
            
            m = self.b1 * m + (1 - self.b1) * g
            v = self.b2 * v + (1 - self.b2) * g.pow(2)
            
            m_biasc = m / (1 - self.b1 ** self.t)
            v_biasc = v / (1 - self.b2 ** self.t)
            
            param.sub_(self.lr * m_biasc/(v_biasc.sqrt() + self.epsilon))

In [14]:
class Evaluator:
    def __init__(self, model):
        self.model = model
    
    def test(self, test_input, test_target):
        num_samples = test_input.size(0)
        prediction = self.model.forward(test_input)
        self.model.zero_grad()
        predicted_class = torch.argmax(prediction, axis=1)
        accuracy = sum(predicted_class == test_target).float() / num_samples
        return accuracy

In [15]:
class CrossValidate:
    def __init__(self, model, optimizer):
        self.model = model
        self.optimizer = optimizer
        self.best_params = None
        
    def cross_validate(self, k, values, verbose=True):
        pass
    
    def set_params(self):
        pass
    
    def train(self, train_input, train_target, verbose=True):
        self.optimizer.train(train_input, train_target, verbose)

In [26]:
class SGDCV(CrossValidate):
    def __init__(self, model, nb_epochs = 50, mini_batch_size=1, lr=1e-4, criterion=LossMSE()):
        optimizer = SGD(model=model, nb_epochs=nb_epochs, mini_batch_size=mini_batch_size, 
                     lr=lr, criterion=criterion)
        CrossValidate.__init__(self, model, optimizer)
        
    def cross_validate(self, k=5, values={"lr": [1e-5, 1e-4, 1e-3, 1e-2]}, verbose=True):
        train_datasets = []
        test_datasets = []
        for i in range(k):
            train_datasets.append(generate_disc_set(1000))
            test_datasets.append(generate_disc_set(1000))
        
        if "lr" not in values:
            raise ValueError("Expected learning rate values to cross-validate...")
        
        possible_lrs = values["lr"]
        
        score_means = []
        score_vars = []
        for lr in possible_lrs:
            if verbose:
                print("Validating (lr={})... ".format(lr), end='')
            
            scores = []
            
            optim = SGD(model=copy.deepcopy(self.model), nb_epochs=self.optimizer.nb_epochs, mini_batch_size=self.optimizer.mini_batch_size, 
                     lr=lr, criterion=self.optimizer.criterion)
            
            for (train_input, train_target), (test_input, test_target) in zip(train_datasets, test_datasets):
                optim.model = copy.deepcopy(self.model)
                
                trained_model = optim.train(train_input, train_target, verbose=False)
                
                evaluator = Evaluator(optim.model)
                accuracy = evaluator.test(test_input, test_target)
                
                scores.append(accuracy)
            
            scores = torch.FloatTensor(scores)
            scores_mean = torch.mean(scores).item()
            scores_var = torch.std(scores).item()
            
            score_means.append(scores_mean)
            score_vars.append(scores_var)
            
            if verbose:
                print("Score : {0:.3f} (+/- {1:.3f}) ".format(scores_mean, scores_var))
                
        best_score = {}
        
        i = max(enumerate(score_means), key=lambda x: x[1])[0]
        
        best_score["lr"] = possible_lrs[i]
        best_score["mean"] = score_means[i]
        best_score["std"] = score_vars[i]
        
        self.best_params = best_score
        
        return dict(zip(possible_lrs, zip(score_means, score_vars))), best_score
    
        
    def set_params(self):
        if self.best_params is not None:
            lr = self.best_params["lr"]
            
            self.optimizer = SGD(model=self.model, nb_epochs=self.optimizer.nb_epochs, mini_batch_size=self.optimizer.mini_batch_size, 
                     lr=lr, criterion=self.optimizer.criterion)

In [23]:
class AdamCV(CrossValidate):
    def __init__(self, model, nb_epochs = 50, mini_batch_size=1, lr=1e-4, criterion=LossMSE(),
                b1=0.9, b2=0.999, epsilon=1e-8):
        optimizer = Adam(model=model, nb_epochs=nb_epochs, mini_batch_size=mini_batch_size, 
                              lr=lr, criterion=criterion, b1=b1, b2=b2, epsilon=epsilon)
        CrossValidate.__init__(self, model, optimizer)
        
    def cross_validate(self, k=5, values={"lr": [1e-5, 1e-4, 1e-3, 1e-2], "b1": [0.9], 
                                          "b2": [0.999], "epsilon": [1e-8]}, verbose=True):
        train_datasets = []
        test_datasets = []
        for i in range(k):
            train_datasets.append(generate_disc_set(1000))
            test_datasets.append(generate_disc_set(1000))
        
        if "lr" not in values or "b1" not in values or "b1" not in values or "epsilon" not in values:
            raise ValueError("Expected learning rate values to cross-validate...")
            
        if "b1" not in values:
            raise ValueError("Expected b1 values to cross-validate...")

        if "b2" not in values:
            raise ValueError("Expected b2 values to cross-validate...")

        if "epsilon" not in values:
            raise ValueError("Expected epsilon values to cross-validate...")
        
        lrs = values["lr"]
        b1s = values["b1"]
        b2s = values["b2"]
        epsilons = values["epsilon"]
        param_grid = [(lr, b1, b2, epsilon)
                        for lr in lrs
                        for b1 in b1s
                        for b2 in b2s
                        for epsilon in epsilons]
        
        score_means = []
        score_vars = []
        for (lr, b1, b2, epsilon) in param_grid:
            if verbose:
                print("Validating (lr={}, b1={}, b2={}, epsilon={})... ".format(lr, b1, b2, epsilon), end='')
            
            scores = []
            
            optim = Adam(model=copy.deepcopy(self.model), nb_epochs=self.optimizer.nb_epochs, mini_batch_size=self.optimizer.mini_batch_size, 
                     lr=lr, criterion=self.optimizer.criterion,  b1=b1, b2=b2, epsilon=epsilon)
            
            for (train_input, train_target), (test_input, test_target) in zip(train_datasets, test_datasets):
                optim.model = copy.deepcopy(self.model)
                trained_model = optim.train(train_input, train_target, verbose=False)
                
                evaluator = Evaluator(optim.model)
                accuracy = evaluator.test(test_input, test_target)
                
                scores.append(accuracy)
            
            scores = torch.FloatTensor(scores)
            scores_mean = torch.mean(scores).item()
            scores_var = torch.std(scores).item()
            
            score_means.append(scores_mean)
            score_vars.append(scores_var)
            
            if verbose:
                print("Score : {0:.3f} (+/- {1:.3f}) ".format(scores_mean, scores_var))
                
        best_score = {}
        
        i = max(enumerate(score_means), key=lambda x: x[1])[0]
        
        best_score["lr"] = param_grid[i][0]
        best_score["b1"] = param_grid[i][1]
        best_score["b2"] = param_grid[i][2]
        best_score["epsilon"] = param_grid[i][3]
        best_score["mean"] = score_means[i]
        best_score["std"] = score_vars[i]
        
        self.best_params = best_score
        
        return dict(zip(param_grid, zip(score_means, score_vars))), best_score

            
    def set_params(self):
        if self.best_params is not None:
            lr = self.best_params["lr"]
            b1 = self.best_params["b1"]
            b2 = self.best_params["b2"]
            epsilon = self.best_params["epsilon"]
            
            self.optimizer = Adam(model=self.model, nb_epochs=self.optimizer.nb_epochs, mini_batch_size=self.optimizer.mini_batch_size, 
                     lr=lr, criterion=self.optimizer.criterion,  b1=b1, b2=b2, epsilon=epsilon)

In [18]:
def generate_disc_set(nb):
    input = empty(nb, 2).uniform_(-1, 1)
    target = input.pow(2).sum(1).sub(2 / math.pi).sign().add(1).div(2).long()
    return input, target

train_input, train_target = generate_disc_set(1000)
test_input, test_target = generate_disc_set(1000)

In [20]:
model = Sequential(Linear(2, 25), ReLU(),
                   Linear(25, 25), ReLU(),
                   Linear(25, 25), ReLU(),
                   Linear(25, 2))

optimizer = AdamCV(model, mini_batch_size=100, criterion=LossCrossEntropy())

values = {"lr": [1e-6, 1e-5, 1e-4, 1e-3, 1e-2], "b1": [0.9, 0.8], 
          "b2": [0.999, 0.888], "epsilon": [1e-8, 1e-7, 1e-6]}

optimizer.cross_validate(k=3, values=values)

optimizer.set_params()

optimizer.train(train_input, train_target, verbose=False)

evaluator = Evaluator(model)

evaluator.test(test_input, test_target)

Validating (lr=1e-06, b1=0.9, b2=0.999, epsilon=1e-08)... Score : 0.754 (+/- 0.001) 
Validating (lr=1e-06, b1=0.9, b2=0.999, epsilon=1e-07)... Score : 0.754 (+/- 0.001) 
Validating (lr=1e-06, b1=0.9, b2=0.999, epsilon=1e-06)... Score : 0.754 (+/- 0.001) 
Validating (lr=1e-06, b1=0.9, b2=0.888, epsilon=1e-08)... Score : 0.748 (+/- 0.003) 
Validating (lr=1e-06, b1=0.9, b2=0.888, epsilon=1e-07)... Score : 0.748 (+/- 0.003) 
Validating (lr=1e-06, b1=0.9, b2=0.888, epsilon=1e-06)... Score : 0.748 (+/- 0.003) 
Validating (lr=1e-06, b1=0.8, b2=0.999, epsilon=1e-08)... Score : 0.763 (+/- 0.004) 
Validating (lr=1e-06, b1=0.8, b2=0.999, epsilon=1e-07)... Score : 0.763 (+/- 0.004) 
Validating (lr=1e-06, b1=0.8, b2=0.999, epsilon=1e-06)... Score : 0.763 (+/- 0.004) 
Validating (lr=1e-06, b1=0.8, b2=0.888, epsilon=1e-08)... Score : 0.748 (+/- 0.005) 
Validating (lr=1e-06, b1=0.8, b2=0.888, epsilon=1e-07)... Score : 0.748 (+/- 0.005) 
Validating (lr=1e-06, b1=0.8, b2=0.888, epsilon=1e-06)... Score :

tensor(0.9810)

In [27]:
model = Sequential(Linear(2, 25), ReLU(),
                   Linear(25, 25), ReLU(),
                   Linear(25, 25), ReLU(),
                   Linear(25, 2))

optimizer = SGDCV(model, mini_batch_size=100, criterion=LossCrossEntropy())

values = {"lr": [1e-6, 1e-5, 1e-4, 1e-3, 1e-2]}

optimizer.cross_validate(k=3, values=values)

optimizer.set_params()

optimizer.train(train_input, train_target, verbose=False)

evaluator = Evaluator(model)

evaluator.test(test_input, test_target)

Validating (lr=1e-06)... Score : 0.844 (+/- 0.011) 
Validating (lr=1e-05)... Score : 0.969 (+/- 0.008) 
Validating (lr=0.0001)... Score : 0.981 (+/- 0.003) 
Validating (lr=0.001)... Score : 0.972 (+/- 0.005) 
Validating (lr=0.01)... Score : 0.514 (+/- 0.013) 


tensor(0.9710)

In [None]:
# model = Sequential(Linear(2, 25), ReLU(),
#                    Linear(25, 25), ReLU(),
#                    Linear(25, 25), ReLU(),
#                    Linear(25, 2))
# model = Sequential(Linear(2, 25), Tanh(),
#                    Linear(25, 25), Tanh(),
#                    Linear(25, 25), Tanh(),
#                    Linear(25, 2))
model = Sequential(Linear(2, 25), Sigmoid(),
                   Linear(25, 25), Sigmoid(),
                   Linear(25, 25), Sigmoid(),
                   Linear(25, 2))

evaluator = EvaluatorAdam(model, mini_batch_size=100)

In [None]:
values = {"lr": [1e-6, 1e-5, 1e-4, 1e-3, 1e-2], "b1": [0.9, 0.8], 
          "b2": [0.999, 0.888], "epsilon": [1e-8, 1e-7, 1e-6]}
evaluator.cross_validate(values=values)

In [None]:
best_lr = 1e-3
model = Sequential(Linear(2, 25), ReLU(),
                   Linear(25, 25), Tanh(),
                   Linear(25, 25), Sigmoid(),
                   Linear(25, 2), xavier_init=False)
optimizer = Adam(model, lr=best_lr, mini_batch_size=100, criterion=LossCrossEntropy())
model = optimizer.train(train_input, train_target)
evaluator.model = model
evaluator.test(test_input, test_target)