In [78]:
from torch import empty, zeros
import math
import copy

In [71]:
torch.manual_seed(1)

<torch._C.Generator at 0x1e5001063b0>

In [2]:
class Module(object):
    def __init__(self):
        pass
        
    def forward(self, *input_):
        raise NotImplementedError
        
    def backward(self, *gradwrtoutput):
        raise NotImplementedError
        
    def param(self):
        return []
    
    def update(self, lr):
        pass
    
    def zero_grad(self):
        pass


In [3]:
class Linear(Module):
    def __init__(self, N_in, N_out):
        super(Linear, self).__init__()
        self.N_in = N_in
        self.N_out = N_out
        
        self.W = empty((N_in, N_out)).normal_()
        self.b = empty((1, N_out)).normal_()
        
        self.gradW = zeros(self.W.shape).normal_()
        self.gradb = zeros(self.b.shape).normal_()
        
    def forward(self, *input_):
        # out = W * input + b
        x = input_[0].clone()
        
        self.x = x
        
        return self.x.mm(self.W) + self.b
        
    def backward(self, *gradwrtoutput):
        # grad_w += input * x^(l-1).t()
        # grad_b += input
        # out = w.t() * input
        # input = grad of activation function, i.e. dl/ds^(l)
        # x^(l-1) = input of the forward pass
        input_ = gradwrtoutput[0].clone()
        
        self.gradW += self.x.t().mm(input_)
        self.gradb += input_.sum(0)
        
        return input_.mm(self.W.t())
        
    def param(self):
        return [(self.W, self.gradW), (self.b, self.gradb)]
    
    def update(self, lr):
        self.W.sub_(lr * self.gradW)
        self.b.sub_(lr * self.gradb)
    
    def zero_grad(self):
        self.gradW = zeros(self.W.shape)
        self.gradb = zeros(self.b.shape)

In [4]:
class ReLU(Module):
    def __init__(self):
        super(ReLU, self).__init__()
        
    def forward(self, *input_):
        s = input_[0].clone()
        self.s = s
        
        s[s < 0] = 0.
        
        return s
        
    def backward(self, *gradwrtoutput):
        # out = f'(s^(l)) * input
        # s^(l) = input of forward pass
        # input = grad of next layer
        input_ = gradwrtoutput[0].clone()
        
        out = self.s.clone()
        out[out > 0] = 1
        out[out < 0] = 0
        
        
        return out.mul(input_)
        
    def param(self):
        return []
    
    def zero_grad(self):
        pass

In [5]:
class Tanh(Module):
    def __init__(self):
        super(Tanh, self).__init__()
        
    def forward(self, *input_):
        s = input_[0].clone()
        self.s = s
        
        return s.tanh()
        
    def backward(self, *gradwrtoutput):
        # out = f'(s^(l)) * input
        # s^(l) = input of forward pass
        # input = grad of next layer
        input_ = gradwrtoutput[0].clone()
        
        out = self.s.clone()
        out = 1 - out.tanh().pow(2)
        
        return out.mul(input_)
        
    def param(self):
        return []
    
    def zero_grad(self):
        pass

In [6]:
class Sequential(Module):
    def __init__(self, *modules):
        super(Sequential, self).__init__()
        self.modules = modules
        
    def forward(self, *input_):
        x = input_[0].clone()
        
        for m in self.modules:
            x = m.forward(x)
        
        return x
        
    def backward(self, *gradwrtoutput):
        x = gradwrtoutput[0].clone()
        
        for i, m in enumerate(reversed(self.modules)):
            #print("{} : {}".format(i, x))
            x = m.backward(x)
        
    def param(self):
        params = []
        
        for m in self.modules:
            for param in m.param():
                params.append(param)
        
        return params

    def update(self, lr):
        for m in self.modules:
            m.update(lr)
    
    def zero_grad(self):
        for m in self.modules:
            m.zero_grad()

In [58]:
"""class Loss: 
    ...
"""
class LossMSE(Module):
    def __init__(self):
        super(LossMSE, self).__init__()
        pass
        
    def forward(self, y, target):
        # out = e^2
        # e = (y - f(x))
        self.y = y.clone()
        target_onehot = zeros((target.shape[0], 2)) 
        self.target = target_onehot.scatter_(1, target.view(-1, 1), 1)
        
        self.e = (self.y - self.target)
        self.n = self.y.size(0)
        
        
        return self.e.pow(2).sum()
        
    def backward(self):
        # out = 2 * e
        
        return 2 * self.e # / self.n        

In [250]:
class LossCrossEntropy(Module):
    def __init__(self):
        super(LossCrossEntropy, self).__init__()
        pass
        
    def forward(self, y, target):
        # out = e^2
        # e = (y - f(x))
        self.y = y.clone()
        self.target = target.clone()
        sm = torch.softmax(self.y, dim=1)
        likelihood = -torch.log(torch.clamp(sm[range(target.size(0)), target], min=1e-3, max=None))
        return likelihood.mean()
        
    def backward(self):
        sm = torch.softmax(self.y, dim=1)
        target_onehot = zeros((self.target.shape[0], 2)) 
        target_onehot = target_onehot.scatter_(1, self.target.view(-1, 1), 1)
        return sm - target_onehot

In [248]:
class Optimizer:
    def __init__(self):
        pass
    
    def train(self):
        raise NotImplementedError
    
    def step(self):
        raise NotImplementedError

class SGD(Optimizer):
    def __init__(self, model, nb_epochs = 50, mini_batch_size=1, lr=1e-4, criterion=LossMSE()):
        super(SGD, self).__init__()
        self.model = model
        self.nb_epochs = nb_epochs
        self.lr = lr
        self.mini_batch_size = mini_batch_size
        self.criterion = criterion
    
    def step(self):
        self.model.update(self.lr)
        
    # Train function?
    def train(self, train_input, train_target, verbose=True):
        
        for e in range(self.nb_epochs):
            sum_loss = 0.
        
            for b in range(0, train_input.size(0), self.mini_batch_size):
                self.model.zero_grad()
                output = self.model.forward(train_input.narrow(0, b, self.mini_batch_size))
                #print(output)
                loss = self.criterion.forward(output, train_target.narrow(0, b, self.mini_batch_size))
                # print(loss)
            
                sum_loss += loss
            
                l_grad = self.criterion.backward()
                self.model.backward(l_grad)
                #print(model.param()[0])
                self.step()
            
            #print(model.param()[0])
            if verbose:
                print("{} iteration: loss={}".format(e, sum_loss))
        return self.model

In [110]:
class Evaluator:
    def __init__(self, model, optimizer):
        self.model = model
        self.optimizer = optimizer
        
    def cross_validate(self, k=5, possible_lrs=[1e-5, 1e-4, 1e-3, 1e-2]):
        train_datasets = []
        test_datasets = []
        for i in range(k):
            train_datasets.append(generate_disc_set(1000))
            test_datasets.append(generate_disc_set(1000))
        scores = {}
        score_means = {}
        score_vars = {}
        for lr in possible_lrs:
            print("Validating:", lr)
            scores[lr] = []
            for (train_input, train_target), (test_input, test_target) in zip(train_datasets, test_datasets):
                optimizer.model = Sequential(Linear(2, 25), ReLU(), Linear(25, 25), ReLU(), Linear(25, 25), ReLU(), Linear(25, 2)) 
                optimizer.lr = lr
                self.model = optimizer.train(train_input, train_target, verbose=False)
                accuracy = self.test(test_input, test_target)
                scores[lr].append(accuracy)
            scores[lr] = torch.FloatTensor(scores[lr])
            score_means[lr] = torch.mean(scores[lr])
            score_vars[lr] = torch.std(scores[lr])
            
        return score_means, score_vars
    
    def test(self, test_input, test_target):
        num_samples = test_input.size(0)
        prediction = self.model.forward(test_input)
        predicted_class = torch.argmax(prediction, axis=1)
        accuracy = sum(predicted_class == test_target).float() / num_samples
        return accuracy

In [9]:
def generate_disc_set(nb):
    input = empty(nb, 2).uniform_(-1, 1)
    target = input.pow(2).sum(1).sub(2 / math.pi).sign().add(1).div(2).long()
    return input, target

train_input, train_target = generate_disc_set(1000)
test_input, test_target = generate_disc_set(1000)

In [10]:
train_input

tensor([[ 0.8252,  0.2783],
        [ 0.0929,  0.2911],
        [ 0.4758,  0.5343],
        ...,
        [-0.2414, -0.2005],
        [-0.9865, -0.1295],
        [-0.6723, -0.1999]])

In [111]:
model = Sequential(Linear(2, 25), ReLU(),
                   Linear(25, 25), ReLU(),
                   Linear(25, 25), ReLU(),
                   Linear(25, 2))
# model = Sequential(Linear(2, 25), Tanh(),
#                    Linear(25, 25), Tanh(),
#                    Linear(25, 25), Tanh(),
#                    Linear(25, 2))

optimizer = SGD(model, mini_batch_size=5)
evaluator = Evaluator(model, optimizer)

In [112]:
evaluator.cross_validate()

Validating: 1e-05
Validating: 0.0001
Validating: 0.001
Validating: 0.01


({1e-05: tensor(0.8174),
  0.0001: tensor(0.7730),
  0.001: tensor(0.5034),
  0.01: tensor(0.5034)},
 {1e-05: tensor(0.0337),
  0.0001: tensor(0.2425),
  0.001: tensor(0.0098),
  0.01: tensor(0.0098)})

In [253]:
best_lr = 1e-4
model = Sequential(Linear(2, 25), ReLU(),
                   Linear(25, 25), ReLU(),
                   Linear(25, 25), ReLU(),
                   Linear(25, 2))
optimizer = SGD(model, lr=best_lr, mini_batch_size=100, criterion=LossCrossEntropy())
model = optimizer.train(train_input, train_target)
evaluator.model = model
evaluator.test(test_input, test_target)

0 iteration: loss=20.889606475830078
1 iteration: loss=7.32117223739624
2 iteration: loss=3.96122407913208
3 iteration: loss=1.9598121643066406
4 iteration: loss=1.200587272644043
5 iteration: loss=0.9994472861289978
6 iteration: loss=0.9138118028640747
7 iteration: loss=0.8422287702560425
8 iteration: loss=0.7800658941268921
9 iteration: loss=0.7308159470558167
10 iteration: loss=0.6977884769439697
11 iteration: loss=0.6656010746955872
12 iteration: loss=0.6381933093070984
13 iteration: loss=0.6255884170532227
14 iteration: loss=0.599713146686554
15 iteration: loss=0.5800283551216125
16 iteration: loss=0.5602816343307495
17 iteration: loss=0.5460494756698608
18 iteration: loss=0.5315064787864685
19 iteration: loss=0.5161387324333191
20 iteration: loss=0.5063460469245911
21 iteration: loss=0.49479174613952637
22 iteration: loss=0.48522859811782837
23 iteration: loss=0.47490543127059937
24 iteration: loss=0.4643429219722748
25 iteration: loss=0.453312486410141
26 iteration: loss=0.44643

tensor(0.9790)