In [None]:
import numpy as np

In [20]:
class Module(object):
    def __init__ (self):
        self.output = None
        self.gradInput = None
        self.training = True
    """
    Basically, you can think of a module as of a something (black box) 
    which can process `input` data and produce `ouput` data.
    This is like applying a function which is called `forward`: 
        
        output = module.forward(input)
    
    The module should be able to perform a backward pass: to differentiate the `forward` function. 
    More, it should be able to differentiate it if is a part of chain (chain rule).
    The latter implies there is a gradient from previous step of a chain rule. 
    
        gradInput = module.backward(input, gradOutput)
    """
    
    def forward(self, input):
        """
        Takes an input object, and computes the corresponding output of the module.
        """
        return self.updateOutput(input)

    def backward(self,input, gradOutput):
        """
        Performs a backpropagation step through the module, with respect to the given input.
        
        This includes 
         - computing a gradient w.r.t. `input` (is needed for further backprop),
         - computing a gradient w.r.t. parameters (to update parameters while optimizing).
        """
        self.updateGradInput(input, gradOutput)
        self.accGradParameters(input, gradOutput)
        return self.gradInput
    

    def updateOutput(self, input):
        """
        Computes the output using the current parameter set of the class and input.
        This function returns the result which is stored in the `output` field.
        
        Make sure to both store the data in `output` field and return it. 
        """
        
        # The easiest case:
            
        # self.output = input 
        # return self.output
        
        pass

    def updateGradInput(self, input, gradOutput):
        """
        Computing the gradient of the module with respect to its own input. 
        This is returned in `gradInput`. Also, the `gradInput` state variable is updated accordingly.
        
        The shape of `gradInput` is always the same as the shape of `input`.
        
        Make sure to both store the gradients in `gradInput` field and return it.
        """
        
        # The easiest case:
        
        # self.gradInput = gradOutput 
        # return self.gradInput
        
        pass   
    
    def accGradParameters(self, input, gradOutput):
        """
        Computing the gradient of the module with respect to its own parameters.
        No need to override if module has no parameters (e.g. ReLU).
        """
        pass
    
    def zeroGradParameters(self): 
        """
        Zeroes `gradParams` variable if the module has params.
        """
        pass
        
    def getParameters(self):
        """
        Returns a list with its parameters. 
        If the module does not have parameters return empty list. 
        """
        return []
        
    def getGradParameters(self):
        """
        Returns a list with gradients with respect to its parameters. 
        If the module does not have parameters return empty list. 
        """
        return []
    
    def training(self):
        """
        Sets training mode for the module.
        Training and testing behaviour differs for Dropout, BatchNorm.
        """
        self.training = True
    
    def evaluate(self):
        """
        Sets evaluation mode for the module.
        Training and testing behaviour differs for Dropout, BatchNorm.
        """
        self.training = False
    
    def __repr__(self):
        """
        Pretty printing. Should be overrided in every module if you want 
        to have readable description. 
        """
        return "Module"
    

class Criterion(object):
    def forward(self, input, target):
        """
            Given an input and a target, compute the loss function
            associated to the criterion and return the result.

            For consistency this function should not be overrided,
            all the code goes in `updateOutput`.
        """
        return self.updateOutput(input, target)

    def backward(self, input, target):
        """
            Given an input and a target, compute the gradients of the loss function
            associated to the criterion and return the result.

            For consistency this function should not be overrided,
            all the code goes in `updateGradInput`.
        """
        return self.updateGradInput(input, target)

    def updateOutput(self, input, target):
        """
        Function to override.
        """
        return self.output

    def updateGradInput(self, input, target):
        """
        Function to override.
        """
        return self.gradInput

    def __repr__(self):
        """
        Pretty printing. Should be overrided in every module if you want
        to have readable description.
        """
        return "Criterion"

    
class Sequential(Module):
    """
         This class implements a container, which processes `input` data sequentially. 
         
         `input` is processed by each module (layer) in self.modules consecutively.
         The resulting array is called `output`. 
    """
    
    def __init__ (self):
        super(Sequential, self).__init__()
        self.modules = []
        
    def add(self, module):
        """
        Adds a module to the container.
        """
        self.modules.append(module)
        self.inputs = []

    def updateOutput(self, input):
        """
        Basic workflow of FORWARD PASS:
        
            y_0    = module[0].forward(input)
            y_1    = module[1].forward(y_0)
            ...
            output = module[n-1].forward(y_{n-2})   
            
            
        Just write a little loop. 
        """
        self.inputs = []
        y = input
        for mod in self.modules:
            self.inputs.append(y)
            y = mod.forward(y)
        self.output = y
        return self.output

    def backward(self, input, gradOutput):
        """
        Workflow of BACKWARD PASS:
            
            g_{n-1} = module[n-1].backward(y_{n-2}, gradOutput)
            g_{n-2} = module[n-2].backward(y_{n-3}, g_{n-1})
            ...
            g_1 = module[1].backward(y_0, g_2)   
            gradInput = module[0].backward(input, g_1)   
             
             
        !!!
                
        To ech module you need to provide the input, module saw while forward pass, 
        it is used while computing gradients. 
        Make sure that the input for `i-th` layer the output of `module[i]` (just the same input as in forward pass) 
        and NOT `input` to this Sequential module. 
        
        !!!
        
        """
        
        g = gradOutput
        for mod, inp in zip(self.modules[::-1], self.inputs[::-1]):
            g = mod.backward(inp, g)
        
        self.gradInput = g
        return self.gradInput
      

    def zeroGradParameters(self): 
        for module in self.modules:
            module.zeroGradParameters()
    
    def getParameters(self):
        """
        Should gather all parameters in a list.
        """
        return [x.getParameters() for x in self.modules]
    
    def getGradParameters(self):
        """
        Should gather all gradients w.r.t parameters in a list.
        """
        return [x.getGradParameters() for x in self.modules]
    
    def __repr__(self):
        string = "".join([str(x) + '\n' for x in self.modules])
        return string
    
    def __getitem__(self,x):
        return self.modules.__getitem__(x)
    
    def training(self):
        for module in self.modules:
            module.training()
    
    def evaluate(self):
        for module in self.modules:
            module.evaluate()

In [25]:
class Linear(Module):
    """
    A module which applies a linear transformation 
    A common name is fully-connected layer, InnerProductLayer in caffe. 
    
    The module should work with 2D input of shape (n_samples, n_feature).
    """
    def __init__(self, n_in, n_out, w=None, b=None):
        super(Linear, self).__init__()
       
        a = np.sqrt(6) / np.sqrt(n_in + n_out);
        
        if w is None:
            self.W = np.random.uniform(-a, a, (n_out, n_in))
        else:
            self.W = w
            
        if b is None:
            self.b = np.random.uniform(-a, a, n_out)
        else:
            self.b = b
        
        self.gradW = np.zeros_like(self.W)
        self.gradb = np.zeros_like(self.b)
        
    def updateOutput(self, input):
        """given X input, produce output"""
        self.output = np.dot(input, self.W.T) + self.b
        return self.output
    
    def updateGradInput(self, input, gradOutput):
        """given input and dL/d_output, compute dL/d_input"""
        self.gradInput = np.dot(gradOutput, self.W)
        return self.gradInput
    
    def accGradParameters(self, input, gradOutput):
        """given input and dL/d_output, compute"""
        self.gradW = np.dot(gradOutput.T, input)
        self.gradb = np.ones_like(self.b) * np.sum(gradOutput, axis=0) 
        return self.gradW, self.gradb
     
    def zeroGradParameters(self):
        self.gradW.fill(0)
        self.gradb.fill(0)
        
    def getParameters(self):
        return [self.W, self.b]
    
    def getGradParameters(self):
        return [self.gradW, self.gradb]

In [22]:
class SoftMax(Module):
    def __init__(self):
         super(SoftMax, self).__init__()
    
    def updateOutput(self, input):
        """forward pass of softmax nonlinearity"""
        # substract max for numerical stability
        input = input - input.max(axis=1, keepdims=True)
        exp = np.exp(input)
        self.output = exp / np.sum(exp, axis=1, keepdims=True)
        return self.output
    
    def updateGradInput(self, input, gradOutput):
        """backward pass of the same thing"""
        exp = np.exp(np.subtract(input, input.max(axis=1, keepdims=True)))
        denom = exp.sum(axis=1, keepdims=True)
        e = np.diag(exp.dot(gradOutput.T))
        self.gradInput = - np.diag(e).dot(exp)    
        self.gradInput += exp * denom * gradOutput
        self.gradInput /= denom**2
        return self.gradInput


In [23]:
class ReLU(Module):
    def __init__(self):
         super(ReLU, self).__init__()
    
    def updateOutput(self, input):
        self.output = np.maximum(0, input)
        return self.output
    
    def updateGradInput(self, input, gradOutput):
        self.gradInput = (input > 0) * gradOutput
        return self.gradInput

In [26]:
class LeakyReLU(Module):
    def __init__(self, alpha=0):
        super(LeakyReLU, self).__init__()
        self.alpha = alpha
    
    def updateOutput(self, input):
        is_negative = (input < 0)
        c = is_negative * self.alpha + np.logical_not(is_negative)
        self.output = c * input
        return self.output
    
    def updateGradInput(self, input, gradOutput):
        is_negative = (input < 0)
        c = is_negative * self.alpha + np.logical_not(is_negative)
        self.gradInput = c * gradOutput
        return self.gradInput

In [None]:
class ELU(Module):
    def __init__(self, alpha=0):
        super(ELU, self).__init__()
        self.alpha = alpha
        
    def f(self, x):
        res = x #.copy()
        is_negative = (res < 0)
        res[is_negative] = self.alpha * (np.exp(res[is_negative]) - 1)
        return res
    
    def updateOutput(self, input):
        self.output = self.f(input)
        return self.output
    
    def updateGradInput(self, input, gradOutput):
        self.gradInput = (input >= 0).astype(np.float64)
        is_negative = (input < 0)
        self.gradInput[is_negative] = self.f(input[is_negative]) + self.alpha
        self.gradInput *= gradOutput
        return self.gradInput

In [None]:
class SoftPlus(Module):
    def __init__(self):
        super(SoftPlus, self).__init__()
        
    def updateOutput(self, input):
        self.output = np.log(1 + np.exp(input))
        return self.output
    
    def updateGradInput(self, input, gradOutput):
        self.gradInput = 1 / (1 + np.exp(-input)) * gradOutput
        return self.gradInput

In [None]:
class ClassNLLCriterion(Criterion):
    def __init__(self):
        super(ClassNLLCriterion, self).__init__()
        
    def updateOutput(self, input, target):  
        self.output = -1.0 / input.shape[0] * np.sum(target * np.log(input))
        return self.output

    def updateGradInput(self, input, target):
        self.gradInput = -1.0 / input.shape[0] * target / input
        return self.gradInput

In [None]:
class BatchMeanSubtraction(Module):
    def __init__(self, alpha=0):
        super(BatchMeanSubtraction, self).__init__()
        self.alpha = alpha
        self.mean = None
    
    def updateOutput(self, input):  
        if self.training:
            if self.mean is not None:
                self.mean = self.alpha * self.mean + (1 - self.alpha) * np.mean(input, axis=0)
            else:
                self.mean = np.mean(input, axis=0)
            
        self.output = input - self.mean
        return self.output

    def updateGradInput(self, input, gradOutput):
        self.gradInput = gradOutput - np.mean(gradOutput, axis=0)
        return self.gradInput

In [28]:
from scipy.stats import bernoulli

class Dropout(Module):
    def __init__(self, p=0.5):
        super(Dropout, self).__init__()
        self.p = p
        self.mask = None
    
    def updateOutput(self, input):
        if self.training:
            self.mask = bernoulli.rvs(self.p, size=input.shape)
            self.output = self.mask * input
        else:
            self.output = p * input
        
        return self.output

    def updateGradInput(self, input, gradOutput):
        self.gradInput = self.mask * gradOutput
        return self.gradInput