# Framework

In [23]:
import numpy as np

In [25]:
class Module():
    def __init__(self):
        self._train = True
    
    def forward(self, input):
        raise NotImplementedError

    def backward(self,input, grad_output):
        raise NotImplementedError
    
    def parameters(self):

        return []
    
    def grad_parameters(self):

        return []
    
    def train(self):
        self._train = True
    
    def eval(self):
        self._train = False

In [26]:
class Sequential(Module):
    def __init__ (self, *layers):
        super().__init__()
        self.layers = layers

    def forward(self, input):

        for layer in self.layers:
            input = layer.forward(input)

        self.output = input
        return self.output

    def backward(self, input, grad_output):
  
        for i in range(len(self.layers)-1, 0, -1):
            grad_output = self.layers[i].backward(self.layers[i-1].output, grad_output)
        
        grad_input = self.layers[0].backward(input, grad_output)
        
        return grad_input
      
    def parameters(self):

        res = []
        for l in self.layers:
            res += l.parameters()
        return res
    
    def grad_parameters(self):

        res = []
        for l in self.layers:
            res += l.grad_parameters()
        return res
    
    def train(self):
        for layer in self.layers:
            layer.train()
    
    def eval(self):
        for layer in self.layers:
            layer.eval()

In [27]:
class Linear(Module):
    def __init__(self, dim_in, dim_out):
        super().__init__()

        stdv = 1./np.sqrt(dim_in)
        self.W = np.random.uniform(-stdv, stdv, size=(dim_in, dim_out))
        self.b = np.random.uniform(-stdv, stdv, size=dim_out)
        
    def forward(self, input):
        self.output = np.dot(input, self.W) + self.b
        return self.output
    
    def backward(self, input, grad_output):
        self.grad_b = np.mean(grad_output, axis=0)

        self.grad_W = np.dot(input.T, grad_output)
        
        grad_input = np.dot(grad_output, self.W.T)
        
        return grad_input
    
    def parameters(self):
        return [self.W, self.b]
    
    def grad_parameters(self):
        return [self.grad_W, self.grad_b]

## Activation functions

**ReLU**
$$
ReLU(x)=
\begin{cases}
x, & x > 0\\
0, & x \leq 0\\
\end{cases}
$$

In [28]:
class ReLU(Module):
    def __init__(self):
         super().__init__()
    
    def forward(self, input):
        self.output = np.maximum(input, 0)
        return self.output
    
    def backward(self, input, grad_output):
        grad_input = np.multiply(grad_output, input > 0)
        return grad_input

**Leaky Rectified Linear Unit**
$$
LeakyReLU_k(x)=
\begin{cases}
x, & x > 0\\
kx, & x \leq 0\\
\end{cases}
$$

In [29]:
class LeakyReLU(Module):
    def __init__(self, slope=0.03):
        super().__init__()
            
        self.slope = slope
        
    def forward(self, input):
        self.output = np.maximum(input, self.slope*input)
        return self.output
    
    def backward(self, input, grad_output):
        mask = (input > 0)
        grad_input = np.multiply(grad_output, mask) + np.multiply(self.slope*grad_output, 1-mask)
        return grad_input

**Sigmoid** 
$$\sigma(x) = \frac{1}{1+e^{-x}}$$

<img width='500px' src='https://upload.wikimedia.org/wikipedia/commons/thumb/5/53/Sigmoid-function-2.svg/2000px-Sigmoid-function-2.svg.png'>

$$
\begin{align}
\sigma'(x) &= (\frac{1}{1+e^{-x}})'
\\         &= \frac{e^{-x}}{(1+e^{-x})^2}
\\         &= \frac{1+e^{-x}-1}{(1+e^{-x})^2}
\\         &= \frac{1+e^{-x}}{(1+e^{-x})^2} - \frac{1}{(1+e^{-x})^2}
\\         &= \frac{1}{1+e^{-x}} - \frac{1}{(1+e^{-x})^2}
\\         &= \sigma(x) - \sigma(x)^2
\\         &= \sigma(x)(1 - \sigma(x))
\end{align}
$$

In [30]:
class Sigmoid(Module):
    def __init__(self, slope=0.03):
        super().__init__()

    def forward(self, input):
        self.output = 1/(1 + np.exp(input))
        return self.output
    
    def backward(self, input, grad_output):
        grad_input = grad_output*self.output*(1 - self.output)
        return grad_input

**Softmax**

$$ \sigma(x)_k = \frac{e^{x_k}}{\sum_{i=1}^n e^{x_i} }$$

In [143]:
class SoftMax(Module):
    def __init__(self):
         super().__init__()
    
    def forward(self, input):

        self.output = np.subtract(input, input.max(axis=1, keepdims=True))
        self.output = np.divide(np.exp(self.output), np.sum(np.exp(self.output),axis = 1)[:,None])
        
        return self.output
    
    def backward(self, input, grad_output):
        
        grad_input = np.zeros((input.shape[0],input.shape[1]))
        
        for i in range(input.shape[0]):
            matrix_proisvod = np.outer(self.output[i], self.output[i])
            mass = np.array([1 for i in range(self.output.shape[1])])
            matrix_proisvod = -matrix_proisvod + np.outer(self.output[i], mass)*np.eye(self.output.shape[1])
            grad_input[i] = np.dot(grad_output[i], matrix_proisvod)
            
        return grad_input

## Regularization

**Dropout**

In [32]:
class Dropout(Module):
    def __init__(self, p=0.5):
        super().__init__()
        
        self.p = p
        self.mask = None
        
    def forward(self, input):
        if self._train:
            mask = np.random.binomial(1, self.p, size=(input.shape[0],input.shape[1]))
            self.output = input*mask
        else:
            self.output = input*self.p
        return self.output
    
    def backward(self, input, grad_output):
        if self._train:
            grad_input = grad_output*self.output/input
        else:
            grad_input = grad_output*self.p
        return grad_input

**BatchNorm**

<img width='300px' src='https://wiseodd.github.io/img/2016-07-04-batchnorm/00.png'>


In [1]:
class BatchNorm(Module):
    def __init__(self, gamma = 0.3, beta = 0):
        super().__init__()
        self.gamma = gamma
        self.beta = beta
        self.mu = 0
        self.sigma = 0
    
    def forward(self, input):
        if self._train:
            mu_prom = np.mean(input, axis=0)
            sigma_prom = np.var(input, axis=0)
            
            self.output = (input - mu_prom)/np.sqrt(sigma_prom + 1e-8)
            self.output = self.gamma*self.output + self.beta
            
            self.mu = 0.9*self.mu + 0.1*mu_prom
            self.sigma = 0.9*self.sigma + 0.1*sigma_prom

        else:
            self.output = (input - self.mu)/np.sqrt(self.sigma + 1e-8)
            self.output = self.gamma*self.output + self.beta
        return self.output
    
    def backward(self, input, grad_output):
        if self._train:
            
            mu_prom = np.mean(input, axis=0)
            sigma_prom = np.var(input, axis=0)
            
            d_X_norm = self.gamma*grad_output
            d_sigma = -0.5*np.sum(d_X_norm*(input - mu_prom), axis=0)/np.sqrt(sigma_prom + 1e-8)/(sigma_prom + 1e-8)
            d_mu = -np.sum(d_X_norm/np.sqrt(sigma_prom + 1e-8), axis=0)
            d_mu = d_mu - 2*d_sigma*np.sum(input - mu_prom, axis=0)/input.shape[1]
            
            grad_input = d_X_norm/np.sqrt(sigma_prom + 1e-8) + 2*d_sigma*(input - mu_prom)/input.shape[1]
            grad_input = grad_input + d_mu/input.shape[1]
            
            self.d_gamma = np.sum(grad_output*(input - mu_prom)/np.sqrt(sigma_prom + 1e-8), axis=0)
            self.d_beta = np.sum(grad_output, axis=0)
        
        else:
            grad_input = input*grad_output/np.sqrt(self.sigma + 1e-8)*self.gamma
            
        return grad_input
    
                
    def parameters(self):
        return [self.gamma, self.beta]
    
    def grad_parameters(self):
        return [self.d_gamma, self.d_beta]

NameError: name 'Module' is not defined

## Criterion

In [35]:
class Criterion():        
    def forward(self, input, target):
        raise NotImplementedError

    def backward(self, input, target):
        raise NotImplementedError

In [36]:
class MSE(Criterion):
    def forward(self, input, target):
        batch_size = input.shape[0]
        self.output = np.sum(np.power(input - target, 2)) / batch_size
        return self.output
 
    def backward(self, input, target):
        self.grad_output  = (input - target) * 2 / input.shape[0]
        return self.grad_output

In [49]:
class CrossEntropy(Criterion):
    def __init__(self):
        super().__init__()
        
    def forward(self, input, target): 
        batch_size = input.shape[0]
        eps = 1e-9
        input_clamp = np.clip(input, eps, 1 - eps)
        
        self.output = -np.sum(target*np.log(input_clamp)) / batch_size
        return self.output

    def backward(self, input, target):
        eps = 1e-3
        input_clamp = np.clip(input, eps, 1 - eps)
        self.grad_output = (input_clamp - target) / input.shape[0]

        return self.grad_output