In [1]:
import numpy as np

In [2]:
class Module():
    def __init__(self):
        self._train = True
    
    def forward(self, input):
        raise NotImplementedError

    def backward(self,input, grad_output):
        raise NotImplementedError
    
    def parameters(self):
        return []
    
    def grad_parameters(self):
        return []
    
    def train(self):
        self._train = True
    
    def eval(self):
        self._train = False

In [3]:
class Sequential(Module):
    def __init__ (self, *layers):
        super().__init__()
        self.layers = layers

    def forward(self, input):

        for layer in self.layers:
            input = layer.forward(input)

        self.output = input
        return self.output

    def backward(self, input, grad_output):
        
        for i in range(len(self.layers)-1, 0, -1):
            grad_output = self.layers[i].backward(self.layers[i-1].output, grad_output)
        
        grad_input = self.layers[0].backward(input, grad_output)
        
        return grad_input
      
    def parameters(self):
        res = []
        for l in self.layers:
            res += l.parameters()
        return res
    
    def grad_parameters(self):
        res = []
        for l in self.layers:
            res += l.grad_parameters()
        return res
    
    def train(self):
        for layer in self.layers:
            layer.train()
    
    def eval(self):
        for layer in self.layers:
            layer.eval()

In [4]:
class Linear(Module):
    def __init__(self, dim_in, dim_out):
        super().__init__()
        
        stdv = 1./np.sqrt(dim_in)
        self.W = np.random.uniform(-stdv, stdv, size=(dim_in, dim_out))
        self.b = np.random.uniform(-stdv, stdv, size=dim_out)
        
        
    def forward(self, input):
        
        self.output = np.dot(input, self.W) + self.b
        
        return self.output
    
    
    def backward(self, input, grad_output):
        
        self.grad_b = np.mean(grad_output, axis=0)
        
        self.grad_W = np.dot(input.T, grad_output)
        
        grad_input = np.dot(grad_output, self.W.T)
        
        return grad_input
    
    def parameters(self):
        return [self.W, self.b]
    
    def grad_parameters(self):
        return [self.grad_W, self.grad_b]

# Activation functions

**ReLU**

In [5]:
class ReLU(Module):
    def __init__(self):
         super().__init__()
    
    def forward(self, input):
        self.output = np.maximum(input, 0)
        return self.output
    
    def backward(self, input, grad_output):
        grad_input = np.multiply(grad_output, input > 0)
        return grad_input

**LeakyReLU**

In [6]:
class LeakyReLU(Module):
    def __init__(self, slope=0.03):
        super().__init__()
            
        self.slope = slope
        
    def forward(self, input):
        self.output = (input > 0) * input + (input <= 0) * input * self.slope
        return self.output
    
    def backward(self, input, grad_output):
        
        grad_input = grad_output * (input > 0) + (input <= 0) * grad_output * self.slope
      
        return grad_input

**Sigmoid**

In [7]:
class Sigmoid(Module):
    def __init__(self, slope=0.03):
        super().__init__()

    def forward(self, input):
        
        self.output = 1. / (1. + np.exp(-input))
        
        return self.output
    
    def backward(self, input, grad_output):
        
        grad_input = grad_output * (1. - grad_output)
        
        return grad_input

**SoftMax**

In [8]:
class SoftMax(Module):
    def __init__(self):
         super().__init__()
    
    def forward(self, input):
        self.output = np.exp(np.subtract(input, input.max(axis=1, keepdims=True)))
        self.output = self.output / np.sum(self.output, axis = 1, keepdims = True)
        return self.output
    
    def backward(self, input, grad_output):
        grad_input = []
        for k in range(grad_output.shape[0]):
            grad_input.append(np.sum(np.diagflat(grad_output[k]) - np.dot(grad_output[k], grad_output[k].T), axis = 1))
        
        return grad_input

# Regularization

**Dropout**

In [9]:
class Dropout(Module):
    def __init__(self, p=0.5):
        super().__init__()
        
        self.p = p
        self.mask = None
        
    def forward(self, input):
        if self._train:
            self.mask = np.random.binomial(1, 1-self.p, size = input.shape) / self.p
            self.output = self.mask * input
        else:
            self.output = input
        return self.output
    
    def backward(self, input, grad_output):
        if self._train:
            grad_input = self.mask * grad_output
        else:
            grad_input = grad_output
        return grad_input

**BatchNorm**

In [10]:
class BatchNorm(Module):
    def __init__(self, num_features, gamma):
        super().__init__()
        self.gamma = gamma
        
    def forward(self, input):
        if self._train:
            self.mu = np.mean(input, axis = 1, keepdims=True)
            self.sigma = np.var(input, axis = 1, keepdims=True)
            input_norm = (input - self.mu) / np.sqrt(self.sigma + 1e-8)
            self.output = self.gamma * input_norm 
        else:
            self.output = input
        return self.output
    
    def backward(self, input, grad_output):
        if self._train:
            n, d = input.shape
            input_mu = input - self.mu
            std_inv = 1. / np.sqrt(self.sigma + 1e-8)
            
            grad_input_norm = grad_output * self.gamma
            grad_sigma = np.sum(grad_input_norm * input_mu, axis = 0) * -.5 * std_inv**3
            grad_mu = np.sum(grad_input_norm * -std_inv, axis = 0) + grad_sigma * np.mean(-2. * input_mu, axis = 0)
            
            grad_input = (grad_input_norm * std_inv) + (grad_sigma * 2 * input_mu / n) + (grad_mu / n)
            grad_gamma = np.sum(grad_output * input, axis = 0)
        else:
            grad_input = grad_output
            
        return grad_input

# Loss functions

In [11]:
class Criterion():        
    def forward(self, input, target):
        raise NotImplementedError

    def backward(self, input, target):
        raise NotImplementedError

**MSE**

In [12]:
class MSE(Criterion):
    def forward(self, input, target):
        batch_size = input.shape[0]
        self.output = np.sum(np.power(input - target.reshape(input.shape), 2)) / batch_size
        return self.output
 
    def backward(self, input, target):
        grad_input  = (input - target.reshape(input.shape)) * 2 / input.shape[0]
        return grad_input

**CrossEntropy**

In [13]:
class CrossEntropy(Criterion):
    def __init__(self):
        super().__init__()
        
    def forward(self, input, target): 
        # чтобы нигде не было взятий логарифма от нуля:
        eps = 1e-9
        input_clamp = np.clip(input, eps, 1 - eps)
        
        m = target.shape
        
        log_likelihood = -np.log(input_clamp[range(m[0]), np.where(target == 1)[1]])
        self.output = log_likelihood / m[1]
        
        return self.output

    def backward(self, input, target):
        eps = 1e-9
        input_clamp = np.clip(input, eps, 1 - eps)
        
        m = target.shape
        grad_input = input_clamp
        grad_input[range(m[0]), np.where(target == 1)[1]] -= 1
        grad_input /= m[1]
        
        return grad_input