# Intro DL

## Imports

In [1]:
import numpy as np
import pandas as pd
from functools import partial


## Defining Entities.

In [115]:
class HyperParameters:
    def __init__(self, learning_rate=0.01, epochs=10, mini_batch_size=None, beta=.9, layers=None, beta1=.9, beta2=.998):
        if layers is None:
            layers = [10, 20, 10]
        self.layers = layers
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.batch_size = mini_batch_size
        self.beta = beta
        self.beta1 = beta1
        self.beta2 = beta2


class WeightAndBias: 
    def __init__(self,number_features, layers, initialisation_type="random"):
        self.initialisation_type = initialisation_type
        self.layers = [number_features]+ layers
        
        self.weights = [pd.DataFrame()] + [np.random.randn(self.layers[i+1], self.layers[i]) * 0.01 
                                           for i in range(len(self.layers)-1)]
        
        self.biases = [pd.DataFrame()] + [np.zeros([self.layers[i+1], 1]) for i in range(len(self.layers)-1)]
        
        print([(self.layers[i+1], self.layers[i]) for i in range(len(self.layers)-1)])
        
    def update_learning_parameters(self):
        pass
    

class ActivationFunctions:
    def __init__(self, layers, activation_functions=None) :
        if activation_functions is None: 
            activation_functions= ['tanh'] * (len(layers) - 1) + ['softmax']
            
        self.activation_functions = [None] + [eval(f'ActivationFunctions.{activation_function}') 
                                     for activation_function in activation_functions]
        
        self.derivative_functions = [None] + [eval(f'ActivationFunctions.{activation_function}_derivative') 
                                     for activation_function in activation_functions]
        

    @staticmethod
    def sigmoid(z) :
        return 1 / (1 + np.exp( -z ))
    
    @staticmethod
    def relu(z) : 
        return np.where(z>0, z, 0.0001 * z )
    
    @staticmethod
    def tanh(z) :
        # return np.tanh(z
        z = np.clip(z, -20, 20)
        return (np.exp(z) - np.exp(-z))/ (np.exp(z) + np.exp(-z))
    
    @staticmethod
    def softmax(z):
        z = np.clip(z, -20, 20)
        return np.exp(z) / np.sum(np.exp(z), axis=0) 
    
    @staticmethod
    def softmax_derivative(y, a) :
        return a - y
    
    @staticmethod
    def sigmoid_derivative(y, a) :
        return a - y
    
    @staticmethod
    def tanh_derivative(z) :
        return (1 - np.tanh(z) ** 2)
    
    @staticmethod
    def relu_derivative(z) :
        return (z > 0) * 1
    
    @staticmethod
    def calculate_loss(a, y, m) :
        return -1/m * np.sum(np.multiply(y, np.log(a)))
    

class NeuralNetwork: 
    def __init__(self, X_train, y_train, HyperParameters, activation_functions=None) :
        
        self.X_train, self.y_train = X_train, y_train
        self.n, self.m = X_train.shape #number of training examples (m) , number of features (n)
        
        print(f"number of training examples: {self.m}\nnumber of features: {self.n}"
              f"shape of y_train {self.y_train.shape}")
        # self.input_data = InputData(X, y)
        self.hp = HyperParameters
        self.layers = self.hp.layers
        self.no_l = len(self.layers)
        
        self.act_function_obj = ActivationFunctions(self.layers, activation_functions=activation_functions)
 
        self.lp = WeightAndBias(self.n, self.layers) #lp --> learning parameters
            
    # this will be put under weights class later
    def update_learning_parameters(self) :
        for l in range(1, self.no_l+1):
            self.lp.biases[l] = self.lp.biases[l] - self.hp.learning_rate * self.db[l]
            self.lp.weights[l] = self.lp.weights[l] - self.hp.learning_rate * self.dW[l]

    def forward_propagation(self, X_test=None) :
        ## the way the forward prop is called needs to be changed
        if X_test is not None :
            X = X_test
        else: 
            X = self.X_train
        self.Z, self.A = [0] + [None] * self.no_l, [X] + [None] * self.no_l
        activation_functions = self.act_function_obj.activation_functions
        
        for l in range(1, self.no_l + 1):
            self.Z[l] = np.dot(self.lp.weights[l], self.A[l-1]) + self.lp.biases[l]
            self.A[l] = activation_functions[l](self.Z[l])      

            
    def back_propagation(self) :

        derivative_functions = self.act_function_obj.derivative_functions
        self.dZ =[None] +  [None] * self.no_l
        self.dW =[None] +  [None] * self.no_l
        self.db =[None] +  [None] * self.no_l
        
        self.dZ[self.no_l] = derivative_functions[self.no_l](self.y_train, self.A[self.no_l])
        self.dW[self.no_l] = 1/self.m * np.dot(self.dZ[self.no_l] , self.A[self.no_l - 1].T)
        self.db[self.no_l] = 1/self.m * np.sum(self.dZ[self.no_l], axis=1, keepdims=True)
        

        assert self.dZ[self.no_l].shape == self.Z[self.no_l].shape
        assert self.db[self.no_l].shape == self.lp.biases[self.no_l].shape        
        assert self.dW[self.no_l].shape == self.lp.weights[self.no_l].shape
        
        for l in range(self.no_l - 1, 0, -1) : 
        
            self.dZ[l] = np.dot(self.lp.weights[l+1].T, self.dZ[l+1] )* derivative_functions[l](self.Z[l])
            self.dW[l] = 1/self.m * np.dot(self.dZ[l], self.A[l-1].T)
            self.db[l] = 1/self.m * np.sum(self.dZ[l], axis=1, keepdims=True)
            
            assert self.dZ[l].shape == self.Z[l].shape
            assert self.dW[l].shape == self.lp.weights[l].shape
            assert self.db[l].shape == self.lp.biases[l].shape       
        
        
    def train_nn(self, verbose=False, per_epoch_log=100) :
        for epoch in range(self.hp.epochs): 
                self.forward_propagation()

                if verbose and epoch % per_epoch_log == 0: 
                    print(f"epochs {epoch} loss: ",ActivationFunctions.calculate_loss(self.A[self.no_l], self.y_train, self.m))

                self.back_propagation()

                self.update_learning_parameters()

        print(f"epochs {epoch} loss: ", ActivationFunctions.calculate_loss(self.A[self.no_l], self.y_train, self.m))

    def predict(self, X_test):
        self.forward_propagation(X_test=X_test)
        preds=  self.A[self.no_l].T
        return (preds == preds.max(axis=1)[:,None]).astype(int)


## load training data

In [88]:
train_data = pd.read_csv('train.csv')

In [117]:
m = 4096 # train_data.shape[0]
test_samples = 10
X = train_data.drop('label', axis=1).iloc[0:m] / 255
X_test = train_data.drop('label', axis=1).iloc[m:m + test_samples] / 255
y_test = train_data.label.iloc[m:m + test_samples]


def one_hot_encoding_y(train_data) :
    
    a = train_data.label
    b = np.zeros((a.size, 10))
    b[np.arange(a.size),a] = 1
    return b
y  = one_hot_encoding_y(train_data)[:m]
y = np.reshape(y, (m, 10))
y_test = one_hot_encoding_y(train_data)[m:m + test_samples]
y_test = np.reshape(y_test, (test_samples, 10))

layers=[100, 15, 10]
layers= [256, 10]
activation_functions = ['tanh'] * (len(layers) - 1) + ['softmax']

nn = NeuralNetwork(X.T, y.T, HyperParameters(layers=layers, learning_rate=0.5, epochs=100),
                   activation_functions=activation_functions)


number of training examples: 4096
number of features: 784shape of y_train (10, 4096)
[(256, 784), (10, 256)]


In [125]:
nn.train_nn(verbose=True, per_epoch_log=50)

epochs 0 loss:  0.28413482305468574
epochs 50 loss:  0.2348511969028756
epochs 99 loss:  0.20196587701948127


In [126]:
prob_preds = lambda preds: (preds == preds.max(axis=1)[:,None]).astype(int)
r = np.sum(np.argmax(y, axis=1) == np.argmax(prob_preds(nn.A[2].T), axis=1))
w = np.sum(np.argmax(y, axis=1) != np.argmax(prob_preds(nn.A[2].T), axis=1))
print(f"total number of examples: {m}\nnumber of right predictions: {r}\nnumber of wrong predictions: {w}\n"
     f"accuracy on train: {r/m * 100}")

total number of examples: 4096
number of right predictions: 3880
number of wrong predictions: 216
accuracy on train: 94.7265625


In [127]:
preds = nn.predict(X_test.T)
r = np.sum(np.argmax(y_test, axis=1) == np.argmax(prob_preds(nn.A[2].T), axis=1))
w = np.sum(np.argmax(y_test, axis=1) != np.argmax(prob_preds(nn.A[2].T), axis=1))
r, w

(9, 1)