<a href="https://colab.research.google.com/github/MLandML/MLandML/blob/main/neural_network_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install nnfs

In [None]:
#@title scihou NN using only numpy

import numpy as np
import matplotlib.pyplot as plt
#import pandas as pd


import nnfs

from nnfs.datasets import spiral_data

nnfs.init()

np.random.seed(0)

class Layer_Dense:

    def __init__(self, n_inputs, n_outputs):
        self.weights = 0.01*np.random.randn(n_inputs, n_outputs)
        self.biases = np.zeros((1, n_outputs))

    def forward(self, inputs):
        self.inputs = inputs
        self.output = np.dot(inputs, self.weights)+self.biases

    def backward(self, dvalues):
        self.dweights = np.dot(self.inputs.T, dvalues)
        self.dbiases = np.sum(dvalues, axis=0, keepdims=True)
        self.dinputs = np.dot(dvalues,self.weights.T)

class Activation_ReLu:
    def forward(self,inputs):
        self.inputs=inputs
        self.output=np.maximum(0,inputs)

    def backward(self,dvalues):
        self.dinputs=dvalues.copy()
        self.dinputs[self.inputs<=0]=0

class Softmax_Activation:
    def forward(self,inputs):
        exp_values=np.exp(inputs-np.max(inputs,axis=1,keepdims=True))
        probabilities=exp_values/np.sum(exp_values,axis=1,keepdims=True)
        self.output=probabilities

    """def backward(self,dvalues):
        self.inputs=np.empty_like(dvalues)

        for index,(single_outputs, single_dvalues) in enumerate(zip(self.output,dvalues)):
            single_ouptuts=single_outputs.reshape(-1,1)
            jacobian_matrix=np.diagflat(single_outputs)-np.dot(single_outputs,single_outputs.T)
            self.dinputs[index]=np.dot(jacobian_matrix,single_dvalues)"""

class Loss:
    def calculate(self,outputs,y):
        sample_loss=self.forward(outputs,y)
        data_loss=np.mean(sample_loss)
        return data_loss

class Categorical_cross_entropy(Loss):
    def forward(self,y_pred,y_true):
        samples=len(y_pred)
        y_pred_clipped=np.clip(y_pred,1e-7,1-1e-7)

        if len(y_true.shape)==1:
            correct_confidences=y_pred_clipped[range(samples),y_true]
        if len(y_true.shape)==2:
            correct_confidences=np.sum(y_pred_clipped*y_true,axis=1)

        neg_loss= -np.log(correct_confidences)
        return neg_loss

    """def backward(self,y_pred,y_true):
        samples = len(y_pred)
        labels = len(samples[0])

        if len(y_true.shape) == 1:
            y_true = np.eye(labels)[y_true]

        self.dinputs = -y_true/y_pred
        self.dinputs = self.dinputs/samples"""

"""class Accuracy:
    def calculate(self,y_pred,y_true):
        predictions=np.argmax(y_pred,axis=1)

        if len(y_true.shape)==1:
            self.accuracy=np.mean(predictions==y_true)
        
        if len(y_true.shape)==2:
            class_targets=np.argmax(y_true,axis=1)
            self.accuracy=np.mean(predictions==class_targets)"""
        
class Activation_Softmax_Loss_Categorical_Crossentropy():

    def __init__(self):
        self.activation=Softmax_Activation()
        self.loss=Categorical_cross_entropy()

    def forward(self,inputs,y_true):
        self.activation.forward(inputs)
        self.output=self.activation.output
        return self.loss.calculate(self.output,y_true)

    def backward(self,dvalues,y_true):
        samples=len(dvalues)

        if len(y_true.shape)==2:
            y_true=np.argmax(y_true,axis=1)
        
        self.dinputs=dvalues.copy()
        self.dinputs[range(samples),y_true] -=1
        self.dinputs=self.dinputs/samples

class Optimizer_SGD:
    def __init__(self,learning_rate=1.,decay=0.,epsilon=0.):
        self.learning_rate=learning_rate
        self.current_learning_rate=learning_rate
        self.decay=decay
        self.epsilon=epsilon
        self.iterations=0

    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate=self.learning_rate*(1/(1. + self.decay*self.iterations))

    def update_params(self,layer):

        if self.epsilon:
            if not hasattr(layer,'weight_momentum'):
                layer.weight_momentum=np.zeros_like(layer.weights)
                layer.bias_momentum=np.zeros_like(layer.biases)
            
            weight_updates= self.epsilon * layer.weight_momentum - self.current_learning_rate * layer.dweights
            layer.weight_momentum=weight_updates
            bias_updates= self.epsilon * layer.bias_momentum - self.current_learning_rate * layer.dbiases
            layer.bias_momentum=bias_updates
            
        else:
            weight_updates = -self.learning_rate * layer.dweights
            bias_updates = -self.learning_rate * layer.dbiases
        
        layer.weights += weight_updates
        layer.biases += bias_updates

    def post_update_params(self):
        self.iterations +=1

class Optimizer_AdaGrad:
    def __init__(self,learning_rate=1.,decay=0.,delta=1e-7):
        self.learning_rate=learning_rate
        self.current_learning_rate=learning_rate
        self.decay=decay
        self.delta=delta
        self.iterations=0

    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate=self.learning_rate*(1./(1. + self.decay*self.iterations))

    def update_params(self,layer):

        if not hasattr(layer,'weight_cache'):
            layer.weight_cache=np.zeros_like(layer.weights)
            layer.bias_cache=np.zeros_like(layer.biases)
        
        layer.weight_cache += layer.dweights **2
        layer.weights += -self.current_learning_rate*layer.dweights /(np.sqrt(layer.weight_cache)+self.delta)
        layer.bias_cache += layer.dbiases **2
        layer.biases += -self.current_learning_rate*layer.dbiases /(np.sqrt(layer.bias_cache)+self.delta)

    def post_update_params(self):
        self.iterations +=1

class Optimizer_RMSProp:
    def __init__(self,learning_rate=0.001,decay=0.,delta=1e-7,rho=0.9):
        self.learning_rate=learning_rate
        self.current_learning_rate=learning_rate
        self.decay=decay
        self.delta=delta
        self.rho=rho
        self.iterations=0

    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate=self.learning_rate*(1./(1. + self.decay*self.iterations))

    def update_params(self,layer):

        if not hasattr(layer,'weight_cache'):
            layer.weight_cache=np.zeros_like(layer.weights)
            layer.bias_cache=np.zeros_like(layer.biases)
        
        layer.weight_cache = self.rho*layer.weight_cache + (1-self.rho)*layer.dweights**2
        layer.weights += -self.current_learning_rate*layer.dweights /(np.sqrt(layer.weight_cache)+self.delta)
        layer.bias_cache = self.rho*layer.bias_cache + (1-self.rho)*layer.dbiases**2
        layer.biases += -self.current_learning_rate*layer.dbiases /(np.sqrt(layer.bias_cache)+self.delta)

    def post_update_params(self):
        self.iterations +=1

class Optimizer_RMSProp_Nesterov_momentum:
    def __init__(self,learning_rate=0.001,decay=0.,epsilon=0.,rho=0.9):
        self.learning_rate=learning_rate
        self.current_learning_rate=learning_rate
        self.decay=decay
        self.epsilon=epsilon
        self.rho=rho
        self.iterations=0

    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate=self.learning_rate*(1./(1. + self.decay*self.iterations))

    def update_params(self,layer):

        if self.epsilon:

          if not hasattr(layer,'weight_momentum'):
              layer.weight_cache=np.zeros_like(layer.weights)
              layer.bias_cache=np.zeros_like(layer.biases)
        
              layer.weight_momentum=np.zeros_like(layer.weights)
              layer.bias_momentum=np.zeros_like(layer.biases)

          layer.weights += self.epsilon*layer.weight_momentum
          layer.weight_cache = self.rho*layer.weight_cache + (1-self.rho)*layer.dweights**2
          weight_updates= self.epsilon*layer.weight_momentum - self.current_learning_rate*layer.dweights/np.sqrt(layer.weight_cache)
          layer.weight_momentum=weight_updates
          layer.weights += weight_updates

          layer.biases += self.epsilon*layer.bias_momentum
          layer.bias_cache = self.rho*layer.bias_cache + (1-self.rho)*layer.dbiases**2
          bias_updates= self.epsilon*layer.bias_momentum - self.current_learning_rate*layer.dbiases/np.sqrt(layer.bias_cache)
          layer.bias_momentum=bias_updates
          layer.biases += bias_updates

        else:
          weight_updates = -self.learning_rate * layer.dweights
          bias_updates = -self.learning_rate * layer.dbiases

        layer.weights += weight_updates
        layer.biases += bias_updates

    def post_update_params(self):
        self.iterations +=1

class Optimizer_Adam:
    def __init__(self,learning_rate=0.001,decay=0.,delta=1e-7,beta_1=0.9,beta_2=0.999):
        self.learning_rate=learning_rate
        self.current_learning_rate=learning_rate
        self.decay=decay
        self.delta=delta
        self.beta_1=beta_1
        self.beta_2=beta_2
        self.iterations=0

    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate=self.learning_rate*(1./(1. + self.decay*self.iterations))

    def update_params(self,layer):

        if not hasattr(layer,'weight_cache'):
            layer.weight_cache=np.zeros_like(layer.weights)
            layer.weight_momentum=np.zeros_like(layer.weights)
            layer.bias_cache=np.zeros_like(layer.biases)
            layer.bias_momentum=np.zeros_like(layer.biases)
        
        layer.weight_momentum = self.beta_1 * layer.weight_momentum + (1- self.beta_1) * layer.dweights
        layer.weight_cache = self.beta_2 * layer.weight_cache + (1- self.beta_2) * layer.dweights**2
        weight_momentum_corrected = layer.weight_momentum/(1-self.beta_1**(self.iterations+1))
        weight_cache_corrected = layer.weight_cache/(1-self.beta_2**(self.iterations+1))
        layer.weights += -self.current_learning_rate*weight_momentum_corrected /(np.sqrt(weight_cache_corrected)+self.delta)

        layer.bias_momentum = self.beta_1 * layer.bias_momentum + (1- self.beta_1) * layer.dbiases
        layer.bias_cache = self.beta_2 * layer.bias_cache + (1- self.beta_2) * layer.dbiases**2
        bias_momentum_corrected = layer.bias_momentum/(1-self.beta_1**(self.iterations+1))
        bias_cache_corrected = layer.bias_cache/(1-self.beta_2**(self.iterations+1))
        layer.biases += -self.current_learning_rate*bias_momentum_corrected /(np.sqrt(bias_cache_corrected)+self.delta)
        

    def post_update_params(self):
        self.iterations +=1 


X_train,y_train= spiral_data(samples=100,classes=3)

layer1=Layer_Dense(2,64)
activation1=Activation_ReLu()
layer2=Layer_Dense(64,3)

final_activation_loss=Activation_Softmax_Loss_Categorical_Crossentropy()

optimizer=Optimizer_SGD(learning_rate=1.,decay=1e-3,epsilon=0.9)
optimizer2=Optimizer_AdaGrad(decay=1e-4)
optimizer3=Optimizer_RMSProp(learning_rate=0.02,decay=1e-4,rho=0.999)
optimizer4=Optimizer_RMSProp_Nesterov_momentum(learning_rate=0.1,decay=1e-4,rho=0.999,epsilon=0.9)
optimizer_adam=Optimizer_Adam(learning_rate=0.05,decay=5e-6)

for epoch in range(10001):

    layer1.forward(X_train)
    activation1.forward(layer1.output)

    layer2.forward(activation1.output)

    loss=final_activation_loss.forward(layer2.output,y_train)
    
    predictions=np.argmax(final_activation_loss.output,axis=1)
    if len(y_train.shape)==2:
        y_train =np.argmax(y_train,axis=1)
    accuracy=np.mean(predictions==y_train)

    if not epoch % 100:
        print(f'epoch: {epoch}, '+
              f'acc: {accuracy:.3f}, '+
              f'loss: {loss:.3f}, '+
              f'lr: {optimizer_adam.current_learning_rate}')
    
    final_activation_loss.backward(final_activation_loss.output,y_train)
    layer2.backward(final_activation_loss.dinputs)
    activation1.backward(layer2.dinputs)
    layer1.backward(activation1.dinputs)

    optimizer_adam.pre_update_params()
    optimizer_adam.update_params(layer2)
    optimizer_adam.update_params(layer1)
    optimizer_adam.post_update_params()



In [None]:
X_test,y_test = spiral_data(samples = 100, classes = 3)
layer1.forward(X_test)
activation1.forward(layer1.output)

layer2.forward(activation1.output)

loss=final_activation_loss.forward(layer2.output,y_test)

predictions=np.argmax(final_activation_loss.output,axis=1)
if len(y_test.shape)==2:
    y_test=np.argmax(y_test,axis=1)
accuracy=np.mean(predictions==y_test)

print(f'validation, acc: {accuracy:.3f}, loss: {loss:.3f}')

