In [1]:
import numpy as np 


# Multi layer perceptron for regression

In [None]:
class MLP():
    def __init__(self,architecture):
        """
        :param architecture: list [n1,n2,n3] corresponding to number of neurons per layer
        """
        self.nb_layer=len(architecture)
        self.weights,self.bias = {},{}
        self.architecture=architecture

        pass

    def initialization(self,data,y):
        weights={}
        bias={}
        all_dimension=[data.shape[1]]+self.architecture+[len(np.unique(y))]

        for layer in range(len(all_dimension)-1):

            weights[f'w{layer+1}']=np.random.randn((all_dimension[layer],all_dimension[layer+1]))
            bias[f'b{layer+1}']=np.random.randn((1,all_dimension[layer+1]))
        
        self.weights,self.bias = weights,bias

        print("Initialized layers:", all_dimension)

    def activation(self,z):
        return 1 / (1 + np.exp(-z))
    
    def activation_derivative(self,z):
        s = self.activation(z)
        return s * (1 - s)
    
    def softmax(self,logits): 
        logits = logits - np.max(logits,axis=1,keepdims=True)

        return np.exp(logits)/np.sum(np.exp(logits),axis=1,keepdims=True)

    def loss(self,logits,target): # Cross entropy loss for classification 
        N = len(target)
        probas=self.softmax(logits)
        log_likelihood= -np.log(probas[range(N), target])
        return log_likelihood.mean(), probas


    def forward(self,data):
        self.cache={} # Store activation and output values for backprop
        nb_layers=len(self.architecture)+1 #hidden layer+output
        next_input=data
        self.cache["A0"]=next_input

        for layer in range(1,nb_layers+1):
            w=self.weights[f"w{layer}"]
            b=self.bias[f"b{layer}"]

            
            next_input=next_input@w + b
            self.cache[f"Z{layer}"] = next_input
            

            if layer<nb_layers:
                next_input=self.activation(next_input)
                self.cache[f"A{layer}"]=next_input
        
        self.cache[f"A{nb_layers}"]=next_input

        return next_input
    

    def backward_prop(self,logits,target):
        grads_w={}
        grads_b={}
        nb_layers=len(self.architecture)+1
        probas=self.softmax(logits)
        
        # gradient of loss wrt to logits in last layer
        dZ=probas.copy()
        dZ[range(len(target)),target]-=1
        dZ/=len(target) 
        
        for layer in reversed(range(1,nb_layers+1)):

            #Compute gradient using explicit formula for sigmoid and linearity
            A_prev=self.cache[f"A{layer-1}"] 
            W=self.weights[f"w{layer}"]

            dw=A_prev.T@dZ
            db=np.sum(dZ,axis=0,keepdims=True)

            grads_w[f"w{layer}"]=dw
            grads_b[f"b{layer}"]=db

            dA_prev=dZ@W.T
            if layer>1:
                dZ=dA_prev*self.activation_derivative(self.cache[f"Z{layer-1}"])


        return grads_w,grads_b
    
    def gradient_descent(self,grads_w,grads_b,lr):

        for key in self.weights:
            self.weights[key]-= lr*grads_w[key]

        for key in self.bias:
            self.bias[key]-= lr*grads_b[key]

    def fit(self,data,target,epoch=100,lr=0.01):
        loss_history=[]

        for ep in epoch:
            logits=self.forward(data)
            loss, probas=self.loss(logits,target)
            loss_history.append(loss)

            grads_w,grads_b=self.backward_prop(logits,target)
            self.gradient_descent(grads_w,grads_b,lr=lr)

        return loss_history

    
