In [None]:
# Network interfaces
    
class CostFunction:
    """Define the cost function interface of the network
    """
    def calculate(self, value : list[float], predicted_value : list[float]) -> float:
        pass

    def calculate_d(self, value : list[float], predicted_value : list[float]) -> float:
        pass
    
class ActivationFunction:
    """Define the activation function interface of the network
    """
    def activation(self, x : list[float]) -> list[float]:
        """Define the activation function

        Args:
            x (list[float]): input values

        Returns:
            list[float]: output values
        """
        pass
    
    def activation_d(self, x : list[float]) -> list[float]:
        """Define the derivative of the activation function

        Args:
            x (list[float]): input values

        Returns:
            list[float]: output values
        """
        pass
    
class Layer:
    """Define the layer interface of the network
    """
    def __init__(self, cost_function : CostFunction):
        self.input , self.output = None, None
        self.cost_function = cost_function

    def forward(self, input_value : list[float]) -> list[float]:
        """Define the forward propagation of the layer

        Args:
            input_value (float): input values

        Returns:
            float: _description_
        """
        pass

    def backward(self, error : list[float], learning_rate : float) -> list[float]:
        """Define the backward propagation of the layer

        Args:
            error (list[float]): error of the previous layer
            learning_rate (float): learning rate

        Returns:
            list[float]: the errors of the next layer
        """
        pass

In [None]:
class Neuron:
    def __init__(self, input_indices : list[int], activation_function : ActivationFunction):
        self.weights = np.random.rand(len(input_indices)) - 0.5
        self.bias = np.random.random() - 0.5
        self.input_indices = input_indices
        
        self.last_output = None
        self.last_input = None
        self.delta = None
        
        self.activation_function = activation_function

    def forward(self, inputs):
        self.last_input = inputs
        
        weighted_sum = np.dot(inputs[self.input_indices], self.weights) + self.bias
        output = self.activation_function(weighted_sum)
        
        self.last_output = output
        return output

    def backward(self, output_error, learning_rate):   
        self.delta = output_error * self.activation_function.activation_d(self.last_output)
        
        # Update the weights and bias
        self.weights += learning_rate * np.dot(self.delta, self.last_input[self.input_indices])
        self.bias += learning_rate * self.delta

    def get_input_indices(self):
        return self.input_indices

In [None]:
# Connection layers

class FullyConnectedLayer(Layer):
    
    def __init__(self, input_size, output_size):
        self.weights = np.random.rand(input_size, output_size) - 0.5
        self.bias = np.random.rand(1, output_size) - 0.5

    def forward(self, input_data):
        self.input = input_data
        return np.dot(self.input, self.weights) + self.bias

    def backward(self, output_error, learning_rate):
        input_error = np.dot(output_error, self.weights.T)
        weights_error = np.dot(self.input.T, output_error)
        
        self.weights -= learning_rate * weights_error
        self.bias -= learning_rate * output_error
        return input_error

class NotFullyConnectedLayer(Layer):
    def __init__(self, cost_function: CostFunction, input_indices: list[list[int]]):
        super().__init__(cost_function)
        self.input_indices = input_indices
        self.neurons = [Neuron(indices) for indices in input_indices]

    def forward(self, input_value: list[float]) -> list[float]:
        self.input = input_value
        outputs = [neuron.forward(input_value) for neuron in self.neurons]
        self.output = outputs
        return outputs

    def backward(self, error: list[float], learning_rate: float) -> list[float]:
        prev_layer_error = np.zeros(len(self.input))

        for i, neuron in enumerate(self.neurons):
            neuron_error = error[i]
            prev_layer_error[neuron.input_indices] += neuron.backward(neuron_error, learning_rate, self.input)

        return prev_layer_error

class ActivationLayer(Layer):
    def __init__(self, activation_function : ActivationFunction):
        self.activation_function = activation_function

    def forward(self, input_data):
        self.input = input_data
        return self.activation_function.activation(input_data)

    def backward(self, output_error, learning_rate):
        return self.activation_function.activation_d(self.input) * output_error

In [None]:
# Cost functions

class MeanError(CostFunction):
    @staticmethod
    def calculate(value, predicted_value):
        return np.mean(value - predicted_value)

    def calculate_prime(value, predicted_value):
        return 1 / value.size;

class MeanSquaredError(CostFunction):
    @staticmethod
    def calculate(value, predicted_value):
        return np.mean(np.power(value - predicted_value, 2));

    def calculate_d(value, predicted_value):
        return 2*(predicted_value - value) / value.size;

# To be implemented
def mean_absolute_error(value, predicted_value):
    return np.mean(np.abs(value - predicted_value));

def log_loss(value, predicted_value, epsilon=1e-15):
    # Clip predicted_value to avoid log(0) and log(1) issues
    predicted_value = np.clip(predicted_value, epsilon, max(predicted_value))
    
    return -np.mean(value * np.log(predicted_value) + (1 - value) * np.log(1 - predicted_value))



In [None]:
# Activation functions

# --------BINARY STEP FUNCTIONS---------- #
# Binary step function depends on a threshold value that decides whether a neuron should be activated or not.


class BinaryStep(ActivationFunction):
    def activation(self, x : list[float]) -> list[float]:
        """Binary step function (map input values to 0 or 1) 

        Args:
            x (list[float]): input values

        Returns:
            list[float]: binary step values
        
        notes: not really useful, doesn't provide multi values output, and doesn't provide a gradient (always 0)
        """
        return np.array(0 if i < 0 else 1 for i in x)

    def activation_d(self, x : list[float]) -> list[float]:
        """Binary step derivative function

        Args:
            x (list[float]): input values

        Returns:
            list[float]: binary step derivative values
        """
        return np.array(0 for i in x)

# --------LINEAR FUNCTIONS---------- #
#The function doesn't do anything to the weighted sum of the input, it simply spits out the value it was given.

class Linear(ActivationFunction):
    def activation(self, x : list[float]) -> list[float]:
        """Linear function (map input values to themselves, linear regression)

        Args:
            x (list[float]): input values

        Returns:
            list[float]: linear values
            
        note: note really useful, the derivative is mostly often not related to the input value, so the gradient descent is not really efficient.
            By using this somewhere in the network, no matter the number of layers in the neural network, the last layer will be a linear function of the first layer
            so basically the network became a 1 layer network.
        """
        return np.array(x)

    def activation_d(self, x : list[float]) -> list[float]:
        """Linear derivative function

        Args:
            x (list[float]): input values

        Returns:
            list[float]: linear derivative values
        """
        return np.array(1 for i in x)

# --------NON LINEAR FUNCTIONS---------- #
# These allow backpropagation, making possible to create complex mappings between the input and the output of the network.

class Sigmoid(ActivationFunction):
    def activation(self, x : list[float]) -> list[float]:
        """Sigmoid function (map input values to an S shaped between 0 and 1)

        Args:
            x (list[float]): input values

        Returns:
            list[float]: sigmoid values
            
        note: Useful since have a smooth (no jumps) shape and map values between 0 and 1, so it can be used as a probability function;
            also if something is more to positive will be a lot more towards 1 then something negative.
            BUT the derivarive have a significative value only between -3 and 3, then it's almost 0 (GD should be used only on this range),
            also suffer of Vanishing gradient problem: when values are really close to 0 or very far from [-3,3] is really difficult to learn
            something just from this.
        note2: The sigmoid function is not symmetric around zero, which means that the output of the function is always positive, this leads to outputs 
                with the same sign, either positive or negative, when all neurons have the same sign, it became difficult train the network and less stable.
        """
        
        return np.array(1 / (1 + np.exp(-i)) for i in x)

    def activation_d(self, x : list[float]) -> list[float]:
        """Sigmoid derivative function

        Args:
            x (list[float]): input values

        Returns:
            list[float]: sigmoid derivative values
        """
        
        return np.array(self.sigmoid(i) * (1 - self.sigmoid(i)) for i in x)

class Tanh(ActivationFunction):
    def activation(self, x : list[float]) -> list[float]:
        """Tanh function (map input values to an S shaped between -1 and 1, centered at 0)

        Args:
            x (list[float]): input values

        Returns:
            list[float]: tanh values
            
        note: Similar to sigmoid, but is symmetric around 0, can easly map values as neutral, strongly negative or strongly positive.
        note2: Usually used in hidden layers before another layer to help the network learn better.
        note3: Also this suffer of Vanishing gradient problem, plus the gradiend is much steeper than sigmoid, this is preferred.
        """
        return np.tanh(x)

    def activation_d(self, x : list[float]) -> list[float]:
        """Tanh derivative function

        Args:
            x (list[float]): input values

        Returns:
            list[float]: tanh derivative values
        """
        return 1 - np.tanh(x)**2

class Relu(ActivationFunction):
    def activation(self, x : list[float]) -> list[float]:
        """Relu function

        Args:
            x (list[float]): input values

        Returns:
            list[float]: relu values
            
        note: The interesting thing about this is that this don't activate all neurons at the same time, by this is far more computationally efficient than the others,
            also accelerate the convergence due to its linearity property.
        note2: The downside is that during backpropagation if the input is negative the gradient is 0, so the neuron is dead and can't learn anything else.
        """
        output = []
        for rows in x:            
            tmp = []
            for i in rows:
                if i < 0:
                    tmp.append(0)
                else:
                    tmp.append(i)
            output.append(tmp)
                    
        return np.array(output)

    def activation_d(self, x : list[float]) -> list[float]:
        """Relu derivative function

        Args:
            x (list[float]): input values

        Returns:
            list[float]: relu derivative values
        """
        output = []
        for rows in x:            
            tmp = []
            for i in rows:
                if i < 0:
                    tmp.append(0)
                else:
                    tmp.append(1)
            output.append(tmp)
                    
        return np.array(output)

class LeakyRelu(ActivationFunction):
    def activation(self, x, alpha=0.1):
        """Leaky/Parametric relu function (similar to relu but don't fully suppress negative values)

        Args:
            x (list[float]): input values
            alpha (float, optional): negative values suppress parameter. Defaults to 0.1.

        Returns:
            list[float]: Leaky relu values
            
        note: by having a non 0 value when x < 0, the gradient is not 0, so the neuron is not dead and can learn,
            the donwsides are that since the neurons are not dead, they must be computed, also predictions may not be consistent for negative values.
        """
        return np.array(alpha*i if i < 0 else i for i in x)

    def activation_d(self, x, alpha=0.1):
        """Leaky relu derivative function

        Args:
            x (list[float]): input values
            alpha (float, optional): negative values suppress parameter. Defaults to 0.1.

        Returns:
            list[float]: Leaky relu derivative values
        """
        return np.array(alpha if i < 0 else 1 for i in x)

class Elu(ActivationFunction):
    def activation(self, x, alpha=0.1):
        """Elu function (similar to leaky relu but with a smooth curve)

        Args:
            x (list[float]): input values
            alpha (float, optional): negative values suppress parameter. Defaults to 0.1.

        Returns:
            list[float]: Elu values
            
        note: strong alternative to parametric relu, but the computation is (a lot) more expensive.
        note: This suffer of the exploding gradient problem: since there is an exponential, the gradient can explode and make the network unstable.
        """
        return np.array(alpha*(np.exp(i) - 1) if i < 0 else i for i in x)

    def activation_d(self, x, alpha=0.1):
        """Elu derivative function

        Args:
            x (list[float]): input values
            alpha (float, optional): negative values suppress parameter. Defaults to 0.1.

        Returns:
            list[float]: Elu derivative values
        """
        return np.array(alpha + i if i < 0 else 1 for i in x)

class Softmax(ActivationFunction):
    def activation(self, value : list[float]) -> list[float]:
        """Softmax function (map values to a probability distribution between 0 and 1, is a generalization of the sigmoid function)

        Args:
            value (list[float]): input values

        Returns:
            list[float]: softmax values
            
        note: A lot common on multi-class classification problems, since the output is a probability distribution, the sum of all values is 1.
        """
        return np.array(np.exp(i) / np.sum(np.exp(value), axis=0) for i in value)

    def activation_d(self, value : list[float]) -> list[float]:
        """Softmax derivative function

        Args:
            value (list[float]): input values

        Returns:
            list[float]: softmax derivative values
        
        # !!! REVIEW THIS, NOT SURE IF CORRECT, NEED A MATEMATICIAN :( !!!
        """
        return np.array(np.exp(i) / np.sum(np.exp(value), axis=0) for i in value)

class Swish(ActivationFunction):
    def activation(self, x : list[float]) -> list[float]:
        """Swish function (similar to sigmoid but with a non 0 gradient for negative values)

        Args:
            x (list[float]): input values

        Returns:
            list[float]: swish values
            
        note: This often match or outperform relu, but is more computationally expensive: 
            is a smooth function (expecially around 0), the 0 values in relu and other functions are set to 0, but they might be useful,
            
        """
        return np.array(i / (1 + np.exp(-i)) for i in x)

    def activation_d(self, x : list[float]) -> list[float]:
        """Swish derivative function

        Args:
            x (list[float]): input values

        Returns:
            list[float]: swish derivative values
        """
        return np.array(self.swish(i) + self.sigmoid(i) * (1 - self.swish(i)) for i in x)

class Gelu(ActivationFunction):
    def activation(self, x : list[float]) -> list[float]:
        """Gelu function (similar to swish but with a different shape)

        Args:
            x (list[float]): input values

        Returns:
            list[float]: gelu values
            
        note: Has found to outperform other activation functions on computer vision, natural language processing and speech recognition tasks,
            the downside is that is more computationally expensive.
        """
        return np.array(0.5 * i * (1 + np.tanh(np.sqrt(2 / np.pi) * (i + 0.044715 * np.power(i, 3)))) for i in x)

    def activation_d(self, x : list[float]) -> list[float]:
        """Gelu derivative function

        Args:
            x (list[float]): input values

        Returns:
            list[float]: gelu derivative values
        """
        return np.array(0.5 * (1 + np.tanh(np.sqrt(2 / np.pi) * (i + 0.044715 * np.power(i, 3)))) + 0.5 * i * (1 - np.power(np.tanh(np.sqrt(2 / np.pi) * (i + 0.044715 * np.power(i, 3))), 2)) * (np.sqrt(2 / np.pi) * (1 + 0.134145 * np.power(i, 2))) for i in x)
    
# To be implemented

def linear_softmax(value : list[float]) -> list[float]:
    """Linear softmax function (map values to a linear probability distribution between 0 and 1)

    Args:
        value (list[float]): input values

    Returns:
        list[float]: linear softmax values
    """
    return value / np.sum(value, axis=0)
