# Activation Function

### Sigmoid

In [None]:
import numpy as np

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    sx = sigmoid(x)
    return sx * (1 - sx)

### Tanh

In [None]:
def tanh(x):
    return np.tanh(x)

def tanh_derivative(x):
    return 1 - np.tanh(x)**2

### ReLU

In [None]:
def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return np.where(x > 0, 1.0, 0.0)

### Leaky ReLU

In [None]:
def leaky_relu(x, alpha=0.01):
    return np.where(x > 0, x, alpha * x)

def leaky_relu_derivative(x, alpha=0.01):
    return np.where(x > 0, 1.0, alpha)

### Softmax
The derivative of softmax is a bit more complex, as it requires computing a Jacobian matrix for the full vector output.

In practice, we often use the output of the softmax function directly in the loss function, which simplifies the gradient computation during backpropagation.

In [None]:
def softmax(z):
    e_z = np.exp(z)
    return e_z / e_z.sum(axis=0)

def cross_entropy_loss(y_true, y_pred):
    # Assuming y_true is one-hot encoded
    return -np.sum(y_true * np.log(y_pred))

def softmax_cross_entropy_derivative(y_true, z):
    y_pred = softmax(z)
    return y_pred - y_true

# Loss function

### Mean squared error (MSE) loss
##### Definition: $MSE(y, \hat{y})=\frac{1}{n}\Sigma_i(y_i-\hat{y}_i)^2$
##### Derivative: $\frac{\partial MSE}{\partial \hat{y}}=\frac{2}{n}(\hat{y} - y)$

In [None]:
import numpy as np

def mse_loss(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)

def mse_loss_derivative(y_true, y_pred):
    return (2 / y_true.size) * (y_pred - y_true)

### Binary cross-entropy loss
##### Definition: $-\frac{1}{n}\Sigma_{i=1}[y_ilog(\hat{y}_i)+(1-y_i)log(1-\hat{y}_i)]$
##### Derivative: $-\frac{y}{\hat{y}}+\frac{1-y}{1-\hat{y}}$

### Categorical cross-entropy loss
##### Definition: $-\Sigma_i y_i log(\hat{y}_i)$
##### Derivative: $-\frac{y}{\hat{y}}$

# Regularization

### L1 regularization
The loss function becomes $L_{reg}=L+\lambda\Sigma_w|w|$

The gradient would be $\frac{\partial L_{reg}}{\partial w}=\frac{\partial L}{\partial w}+\lambda \cdot sign(w)$, where $sign(w)$ is a function that return -1 if w<0, and 0 if w=0, and 1 if w>0

In [None]:
def l1_regularization(weights, lambda_reg):
    # L1 Regularization term
    l1_term = lambda_reg * np.sum(np.abs(weights))
    return l1_term

def l1_regularization_derivative(weights, lambda_reg):
    # L1 Regularization derivative
    l1_derivative = lambda_reg * np.sign(weights)
    return l1_derivative

### L2 regularization
The loss function becomes $L_{reg}=L+\frac{\lambda}{2}\Sigma_w w^2$

The gradient would be $\frac{\partial L_{reg}}{\partial w}=\frac{\partial L}{\partial w}+\lambda w$

In [None]:
def l2_regularization(weights, lambda_reg):
    # L2 Regularization term
    l2_term = lambda_reg * np.sum(np.square(weights))
    return l2_term

def l2_regularization_derivative(weights, lambda_reg):
    # L2 Regularization derivative
    l2_derivative = 2 * lambda_reg * weights
    return l2_derivative