# A regression neural network
### 2 layers, ReLU as activation function, MSE loss

In [None]:
import numpy as np

# for activation function relu
def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return np.where(x > 0, 1.0, 0.0)

# for loss function mse
def mse_loss(y_true, y_pred):
		return ((y_true - y_pred) ** 2).mean()

def mse_derivative(y_true, y_pred):
		return 2 * (y_pred - y_true) / y_true.size 


class NeuralNetwork:
    def __init__(self, input_dim, hidden_dim, output_dim):
        self.w1 = np.random.rand(input_dim, hidden_dim)
        self.b1 = np.zeros(hidden_dim)
        self.w2 = np.random.rand(hidden_dim, output_dim)
        self.b2 = np.zeros(output_dim)

    def forward_pass(self, x):
        """
        :param x: input with size (batch_size, input_dim)
        :param w1: layer 1 weights (input_dim, hidden_dim)
        :param b1: layer 1 bias (batch_size, hidden_dim)
        :param w2: layer 2 weights (hidden_dim, output_dim)
        :param b2: layer 2 bias (batch_size, output_dim)
        :return: predicted_y with size (batch_size, output_dim)
        """
        layer1 = np.matmul(x, self.w1) + self.b1
        activated_layer1 = relu(layer1)
        pred_y = np.matmul(activated_layer1, self.w2) + self.b2

        return pred_y, layer1, activated_layer1

    def backward_pass(self, lr, x, y_true, y_pred, layer1, activated_layer1):
        loss = mse_loss(y_true, y_pred)

        # gradient y_pred
        dloss_dy_pred = mse_derivative(y_true, y_pred)  # (batch_size, output_dim)

        # gradient w2
        dloss_dw2 = np.dot(np.transpose(activated_layer1), dloss_dy_pred)  # (hidden_dim, output_dim)

        # gradient bias b2
        dloss_db2 = np.sum(dloss_dy_pred, axis=0)

        # gradient activated_layer1
        dloss_dactivated_layer1 = np.dot(dloss_dy_pred, np.transpose(self.w2))   # (batch_size, hidden_dim)

        # gradient layer1
        dloss_dlayer1 = dloss_dactivated_layer1 * relu_derivative(layer1)  # (batch_size, hidden_dim)

        # gradient w1
        dloss_dw1 = np.dot(np.transpose(x), dloss_dlayer1)

        # gradient b1
        dloss_db1 = np.sum(dloss_dlayer1, axis=0)

        self.w2 -= lr * dloss_dw2
        self.b2 -= lr * dloss_db2
        self.w1 -= lr * dloss_dw1
        self.b1 -= lr * dloss_db1

    def training_loop(self, x, y, iteration, lr):
        for i in range(iteration):
            pred_y, layer1, activated_layer1 = self.forward_pass(x)
            self.backward_pass(lr, x, y, pred_y, layer1, activated_layer1)