# The basic architecture and functionality of Neural Networks

<center><img src="images/Neuron.drawio.png" width="800" height="500" /></center>

<center><img src="images/nn_layers.png" width="800" height="500"></center>


* $X$ - is our input features in the form of a vector
* $w$ - is the weights we initialize randomly and will be updated during training
* $b$ - is the bias we initialize randomly and will be updated during training
* `Dense Layer` - one hidden layer in the network with $n$ neurons (fully connected network)
* `Forward pass` - process of calculating the input * weights + bias for each neuron in the hidden layer and then applying an activation function to the result
* `Activation function` - a function that is applied to the result of the forward pass to introduce non-linearity into the network
* `Output` - the final result of the network  

## Neural Network Implementation

Code adapted from: nnfs.io

### Adeline Perceptron

In [None]:
import numpy as np

class Perceptron:
    def __init__(self, n_inputs, epochs=10, learning_rate=0.01):
        self.epochs = epochs
        self.learning_rate = learning_rate
        self.weights = np.zeros(n_inputs + 1)
        self.errors = []
        
    def predict(self, inputs):
        summation = np.dot(inputs, self.weights[1:]) + self.weights[0]
        if summation > 0:
            activation = 1
        else:
            activation = 0
        return activation
    
    def fit(self, X, y):
        for _ in range(self.epochs):
            error = 0
            for inputs, target in zip(X, y):
                prediction = self.predict(inputs)
                update = self.learning_rate * (target - prediction)
                self.weights[1:] += update * inputs
                self.weights[0] += update
                error += int(update != 0.0)
            self.errors.append(error)
        return self

## Neural Network implementation

In [None]:
import numpy as np
import nnfs
from nnfs.datasets import spiral_data
from nnfs.datasets import vertical_data

nnfs.init()

### Dense Layer Implementation

In [None]:
class Dense:
    # Layer initialization
    def __init__(self, n_inputs, n_neurons):
        self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)
        self.biases = np.zeros((1, n_neurons))
    
    def forward(self, inputs):
        self.inputs = inputs
        # Calculate output values from input ones, weights and biases
        self.output = np.dot(inputs, self.weights) + self.biases
        
    def backward(self, dvalues):
        # Gradients on parameters
        self.dweights = np.dot(self.inputs.T, dvalues)
        self.dbiases = np.sum(dvalues, axis=0, keepdims=True)
        # Gradient on values
        self.dinputs = np.dot(dvalues, self.weights.T)

### Activation Function: ReLU

In [None]:
class Activation_ReLU:
    def forward(self, inputs):
        self.inputs = inputs
        self.output = np.maximum(0, inputs)
        
    def backward(self, dvalues):
        # save a copy since we modify the original one
        self.dinputs = dvalues.copy()
        self.dinputs[self.inputs <= 0] = 0

### Softmax

In [None]:
class Activation_Softmax:
    def forward(self, inputs):
        self.inputs = inputs
        # Get unnormalized probabilities
        exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True))
        
        # Normalize them for each sample
        probabilities = exp_values / np.sum(exp_values, axis=1, keepdims=True)
        
        self.output = probabilities
    
    def backward(self, dvalues):
        self.dinputs = np.empty_like(dvalues)
        
        # Enumerate outputs and gradients
        for index, (single_output, single_dvalues) in enumerate(zip(self.output, dvalues)):
            # Flatten output array
            single_output = single_output.reshape(-1, 1)
            
            # Calculate Jacobian matrix of the output and
            jacobian_matrix = np.diagflat(single_output) - np.dot(single_output, single_output.T)
            
            # Calculate sample-wise gradient and add it to the array of sample gradients
            self.dinputs[index] = np.dot(jacobian_matrix, single_dvalues)

### Calculating loss

In [None]:
class Loss:
    def calculate(self, output, y):
        # Calculate sample losses
        sample_losses = self.forward(output, y)
        
        # Calculate mean loss
        data_loss = np.mean(sample_losses)
        
        return data_loss

### Cross Entropy

In [None]:
class Loss_CategoricalCrossEntropy(Loss):
    def forward(self, y_pred, y_true):
        # Number of samples in a batch
        samples = len(y_pred)
        
        # Clip data to prevent division by 0
        # Clip both sides to not drag mean towards any value
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)
        
        # Probabilities for target values
        # Only if categorical labels
        if len(y_true.shape) == 1:
            correct_confidences = y_pred_clipped[range(samples), y_true]
        
        # Mask values - only for one-hot encoded labels
        elif len(y_true.shape) == 2:
            correct_confidences = np.sum(y_pred_clipped * y_true, axis=1)
        
        # Losses
        negative_log_likelihoods = -np.log(correct_confidences)
        return negative_log_likelihoods
    
    def backward(self, dvalues, y_true):
        # Number of samples
        samples = len(dvalues)
        # Number of labels in every sample
        # Use first sample to count them
        labels = len(dvalues[0])
        
        # If labels are sparse, turn them into one-hot vector
        if len(y_true.shape) == 1:
            y_true = np.eye(labels)[y_true]
        
        # Calculate gradient
        self.dinputs = -y_true / dvalues
        # Normalize gradient
        self.dinputs = self.dinputs / samples

### Categorical Cross Entropy

In [None]:
class Activation_Softmax_Loss_CategoricalCrossEntropy():
    def __init__(self):
        self.activation = Activation_Softmax()
        self.loss = Loss_CategoricalCrossEntropy()
        
    def forward(self, inputs, y_true):
        # Output layer's activation function
        self.activation.forward(inputs)
        
        # Set the output
        self.output = self.activation.output
        
        # Calculate and return loss value
        return self.loss.calculate(self.output, y_true)

    def backward(self, dvalues, y_true):
        # Number of samples
        samples = len(dvalues)
        
        # If labels are one-hot encoded, turn them into discrete values
        if len(y_true.shape) == 2:
            y_true = np.argmax(y_true, axis=1)
        
        # Copy so we can safely modify
        self.dinputs = dvalues.copy()
        # Calculate gradient
        self.dinputs[range(samples), y_true] -= 1
        # Normalize gradient
        self.dinputs = self.dinputs / samples

### Create our model

In [None]:
X, y = spiral_data(samples=100, classes=3)

# Create Dense layer with 2 input features and 3 output values
dense1 = Dense(2, 3)

# create ReLU activation (to be used with Dense layer):
activation1 = Activation_ReLU()

# create second Dense layer with 3 input features (as we take output
# of previous layer here) and 3 output values (output values)
dense2 = Dense(3, 3)

# create Softmax classifier's combined loss and activation
loss_activation = Activation_Softmax_Loss_CategoricalCrossEntropy()

# perform a forward pass of our training data through this layer
dense1.forward(X)

# perform a forward pass through activation function
# takes the output of first dense layer here
activation1.forward(dense1.output)

# perform a forward pass through second Dense layer
# takes outputs of activation function of first layer as inputs
dense2.forward(activation1.output)

loss = loss_activation.forward(dense2.output, y)

# Let's see output of the first few samples:
print(loss_activation.output[:5])
print('loss:', loss)

There are two layers in this neural network: the first is a dense (fully connected) layer with 2 input features and 3 output values, and the second is also a dense layer but with 3 input features and 3 output values, essentially taking the output from the first layer as its input. Between these two dense layers, a Rectified Linear Unit (ReLU) activation function is applied to introduce non-linearity and help the model learn from the data. After the data passes through the second dense layer, a Softmax function coupled with a Categorical Cross-Entropy loss function is applied to calculate the probability distribution of the classes and the loss value, respectively. A forward pass is performed through the network to compute the outputs and the loss value, which is then printed out at the end along with the output of the first few samples.

## Optimization using Gradient Descent

### Hackers method

In [None]:
X, y = vertical_data(samples=100, classes=3)

# create model
dense1 = Dense(2, 3) # first dense layer, 2 inputs (each sample has 2 features), 3 outputs
activation1 = Activation_ReLU()
dense2 = Dense(3, 3) # second dense layer, 3 inputs, 3 outputs
activation2 = Activation_Softmax()

# create loss function
loss_function = Loss_CategoricalCrossEntropy()

# define the variables
lowest_loss = 9999999 # some initial value
best_dense1_weights = dense1.weights.copy()
best_dense1_biases = dense1.biases.copy()
best_dense2_weights = dense2.weights.copy()
best_dense2_biases = dense2.biases.copy()

for iteration in range(10000):
    dense1.weights += 0.05 * np.random.randn(2, 3)
    dense1.biases += 0.05 * np.random.randn(1, 3)
    dense2.weights += 0.05 * np.random.randn(3, 3)
    dense2.biases += 0.05 * np.random.randn(1, 3)
    
    # Perform a forward pass of our training data through this layer
    dense1.forward(X)
    activation1.forward(dense1.output)
    dense2.forward(activation1.output)
    activation2.forward(dense2.output)
    
    # Perform a forward pass through activation function
    # Takes the output of second dense layer here and returns loss
    loss = loss_function.calculate(activation2.output, y)
    
    # Calculate accuracy from output of activation2 and targets
    # calculate values along first axis
    predictions = np.argmax(activation2.output, axis=1)
    accuracy = np.mean(predictions == y)
    
    # If loss is smaller - print and save weights and biases aside
    if loss < lowest_loss:
        print('New set of weights found, iteration:', iteration,
              'loss:', loss, 'acc:', accuracy)
        best_dense1_weights = dense1.weights.copy()
        best_dense1_biases = dense1.biases.copy()
        best_dense2_weights = dense2.weights.copy()
        best_dense2_biases = dense2.biases.copy()
        lowest_loss = loss
    # Revert weights and biases
    else:
        dense1.weights = best_dense1_weights.copy()
        dense1.biases = best_dense1_biases.copy()
        dense2.weights = best_dense2_weights.copy()
        dense2.biases = best_dense2_biases.copy()

### Derivative of Loss

To learn how to adjust weights and biases, we need to know something about their impact on loss. We can do this by calculating the derivative of loss with respect to the weights and biases. This is done using the chain rule of calculus. The chain rule is used to calculate the derivative of a function inside of a function. In this case, the derivative of the loss function with respect to the weights and biases inside the dense layer function.

$$\frac{\partial L}{\partial w} = \frac{\partial L}{\partial \hat{y}} \frac{\partial \hat{y}}{\partial z} \frac{\partial z}{\partial w}$$

In [None]:
import matplotlib.pyplot as plt
import numpy as np

def f(x):
    return 2*x

x = np.array(range(5))
y = f(x)

plt.plot(x, y)
plt.show()

if we wanted to calculate the slope, we would need to calculate the tangent
at a given point. We can do this by using the slope formula:
$$slope = (y2 - y1) / (x2 - x1) = rise / run$$

We are interested in the impact that x has on y, so we can rewrite the slope formula as:

$$slope = \frac{\Delta y}{\Delta x}$$

In [None]:
import matplotlib.pyplot as plt
import numpy as np

def f(x):
    return 2*x**2

X = np.array(np.arange(0, 5, 0.001))
y = f(X)

plt.plot(X, y)

colors = ['r', 'g', 'b', 'y', 'm']

def approximate_tan_line(x, approximate_derivative):
    return (approximate_derivative*x) + b

for i in range(5):
    p2_delta = 0.0001
    x1 = i
    x2 = x1 + p2_delta
    
    y1 = f(x1)
    y2 = f(x2)
    
    print((x1, y1), (x2, y2))
    approximate_derivative = (y2 - y1) / (x2 - x1)
    b = y2 - (approximate_derivative*x2)
    
    to_plot = [x1-0.9, x1, x1+0.9]
    
    plt.scatter(x1, y1, c=colors[i])
    plt.plot([point for point in to_plot],
             [approximate_tan_line(point, approximate_derivative) for point in to_plot],
             c=colors[i])
    
    print('Approximate derivative for f(x)', f'where x = {x1} is {approximate_derivative}')
    
plt.show()


1. **Import Statements**:
   - `import matplotlib.pyplot as plt` and `import numpy as np` are standard import statements used to bring in the Matplotlib and NumPy libraries, which are often used for data visualization and numerical operations, respectively.

2. **Function Definition**:
   - `def f(x): return 2*x**2` defines a simple quadratic function \( f(x) = 2x^2 \).

3. **Creating Data Points**:
   - `X = np.array(np.arange(0, 5, 0.001))` creates an array of x-values ranging from 0 to 5, with a step of 0.001.
   - `y = f(X)` computes the corresponding y-values by applying the function \( f \) to each x-value.

4. **Initial Plot**:
   - `plt.plot(X, y)` plots the quadratic function using the generated x and y values.

5. **Colors Array**:
   - `colors = ['r', 'g', 'b', 'y', 'm']` sets up an array of color codes to be used for different tangent lines.

6. **Tangent Line Approximation and Plotting**:
   - A for loop `for i in range(5):` iterates through the integers 0 through 4.
   - Within the loop:
       - Two points, $(x_1, y_1)$ and $(x_2, y_2)$, very close to each other, are chosen along the curve.
       - The slope (approximate derivative) of the tangent line at $x_1$ is calculated using the difference quotient formula: $\frac{{y_2 - y_1}}{{x_2 - x_1}}$.
       - The y-intercept $b$ of the tangent line is found using the point-slope form of a linear equation.
       - Three x-values around $x_1$ are chosen, and the tangent line is plotted over these points using the `plt.plot()` and `plt.scatter()` functions.
       - The approximate derivative is printed to the console.

7. **Displaying the Plot**:
   - `plt.show()` displays the plot.

### Implementation into neural network architecture: partial derivatives and chain rule

In the forward pass, we need to calculate the derivative of the loss with respect to the weights and biases. This is done using the chain rule of calculus. The chain rule is used to calculate the derivative of a function inside of a function.

* The partial derivative measures how much impact a given parameter has on the output of a function
* the gradient is vector of the size of inputs with each element of the vector representing the partial derivative of the loss with respect to the input



## Backpropagation

Backpropagation is a central algorithm in the training of feedforward artificial neural networks and has foundational importance in the field of machine learning and artificial intelligence. It is based on the calculus principle of chain rule for derivatives, and is used to minimize the error in the neural network's predictions by adjusting the weights of the connections between neurons. Below is a detailed explanation and its relation to broader concepts in data science and mathematics.

### Explanation:
Backpropagation, short for "backward propagation of errors," is a method used during the training of neural networks. Here are the steps broken down:

1. **Forward Pass**: 
    - Input data is fed forward through the network.
    - Each layer computes an output based on the input and its weights.
    - The final output is compared to the target value to compute the error using a loss function (e.g., mean squared error).

2. **Backward Pass**:
    - The error is then propagated backward through the network.
    - The gradient of the loss function with respect to each weight is computed using the chain rule of calculus.
    - This involves computing the derivative of the loss function with respect to the network's output, multiplied by the derivative of the network's output with respect to each weight.

3. **Weight Update**:
    - The weights are then updated in a way to minimize the error, typically using a gradient descent algorithm or one of its variants.
    - The amount by which each weight is adjusted is proportional to the negative of the gradient, scaled by a learning rate.

### Summary:
Imagine you're teaching a robot to catch a ball. At first, it misses a lot. But each time it misses, you tell it what it did wrong, and it makes a slight adjustment. Over time, it gets better at catching the ball. 

Backpropagation in neural networks is similar. The network makes a guess about the input data, checks how wrong or right the guess is, then goes back and adjusts the weights a bit to improve for next time. It does this many times, learning from the errors, and getting better with each iteration.

### Python Code Example:
Here's a simplified version of how backpropagation might be implemented in Python:

In [None]:
import numpy as np

# Assume we have some input data X, and target labels y
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = np.array([[0], [1], [1], [0]])

# Randomly initialize weights
weights = np.random.rand(2, 1)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    return x * (1 - x)

adjustments_history = []

for epoch in range(10000):  # Train for 10000 epochs
    # Forward pass
    input_layer = X
    outputs = sigmoid(np.dot(input_layer, weights))
    
    # Compute error
    error = y - outputs
    
    # Backward pass
    adjustments = error * sigmoid_derivative(outputs)
    adjustments_history.append(adjustments)
    
    # Update weights
    weights += np.dot(input_layer.T, adjustments)

In [None]:
adjustments_history = np.array(adjustments_history)
adjustments_history[:3]

### Relation to Broader Concepts:

Backpropagation exemplifies how fundamental mathematical concepts, particularly calculus and linear algebra, are applied in the realm of machine learning and data science to optimize models for better predictions. It also provides a pathway to understanding how errors can be minimized in predictive modeling, which is a core aspect of data science. The algorithmâ€™s efficacy in minimizing error during training elucidates the importance of optimization techniques in machine learning and data science.

<center><img src="./images/backpropagation.png" height="500" width="800"></center>

visualized: https://nnfs.io/pro/

### PyTorch Implementation of backprop


In [None]:
import torch

# Create a tensor with 5 elements
x = torch.tensor([1, 2, 3, 4, 5], dtype=torch.float32)
x

In [None]:
## require_grad=True

x = torch.tensor([1, 2, 3, 4, 5], dtype=torch.float32, requires_grad=True)
x

In [None]:
## operations on the tensor

# Computational Graph is created in the forward pass
y = x**2

* Computes the gradients w.r.t. the parameters of the model

$$\frac{\partial y}{\partial x}$$

In [None]:
y #grad_fn=<PowBackward0> is the gradient function (for backpropagation)

In [None]:
# perform a backward pass
z = 2*y + x
z

In [None]:
z.mean() #mean of all the elements in the tensor

In [None]:
## dz/dx

z.mean().backward() #scalar value

In [None]:
x.grad #gradient of z with respect to x

Vector jacobian product or chain rule

The jacobian matrix is an array of partial derivatives in all of the possible combinations of the input and output variables.

<center><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/e343f872b676a0e64646f27593d03c77c53cbaf3" height="400" width="800"></center>

In [None]:
x = torch.tensor([1, 2, 3, 4, 5], dtype=torch.float32, requires_grad=True)
y = x**2

y

In [None]:
# jacobian matrix
j = torch.tensor([0.1, 1.0, 0.001, 0.0001, 0.00001], dtype=torch.float32)
x.backward(j) #jacobian matrix - if not scalar value then we must pass a vector

In [None]:
## dz/dx
print(x.grad)

In [None]:
## to suspend tracking of gradients

with torch.no_grad():
    y = x**2
    print(y)

In [None]:
## Other options to stop tracking gradients

x.requires_grad_(False)
x.detach()

In [None]:
## Training example

weights = torch.ones(4, requires_grad=True)

for epoch in range(3): # gradients are accumulated
    model_output = (weights*3).sum()
    model_output.backward()
    print(weights.grad)
    
    # empty the gradients
    # weights.grad.zero_()

In [None]:
### Calculating the gradients manually

# 1. forward pass
# 2. compute gradients
# 3. Backward pass (d loss / d weights)

import torch

x = torch.tensor(1.0)
y = torch.tensor(2.0)

w = torch.tensor(1.0, requires_grad=True)

# forward pass and compute the loss
y_hat = w * x
loss = (y_hat - y)**2 # linear regression loss

print(loss)

In [None]:
## backward pass
loss.backward()
w.grad

In [None]:
import numpy as np

# linear regression
X = np.array([1, 2, 3, 4], dtype=np.float32)
Y = np.array([2, 4, 6, 8], dtype=np.float32)

w = 0.0

# model prediction
def forward(x):
    return w * x

# loss = Mean Squared Error (MSE)
def loss(y, y_predicted):
    return ((y_predicted - y)**2).mean()


# gradient = d(loss)/d(w) = 1/N * 2x (xw - y)
def gradient(x, y, y_predicted):
    return np.dot(2*x, y_predicted - y).mean()


print(f'Prediction before training: f(5) = {forward(5):.3f}')

# training
learning_rate = 0.01

n_iters = 20

for epoch in range(n_iters):
    # prediction = forward pass
    y_pred = forward(X)
    
    # loss
    l = loss(Y, y_pred)
    
    # gradients
    dw = gradient(X, Y, y_pred)
    
    # update weights
    w -= learning_rate * dw
    
    if epoch % 2 == 0:
        print(f'[INFO]: epoch {epoch + 1}: w = {w:.3f}, loss = {l:.8f}')
    
print(f'Prediction after training: f(5) = {forward(5):.3f}')

In [None]:
import torch

X = torch.tensor([1, 2, 3, 4], dtype=torch.float32)
Y = torch.tensor([2, 4, 6, 8], dtype=torch.float32)

w = torch.tensor(0.0, dtype=torch.float32, requires_grad=True)

def forward(x):
    return w * x

def loss(y, y_predicted):
    return ((y_predicted - y)**2).mean()

print(f'Prediction before training: f(5) = {forward(5):.3f}')

learning_rate = 0.01
n_iters = 20

for epoch in range(n_iters):
    # prediction = forward pass
    y_pred = forward(X)
    
    # loss
    l = loss(Y, y_pred)
    
    # gradients = backward pass
    l.backward() # dl/dw
    
    # update weights
    with torch.no_grad():
        w -= learning_rate * w.grad
    
    # zero gradients
    w.grad.zero_()
    
    if epoch % 2 == 0:
        print(f'[INFO]: epoch {epoch + 1}: w = {w:.3f}, loss = {l:.8f}')

print(f'Prediction after training: f(5) = {forward(5):.3f}')

## Putting it all together

* Model Design (input, output size, forward pass)
* Create loss and optimizer
* Training Loop
    * Forward pass - compute prediction
    * Backward pass - gradients
    * Update weights - optimizer.step()

In [None]:
import torch
import torch.nn as nn #neural networks

## linear regression
X = torch.tensor([[1], [2], [3], [4]], dtype=torch.float32)
Y = torch.tensor([[2], [4], [6], [8]], dtype=torch.float32)

n_samples, n_features = X.shape # 4, 1 (n_samples, n_features)

test = torch.tensor([5], dtype=torch.float32)

input_size = n_features
output_size = n_features

# class LinearRegression(nn.Module):
#     def __init__(self, input_dim, output_dim):
#         super(LinearRegression, self).__init__() # super class constructor
#         self.lin = nn.Linear(input_dim, output_dim) # define the linear layer
        
#     def forward(self, x):
#         return self.lin(x)
    
model = nn.Linear(input_size, output_size) # == LinearRegression(input_size, output_size)

print(f'Prediction before training: f(5) = {model(test).item():.3f}')

learning_rate = 0.01
n_iters = 20

loss = nn.MSELoss() # mean squared error loss
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) # stochastic gradient descent

# training
for epoch in range(n_iters):
    # prediction = forward pass
    y_pred = model(X)
    
    # loss
    l = loss(Y, y_pred)
    
    # gradients = backward pass
    l.backward() # dl/dw
    
    optimizer.step() # update weights
    
    # zero gradients
    optimizer.zero_grad()
    
    if epoch % 2 == 0:
        print(f'[INFO]: epoch {epoch + 1}: w = {model.weight.item():.3f}, loss = {l:.8f}')

print(f'Prediction after training: f(5) = {model(test).item():.3f}')