In [1]:
# I tried my best to avoid hard coding in this project:)

#### Importing libraries

In [2]:
import numpy as np
from sympy import symbols, diff
import tensorflow as tf

#### Defining the dataset which is simply XOR:

In [3]:
X = np.array([[0,0], [0,1], [1,1], [1,0]]) # inputs
y = np.array([[0], [1], [0], [1]]) # labels

#### Initialize the layers size

In [4]:
network = {'Input':2, 'Hidden_1': 3, 'Hidden_2':2, 'Output': 1}

#### Define weights, biases and X

In [5]:
# Can initialize weights using Xavier or He normal and based on my activation functions:
def init_weights(n_in, n_out, activation) -> np.ndarray:
    if activation in ['sigmoid', 'tanh']:
        limit = np.sqrt(6 / (n_in + n_out))  # Xavier uniform
        weights = np.random.uniform(-limit, limit, size=(n_in, n_out))
    elif activation == 'relu':
        std_dev = np.sqrt(2 / n_in)  # He normal
        weights = np.random.normal(0, std_dev, size=(n_in, n_out))
    else:
        weights = np.random.normal(0, 0.01, size=(n_in, n_out))
    return weights

# Initialize biases based on count of neurons of each layer
def init_bias(n_neurons):
    biases = np.zeros((1, n_neurons))
    return biases

#### Activation Functions

In [6]:
def relu_func(x:np.ndarray) -> np.ndarray:
    return(np.maximum(0,x))

def relu_derivative(a):
    return (a > 0).astype(float)

def sigmoid_func(x:np.ndarray) -> np.ndarray:
    # sigmoid returns probabilities and not labels!
    p = 1 / (1+np.exp(-x))
    
    labels = (p >= 0.5).astype(int) # this works as boolean, if p>=0.5 returns True and then astype(int) converts to 1!
    return labels

def sigmoid_derivative(a):
    return a * (1 - a)

activations = {'relu': relu_func, 'relu_der': relu_derivative, 'sigmoid': sigmoid_func, 'sigmoid_der': sigmoid_derivative}

In [7]:
# Hyperparameters
lr = 0.01        # learning rate
beta1 = 0.9      # momentum term
beta2 = 0.999    # RMSprop term
epsilon = 1e-8   # small value to prevent division by zero


In [8]:
# Initializing weights and biases
weights = {} # storing different layers weights inside a dict.
biases = {}  # same for biases.

for index, (key, value) in enumerate(network.items()):
    if key == 'Input':
        continue
    elif 'Hidden' in key:
        w = init_weights(list(network.values())[index-1], value, activations['relu'])
        b = np.zeros((1, value))
        weights[f'Hidden_{index}'] = w
        biases[f'Hidden_{index}'] = b
    else:
        w = init_weights(list(network.values())[index-1], value, activations['sigmoid'])
        b = init_bias(value)
        weights['Output'] = w
        biases['Output'] = b
 

In [9]:
# Adam variables
m_w = {k: np.zeros_like(v) for k,v in weights.items()}
v_w = {k: np.zeros_like(v) for k,v in weights.items()}
m_b = {k: np.zeros_like(v) for k,v in biases.items()}
v_b = {k: np.zeros_like(v) for k,v in biases.items()}
t = 0

In [10]:
def feedforward(W, b, X, activation_func):
    z = np.dot(X, W) + b
    return activation_func(z)


#### Training loop

In [13]:
# There is a big training loop but you can view different parts seprated at the end of the notebook

num_epochs = 5000

for epoch in range(1, num_epochs+1):
    t += 1
    a = X
    after_act = {}
    # Forwardpass
    for index, (key, value) in enumerate(network.items()):
        if key == 'Input':
            continue
        elif 'Hidden' in key:
            w = weights[f'Hidden_{index}']
            b = biases[f'Hidden_{index}']
            a = feedforward(w, b, a, activations['relu'])
            after_act[f'Hidden_{index}'] = a
        else:
            w = weights['Output']
            b = biases['Output']
            a = feedforward(w, b, a, activations['sigmoid'])
            after_act['Output'] = a
        
    # Computing delta
    deltas = {}
    error = y - after_act['Output']
    deltas['Output'] = error * sigmoid_derivative(after_act['Output'])
    
    hidden_layers = [k for k in weights.keys() if 'Hidden' in k]
    hidden_layers.sort(reverse=True)
    next_layer = 'Output'
    for layer in hidden_layers:
        deltas[layer] = np.dot(deltas[next_layer], weights[next_layer].T) * relu_derivative(after_act[layer])
        next_layer = layer

        
    # Gradients
    # I know I coded this part really weird-_-
    grad_b = {key: np.sum(delta, axis=0, keepdims=True) for key, delta in deltas.items()}
    grad_b = dict(reversed(list(grad_b.items())))

    grad_w = {}
    for key in deltas:
        prev_layer = X if 'Hidden_1' in key else after_act[list(after_act.keys())[list(deltas.keys()).index(key)-1]]
        grad_w[key] = np.dot(prev_layer.T, deltas[key])
    grad_w = dict(reversed(list(grad_w.items())))

    # Optimization
    for key in weights:
        g_w = grad_w[key]
        g_b = grad_b[key]
        
        # First moment
        m_w[key] = beta1 * m_w[key] + (1 - beta1) * g_w
        m_b[key] = beta1 * m_b[key] + (1 - beta1) * g_b
        
        # Second moment
        v_w[key] = beta2 * v_w[key] + (1 - beta2) * (g_w ** 2)
        v_b[key] = beta2 * v_b[key] + (1 - beta2) * (g_b ** 2)
        
        # Bias-corrected
        m_hat_w = m_w[key] / (1 - beta1**t)
        v_hat_w = v_w[key] / (1 - beta2**t)
        m_hat_b = m_b[key] / (1 - beta1**t)
        v_hat_b = v_b[key] / (1 - beta2**t)
        
        # Update
        weights[key] -= lr * m_hat_w / (np.sqrt(v_hat_w) + epsilon)
        biases[key]  -= lr * m_hat_b / (np.sqrt(v_hat_b) + epsilon)

    # loss Function
    y_hat = np.clip(after_act['Output'], epsilon, 1 - epsilon)
    loss = -np.mean(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat))
    if epoch % 5 == 0:
        print(f"Epoch {epoch}, Loss: {loss:.4f}")   


Epoch 5, Loss: 9.2103
Epoch 10, Loss: 9.2103
Epoch 15, Loss: 9.2103
Epoch 20, Loss: 9.2103
Epoch 25, Loss: 9.2103
Epoch 30, Loss: 9.2103
Epoch 35, Loss: 9.2103
Epoch 40, Loss: 9.2103
Epoch 45, Loss: 9.2103
Epoch 50, Loss: 9.2103
Epoch 55, Loss: 9.2103
Epoch 60, Loss: 9.2103
Epoch 65, Loss: 9.2103
Epoch 70, Loss: 9.2103
Epoch 75, Loss: 9.2103
Epoch 80, Loss: 9.2103
Epoch 85, Loss: 9.2103
Epoch 90, Loss: 9.2103
Epoch 95, Loss: 9.2103
Epoch 100, Loss: 9.2103
Epoch 105, Loss: 9.2103
Epoch 110, Loss: 9.2103
Epoch 115, Loss: 9.2103
Epoch 120, Loss: 9.2103
Epoch 125, Loss: 9.2103
Epoch 130, Loss: 9.2103
Epoch 135, Loss: 9.2103
Epoch 140, Loss: 9.2103
Epoch 145, Loss: 9.2103
Epoch 150, Loss: 9.2103
Epoch 155, Loss: 9.2103
Epoch 160, Loss: 9.2103
Epoch 165, Loss: 9.2103
Epoch 170, Loss: 9.2103
Epoch 175, Loss: 9.2103
Epoch 180, Loss: 9.2103
Epoch 185, Loss: 9.2103
Epoch 190, Loss: 9.2103
Epoch 195, Loss: 9.2103
Epoch 200, Loss: 9.2103
Epoch 205, Loss: 9.2103
Epoch 210, Loss: 9.2103
Epoch 215, L

In [None]:
    for index, (key, value) in enumerate(network.items()):
        if key == 'Input':
            pass
        elif 'Hidden' in key:
            w = init_weights(list(network.values())[index-1], list(network.values())[index],activations['relu'])
            b = init_bias(value)
            
            assert w.shape[0] == a.shape[1] # Preventing error because of a shape mismatch between matrices
            a = feedforward(w,b,a, activations["relu"])
            
            weights[f"Hidden_{index}"] = w
            biases[f"Hidden_{index}"] = b
            after_act[f"Hidden_{index}"] = a
            
        else:
            w = init_weights(list(network.values())[index-1], list(network.values())[index],activations['sigmoid']) # dicts don't have index function, converting to list first
            b = init_bias(value)
            
            assert w.shape[0] == a.shape[1] # Preventing error because of a shape mismatch between matrices
            a = feedforward(w,b,a, activations["sigmoid"])
            
            weights[f"Output"] = w
            biases[f"Output"] = b
            after_act[f"Output"] = a
        

#### FeedForward

In [None]:
# def feedforward(W:np.ndarray, b:np.ndarray, X:np.ndarray, activation_func:callable):
#     z = (np.dot(X,W)) + b
#     a = activation_func(z) # adding activation function 
#     return a

In [None]:
# a = X
# weights = {} # storing different layers weights inside a dict.
# biases = {}  # same for biases.
# after_act = {}

# # A loop  for initializing parameters and feedforward
# for index, (key, value) in enumerate(network.items()):
#     if key == 'Input':
#         pass
#     elif 'Hidden' in key:
#         w = init_weights(list(network.values())[index-1], list(network.values())[index],activations['relu'])
#         b = init_bias(value)
        
#         assert w.shape[0] == a.shape[1] # Preventing error because of a shape mismatch between matrices
#         a = feedforward(w,b,a, activations["relu"])
        
#         weights[f"Hidden_{index}"] = w
#         biases[f"Hidden_{index}"] = b
#         after_act[f"Hidden_{index}"] = a
        
#     else:
#         w = init_weights(list(network.values())[index-1], list(network.values())[index],activations['sigmoid']) # dicts don't have index function, converting to list first
#         b = init_bias(value)
        
#         assert w.shape[0] == a.shape[1] # Preventing error because of a shape mismatch between matrices
#         a = feedforward(w,b,a, activations["sigmoid"])
        
#         weights[f"Output"] = w
#         biases[f"Output"] = b
#         after_act[f"Output"] = a



In [None]:
# epsilon = 1e-15 # log(0) is undefined
# y_hat = np.clip(after_act['Output'], epsilon, 1 - epsilon)
# loss = -np.mean(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat))

#### BackPropagation

In [None]:
# # Calculating the error and delta

# deltas = {}

# # Output layer delta
# error = y - after_act['Output']
# deltas['Output'] = error * sigmoid_derivative(after_act['Output'])

# # Hidden layers in reverse order
# hidden_layers = [k for k in weights.keys() if 'Hidden' in k]
# hidden_layers.sort(reverse=True)  # or reverse order of insertion if needed

# next_layer = 'Output'
# for layer in hidden_layers:
#     deltas[layer] = np.dot(deltas[next_layer], weights[next_layer].T) * relu_derivative(after_act[layer])
#     next_layer = layer

# # Calcuating gradient
# # bias
# grad_b = np.sum(deltas, axis=0, keepdims=True)
# grad_b = dict(reversed(list(grad_b.items())))

# # weight
# grad_w = {}
# # I know I coded this part really weird-_-
# for index_d, (key_d, value_d) in enumerate(deltas.items()):
#     for index_a, (key_a, value_a) in enumerate(after_act.items()):
#         if key_d == key_a:
#             if key_d == 'Hidden_1':
#                 grad_w[key_d] = np.dot(X.T, value_d)
#             else:
#                 grad_w[key_d] = np.dot(np.transpose(list(after_act.values())[index_a-1]), value_d)
# # The order was incorrect
# grad_w = dict(reversed(list(grad_w.items())))

#### Optimization

In [None]:
# # Hyperparameters
# lr = 0.01        # learning rate
# beta1 = 0.9      # momentum term
# beta2 = 0.999    # RMSprop term
# epsilon = 1e-8   # small value to prevent division by zero

# # Initialize m, v for all weights and biases
# m_w = {k: np.zeros_like(v) for k, v in weights.items()}
# v_w = {k: np.zeros_like(v) for k, v in weights.items()}
# m_b = {k: np.zeros_like(v) for k, v in biases.items()}
# v_b = {k: np.zeros_like(v) for k, v in biases.items()}

# t = 0  # timestep

# # Inside your training loop:
# t += 1

# for key in weights:
#     # Get gradients (from backprop)
#     g_w = grad_w[key]
#     g_b = grad_b[key]

#     # Update biased first moment estimate
#     m_w[key] = beta1 * m_w[key] + (1 - beta1) * g_w
#     m_b[key] = beta1 * m_b[key] + (1 - beta1) * g_b

#     # Update biased second raw moment estimate
#     v_w[key] = beta2 * v_w[key] + (1 - beta2) * (g_w ** 2)
#     v_b[key] = beta2 * v_b[key] + (1 - beta2) * (g_b ** 2)

#     # Compute bias-corrected first and second moment estimates
#     m_hat_w = m_w[key] / (1 - beta1 ** t)
#     v_hat_w = v_w[key] / (1 - beta2 ** t)
#     m_hat_b = m_b[key] / (1 - beta1 ** t)
#     v_hat_b = v_b[key] / (1 - beta2 ** t)

#     # Update weights and biases
#     weights[key] -= lr * m_hat_w / (np.sqrt(v_hat_w) + epsilon)
#     biases[key]  -= lr * m_hat_b / (np.sqrt(v_hat_b) + epsilon)
