In [1]:
# I tried my best to avoid hard coding in this project:)

#### Importing libraries

In [2]:
import numpy as np
from scipy.misc import derivative

#### Defining the dataset which is simply XOR:

In [3]:
X = np.array([[0,0], [0,1], [1,1], [1,0]]) # inputs
y = np.array([[0], [1], [0], [1]]) # labels

#### Initialize the layers size

In [4]:
network = {'Input':2, 'Hidden_1': 3, 'Hidden_2':2, 'Output': 1}

#### Define weights, biases and X

In [5]:
# First I wanted to define weight this simple but then I learned something cooler;)
# W = np.random.randn(2,2)

In [6]:
# Can initialize weights using Xavier or He normal and based on my activation functions:
def init_weights(n_in, n_out, activation) -> np.ndarray:
    if activation in ['sigmoid', 'tanh']:
        limit = np.sqrt(6 / (n_in + n_out))  # Xavier uniform
        weights = np.random.uniform(-limit, limit, size=(n_in, n_out))
    elif activation == 'relu':
        std_dev = np.sqrt(2 / n_in)  # He normal
        weights = np.random.normal(0, std_dev, size=(n_in, n_out))
    else:
        weights = np.random.normal(0, 0.01, size=(n_in, n_out))
    return weights

# Initialize biases based on count of neurons of each layer
def init_bias(n_neurons):
    biases = np.zeros((1, n_neurons))
    return biases

#### Activation Functions

In [7]:
def relu_func(x:np.ndarray) -> np.ndarray:
    return(np.maximum(0,x))

def relu_derivative(a):
    return (a > 0).astype(float)

def sigmoid_func(x:np.ndarray) -> np.ndarray:
    # sigmoid returns probabilities and not labels!
    p = 1 / (1+np.exp(-x))
    
    labels = (p >= 0.5).astype(int) # this works as boolean, if p>=0.5 returns True and then astype(int) converts to 1!
    return labels

def sigmoid_derivative(a):
    return a * (1 - a)

activations = {'relu': relu_func, 'relu_der': relu_derivative, 'sigmoid': sigmoid_func, 'sigmoid_der': sigmoid_derivative}

#### FeedForward

In [8]:
def feedforward(W:np.ndarray, b:np.ndarray, X:np.ndarray, activation_func:callable):
    z = (np.dot(X,W)) + b
    a = activation_func(z) # adding activation function 
    return a

In [9]:
a = X
weights = {} # storing different layers weights inside a dict.
biases = {}  # same for biases.
after_act = {}

# A loop  for initializing parameters and feedforward
for index, (key, value) in enumerate(network.items()):
    if key == 'Input':
        pass
    elif 'Hidden' in key:
        w = init_weights(list(network.values())[index-1], list(network.values())[index],activations['relu'])
        b = init_bias(value)
        
        assert w.shape[0] == a.shape[1] # Preventing error because of a shape mismatch between matrices
        a = feedforward(w,b,a, activations["relu"])
        
        weights[f"Hidden_{index}"] = w
        biases[f"Hidden_{index}"] = b
        after_act[f"Hidden_{index}"] = a
        
    else:
        w = init_weights(list(network.values())[index-1], list(network.values())[index],activations['sigmoid']) # dicts don't have index function, converting to list first
        b = init_bias(value)
        
        assert w.shape[0] == a.shape[1] # Preventing error because of a shape mismatch between matrices
        a = feedforward(w,b,a, activations["sigmoid"])
        
        weights[f"Output"] = w
        biases[f"Output"] = b
        after_act[f"Output"] = a

In [10]:
epsilon = 1e-15 # log(0) is undefined
y_hat = np.clip(after_act['Output'], epsilon, 1 - epsilon)
loss = -np.mean(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat))

#### BackPropagation

In [12]:
# Calculating the error and delta

deltas = {}

# Output layer delta
error = y - after_act['Output']
deltas['Output'] = error * sigmoid_derivative(after_act['Output'])

# Hidden layers in reverse order
hidden_layers = [k for k in weights.keys() if 'Hidden' in k]
hidden_layers.sort(reverse=True)  # or reverse order of insertion if needed

next_layer = 'Output'
for layer in hidden_layers:
    deltas[layer] = np.dot(deltas[next_layer], weights[next_layer].T) * relu_derivative(after_act[layer])
    next_layer = layer



In [42]:
# Calcuating gradient
# bias
grad_b = np.sum(deltas, axis=0, keepdims=True)

# weight
grad_w = {}
# I know I coded this part really weird-_-
for index_d, (key_d, value_d) in enumerate(deltas.items()):
    for index_a, (key_a, value_a) in enumerate(after_act.items()):
        if key_d == key_a:
            if key_d == 'Hidden_1':
                grad_w[key_d] = np.dot(X.T, value_d)
            else:
                grad_w[key_d] = np.dot(np.transpose(list(after_act.values())[index_a-1]), value_d)