In [48]:
import numpy as np

def sigmoid_derivative(input):
  return input * (1 - input)

class Sigmoid:
  def sigmoid(self,input):
    return 1 / (1 + np.exp(-input))

  def sigmoid_derivative(self,input):
    return input * (1 - input)

  def forward(self,input):
    return self.sigmoid(input)

  def backward(self,input):
    return self.sigmoid_derivative(input)


class ReLU:
  def relu(self,input):
    return np.maximum(0,input)

  def relu_derivative(self,input):
    return np.where(input > 0, 1, 0)

  def forward(self,input):
    return self.relu(input)

  def backward(self,input):
    return self.relu_derivative(input)

class MSE:
  def mse(self,target,output):
    return np.mean(0.5*(output - target)**2)

  def mse_derivative(self,target,output):
    return  target - output

  def forward(self,target,output):
    return self.mse(target,output)

  def backward(self,target,output):
    return self.mse_derivative(target,output)
  
class Linear:
    def forward(self, input):
        return input

    def backward(self, input):
        # Derivative of linear function is 1
        return np.ones_like(input)

class NeuralNetLayer:
  def __init__(self,input_size,output_size):
    self.input_size = input_size
    self.output_size = output_size
    self.weights = np.random.rand(input_size,output_size) * np.sqrt(2. / input_size)
    self.bias = np.random.rand(1,output_size) * np.sqrt(2. / input_size)

  def forward(self,input):
    assert input.shape[-1] == self.input_size, "Shape not same"
    output = (input @ self.weights) + self.bias
    return output


  def backward(self,input,error,lr):
    grad = error @ self.weights.T 
    self.weights +=  lr * (input.T @ error)
    self.bias += lr * error.mean(axis=0)
    return grad

In [None]:
np.random.seed(5416)
x = np.random.randn(32,16)
target = np.random.randn(32,1)
layer1 = NeuralNetLayer(16,64)
hidden_activation = ReLU()

layer2 = NeuralNetLayer(64,1)
output_activation = Linear()
loss = MSE()


def normalize(x):
    return x / np.linalg.norm(x,ord=1,axis=0)

for i in range(10000):
    # Forward Pass
    hidden_layer = layer1.forward(x)
    hidden_layer = normalize(hidden_layer)
    hidden_layer = hidden_activation.forward(hidden_layer)
    output_layer = layer2.forward(hidden_layer)
    output_layer = output_activation.forward(output_layer)  # sometimes number goes big and sigmoid output becomes 1 so in that case use normalization
    loss_output = loss.forward(target,output_layer)

    
    '''
    Bad Explanation Disclaimer!!:
    (--- Iam a lazy person and I don't put much stress on my mind to find good words
    thats why I make terrible explanations that nobody understands. Its all in the
    mind that I can't explain in words cuz I dont have words. If the explanation
    makes sense to  you  thats good and if dont go to hell. ---)

    Take mean of gradient value of each sample in batch when using batch input,
    Forward pass run in parallel for each sample in input. But only one backward
    pass runs cuz changing every weight parallely (i.e parallel write to the same
    weight matrix) does not make sense and also tensors or numpy array broadcast
    does not allow this. Thats the whole idea of batch processing in neural networks.

    Usually pytorch or other implementations take averge of loss and gradients but here
    Iam not saving gradient separately so taking average of gradients across batch axis
    and then update the weights
    
    Weights become too large that relu is simple return 1 for all weights and model is not
    going anywhere to need to normalize. It will also prevent from weights overflow

    Model is still not learning anything weights are large at initialization need to normalize
    them while initializing   

    Still not converging ah I found the problem my targets are in range of -1 to 1 and my 
    output layer activation function is sigmoid with range of 0 to 1 so its never gonna
    converge need to change the activation function to linear
    
    Yay problem solved model is converging very fast and learning the input data distribution
    .Now I can sleep peacefully
    '''


    # Backward Pass
    error = loss.backward(target,output_layer)
    error = output_activation.backward(output_layer) * error
    
    error = layer2.backward(hidden_layer,error,0.01)
    error = hidden_activation.backward(hidden_layer) * error
    error = layer1.backward(x,error,0.01)
    if i % 1000 == 0:
      print(loss_output)



0.6580494166764389
0.10331399201683275
0.04109411379240492
0.01958084072263737
0.01030952355629797
0.006077023391939303
0.003952374451300624
0.002794787831239696
0.0021102089301917415
0.0016552431261428518
