![image](images\mainRNN.png)
![image](images\dWh.png)
![image](images\dWx.jpg)
![image](images\dB.jpg)

![image](images\dcdw.png)

In [79]:
#from matplotlib import pyplot as plt
import random, math
import numpy as np

def sigmoid(x):
    return 1/(1+np.exp(-x))

def swish(x):
    return x*sigmoid(x)

def relu(x):
    return np.maximum(0,x)

def softmax(x):
    p = np.exp(x - np.max(x))
    return p/np.sum(p)

def activation_function(z,act):
    if act=="sigmoid":
        return sigmoid(z)
    elif act == "swish":
        return swish(z)
    elif act == "relu":
        return relu(z)
    elif act=="tanh":
        return np.tanh(z)
    elif act=="softmax":
        return softmax(z)
    
class Layer():
    def __init__(self,input_size,output_size,act=None):
        self.size = input_size
        self.output_size = output_size
        self.activation = act
        self.wx = np.random.uniform(-1,1,(self.output_size,self.size))
        self.wh = np.random.uniform(-1,1,(self.output_size,self.output_size))
        self.wy = np.random.uniform(-1,1,(self.output_size,self.output_size))
        self.bias = np.random.uniform(-1,1,(output_size,1))
        # Changes required to the below two definitions
        self.A = np.random.uniform(-1,1,(0,output_size,1))
        self.Y = np.random.uniform(-1,1,(0,output_size,1))
        self.H = np.random.uniform(-1,1,(0,output_size,1))
        
    def forward_propagation(self, input_data):
        if len(self.A)-1 >= 0:
            #print("H shape: " + str(self.H.shape) + ", toAppend shape: " + str(np.array([np.dot(self.wx,input_data) + np.dot(self.wh,self.A[-1]) + self.bias]).shape))
               
            toAppend1 = np.array([np.dot(self.wx,input_data) + np.dot(self.wh,self.A[-1]) + self.bias])
            self.H = np.append(self.H, toAppend1, axis=0)
            toAppend2 = np.array([activation_function(self.H[-1], self.activation)])
            self.A = np.append(self.A, toAppend2, axis=0)
            toAppend3 = np.array([np.dot(self.wy, self.A[-1])])
            self.Y = np.append(self.Y, toAppend3, axis=0)
            #print("wy shape" + str(self.H.shape))
        else:
            #print("wx*inppdata"+str(np.array(input_data).shape))
            #print("H"+str(self.H.shape))
            toAppend = np.array([np.dot(self.wx, input_data) + self.bias])
            self.H = np.append(self.H, toAppend, axis=0)
            #print("self.H: " + str(self.H.shape))
            self.A = activation_function(self.H, self.activation)
            self.Y = np.array([np.dot(self.wy, self.A[-1])])
            #print("Y shape" + str(self.Y.shape))
        #print("A[-1] " + str(self.A.shape))
    def derMSE(self, target):
        return 2*(self.Y[-1] - target)
    
    def BPTT(self, input_data, gradient, learningRate):

        grad = np.tile(gradient, (1,self.wy.shape[1]))
        der_Wy_tiled = np.tile(self.A[-1].T, (self.wy.shape[0], 1))
        #print(str(grad.shape))
        #print(str(der_Wy_tiled.shape))
        der_Wy = np.multiply(grad, der_Wy_tiled)
        
        der_A = np.multiply(grad, self.wy)
        der_A_summed = np.sum(der_A, axis=0, keepdims=True).T
        der_A_act = np.array(der_A_summed)
        der_A_act.fill(1)
        if(self.activation == "tanh"):
            der_A_act = np.multiply(der_A_act, 1-np.power(self.A[-1], 2))
        der_A_tiled = np.tile(der_A_act, (1, self.wx.shape[1]))
        der_X = np.multiply(der_A_tiled, self.wx)
        der_X_summed = np.sum(der_X, axis=0, keepdims=True)
        #print("self.Y: " + str(self.Y[-1]))
        #print("der.X: " + str(der_X))
        
        der_Wh = 0
        der_Wx = 0
        der_bias = 0
        
        der_Wh = np.empty(self.A[-1].shape)
        der_Wh.fill(1)
        if len(self.A) > 1:
            for i in range(1, len(self.A)-1):
                der_A = 0
                if(self.activation == "tanh"):
                    der_A = np.multiply(der_A, 1-np.power(self.A[i], 2))
                der_Wh = np.multiply(der_Wh, der_A)
                if i == len(self.A)-1:
                    break
                sum_Wh = np.sum(self.wh, axis=0, keepdims=True).T
                der_Wh = np.multiply(sum_Wh, der_Wh)
                der_Wh = der_Wh + self.A[i]
        der_Wh = np.multiply(der_Wh, der_A_summed)
        
        
        der_Wx = np.tile(np.array(input_data[0]).T, (self.wx.shape[0], 1))
        if len(self.A) > 1:
            for i in range(1, len(self.A)-1):
                der_A = 0
                if(self.activation == "tanh"):
                    der_A = np.multiply(der_A, 1-np.power(self.A[i], 2))
                der_A = np.tile(der_A, (1, self.wh.shape[1]))
                der_Wx = np.multiply(der_Wx, der_A)
                der_Wx = np.multiply(der_Wx, self.wh)
                if i == len(self.A)-1:
                    break
                sum_Wx = np.tile(np.array(input_data[i+1]).T, (self.wx.shape[0], 1))
                der_Wx = der_Wx + sum_Wx
        der_A_tiledX = np.tile(der_A_summed, (1, self.wx.shape[1]))
        print("der_A_tiledX shape" + str(der_A_tiledX.shape))
        der_Wx = np.multiply(der_Wx, der_A_tiledX)
        
        """PARAMETER UPDATES"""
        
        self.wh = self.wh - learningRate * der_Wh
        
        self.wx = self.wx - learningRate * der_Wx
        print("wx shape" + str(self.wx.shape))
        #self.bias = self.bias - learningRate * der_Wb
        self.wy = self.wy - learningRate * der_Wy
    
        return der_X
    
    def clear(self):
        self.A = np.random.uniform(-1,1,(0,self.output_size,1))
        self.H = np.random.uniform(-1,1,(0,self.output_size,1))
        self.Y = np.random.uniform(-1,1,(0,self.output_size,1))
    
    '''def descent(self, input_data, gradient, learningRate):
        
        if(self.activation == "tanh"):
            derZ = 1 - np.power(self.A, 2)
        elif(self.activation == "swish"):
            derZ = swish(self.Z) + sigmoid(self.Z) * (1 - swish(self.Z))
        reps = (self.weights.shape[0], 1)
        derWeights = np.tile(input_data.transpose(), reps)        # Size -> a(L-1)*a(L)
        """derWeights is a matrix with derivatives of Z WRT weights, which is transposed inputs, 
        repeated in rows n-times, where n is number of neurons.
        
    Example:
    
        input_data = [
            [2],
            [1],
            [0]
        ]
        
        derWeights = [
            [2, 1, 0],
            [2, 1, 0],
            [2, 1, 0],
            ... n-rows
        ]
        
        """

        
        """ The below part adds the gradient to the derivative of A WRT input_data and passes this new
        gradient through return, to be used as the gradient for next layer's descent.
            dA/dX is made with 2 steps: adding the backprop gradient to the derivative of A WRT Z and then adding
        the derivative of Z WRT input_data (chain rule).
        
        1.
        Since A is a matrix shaped Nx1, where N is the number of outputs, the receiving gradient from the upper layer
        must be the same shape. Therefore we can multiply the gradient and the derivative together element-wise.
        
    Example:
        
         derZ = [3x1]
         gradient = [3x1]
         firstGrad = [3x1] *(elementwise) [3x1] = [3x1]
        
        """
        firstGrad = np.multiply(gradient, derZ)             # ∂C/∂a(L) * ∂(act)/∂Z    a(L)*1 * a(L)*1 = a(L)*1
        
        """
        2.
        What's left is adding the gradient of Z WRT input_data. 
        
        This turns out to be the weights matrix. Now we have to multiply the firstGrad gradient to these weights 
        element-wise but since the firstGrad is Nx1 shape and the weights are NxM, where M are the features, 
        we need to reshape the gradient matrix to match the weights matrix by cloning gradient's columns:
        """
        secondGrad = np.tile(firstGrad, (1, self.weights.shape[1]))          #Size -> a(L)*1 -> a(L)*a(L-1)
        
        """
        Finally we multiply (E-W) secondGrad to the weights matrix:
        """
        derX = np.multiply(self.weights,secondGrad)         #Size -> a(L)*a(L-1) * a(L)*a(L-1) = a(L)*a(L-1)
        
        """But because same inputs are multiplied with many weights, we can sum those weights together. It turns out
        that we can sum columns to do that"""
        
        derX = np.sum(derX, axis=0, keepdims=True)       #Size -> 1*a(L-1) 
        derBias = 1
        
        weightGrad = np.multiply(derWeights, np.tile(firstGrad, (1, self.weights.shape[1])))
        self.weights = self.weights - learningRate * weightGrad
        self.bias = self.bias - learningRate * firstGrad
        
        """We return transposed matrix, because we desire inputs with a shape of Nx1 and right now finalGrad is 
        transposed"""
        return derX.transpose()     # Size -> a(L-1)*1
    
        
"""class NeuralNetwork():
    def __init__(self):
        self.layers=[]
        self.epochs=10
        self.learning_rate = 0.008
    
    def add_layer(self,input_size,output_size,activation=None):
        new_layer = Layer(input_size,output_size,activation)
        self.layers.append(new_layer)
        
    def forward_propagation(self,layer_no):
        current_layer = self.layers[layer_no-1]
        prev_layer = self.layers[layer_no-2]
        act = current_layer.activation
        input_data = prev_layer.A
        self.Z = np.dot(weights,input_data)+self.bias
        result = activation_function(self.Z,act)    # array containing neuron values
        current_layer.A = result            #After forward propogation, fills in the neurons in that layer
        return result
    
    def full_forward_propagation(self, input_data):
        #print("layer 0 forward_propagation")
        
        self.layers[0].forward_propagation(input_data)        # From input data to first layer
        for i in range(1, len(self.layers)):
            #print("layer " + str(i) + " forward_propagation")
            
            self.layers[i].forward_propagation(self.layers[i-1].A)      #From layer i-1 to layer i 
        return self.layers[len(self.layers)-1].A
    
    def back_propagation(self, input_data, target):
        gradient = self.layers[len(self.layers)-1].derMSE(target)     # ∂C/∂a(L)   Size -> a(L)*1
        for i in range(0, len(self.layers)-1):
            index = len(self.layers)-1 - i
            #print("Layer " + str(index) + " backpropagation")
            gradient = self.layers[index].descent(self.layers[index-1].A, gradient, self.learning_rate)   #a(L)*1
        self.layers[0].descent(input_data, gradient, self.learning_rate)
            
            
    def predict(self,test_data):
        self.layers[0].forward_propagation(test_data)
        for i in range(1, len(self.layers)):
            self.layers[i].forward_propagation(self.layers[i-1].A)
        return self.layers[len(self.layers)-1].A"""
        '''
train_data = [
    [[0.0],[0.0]],
    [[0.2],[0.2]],
    [[0.4],[0.4]],
    [[0.6],[0.6]],
    [[0.8],[0.8]]
]

class RNN():
    def __init__(self):
        self.layers = []
        self.learning_rate = 0.008
    
    def add_layer(self,input_size,output_size,activation=None):
        new_layer = Layer(input_size,output_size,activation)
        self.layers.append(new_layer)
        
    def forward_propagation(self,layer_no):
        current_layer = self.layers[layer_no-1]
        prev_layer = self.layers[layer_no-2]
        act = current_layer.activation
        input_data = prev_layer.A
        self.H = np.dot(weights,input_data)+self.bias
        result = activation_function(self.H,act)    # array containing neuron values
        current_layer.A = result            #After forward propogation, fills in the neurons in that layer
        return result
    
    def full_forward_propagation(self, input_data):
        #print("layer 0 forward_propagation")
        #print("Input_data shape " + str(np.array(input_data).shape))
        self.layers[0].forward_propagation(input_data)        # From input data to first layer
        for i in range(1, len(self.layers)):
            #print("layer " + str(i) + " forward_propagation")
            
            #print(str(self.layers[i-1].Y.shape))
            self.layers[i].forward_propagation(self.layers[i-1].Y[-1])      #From layer i-1 to layer i 
        return self.layers[-1].Y[-1]
    
    def back_propagation_through_time(self,input_data,target):
        grad = self.layers[-1].derMSE(target)
        #print("First MSE grad" + str(grad))
        for i in range(1, len(self.layers)-1):
            #print("inside loop, iteration " + str(i))
            grad = self.layers[-i].BPTT(self.layers[-i-1].Y, grad, self.learning_rate)
        self.layers[0].BPTT(input_data, grad, self.learning_rate)
    
    def clear_memory(self):
        for i in range(0, len(self.layers)):
            self.layers[i].clear()
        
network = RNN()
network.add_layer(2, 2, "tanh")
network.add_layer(2, 1, "tanh")
network.add_layer(1, 2, "tanh")

for i in range(0, 16000):
    network.clear_memory()
    for j in range(0,len(train_data)-1):
        if i%399==0 and j%1 == 0:
            print(i,j)
            print("----------------")
            print("Input_data:\n" + str(train_data[j]))
            print("Forward pass:\n" + str(network.full_forward_propagation(train_data[j])))
            print("Target Data:\n" + str(train_data[j+1]))
            network.back_propagation_through_time(train_data, train_data[j+1])
        else:
            network.full_forward_propagation(train_data[j])
            network.back_propagation_through_time(train_data, train_data[j+1])
print("------------------------------------------------")
print("Model Training Completed Successfully !!")


0 0
----------------
Input_data:
[[0.0], [0.0]]
Forward pass:
[[0.0237183 ]
 [0.28809817]]
Target Data:
[[0.2], [0.2]]
der_A_tiledX shape(2, 1)
wx shape(2, 1)
der_A_tiledX shape(2, 2)
wx shape(2, 2)
0 1
----------------
Input_data:
[[0.2], [0.2]]
Forward pass:
[[-0.00537289]
 [ 0.1678916 ]]
Target Data:
[[0.4], [0.4]]
der_A_tiledX shape(2, 1)
wx shape(2, 1)
der_A_tiledX shape(2, 2)
wx shape(2, 2)
0 2
----------------
Input_data:
[[0.4], [0.4]]
Forward pass:
[[0.0322534 ]
 [0.22192716]]
Target Data:
[[0.6], [0.6]]
der_A_tiledX shape(2, 1)
wx shape(2, 2)
der_A_tiledX shape(2, 2)
wx shape(2, 2)
0 3
----------------
Input_data:
[[0.6], [0.6]]


ValueError: shapes (2,2) and (1,1) not aligned: 2 (dim 1) != 1 (dim 0)

In [57]:
test_data = np.empty((0, 3, 1))
while len(test_data) < 20:
    X_loc = random.random()*2-1
    Y_loc = random.random()*2-1
    radius = random.random()
    test_data = np.append(test_data, [[[X_loc], [Y_loc], [radius]]], axis=0)
for x in test_data:
    pass
    #print("Prediction :\n {} \n ----------------\n".format(x), network.predict(x),"\n--------------------------------")