![image](images\YAH.png)

In [173]:
import random, math
import numpy as np

def sigmoid(x):
    return 1/(1+np.exp(-np.clip(x, -500, 500)))

def swish(x):
    return x*sigmoid(x)

def relu(x):
    return np.maximum(0,x)

def softmax(x):
    p = np.exp(x - np.max(x))
    return p/np.sum(p)

def MSE(target, x):
    return np.power(target - x, 2)

def activation_function(z, act, derivative=False, activated_value=None):
    if derivative == False:
        if act=="sigmoid":
            return sigmoid(z)
        elif act == "swish":
            return swish(z)
        elif act == "relu":
            return relu(z)
        elif act=="tanh":
            return np.tanh(z)
        elif act=="softmax":
            return softmax(z)
    else:
        if act=="sigmoid":
            return sigmoid(z)
        elif act == "swish":
            return activated_value + np.multiply(sigmoid(z), 1 - activated_value)
        elif act == "relu":
            return relu(z)
        elif act=="tanh":
            return 1 - np.power(activated_value, 2)
        elif act=="softmax":
            return softmax(z)
    
    

class RNN():
    
    def __init__(self, l_rate):
        self.layers = []
        self.learning_rate = l_rate
        
    def add_layer(self, input_size, output_size, activation=None, last_activation=None):
        new_layer = Layer(input_size, output_size, activation)
        self.layers.append(new_layer)
        
    def forward_pass(self, input_data):
        self.layers[0].forward_pass(input_data)
        for i in range(1, len(self.layers)):
            prev_layer_out = self.layers[i-1].Y[-1]
            self.layers[i].forward_pass(prev_layer_out)
            
        return self.layers[-1].Y[-1]
            
    def clear_memory(self):
        for i in range(0, len(self.layers)):
            self.layers[i].clear_memory()
            
    def backpropagation_through_time(self, input_data, target):
        gradient, loss = self.layers[-1].der_MSE(target)
        for i in range(1, len(self.layers)):
            inputs = self.layers[-i-1].Y
            gradient = self.layers[-i].descent(gradient, inputs, self.learning_rate)
        self.layers[0].descent(gradient, input_data, self.learning_rate)
        return loss
            
class Layer():
    
    def __init__(self, input_size, output_size, activation=None, last_activation=None):
        self.input_size = input_size
        self.output_size = output_size
        self.activation = activation
        self.last_activation = last_activation
        
        self.weights_X = self.weights_init(output_size, input_size)
        self.weights_H = self.weights_init(output_size, output_size)
        self.weights_Y = self.weights_init(output_size, output_size)
        self.bias      = self.weights_init(output_size, 1)
        
        self.A = np.empty((0,output_size,1))
        self.Y = np.empty((0,output_size,1))
        self.H = np.empty((0,output_size,1))
        
        
    def weights_init(self, rows, cols):
        return np.random.uniform(-1,1,(rows, cols))
    
    
    def forward_pass(self, input_data):
        new_H = self.weights_X.dot(input_data) + self.bias
        if len(self.A) > 0:
             new_H += self.weights_H.dot(self.A[-1]) 
        self.H = np.append(self.H, np.array([new_H]), axis=0)
        
        
        new_A = activation_function(self.H[-1], self.activation)
        self.A = np.append(self.A, np.array([new_A]), axis=0)
        
        
        new_Y = self.weights_Y.dot(self.A[-1])
        if self.last_activation != None:
            new_Y = activation_function(new_Y, self.last_activation)
        self.Y = np.append(self.Y, np.array([new_Y]), axis=0)
        

    def clear_memory(self):
        self.A = np.empty((0, self.output_size, 1))
        self.Y = np.empty((0, self.output_size, 1))
        self.H = np.empty((0, self.output_size, 1))
        
        
    def der_MSE(self, target):
        return 2*(self.Y[-1] - target), np.power(target - self.Y[-1], 2)
     
    
    def descent(self, gradient, input_data, learning_rate):
        input_matrix = np.array(input_data)
        
        dC_Wh = 0
        dC_Wx = 0
        dC_B = 0
        
        dY_Wy = np.tile(self.A[-1].T, (self.weights_Y.shape[0], 1))
        dC_Wy = np.multiply(np.tile(gradient, (1, self.weights_Y.shape[1])), dY_Wy)
        
        dY_A = np.sum(self.weights_Y, axis=0, keepdims=True).T
        dC_A = np.multiply(gradient, dY_A)
        dA_H = activation_function(self.H[-1], self.activation, derivative=True, activated_value=self.A[-1])
        dC_H = np.multiply(gradient, dA_H)
        dC_H_transformed_forX = np.tile(dC_H, (1, self.weights_X.shape[1]))
        dC_X = np.sum(np.multiply(self.weights_X, dC_H_transformed_forX), axis=0, keepdims=True).T
        
        
        dA_Wh = np.empty(self.A[-1].shape)
        dA_Wh.fill(1)
        for i in range(0, len(self.A)):
            dA_Wh = np.multiply(dA_Wh, activation_function(self.H[i], self.activation, derivative=True, activated_value=self.A[i]))
            if i == len(self.A)-1:
                break
            dA_Wh = np.multiply(dA_Wh, np.sum(self.weights_H, axis=0, keepdims=True).T)
            dA_Wh += self.A[i]
        dC_Wh = np.multiply(dA_Wh, dC_A)
        dC_Wh = np.tile(dC_Wh, (1, self.weights_H.shape[1]))
        
        
        dA_Wx = np.sum(np.tile(input_matrix[0].T, (self.weights_X.shape[0], 1)), axis=1, keepdims=True)
        for i in range(0, len(self.A)):
            dA_H_temp = activation_function(self.H[i], self.activation, derivative=True, activated_value=self.A[i])
            dA_Wx = np.multiply(dA_H_temp, dA_Wx)
            if i == len(self.A)-1:
                break
            dA_Wx = np.multiply(np.sum(self.weights_H, axis=0, keepdims=True).T, dA_Wx)
            dA_Wx += np.sum(np.tile(input_matrix[i+1].T, (self.weights_X.shape[0], 1)), axis=1, keepdims=True)
        dC_Wx = np.multiply(dA_Wx, dC_A)
        dC_Wx = np.tile(dC_Wx, (1, self.weights_X.shape[1]))
        
        
        dA_B = np.empty(self.A[-1].shape)
        dA_B.fill(1)
        for i in range(0, len(self.A)):
            dA_B = np.multiply(dA_B, activation_function(self.H[i], self.activation, derivative=True, activated_value=self.A[i]))
            if i == len(self.A)-1:
                break
            dA_B = np.multiply(dA_B, np.sum(self.weights_H, axis=0, keepdims=True).T)
            dA_B += 1
        dC_B = np.multiply(dA_B, dC_A)
        
        """weights update"""
        self.weights_Y -= learning_rate * dC_Wy
        self.weights_H -= learning_rate * dC_Wh
        self.bias      -= learning_rate * dC_B
        self.weights_X -= learning_rate * dC_Wx
        
        return dC_X
    

In [174]:
network = RNN(0.003)
network.add_layer(1, 5, "tanh")
network.add_layer(5, 5, "tanh")
network.add_layer(5, 5, "tanh")
network.add_layer(5, 1, "swish")

In [175]:
input_data = [
    [[0.0]],
    [[0.4]],
    [[0.8]],
    [[0.4]],
    [[0.0]],
    [[0.4]],
    [[0.8]],
    [[0.4]],
    [[0.0]],
    [[0.4]],
    [[0.8]],
    [[0.4]],
    [[0.0]],
    [[0.4]],
    [[0.8]],
    [[0.4]]
]

In [None]:
epochs = 9000
batch = 2
early_stop = 500
loss = 0
prevLoss = 999
for e in range(0, epochs+1):
    network.clear_memory()
    loss = 0
    for i in range(0, len(input_data)-1):
        output = network.forward_pass(input_data[i])
        loss += network.backpropagation_through_time(input_data, input_data[i+1])
        if e%500 == 0:
            print("---------------")
            print("epoch " + str(e))
            print("sample " + str(i))
            print("output: \n" + str(output))
            print("target: \n" + str(input_data[i+1]))
    if prevLoss <= loss:
        early_stop-=1
    else:
        early_stop = 100
    if e%500 == 0:        
        print("average loss: \n" + str(loss/len(input_data)))   
    
    prevLoss = loss
    if early_stop <= 0:
        break
print("------------training finished successfully!------------")
        

---------------
epoch 0
sample 0
output: 
[[0.02107509]]
target: 
[[0.4]]
---------------
epoch 0
sample 1
output: 
[[0.40143647]]
target: 
[[0.8]]
---------------
epoch 0
sample 2
output: 
[[0.22151854]]
target: 
[[0.4]]
---------------
epoch 0
sample 3
output: 
[[0.39152724]]
target: 
[[0.0]]
---------------
epoch 0
sample 4
output: 
[[0.19492891]]
target: 
[[0.4]]
---------------
epoch 0
sample 5
output: 
[[0.24803932]]
target: 
[[0.8]]
---------------
epoch 0
sample 6
output: 
[[0.05496231]]
target: 
[[0.4]]
---------------
epoch 0
sample 7
output: 
[[0.16281699]]
target: 
[[0.0]]
---------------
epoch 0
sample 8
output: 
[[0.00956509]]
target: 
[[0.4]]
---------------
epoch 0
sample 9
output: 
[[0.18329167]]
target: 
[[0.8]]
---------------
epoch 0
sample 10
output: 
[[0.05921047]]
target: 
[[0.4]]
---------------
epoch 0
sample 11
output: 
[[0.32104968]]
target: 
[[0.0]]
---------------
epoch 0
sample 12
output: 
[[0.16029798]]
target: 
[[0.4]]
---------------
epoch 0
sample 13
o