<img src="images\GRU_formulas.jpg" width=800>
<img src="images\GRU_shapes.jpg" width=200>
<img src="images\GRU_derivatives.jpg" width=800>

In [29]:
import random, math
import numpy as np
from matplotlib import pyplot as plt
plt.rcParams["figure.figsize"]=16,5


def load_data_txt(filename, dimensions, max_len=None):
    f = open(filename, "r")
    data = []
    print("loading data")
    for line in f:
        for c in line:
            if ord(c)-ord('A') > 0 and ord(c)-ord('A') < dimensions:
                new_data = []
                for i in range(0, dimensions):
                    new_data.append([0])
                    
                new_data[ord(c)-ord('A')] = [1]
                data.append(new_data)
                if max_len != None:
                    if len(data) >= max_len:
                        print("all loaded")    
                        return data
                    
    print("all loaded")    
    return data
            
def sigmoid(x):
    return 1/(1+np.exp(-np.clip(x, -500, 500)))

def swish(x):
    return x*sigmoid(x)

def relu(x):
    return np.maximum(0,x)

def softmax(x):
    p = np.exp(x - np.max(x))
    return p/np.sum(p)

def MSE(target, x):
    return np.power(target - x, 2)

def activation_function(z, act, derivative=False, activated_value=None):
    if derivative == False:
        if act=="sigmoid":
            return sigmoid(z)
        elif act == "swish":
            return swish(z)
        elif act == "relu":
            return relu(z)
        elif act=="tanh":
            return np.tanh(z)
        elif act=="softmax":
            return softmax(z)
    else:
        if act=="sigmoid":
            return activated_value * (1 - activated_value)
        elif act == "swish":
            return activated_value + sigmoid(z) * (1 - activated_value)
        elif act == "relu":  
            x = activated_value
            x[x<=0] = 0
            x[x>0] = 1
            return x
        
        elif act=="tanh":
            return 1 - np.power(activated_value, 2)
        elif act=="softmax":
            return activated_value

class RNN():
    
    def __init__(self, l_rate):
        self.layers = []
        self.learning_rate = l_rate
        
    def add_layer(self, input_size, output_size, last_activation=None):
        new_layer = Layer(input_size, output_size, last_activation)
        self.layers.append(new_layer)
        
    def forward_pass(self, input_data):
        self.layers[0].forward_pass(input_data)
        for i in range(1, len(self.layers)):
            prev_layer_out = self.layers[i-1].h[-1]        # Takes in the output of the previous layer. h[-1] is the hidden state of the last timestep of the layer
            self.layers[i].forward_pass(prev_layer_out)
            
        return self.layers[-1].h[-1]
            
    def clear_memory(self):
        for i in range(0, len(self.layers)):
            self.layers[i].clear_memory()
            
    def backpropagation_through_time(self, input_data, target, max_steps):
        gradient, loss = self.layers[-1].der_MSE(target)
        for i in range(1, len(self.layers)):
            inputs = self.layers[-i-1].h
            gradient = self.layers[-i].descent(gradient, inputs, self.learning_rate, max_steps)
        self.layers[0].descent(gradient, input_data, self.learning_rate, max_steps)
        return loss
    
    def predict(self, input_data):
        next_letter = []
        out_letter = []
        for i in range(0, len(input_data)):
            next_letter = self.forward_pass(input_data[i])
            
        out_letter = next_letter
        print_output = ""
        for i in range(500, 700):
            maximum = 0;
            max_index = 0;
            for k in range(0, 3):
                if next_letter[k][0] > maximum:
                    maximum = next_letter[k][0]
                    max_index = k
            out_letter = np.empty(next_letter.shape)
            out_letter.fill(0)
            out_letter[max_index,0] = 1
            print_output += chr(ord('A')+max_index)
            next_letter = self.forward_pass(out_letter)
        print(print_output)
    
class Layer():
    
    def __init__(self, input_size, output_size, last_activation=None):
        self.input_size = input_size # N
        self.output_size = output_size # M
        self.last_activation = last_activation
        
        self.Wz = self.weights_init(output_size, input_size) # NxM
        self.Wr = self.weights_init(output_size, input_size) # NxM
        self.Wh = self.weights_init(output_size, input_size) # NxM
        
        self.Uz = self.weights_init(output_size, output_size) # NxN
        self.Ur = self.weights_init(output_size, output_size) # NxN
        self.Uh = self.weights_init(output_size, output_size) # NxN
        
        self.bz = self.weights_init(output_size, 1, bias=True) # Nx1
        self.br = self.weights_init(output_size, 1, bias=True) # Nx1
        self.bh = self.weights_init(output_size, 1, bias=True) # Nx1
        
        # T = time steps, starts with 0
        
        self.z       = np.empty((0,output_size,1)) # TxNx1, z(t) = Nx1
        self.r       = np.empty((0,output_size,1)) # TxNx1, r(t) = Nx1
        self.h_tilde = np.empty((0,output_size,1)) # TxNx1, h_tilde(t) = Nx1
        self.h       = np.empty((0,output_size,1)) # TxNx1, h(t) = Nx1
        self.inputs  = np.empty((0,input_size,1))  # TxMx1, inputs(t) = Mx1
        
        self.v_Wy = 0
        self.v_Wh = 0
        self.v_B = 0
        self.v_Wx = 0
        
    def weights_init(self, rows, cols, bias=False):
        variance = math.sqrt(1/self.input_size)
        if bias:
            return np.random.uniform(0,0.01,(rows, cols))
        return np.random.uniform(-variance,variance,(rows, cols))
    
    
    def forward_pass(self, input_data):
        
        self.inputs = np.append(self.inputs, [input_data])
        prev_h = []
        if len(self.h) > 0:
            prev_h = self.h[-1]
        else:
            prev_h = np.empty((self.h.shape[1], self.h.shape[2]))
            prev_h.fill(0)
        
        new_z = self.Wz @ input_data + self.Uz @ prev_h + self.bz
        new_z = activation_function(new_z, "sigmoid")
        self.z = np.append(self.z, [new_z], axis=0)
        
        new_r = self.Wr @ input_data + self.Ur @ prev_h + self.br
        new_r = activation_function(new_r, "sigmoid")
        self.r = np.append(self.r, [new_r], axis=0)
        
        new_h_tilde = self.Wh @ input_data + self.r[-1] * (self.Uh @ prev_h) + self.bh
        new_h_tilde = activation_function(new_h_tilde, "tanh")
        self.h_tilde = np.append(self.h_tilde, [new_h_tilde], axis=0)
        
        new_h = self.z[-1] * self.h_tilde[-1] + (1 - self.z[-1]) * prev_h
        self.h = np.append(self.h, [new_h], axis=0)
        
        
        if len(self.z) > 30:
            self.z = np.delete(self.z, 0, 0)
        if len(self.r) > 30:
            self.r = np.delete(self.r, 0, 0)
        if len(self.h_tilde) > 30:
            self.h_tilde = np.delete(self.h_tilde, 0, 0)
        if len(self.h) > 30:
            self.h = np.delete(self.h, 0, 0)
            
            
        
    def clear_memory(self):
        self.z       = np.empty((0,self.output_size,1))
        self.r       = np.empty((0,self.output_size,1))
        self.h_tilde = np.empty((0,self.output_size,1))
        self.h       = np.empty((0,self.output_size,1))
        
        self.v_Wy = 0
        self.v_Wh = 0
        self.v_B = 0
        self.v_Wx = 0
        
        
    def der_MSE(self, target):
        if self.last_activation != None:
            return 2*(self.h[-1] - target) * activation_function(self.h[-1], self.last_activation, derivative=True, activated_value=self.h[-1]), np.power(target - self.h[-1], 2)
        return 2*(self.h[-1] - target), np.power(target - self.h[-1], 2)
     
    
    def descent(self, gradient, input_data, learning_rate, max_steps):
        input_matrix = np.array(input_data)
        first_index = max(len(self.h) - max_steps, 0)
        
        step = 0
        inputs_reshaped = np.sum(self.inputs[first_index], axis=0, keepdims=True)
        inputs_reshaped = np.tile(inputs_reshaped, (self.output_size, 1))
        
        
        dr_Wr = activation_function(self.r[first_index], "sigmoid", derivative=True, activated_value=self.r[first_index])
        dr_Wr *= (inputs_reshaped)
        #dr_Wr = np.tile(dr_Wr, (1, self.input_size))
        
        dz_Wr = activation_function(self.z[first_index], "sigmoid", derivative=True, activated_value=self.z[first_index])
        dz_Wr *= (inputs_reshaped)
        
        
        dhtilde_Wr = activation_function(self.h_tilde[first_index], "tanh", derivative=True, activated_value=self.h_tilde[first_index])
        dhtilde_Wr *= (inputs_reshaped)
        
        dh_Wr = dz_Wr * (-self.h_tilde[first_index]) + dhtilde_Wr * (1 - self.z[first_index])
        
        
        
        dr_Wz = dr_Wr
        #dr_Wr = np.tile(dr_Wr, (1, self.input_size))
        dz_Wz = dz_Wr
        
        dhtilde_Wz = dhtilde_Wr
        
        dh_Wz = dz_Wz * (-self.h_tilde[first_index]) + dhtilde_Wz * (1 - self.z[first_index])
        
        
        
        dz_Uz = activation_function(self.z[first_index], "sigmoid", derivative=True, activated_value=self.z[first_index])
        dr_Uz = activation_function(self.r[first_index], "sigmoid", derivative=True, activated_value=self.r[first_index])
        dhtilde_Uz = activation_function(self.h_tilde[first_index], "tanh", derivative=True, activated_value=self.h_tilde[first_index])
        dh_Uz = dz_Uz * (-self.h_tilde[first_index]) + dhtilde_Uz * (1 - self.z[first_index])
        
        dr_x = activation_function(self.r[first_index], "sigmoid", derivative=True, activated_value=self.r[first_index])
        dr_x = np.tile(dr_x, (1, self.input_size))
        dr_x = np.sum(dr_x, axis=0, keepdims=True).T
        Wr_reshaped = np.sum(self.Wr, axis=0, keepdims=True).T
        dr_x *= (Wr_reshaped)
        
        dz_x = activation_function(self.z[first_index], "sigmoid", derivative=True, activated_value=self.z[first_index])
        dz_x = np.tile(dz_x, (1, self.input_size))
        dz_x = np.sum(dz_x, axis=0, keepdims=True).T
        Wz_reshaped = np.sum(self.Wz, axis=0, keepdims=True).T
        dz_x *= (Wz_reshaped)
        
        dhtilde_x = activation_function(self.h_tilde[-1], "tanh", derivative=True, activated_value=self.h_tilde[-1])
        dhtilde_x = np.tile(dhtilde_x, (1, self.input_size)) # from Nx1 to NxM
        dhtilde_x = np.sum(dhtilde_x, axis=0, keepdims=True).T # from NxM to Mx1
        Wh_reshaped = np.sum(self.Wh, axis=0, keepdims=True).T # from NxM to Mx1
        prev_h = np.empty((self.output_size, 1)) if len(self.h) < 2 else self.h[-2]
        Uhprev_h = np.sum(np.tile(self.Uh @ prev_h, (1, self.input_size)), axis=0, keepdims=True).T
        dhtilde_x *= (Wh_reshaped + dr_x * Uhprev_h)
        
        h_subtract = np.tile(prev_h - self.h_tilde[-1], (1, self.input_size))
        h_subtract = np.sum(h_subtract, axis=0, keepdims=True).T
        one_minus = np.tile((1 - self.z[-1]), (1, self.input_size))
        one_minus = np.sum(one_minus, axis=0, keepdims=True).T
        dh_x = dz_x * h_subtract + dhtilde_x * one_minus
        
        step = 1
        
        while step + first_index < len(self.h): 
            
            inputs_reshaped = np.sum(self.inputs[first_index+step], axis=0, keepdims=True)
            inputs_reshaped = np.tile(inputs_reshaped, (self.output_size, 1)) # X goes from Mx1 to Nx1
            #print("first_index: " + str(first_index))
        
            """Wr gradient steps"""
            dr_Wr = activation_function(self.r[first_index+step], "sigmoid", derivative=True, activated_value=self.r[first_index+step])
            
            dr_Wr *= (inputs_reshaped + self.Ur @ dh_Wr)
            
            dz_Wr = activation_function(self.z[first_index+step], "sigmoid", derivative=True, activated_value=self.z[first_index+step])

            
            dz_Wr *= (inputs_reshaped + self.Uz @ dh_Wr)
            
            dhtilde_Wr = activation_function(self.h_tilde[first_index+step], "tanh", derivative=True, activated_value=self.h_tilde[first_index+step])
            
            dhtilde_Wr *= (inputs_reshaped + dr_Wr * (self.Uh @ self.h[first_index+step-1]) + dh_Wr * (self.Uh @ self.r[first_index+step]))
            
            dh_Wr = dz_Wr * (self.h[first_index+step-1] - self.h_tilde[first_index+step]) \
            + dh_Wr * self.z[first_index+step] + dhtilde_Wr * (1 - self.z[first_index+step])
            
            
            """Wz gradient steps"""
            
            dr_Wz = activation_function(self.r[first_index+step], "sigmoid", derivative=True, activated_value=self.r[first_index+step])
            dr_Wz *= (inputs_reshaped + self.Ur @ dh_Wz)
            
            dz_Wz = activation_function(self.z[first_index+step], "sigmoid", derivative=True, activated_value=self.z[first_index+step])
            dz_Wz *= (inputs_reshaped + self.Uz @ dh_Wz)
            dhtilde_Wz = activation_function(self.h_tilde[first_index+step], "tanh", derivative=True, activated_value=self.h_tilde[first_index+step])
            dhtilde_Wz *= (inputs_reshaped + dr_Wz * (self.Uh @ self.h[first_index+step-1]) + dh_Wz * (self.Uh @ self.r[first_index+step]))
            
            dh_Wz = dz_Wz * (self.h[first_index+step-1] - self.h_tilde[first_index+step]) \
            + dh_Wz * self.z[first_index+step] + dhtilde_Wz * (1 - self.z[first_index+step])
            
            """Uz gradient steps"""
            
            dz_Uz = activation_function(self.z[first_index+step], "sigmoid", derivative=True, activated_value=self.z[first_index+step])
            dz_Uz *= (self.h[first_index-1+step] + self.Uz @ dh_Uz)
            
            dr_Uz = activation_function(self.r[first_index+step], "sigmoid", derivative=True, activated_value=self.r[first_index+step])
            dr_Uz *= (self.h[first_index-1+step] + self.Ur @ dh_Uz)
            
            dhtilde_Uz = activation_function(self.h_tilde[first_index+step], "tanh", derivative=True, activated_value=self.h_tilde[first_index+step])
            dhtilde_Uz *= ((self.Uh @ self.h[first_index-1+step]) * dr_Uz + self.r[first_index+step] * self.h[first_index-1+step] + dh_Uz * (self.Uh @ self.r[first_index+step]))
            
            dh_Uz = dz_Uz * (self.h[first_index-1+step]-self.h_tilde[first_index+step]) + dh_Uz * self.z[first_index+step] + dhtilde_Uz * (1 - self.z[first_index+step])
            
            
            step+=1
                
        
        print(str(dh_Uz.shape))
        dh_Wr *= gradient
        dh_Wz *= gradient
        dh_Uz *= gradient
        
        dh_Wr = np.tile(dh_Wr, (1, self.input_size))
        dh_Wz = np.tile(dh_Wz, (1, self.input_size))
        dh_Uz = np.tile(dh_Uz, (1, self.input_size))
        
        self.Wr -= learning_rate * dh_Wr
        self.Wz -= learning_rate * dh_Wz
        self.Uz -= learning_rate * dh_Uz
        
        return dh_x
    

In [30]:
network = RNN(0.005)
network.add_layer(3, 1)
network.add_layer(1, 3,"softmax")

In [31]:
#input_data = load_data_txt("FinalText.txt", 5, 150)


input_data = [
    [[0],[1],[0]],
    [[1],[0],[0]],
    [[0],[0],[1]],
    [[0],[1],[0]],
    [[0],[1],[0]],
    [[1],[0],[0]],
    [[0],[0],[1]],
    [[0],[1],[0]],
    [[0],[1],[0]],
    [[1],[0],[0]],
    [[0],[0],[1]],
    [[0],[1],[0]],
    [[0],[1],[0]],
    [[1],[0],[0]],
    [[0],[0],[1]],
    [[0],[1],[0]],
    [[0],[1],[0]],
    [[1],[0],[0]],
    [[0],[0],[1]],
    [[0],[1],[0]],
    [[0],[1],[0]],
    [[1],[0],[0]],
    [[0],[0],[1]],
    [[0],[1],[0]],
    [[0],[1],[0]],
    [[1],[0],[0]],
    [[0],[0],[1]],
    [[0],[1],[0]],
    [[0],[1],[0]],
    [[1],[0],[0]],
    [[0],[0],[1]],
    [[0],[1],[0]],
    [[0],[1],[0]],
    [[1],[0],[0]],
    [[0],[0],[1]],
    [[0],[1],[0]],
    [[0],[1],[0]],
    [[1],[0],[0]],
    [[0],[0],[1]],
    [[0],[1],[0]],
    [[0],[1],[0]],
    [[1],[0],[0]],
    [[0],[0],[1]],
    [[0],[1],[0]],
    [[0],[1],[0]],
    [[1],[0],[0]],
    [[0],[0],[1]],
    [[0],[1],[0]],
    [[0],[1],[0]],
    [[1],[0],[0]],
    [[0],[0],[1]],
    [[0],[1],[0]],
    [[0],[1],[0]],
    [[1],[0],[0]],
    [[0],[0],[1]],
    [[0],[1],[0]],
    [[0],[1],[0]],
    [[1],[0],[0]],
    [[0],[0],[1]],
    [[0],[1],[0]],
    [[0],[1],[0]],
    [[1],[0],[0]],
    [[0],[0],[1]],
    [[0],[1],[0]],
    [[0],[1],[0]],
    [[1],[0],[0]],
    [[0],[0],[1]],
    [[0],[1],[0]],
    [[0],[1],[0]],
    [[1],[0],[0]],
    [[0],[0],[1]],
    [[0],[1],[0]],
    [[0],[1],[0]],
    [[1],[0],[0]],
    [[0],[0],[1]],
    [[0],[1],[0]],
    [[0],[1],[0]],
    [[1],[0],[0]],
    [[0],[0],[1]],
    [[0],[1],[0]],
    [[0],[1],[0]],
    [[1],[0],[0]],
    [[0],[0],[1]],
    [[0],[1],[0]],
    [[0],[1],[0]],
    [[1],[0],[0]],
    [[0],[0],[1]],
    [[0],[1],[0]],
    [[0],[1],[0]],
    [[1],[0],[0]],
    [[0],[0],[1]],
    [[0],[1],[0]],
    [[0],[1],[0]],
    [[1],[0],[0]],
    [[0],[0],[1]],
    [[0],[1],[0]],
    [[0],[1],[0]],
    [[1],[0],[0]],
    [[0],[0],[1]],
    [[0],[1],[0]],
    [[0],[1],[0]],
    [[1],[0],[0]],
    [[0],[0],[1]],
    [[0],[1],[0]],
    [[0],[1],[0]],
    [[1],[0],[0]],
    [[0],[0],[1]],
    [[0],[1],[0]],
    [[0],[1],[0]],
    [[1],[0],[0]],
    [[0],[0],[1]],
    [[0],[1],[0]],
    [[0],[1],[0]],
    [[1],[0],[0]],
    [[0],[0],[1]],
    [[0],[1],[0]],
    [[0],[1],[0]],
    [[1],[0],[0]],
    [[0],[0],[1]],
    [[0],[1],[0]],
    [[0],[1],[0]],
    [[1],[0],[0]],
    [[0],[0],[1]],
    [[0],[1],[0]],
    [[0],[1],[0]],
    [[1],[0],[0]],
    [[0],[0],[1]],
    [[0],[1],[0]],
]


In [32]:
epochs = 14000
batch = 2
early_stop = 500
loss = 0
prevLoss = 999
losses = []

for e in range(0, epochs+1):
    network.clear_memory()
    loss = 0
    outputs = []
    print("---------------")
    print("epoch " + str(e))
    network.learning_rate /= 1.00025
    for i in range(0, len(input_data)-1):
        output = network.forward_pass(input_data[i])
        outputs.append(output)
        loss += network.backpropagation_through_time(input_data, input_data[i+1], 8)
  
        if e%10 == 0 and i%131 == 0:
            print("---------------")
            print("epoch " + str(e))
            print("learning rate: " + str(network.learning_rate))
            print("sample " + str(i))
            print("output: \n" + str(output))
            print("target: \n" + str(input_data[i+1]))
    
    if e%10 == 0:        
        print("-----------------average loss: " + str(loss/len(input_data))) 
        plt.plot(np.squeeze(outputs))
        plt.ylabel('outputs')
        plt.xlabel('iterations')
        plt.show()
        
        plt.plot(np.squeeze(losses))
        plt.ylabel('loss')
        plt.xlabel('iterations')
        plt.grid()
        plt.show()
        network.predict(input_data)
        
        
    prevLoss = loss
    if e > 10:
        losses.append(loss/len(input_data))
    if e > 90:
        del losses[0]
    
    
print("------------training finished successfully!------------")
        

---------------
epoch 0


ValueError: non-broadcastable output operand with shape (1,1) doesn't match the broadcast shape (1,3)

In [None]:
print(str(network.layers[-1].weights_H))

In [None]:
print(str(network.layers[-2].weights_X))

In [None]:
print(str(network.layers[-2].H))