In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random

In [2]:
class RNN:
    # Weights
    W = []
    # Layers
    M = []
    # Bias
    b = []
    # Dropout percentage
    D = []
    # Activation function for each layer
    A = []
    Yhat = []
    costs = []
    lossType = 'ce'
    
    def __init__(self):
        self.reset()
    
    def __sigmoid(self, Z):
        return 1 / (1 + np.exp(-Z))
    
    def __tanh(self, Z):
        return np.tanh(Z)
    
    def __relu(self,Z):
        return Z * (Z > 0)
    
    # Calls specified activation function
    def __actf(self,Z,ty):
        if ty == 'sigmoid':
            return self.__sigmoid(Z)
        elif ty == 'tanh':
            return self.__tanh(Z)
        elif ty == 'softmax':
            return self.__softmax(Z)
        elif ty == 'none':
            return Z
        else:# ty == 'relu':
            return self.__relu(Z)
        
        # Calls specified activation function derivative
    def __actf_dv(self,Z,ty):
        if ty == 'sigmoid':
            return Z*(1-Z)
        elif ty == 'tanh':
            return (1-Z*Z)
        else:# ty == 'relu':
            return np.where(Z > 0, 1, 0)
        
    def __softmax(self,A):
        expA = np.exp(A)
        return expA / expA.sum(axis=1,keepdims=True)
    
    # Cross-entropy cost for softmax
    def __cost(self,T,Y,ty):
        if ty == 'ce':
            return self.__cross_entropy(T,Y)
        else:
            return self.__sparse_cat(T,Y)
    
    def __cross_entropy(self,T,Y):
        tot = (-T * np.log(Y))
        return tot.sum()
    
    # Sparse categorical cross-entropy loss
    def __sparse_cat(self, T, Y):
        tot = 0
        for i in range(len(Y)):
            tot += -np.log(Y[i][T[i]])
        return tot;
    
    def classification_rate(self,T):
        Yhat = NeuralNetwork.Yhat
        Yp = np.argmax(Yhat,axis=1)
        print('Classification rate: ', np.mean(T == Yp))
    
    # Adds hidden layer with L nodes, d dropout
    def add_layer(self,L,a='sigmoid',d=0):
        self.M.append(L)
        self.A.append(a)
        d = min(d,1)
        d = max(d, 0)
        self.D.append(d)
        
    def __shuffle(self,X,Y,y):
        assert len(X) == len(Y) == len(y)
        p = np.random.permutation(len(X))
        return X[p],Y[p],y[p]
        
            
    # Parameters(M:Layers,W:Weights,b:bias,A:activation function,D:Dropout)
    def __forward(self,M,W,b,A,D):
        for i in range(1,len(W)+1):
            #if (i != len(W)+1):
            M[i] = self.__actf((M[i-1].dot(W[i-1]) + b[i-1]), A[i-1])
            # Dropout
            if (D[i-1] > 0):
                for j in range(len(M[i])):
                    if (random.random() < D[i-1]):
                        M[i][j] = 0
            #else:
            #    M[i] = M[i-1].dot(W[i-1]) + b[i-1]
            #    Y = self.__softmax(M[i])
        Y = M[-1]
        return Y,M
    
    def fit(self,X,y,epochs=20000,batchSize=0,learnR=10e-6,reg=0,lossType='ce',optimizer='none'):
        W = self.W
        b = self.b
        M = self.M
        A = self.A
        D = self.D
        #NeuralNetwork.actf = a
        
        
        
        # Initialize layers for M
        N = X.shape[0]
        for i in range(len(M)):
            M[i] = np.random.randn(N,M[i])
            
        # Add input and output layers to M
        K = len(set(y))
        M.insert(0,X)
        M.append(np.random.randn(N,K))
        # Add softmax to end of activation functions
        A.append('softmax')
        D.append(0)
        l = learnR
        
        # Regulate batch size
        batchSize = min(batchSize, N)
        batchSize = max(batchSize, 1)
        
        # Set indicator matrix
        Y = np.zeros((N,K))
        for i in range(N):
            Y[i,y[i]] = 1
            
        # Set weights
        for i in range(len(M)-1):
            if i == (len(M)-2):
                W.append(np.random.randn(M[i].shape[1],K) / np.sqrt(M[i].shape[1] + K))
                b.append(np.random.randn(K) / np.sqrt(K))
            else:
                W.append(np.random.randn(M[i].shape[1],M[i+1].shape[1]) / np.sqrt(M[i].shape[1] + M[i+1].shape[1]))
                b.append(np.random.randn(M[i+1].shape[1]) / np.sqrt(M[i+1].shape[1]))
        # Set cache (if using rmsprop/adam)
        if optimizer == 'rms':
            wCache = []
            bCache = []
            eps = 10e-8
            decay = 0.99
            for i in range(len(M)-1):
                if i == (len(M)-2):
                    wCache.append(np.ones((M[i].shape[1],K)))
                    bCache.append(np.ones((K)))
                else:
                    wCache.append(np.ones((M[i].shape[1],M[i+1].shape[1])))
                    bCache.append(np.ones((M[i+1].shape[1])))
        elif optimizer == 'adam':
            wM = []
            wV = []
            bM = []
            bV = []
            eps = 10e-8
            decay1 = 0.99
            decay2 = 0.999
            for i in range(len(M)-1):
                if i == (len(M)-2):
                    wM.append(np.ones((M[i].shape[1],K)))
                    wV.append(np.ones((M[i].shape[1],K)))
                    bM.append(np.ones((K)))
                    bV.append(np.ones((K)))
                else:
                    wM.append(np.ones((M[i].shape[1],M[i+1].shape[1])))
                    wV.append(np.ones((M[i].shape[1],M[i+1].shape[1])))
                    bM.append(np.ones((M[i+1].shape[1])))
                    bV.append(np.ones((M[i+1].shape[1])))
        
        costs = []
        for e in range(epochs):
            iterations = N // batchSize
            X,Y,y = self.__shuffle(X,Y,y)
            for i in range(iterations):
                start = i * batchSize
                end = (i+1) * batchSize
                batchX, batchY, sparseY = X[start:end],Y[start:end],y[start:end]
                del M[0]
                M.insert(0,batchX)
                #print(sparceY)
                Yp,Z = self.__forward(M,W,b,A,D)
                #cost = self.__cost(Y,Yp)
                if lossType == 'ce':
                    cost = self.__cost(batchY,Yp,lossType)
                else:
                    cost = self.__cost(sparseY,Yp,lossType)
                costs.append(cost)
                

                # Adjust weights
                #S = (Y - Yp)
                S = (batchY - Yp)
                n = len(M)-2
                Zt = S
                for i in range(len(M)-1):
                    # Weight and bias derivative
                    dw = Z[n].T.dot(Zt)
                    db = Zt.sum()
                    if optimizer == 'none': 
                        W[n] += l * (dw - reg*W[n])
                        b[n] += l * (db - reg*b[n])
                    elif optimizer == 'rms':
                        # rmsprop
                        wCache[n] = (decay * wCache[n]) + (1-decay) * np.square(dw)
                        bCache[n] = (decay * bCache[n]) + (1-decay) * np.square(db)
                        wDenominator = np.sqrt(wCache[n]) + eps
                        bDenominator = np.sqrt(bCache[n]) + eps
                        
                        W[n] += l * ((dw/wDenominator) - reg*W[n])
                        b[n] += l * ((db/bDenominator) - reg*b[n])
                    elif optimizer == 'adam':
                        
                        #print(wM.shape)
                        wM[n] = (decay1 * wM[n]) + (1-decay1) * dw
                        wV[n] = (decay2 * wV[n]) + (1-decay2) * np.square(dw)
                        bM[n] = (decay1 * bM[n]) + (1-decay1) * db
                        bV[n] = (decay2 * bV[n]) + (1-decay2) * np.square(db)
                        wMhat = wM[n]/(1-decay1**(epochs+1))
                        wVhat = wV[n]/(1-decay2**(epochs+1))
                        bMhat = bM[n]/(1-decay1**(epochs+1))
                        bVhat = bV[n]/(1-decay2**(epochs+1))
                        wDenom = np.sqrt(wVhat) + eps
                        bDenom = np.sqrt(bVhat) + eps
                        #print(W[n].shape, wM[n].shape, wDenom.shape)
                        W[n] += l * ((wMhat/wDenom) - reg*W[n])
                        b[n] += l * ((bMhat/bDenom) - reg*b[n])

                    if i != (len(M)-2):
                        #Update Zt
                        Zt = Zt.dot(W[n].T)*self.__actf_dv(Z[n],A[n-1])

                    n -= 1
            if e % 1000 == 0:
                print(e,costs[-1])
        self.W = W
        self.b = b
        self.M = M
        self.Yhat = Yp
        self.costs = costs
    
    def predict(self,X):
        W = self.W
        b = self.b
        M = self.M
        A = self.A
        #a = NeuralNetwork.actf
        D = self.D
        del M[0]
        M.insert(0,X)
        Yp,Z = self.__forward(M,W,b,A,D)
        NeuralNetwork.Yhat = Yp
        return Yp
    
    def plot_cost(self):
        costs = self.costs
        plt.plot(costs)
        plt.show()
    
    def reset(self):
        self.W = []
        self.b = []
        self.M = []
        self.A = []
        self.D = []
        self.lossType = 'ce'
        self.optimizer = 'none'
        self.Yhat = []
        self.costs = []