In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random

In [101]:
class RNN:
    # Weights
    W = []
    # Layers
    M = []
    # Bias
    b = []
    # Activation function for each layer
    A = []
    Yhat = []
    costs = []
    lossType = 'mse'
    optimizer = 'none'
    
    def __init__(self):
        self.reset()
    
    def __sigmoid(self, Z):
        return 1 / (1 + np.exp(-Z))
    
    def __tanh(self, Z):
        return np.tanh(Z)
    
    def __relu(self,Z):
        return Z * (Z > 0)
    
    # Calls specified activation function
    def __actf(self,Z,ty):
        if ty == 'sigmoid':
            return self.__sigmoid(Z)
        elif ty == 'tanh':
            return self.__tanh(Z)
        elif ty == 'softmax':
            return self.__softmax(Z)
        elif ty == 'none':
            return Z
        else:# ty == 'relu':
            return self.__relu(Z)
        
        # Calls specified activation function derivative
    def __actf_dv(self,Z,ty):
        if ty == 'sigmoid':
            return Z*(1-Z)
        elif ty == 'tanh':
            return (1-Z*Z)
        elif ty == 'none':
            return Z
        else:# ty == 'relu':
            return np.where(Z > 0, 1, 0)
        
    def __softmax(self,A):
        expA = np.exp(A)
        return expA / expA.sum(axis=1,keepdims=True)
    
    # Cross-entropy cost for softmax
    def __cost(self,T,Y,ty):
        if ty == 'ce':
            return self.__cross_entropy(T,Y)
        elif ty == 'mse':
            return self.__mean_squared(T,Y)
    
    def __cross_entropy(self,T,Y):
        tot = (-T * np.log(Y))
        return tot.sum()
    
    def __mean_squared(self,T,Y):
        tot = np.square(T - Y)
        return tot.sum()
    
    def classification_rate(self,T):
        Yhat = NeuralNetwork.Yhat
        Yp = np.argmax(Yhat,axis=1)
        print('Classification rate: ', np.mean(T == Yp))
    
    # Adds hidden layer with L nodes, d dropout
    def add_layer(self,L,a='sigmoid'):
        self.M.append(L)
        self.A.append(a)
        
    def __shuffle(self,X,Y):
        assert len(X) == len(Y)
        p = np.random.permutation(len(X))
        return X[p],Y[p]
        
            
    # Parameters(M:Layers,W:Weights,b:bias,A:activation function,D:Dropout)
    def __forward(self,M,W,b,A):
        Y = []
        yp = []
        X = M[0]
        
        for i in range(len(X)):
            
            ht_prev = M[1][i]
            for t in range(len(X[i])):
                ht = self.__actf((X[i][t].dot(W[0]) + ht_prev.dot(W[1]) + b[0]),A[0])
                yp.append(ht.dot(W[2]) + b[1])
                #yp = softmax(yp)
                ht_prev = ht
                
            M[1][i] = ht
            Y.append(yp[i][-1])
        
        Y = np.array(Y).reshape(len(Y),-1)
        M[2] = Y
        return Y,M
    
    def fit(self,X,Y,epochs=20000,batchSize=0,learnR=10e-6,reg=0,lossType='mse',optimizer='none'):
        W = self.W
        b = self.b
        M = self.M
        A = self.A
        
        # Initialize layers for M
        N = X.shape[0]
        M[0] = np.random.randn(N,M[0])
            
        # Add input and layer to M
        K = Y.shape[1]
        M.insert(0,X)
        M.append(np.random.randn(N,K))
        A.append('none')
        l = learnR
        
        # Regulate batch size
        batchSize = min(batchSize, N)
        batchSize = max(batchSize, 1)
            
        # Set weights
        for i in range(len(M)-1):
            if i == (len(M)-2):

                W.append(np.random.randn(M[i].shape[1],K) / np.sqrt(M[i].shape[1] + K))
                b.append(np.random.randn(K) / np.sqrt(K))
            else:
                # input to hidden
                W.append(np.random.randn(M[i][i].shape[1],M[i+1].shape[1]) / np.sqrt(M[i][i].shape[1] + M[i+1].shape[1]))
                b.append(np.random.randn(M[i+1].shape[1]) / np.sqrt(M[i+1].shape[1]))
                # hidden to hidden
                W.append(np.random.randn(M[i+1].shape[1],M[i+1].shape[1]) / np.sqrt(M[i+1].shape[1] + M[i+1].shape[1]))
                b.append(np.random.randn(M[i+1].shape[1]) / np.sqrt(M[i+1].shape[1]))
        # Set cache (if using rmsprop/adam)
        if optimizer == 'rms':
            wCache = []
            bCache = []
            eps = 10e-8
            decay = 0.99
            for i in range(len(M)-1):
                if i == (len(M)-2):
                    wCache.append(np.ones((M[i].shape[1],K)))
                    bCache.append(np.ones((K)))
                else:
                    wCache.append(np.ones((M[i].shape[1],M[i+1].shape[1])))
                    bCache.append(np.ones((M[i+1].shape[1])))
        elif optimizer == 'adam':
            wM = []
            wV = []
            bM = []
            bV = []
            eps = 10e-8
            decay1 = 0.9
            decay2 = 0.999
            for i in range(len(M)-1):
                if i == (len(M)-2):
                    wM.append(np.zeros((M[i].shape[1],K)))
                    wV.append(np.zeros((M[i].shape[1],K)))
                    bM.append(np.zeros((K)))
                    bV.append(np.zeros((K)))
                else:
                    wM.append(np.zeros((M[i].shape[1],M[i+1].shape[1])))
                    wV.append(np.zeros((M[i].shape[1],M[i+1].shape[1])))
                    bM.append(np.zeros((M[i+1].shape[1])))
                    bV.append(np.zeros((M[i+1].shape[1])))
        
        costs = []
        for e in range(epochs):
            iterations = N // batchSize
            #X,Y,y = self.__shuffle(X,Y,y)
            for i in range(iterations):
                start = i * batchSize
                end = (i+1) * batchSize
                batchX, batchY = X[start:end],Y[start:end]
                del M[0]
                M.insert(0,batchX)
                #print(sparceY)
                Yp,M = self.__forward(M,W,b,A)
                
                Z = M
                #cost = self.__cost(Y,Yp)
                cost = self.__cost(batchY,Yp,lossType)
                costs.append(cost)
                

                # Adjust weights
                #S = (Y - Yp)
                S = (batchY - Yp)
                n = len(M)-1
                Zt = batchY
                for i in range(len(M)-2):
                    # Weight and bias derivative
                    if lossType == 'softmax':
                        dw = Z[n].T.dot(Zt)
                        db = Zt.sum()
                    elif lossType == 'mse':
                        dw = (2/N) * Z[n].T.dot(Zt)
                        if n == 0:
                            mlast = [i[0] for i in M[0]]
                            mlast = np.array(mlast)
                            dw = dw[-1]#(2/N) * mlast.T.dot(Zt)
                        db = (2/N) * Zt.sum()
                    
                    #print(i, n)
                    #print(f"M[{n}]: {Z[n].shape}, Zt: {Zt.shape}, M.T.dot(Zt): {(Z[n].T.dot(Zt)).shape}, W[{n}]: {W[n].shape}")
                    if optimizer == 'none':
                        #if n == 1:
                        #    W[n-1] += l * (dw1 - reg*W[n])
                        #    b[n-1] += l * (db1 - reg*b[n])
                        #print(f"W[{n+1}]: {W[n+1].shape}, dw: {dw.shape}")
                        W[n] += l * (dw - reg*W[n])
                        b[n] += l * (db - reg*b[n])
                    elif optimizer == 'rms':
                        # rmsprop
                        wCache[n] = (decay * wCache[n]) + (1-decay) * np.square(dw)
                        bCache[n] = (decay * bCache[n]) + (1-decay) * np.square(db)
                        wDenominator = np.sqrt(wCache[n]) + eps
                        bDenominator = np.sqrt(bCache[n]) + eps
                        
                        W[n] += l * ((dw/wDenominator) - reg*W[n])
                        b[n] += l * ((db/bDenominator) - reg*b[n])
                    elif optimizer == 'adam':
                        
                        #print(wM.shape)
                        wM[n] = (decay1 * wM[n]) + (1-decay1) * dw
                        wV[n] = (decay1 * wV[n]) + (1-decay1) * np.square(dw)
                        bM[n] = (decay2 * bM[n]) + (1-decay2) * db
                        bV[n] = (decay2 * bV[n]) + (1-decay2) * np.square(db)
                        wMhat = wM[n]/(1-decay1**(epochs+1))
                        wVhat = wV[n]/(1-decay1**(epochs+1))
                        bMhat = bM[n]/(1-decay2**(epochs+1))
                        bVhat = bV[n]/(1-decay2**(epochs+1))
                        wDenom = np.sqrt(wVhat) + eps
                        bDenom = np.sqrt(bVhat) + eps
                        #print(W[n].shape, wM[n].shape, wDenom.shape)
                        W[n] += l * ((wMhat/wDenom) - reg*W[n])
                        b[n] += l * ((bMhat/bDenom) - reg*b[n])
                    
                    
                    #if i == 0:
                        #print(f"actf: {A[n]}")
                        #Zt = Zt.dot(W[n+1].T)*self.__actf_dv(Z[n],A[n-1])
                    #elif i != (len(M)-2):
                        #Update Zt
                        #print("In else")
                        #print(f"actf: {A[n-1]}")
                    #print(n)
                    if n == 1 and lossType == 'mse':
                        Zt = Zt
                    else:
                        Zt = Zt.dot(W[n].T)*self.__actf_dv(Z[n],A[n-1])

                    n -= 1
            if e % 100 == 0:
                print(e,costs[-1])
        self.W = W
        self.b = b
        self.M = M
        self.Yhat = Yp
        self.costs = costs
    
    def predict(self,X):
        W = self.W
        b = self.b
        M = self.M
        A = self.A
        del M[0]
        M.insert(0,X)
        Yp,Z = self.__forward(M,W,b,A,D)
        NeuralNetwork.Yhat = Yp
        return Yp
    
    def plot_cost(self):
        costs = self.costs
        plt.plot(costs)
        plt.show()
    
    def reset(self):
        self.W = []
        self.b = []
        self.M = []
        self.A = []
        self.lossType = 'mse'
        self.optimizer = 'none'
        self.Yhat = []
        self.costs = []

In [57]:
series = np.sin(0.1*np.arange(200))

In [58]:
T = 10
X = []
Y = []
for t in range(len(series) - T):
    x = series[t:t+T]
    X.append(x)
    y = series[t+T]
    Y.append(y)

X = np.array(X).reshape(-1,T,1)
Y = np.array(Y).reshape(len(Y),-1)
N = len(X)

In [102]:
model = RNN()

In [103]:
model.add_layer(15,'tanh')

In [104]:
# (X, Y, loss type, # iterations, batch size, learning rate, regulization)
# Loss type('ce': cross-entropy, 'sce': sparse cross-entropy)
lossType = 'mse'
epochs = 20000
batchSize = N
learnR = 10e-4
reg = 0
optimizer = 'none'
model.fit(X,Y,epochs,batchSize,learnR,lossType=lossType,optimizer=optimizer)

0 106.41648679047788
100 101.1148997708629
200 101.11602646962766
300 101.11715283093504
400 101.11827885470925
500 101.11940454087477
600 101.1205298893563
700 101.12165490007905
800 101.12277957296827
900 101.12390390794965
1000 101.12502790494916
1100 101.126151563893
1200 101.12727488470777
1300 101.12839786732027
1400 101.12952051165762
1500 101.13064281764721
1600 101.13176478521679
1700 101.13288641429432
1800 101.13400770480811
1900 101.13512865668673
2000 101.13624926985904
2100 101.1373695442542


KeyboardInterrupt: 

In [84]:
W[2].shape

(15, 1)

## Individual methods / testing

In [3]:
def forward(M,W,b):
    X = M[0]
    Y = []
    yp = []
    ht_prev = M[1][0]
    for i in range(len(X)):
        for t in range(len(X[i])):
            ht = tanh((X[i][t].dot(W[0]) + ht_prev.dot(W[1]) + b[0]))
            yp.append(ht.dot(W[2]) + b[1])
            #yp = softmax(yp)
            ht_prev = ht
        M[1][i] = ht
        Y.append(yp[i][-1])
    
    Y = np.array(Y).reshape(len(Y),-1)               
    return Y,M

In [4]:
def mse(T,Y):
    tot = np.square(T - Y)
    return tot.sum()

In [5]:
def sigmoid(Z):
    return 1 / (1 + np.exp(-Z))

In [6]:
def tanh(Z):
    return np.tanh(Z)

In [7]:
def actf_dv(Z):
    return Z*(1-Z)

In [8]:
def softmax(A):
    expA = np.exp(A)
    return expA / expA.sum(axis=1,keepdims=True)

In [9]:
series = np.sin(0.1*np.arange(200))

In [10]:
T = 10
D = 1
H = 15
X = []
Y = []
for t in range(len(series) - T):
    x = series[t:t+T]
    X.append(x)
    y = series[t+T]
    Y.append(y)

X = np.array(X).reshape(-1,T,1)
Y = np.array(Y).reshape(len(Y),-1)
N = len(X)

In [11]:
# Initialize layers for M
M = []
W = []
b = []
M.append(np.zeros((N,H)))

# Add input and layer to M
K = 1
M.insert(0,X)
M.append(np.random.randn(N,K))

In [19]:
# Set weights
W = []
b = []
for i in range(len(M)-1):
    if i == (len(M)-2):
        
        W.append(np.random.randn(M[i].shape[1],K) / np.sqrt(M[i].shape[1] + K))
        b.append(np.random.randn(K) / np.sqrt(K))
    else:
        # input to hidden
        W.append(np.random.randn(M[i][i].shape[1],M[i+1].shape[1]) / np.sqrt(M[i][i].shape[1] + M[i+1].shape[1]))
        #b.append(np.random.randn(M[i+1].shape[1]) / np.sqrt(M[i+1].shape[1]))
        # hidden to hidden
        W.append(np.random.randn(M[i+1].shape[1],M[i+1].shape[1]) / np.sqrt(M[i+1].shape[1] + M[i+1].shape[1]))
        b.append(np.random.randn(M[i+1].shape[1]) / np.sqrt(M[i+1].shape[1]))

In [13]:
x = M[0]

In [20]:
b3 = np.random.randn(1,10,15)

In [15]:
ht = tanh((M[0][0].dot(W[0]) + M[1][0].dot(W[1]) + b[0]))

In [16]:
b[0].shape

(10, 15)

In [17]:
y1 = ht.dot(W[2])

In [21]:
Yp,M = forward(M,W,b)

In [75]:
Yp.shape

(190, 1)

In [91]:
b[1].shape

(15,)

In [25]:
M[1].shape

(190, 15)

In [22]:
zt = (Y-Yp)

In [24]:
dw = (2/N) * M[2].T.dot(zt)

In [26]:
zt1 = (zt).dot(W[2].T)*tanh(M[2])

In [27]:
dw1 = (2/N) * M[1].T.dot(zt1)

In [29]:
dw2 = (2/N) * M[0].T.dot(zt1)

In [32]:
W[0].shape

(1, 15)

In [34]:
zt1.shape

(190, 15)

In [49]:
mlast = [i[-1] for i in M[0]]
mlast = np.array(mlast)

In [54]:
M[0][2][-1]

array([0.89120736])

In [55]:
mlast.shape

(190, 1)

In [97]:
x = np.random.randn((50))

In [100]:
x.dot(x)

38.590652295603974

In [18]:
Z = M[1]
Zt = Y

In [19]:
b1 = b[1]
b0 = b[0]

In [20]:
Z = M

In [21]:
w2 = np.vstack([b1,W[2]])
w1 = np.vstack([b0,W[1]])
z2 = np.vstack([np.ones((len(Z[2][0]))),Z[2]])
z1 = np.vstack([np.ones((len(Z[1][0]))),Z[1]])

In [32]:
z1.shape

(191, 15)

In [30]:
Y1 = np.vstack([np.ones(1),Y])

In [89]:
b2.shape

(1,)

In [33]:
dw = np.linalg.solve(np.dot(z2.T, z2), np.dot(z2.T, Y1))

In [15]:
dw1 = np.linalg.solve(np.dot(Yp.T, Yp), np.dot(Yp.T, Y))

In [36]:
dw

array([[0.00980991]])

In [16]:
dw1

array([[0.00013429]])

In [118]:
y1 = Y.reshape(N,-1)

In [136]:
re = x[0][1].dot(W[0]) + M[1][0].dot(W[1]) + b[0]

In [32]:
ht_p = M[1]

In [33]:
re3 = sigmoid((M[0][0][0].dot(W[0]) + ht_p[0].dot(W[1]) + b[0]))

In [34]:
re3.shape

(15,)

In [139]:
re1 = re.dot(W[2]) + b[1]

In [141]:
len(re1)

15

In [148]:
re2 = re1.reshape(len(re1),-1)

In [147]:
re1

array([-0.32032633, -0.30753756, -0.06444399, -0.3179915 , -0.35411167,
       -0.43051232, -0.30951572, -0.51114223, -0.1661792 , -0.78485887,
       -0.27389987, -0.02894201,  0.28752142, -0.49888721,  0.10897052])

In [149]:
sof = softmax(re2)

In [150]:
sof

array([[1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.]])

In [152]:
re2

array([[-0.32032633],
       [-0.30753756],
       [-0.06444399],
       [-0.3179915 ],
       [-0.35411167],
       [-0.43051232],
       [-0.30951572],
       [-0.51114223],
       [-0.1661792 ],
       [-0.78485887],
       [-0.27389987],
       [-0.02894201],
       [ 0.28752142],
       [-0.49888721],
       [ 0.10897052]])