In [1]:
import numpy as np
import pandas as pd

In [2]:
# lets we have four sentences 
s1 = 'mango is yellow color'
s2 = 'banana is pink color'
s3 = 'hair has black color'

# Now what is and how the matrices shapes define
# create vocab
l = [*s1.split(),*s2.split(),*s3.split()]
l.append('<end>')
vocab = sorted(set(l))
one_hot_vector_vocab = np.array(pd.get_dummies(vocab))
print(pd.get_dummies(vocab))

   <end>  banana  black  color  hair  has  is  mango  pink  yellow
0      1       0      0      0     0    0   0      0     0       0
1      0       1      0      0     0    0   0      0     0       0
2      0       0      1      0     0    0   0      0     0       0
3      0       0      0      1     0    0   0      0     0       0
4      0       0      0      0     1    0   0      0     0       0
5      0       0      0      0     0    1   0      0     0       0
6      0       0      0      0     0    0   1      0     0       0
7      0       0      0      0     0    0   0      1     0       0
8      0       0      0      0     0    0   0      0     1       0
9      0       0      0      0     0    0   0      0     0       1


In [3]:
# now we create two list which stores [character_to_index] values and [index_to_character] values
char2idx = {value:index for index,value in enumerate(vocab)}
idx2char = np.array(vocab)

In [4]:
# max length of time step
seq_length = max(len(s1.split()),len(s2.split()),len(s3.split()))
seq_length

4

# Preparing Input for multi batch

In [5]:
# For three batch batch  (3*10*4) loop 4 times i.e (3*10) (3*10) (3*10) (3*10)

############################################################################################
#          input        #       shape       #        target          #       shape         #
############################################################################################
# [mango ,banana,hair ]  (3*10) onehotvector    [  is  , is  , has ]   (1*10) onehotvector #
# [ is   ,  is  ,has  ]  (3*10) onehotvector    [yellow,pink ,black]   (1*10) onehotvector #
# [yellow,pink  ,black]  (3*10) onehotvector    [color ,color,color]   (1*10) onehotvector #
# [color ,color ,color]  (3*10) onehotvector    [<end> ,<end>,<end>]   (1*10) onehotvector #
############################################################################################

In [6]:
# Input array for multi batch size
x1 = np.array([s1.split(),s2.split(),s3.split()]).T[0] 
x2 = np.array([s1.split(),s2.split(),s3.split()]).T[1]
x3 = np.array([s1.split(),s2.split(),s3.split()]).T[2]
x4 = np.array([s1.split(),s2.split(),s3.split()]).T[3]

In [7]:
print([char2idx[char] for char in x1])  # first time step
print([char2idx[char] for char in x2])  # second time step
print([char2idx[char] for char in x3])  # third time step
print([char2idx[char] for char in x4])  # forth time step

[7, 1, 4]
[6, 6, 5]
[9, 8, 2]
[3, 3, 3]


# Preparing target variable for multi batch

In [8]:
# preparing target varialbes
# target predict the next upcoming words so we only have to expect 3 prediction words and 4 word is end of sentence
y1 = np.array([s1.split(),s2.split(),s3.split()]).T[1]
y2 = np.array([s1.split(),s2.split(),s3.split()]).T[2]
y3 = np.array([s1.split(),s2.split(),s3.split()]).T[3]
y4 = np.array(['<end>','<end>','<end>'])

In [9]:
print([char2idx[i] for i in y1]) # this is expected targets for first time step
print([char2idx[i] for i in y2]) # this is expected targets for second time step
print([char2idx[i] for i in y3]) # this is expected targets for third time step
print([char2idx[i] for i in y4]) # this is expected targets for forth time step

[6, 6, 5]
[9, 8, 2]
[3, 3, 3]
[0, 0, 0]


In [10]:
print(x1,x2,x3,x4)
print(y1,y2,y3,y4)

['mango' 'banana' 'hair'] ['is' 'is' 'has'] ['yellow' 'pink' 'black'] ['color' 'color' 'color']
['is' 'is' 'has'] ['yellow' 'pink' 'black'] ['color' 'color' 'color'] ['<end>' '<end>' '<end>']


# Forward Propagation

In [11]:
##################################################################################
# input_neuron(3)     ###  1_H_l__2_neuron(2)    #### out_lay_3_neuron(3)        #
##################################################################################
#                      Wax.T  *  Xt               Way.T  * a_current             #
# (3*10)               (2*3)  * (3*10)             (3*2) *  (2*10)               #
# mango       ----------------------->        --------------------->  is         #
# banana      -----------------------> (2*10) --------------------->  is  (3*10) #
# hair        ----------------------->        --------------------->  has        #
##################################################################################

In [12]:
def softmax(a):
    return np.exp(a)/np.exp(a).sum(axis=1).reshape(-1,1)

def rnn_cell_forward(xt, a_prev, parameters):
    """
    Arguments:
    xt         --  (batch_unit_size , vocab_size)
    a_prev     --  (hidden_unit_size, vocab_size)

    parameters -- python dictionary containing:
                        Wax -- (batch_unit_size , hidden_unit_size)
                        Waa -- (hidden_unit_size, hidden_unit_size)
                        Wya -- (hidden_unit_size, output_unit_size)
                        ba --  (hidden_unit_size, 1)
                        by --  (output_unit_size, 1)
    Returns:
    a_next  -- (hidden_unit, vocab_size)
    yt_pred -- (output_unit, vocab_size)
    cache   -- (a_next, a_prev, xt, parameters)
    """
    Wax = parameters["Wax"]
    Waa = parameters["Waa"]
    Way = parameters["Way"]
    ba = parameters["ba"]
    by = parameters["by"]
    
    a_next = np.tanh(np.dot(Waa.T, a_prev) + np.dot(Wax.T, xt) + ba) # current activation bias
    y_cap  = softmax(np.dot(Way.T, a_next) + by)  # output of current time step
    return (a_prev, a_next, y_cap)

In [13]:
def rnn_forward(xt, a_prev, parameters):
    """
    Arguments:
    xt         --  (batch_unit_size , vocab_size , timestep)
    a_prev     --  (hidden_unit_size, vocab_size)

    parameters -- python dictionary containing:
                        Wax -- (batch_unit_size , hidden_unit_size)
                        Waa -- (hidden_unit_size, hidden_unit_size)
                        Way -- (hidden_unit_size, output_unit_size)
                        ba --  (hidden_unit_size, 1)
                        by --  (output_unit_size, 1)
    Returns:
    a_next  -- (hidden_unit, vocab_size)
    yt_pred -- (output_unit, vocab_size)
    cache   -- (a_next, a_prev, xt, parameters)
    """
    
    caches = []  # store outputs which we get using single timestep  [need during backpropagation]
    
    hidden_neuron , vocab_size         =  a_prev.shape
    batch_size , vocab_size , timestep =  xt.shape
    hidden_neuron, output_neuron       =  parameters["Way"].shape
    
    activations = np.zeros([hidden_neuron,vocab_size,timestep])  # store activations during each timesteps
    y_caps      = np.zeros([output_neuron,vocab_size,timestep])  # store output at each timesteps

    a_next_t = a_prev  # why? you will get soon
    a0 = 0
    # loop over all time-steps
    for t in range(timestep):
        
        a_prev_t, a_next_t, y_cap_t = rnn_cell_forward(xt[:,:,t], a_next_t, parameters)
        if t == 0:
            a0 = a_prev_t # store only first prev activation
        activations[:,:,t]   = a_next_t
        y_caps[:,:,t]        = y_cap_t
    
    return a0, activations, y_caps

# Back Propagation

In [14]:
def rnn_cell_backward(timestep, y_caps, y, activation, parameters,xt,a0):
    """
    Arguments:
    da_next -- Gradient of loss with respect to next hidden state
    cache   -- python dictionary containing useful values (output of rnn_cell_forward())


    Returns:
    gradients -- python dictionary containing:
                        dx      -- Gradients of input data, of shape              (batch_size ,vocab_size, timestep)
                        da_prev -- Gradients of previous hidden state, of shape   (hidden_unit,vocab_size)
                        dWax    -- Gradients of input-to-hidden weights, of shape (batch_size ,hidden_unit)
                        dWaa    -- Gradients of hidden-to-hidden weights, of shape(hidden_unit,hidden_unit)
                        dba     -- Gradients of bias vector, of shape             (hidden_unit, 1)
    """
    Wax = parameters["Wax"] #(3*2)
    Waa = parameters["Waa"] #(2*2)
    Way = parameters["Way"] #(2*3)
    ba  = parameters["ba"]  #(2*1)
    by  = parameters["by"]  #(3*1)
    
    y_cap_ = y_caps[:,:,timestep]      # current timestep y_pred
    y_     = y[:,:,timestep]           # current timestep y_actual
    a_     = activation[:,:,timestep]  # current timestep a_next
    xt_    = xt[:,:,timestep]          # current timestep input
    a0_    = a0                        # a_prev for first timestep
    
    # compute derivative Way w.r.t to Loss
    #                (2*10) (3*10)-(3*10)
    dL_dWay = np.dot( a_,  (y_cap_ - y_).T)                # (2*3) 
    
    # dL_dWaa = dL_da * da__da_prev * da_da_prev__dWaa 
    dL_dWaa = 0
    for i in range(timestep+1):   
        #                    (2*3)  (3*10)-(3*10) 
        dL_da        = np.dot(Way, (y_cap_ - y_))          # (2*10)
        
        da__da_prev = 1
        for j in reversed(range(i+1,timestep+1)): 
            #                               (2*10)
            da_dtanh   = (1-np.square(activation[:,:,j]))   # (2*10)
            #                               (2*2)  (2*10)   
            da__da_prev= da__da_prev * np.dot(Waa , da_dtanh)# (2*10)
            
        if i == 0: 
            #           (2*10)                    (2*10)  
            da_dWaa   =  a0_   *  (1 - np.square(activation[:,:,i])) # (2*10)        
        else:
            #                          (2*10)                               (2*10)
            da_dWaa   =  (1 - np.square(activation[:,:,i-1]))   *  (1 - np.square(activation[:,:,i])) # (2*10)
            
            #            (2*10)     (2*10)       (10*2)
        dL_dWaa += np.dot(dL_da * da__da_prev , da_dWaa.T)  # (2*2)
    
    # dL_dWax = dL_da * da__da_prev * da_da_prev__dWax 
    dL_dWax = 0
    da__da_prev = 0
    for i in range(timestep+1):  
        dL_da       = np.dot(Way, (y_cap_ - y_))   # (2*10)
        da__da_prev = np.ones_like(a0) # 2*10
        for j in reversed(range(i+1,timestep+1)):      
            da_dtanh     = (1 - np.square(activation[:,:,j]))          # (2*10)
            da__da_prev  = da__da_prev * np.dot(Waa , da_dtanh)        # (2*10)
        
        da_dWax   =  np.dot(xt[:,:,i] , (1 - np.square(activation[:,:,i])).T) # (3*2)
        
                            # 3*2              2*10     10*2
        dL_dWax  +=  np.dot(da_dWax , np.dot(dL_da , da__da_prev.T))       # (3*2)
            
    # dL_da0 = dL_da * da__da_prev
    dL_da = np.dot(Way, (y_cap_ - y_))            # (2*10)
    da_da_prev = 1
    for i in reversed(range(timestep+1)): # 0 1 2 3
        da_dtanh     = (1 - np.square(activation[:,:,i]))          # (2*10)
        da__da_prev  = da__da_prev * np.dot(Waa , da_dtanh)        # (2*10)
    
    dL_da0 = dL_da * da__da_prev                                   # (2*10)
    #dL_dby = dL_ds * ds_dby
    #          (1*10) - (1*10)     
    dL_dby = (y_cap_ - y_).sum(axis = 1).reshape(-1,1)  #   (1*1)
                        
    #dL_dba = dL_da * da_dba
    #       (2*10)       (2*1) (1*10)-(1*10) 
    dL_dba = (a_ * np.dot(Way, (y_cap_ - y_))).sum(axis = 1 ).reshape(-1,1)   # (2*10)
    
    # Store the gradients in a python dictionary
    gradients = {"dL_da0" :dL_da0,
                 "dL_dWaa": dL_dWaa, 
                 "dL_dWax": dL_dWax,
                 "dL_dWay": dL_dWay,
                 "dL_dba" : dL_dba,
                 "dL_dby" : dL_dby
                 }
    
    return gradients

In [15]:
def rnn_backward(a0, activations, y_caps, y1_ohe, parameters,xt):
    """
    Arguments:
    da     -- Upstream gradients of all hidden states, of shape (batch_size, vocab_size, timestep)
    caches -- tuple containing information from the forward pass (rnn_forward)
    
    Returns:
    gradients -- python dictionary containing:
                    dx  -- Gradient w.r.t. the input data, numpy-array of shape (batch_size, vocab_size, timestep)
                    da0 -- Gradient w.r.t the initial hidden state, numpy-array of shape (hidden_unit, vocab_size)
                    dWax-- Gradient w.r.t the input's weight matrix, numpy-array of shape (batch_size, hidden_unit)
                    dWaa-- Gradient w.r.t the hidden state's weight matrix, numpy-arrayof shape (hidden_unit,hidden_unit)
                    dba -- Gradient w.r.t the bias, of shape (hidden_unit, 1)
    """
    
    batch_size, vocab_size, timestep =  y_caps.shape
    hidden_unit = activations.shape[0]
    
    dWax = 0
    dWay = 0
    dWaa = 0
    dba  = 0
    da0  = 0
    dby  = 0
    
    # Loop through all the time steps
    
    for t in range(timestep):
        
        gradient = rnn_cell_backward(t, y_caps, y1_ohe, activations, parameters,xt,a0)
        
        dL_da0      = gradient["dL_da0"]
        dL_dWaa     = gradient["dL_dWaa"]
        dL_dWax     = gradient["dL_dWax"]
        dL_dWay     = gradient["dL_dWay"]   
        dL_dba      = gradient["dL_dba"]
        dL_dby      = gradient["dL_dby"]
        
        da0  += dL_da0
        dWaa += dL_dWaa
        dWax += dL_dWax
        dWay += dL_dWay
        dba  += dL_dba
        dby  += dL_dby
    
    gradients = {"da0": da0, "dWaa": dWaa, "dWax": dWax, "dWay": dWay,"dba": dba,'dby':dby}
    return gradients

In [16]:
# loss function
def crossEntropy(y_predict,y_actul):
    cost = []
    for i in range(y_predict.shape[2]): # 4
        y_actual = y_actul[:,:,i]
        y_pred   = y_caps[:,:,i]
        cost.append(-np.sum(y_actual * np.log(y_pred)))
    return np.mean(cost,axis=0)

# one hot encoding
def one_hot_encode(*x):
    one = np.zeros([3,10,4])
    for i1,i2 in enumerate(x):
        for j1,j2 in enumerate(i2):
            one[j1,:,i1] = one_hot_vector_vocab[char2idx[j2]].reshape(1,-1)
    return one

In [29]:
x = one_hot_encode(x1,x2,x3,x4) # (3*10*4)
y = one_hot_encode(y1,y2,y3,y4) # (3*10*4)
np.random.seed(1)
Wax = np.random.randn(3,5)   # (5*2)    ==> batch_size    * hidden_neuron

a0  = np.random.randn(5,10)  # (2*10)   ==> hidden_neuron * vocab_size
Waa = np.random.randn(5,5)   # (2*2)    ==> hidden_neuron * hidden_neuron
ba  = np.random.randn(5,1)   # (2*1)    ==> hidden nueron * 1

Way = np.random.randn(5,3)   # (2*3)    ==> hidden_neuron * output_neuron(or batch_size)
by  = np.random.randn(3,1)   # (3*1)    ==> output_neuron * 1

parameters = {"Waa": Waa, "Wax": Wax, "Way": Way, "ba": ba, "by": by}

#######################
# stochastic gradient #
#######################
alpha = 0.01
parameters = {"Waa": Waa, "Wax": Wax, "Way": Way, "ba": ba, "by": by}
for i in range(500):
    a0, activations, y_caps = rnn_forward(x, a0, parameters)
    gradients               = rnn_backward(a0, activations, y_caps, y, parameters,x)
    print("Final cost: ",crossEntropy(y_caps,y))
    parameters['Waa'] -= (alpha * gradients['dWaa'])
    parameters['Wax'] -= (alpha * gradients['dWax'])
    parameters['Way'] -= (alpha * gradients['dWay'])
    parameters['ba']  -= (alpha * gradients['dba'])
    parameters['by']  -= (alpha * gradients['dby'])
    parameters['by']  -= (alpha * gradients['dby'])
    a0                -= (alpha * gradients['da0'])

Final cost:  8.714958911217202
Final cost:  8.490240889190716
Final cost:  8.27790018110784
Final cost:  8.079808636054477
Final cost:  7.898599061537567
Final cost:  7.736670997966742
Final cost:  7.594902332731643
Final cost:  7.472442930617496
Final cost:  7.367491931192761
Final cost:  7.27782289907985
Final cost:  7.201132485366464
Final cost:  7.1353404646450596
Final cost:  7.0787102246388445
Final cost:  7.02983271713319
Final cost:  6.9875646567829275
Final cost:  6.950969580991405
Final cost:  6.919275004379194
Final cost:  6.891842765219503
Final cost:  6.8681453865151525
Final cost:  6.847742816683187
Final cost:  6.830257622910905
Final cost:  6.815350371468469
Final cost:  6.802699224924156
Final cost:  6.791987771989628
Final cost:  6.782902518754906
Final cost:  6.775141694133786
Final cost:  6.768467670618382
Final cost:  6.7628294613018785
Final cost:  6.757441605123861
Final cost:  6.748105778352948
Final cost:  6.732556416168695
Final cost:  6.713795340851124
Final 

Final cost:  4.379834953367501
Final cost:  4.410234393105546
Final cost:  4.399040368988815
Final cost:  4.392507947870941
Final cost:  4.38226437157717
Final cost:  4.376938102076
Final cost:  4.369868585558799
Final cost:  4.366245895180701
Final cost:  4.3626397778518005
Final cost:  4.360812944825141
Final cost:  4.359689483415532
Final cost:  4.359482977885808
Final cost:  4.359926878566681
Final cost:  4.360935916890213
Final cost:  4.3623971101919015
Final cost:  4.364212943276863
Final cost:  4.366285445846884
Final cost:  4.368520491557525
Final cost:  4.370836460884493
Final cost:  4.373171735831868
Final cost:  4.37548412041868
Final cost:  4.377742872713763
Final cost:  4.379919582949105
Final cost:  4.381983356326275
Final cost:  4.383901417896263
Final cost:  4.385642764732017
Final cost:  4.3871816924227955
Final cost:  4.388499311329548
Final cost:  4.389582952336875
Final cost:  4.39042440917417
Final cost:  4.3910180359257645
Final cost:  4.391359271020101
Final cost

In [30]:
print(np.argmax(x[:,:,0],axis=1))
print(np.argmax(x[:,:,1],axis=1))
print(np.argmax(x[:,:,2],axis=1))
print(np.argmax(x[:,:,3],axis=1))

[7 1 4]
[6 6 5]
[9 8 2]
[3 3 3]


In [31]:
print(np.argmax(y[:,:,0],axis=1))
print(np.argmax(y[:,:,1],axis=1))
print(np.argmax(y[:,:,2],axis=1))
print(np.argmax(y[:,:,3],axis=1))

[6 6 5]
[9 8 2]
[3 3 3]
[0 0 0]


In [32]:
print(np.argmax(y_caps[:,:,0],axis=1))
print(np.argmax(y_caps[:,:,1],axis=1))
print(np.argmax(y_caps[:,:,2],axis=1))
print(np.argmax(y_caps[:,:,3],axis=1))

[6 6 6]
[9 9 2]
[6 6 7]
[8 8 5]


In [33]:
print(y[:,:,0].shape)
print(y_caps[:,:,0].shape)

(3, 10)
(3, 10)


In [34]:
####################
#     ACCURACY     #
####################
print("Actual_input\t\tActual_expected\t\t\tPredicted")
for i in range(4):
    y_ = np.argmax(y[:,:,i],axis=1)
    y_cap = np.argmax(y_caps[:,:,i],axis=1)
    x_ = np.argmax(x[:,:,i],axis=1)
    for p in range(3):
        print(idx2char[x_[p]],end=',')
    print('\t\t',end='')
    for j in range(3):
        print(idx2char[y_[j]],end=',')
    print('\t\t',end='')
    for k in range(3):
        print(idx2char[y_cap[j]],end=',')
    print()

Actual_input		Actual_expected			Predicted
mango,banana,hair,		is,is,has,		is,is,is,
is,is,has,		yellow,pink,black,		black,black,black,
yellow,pink,black,		color,color,color,		mango,mango,mango,
color,color,color,		<end>,<end>,<end>,		has,has,has,
