In [1]:
import numpy as np
np.random.seed(42)

# from ipdb import set_trace as debug

import pandas as pd

In [2]:
# Methods!
def new_state(new_input, old_state):
    # in the description of the hw there were no bias terms so i did not include them either
    return np.tanh(np.dot(Wxh, new_input) + np.dot(Whh, old_state))

def one_hot_encoded(index):
    temp = np.zeros((vocab_size,1))
    temp[index] = 1
    return temp

def softmax_output(state):
    logit = np.dot(Why, state) # the logits
    # apply softmax
    return np.exp(logit) / np.sum(np.exp(logit)) # probabilities for next chars

def cross_entropy_loss(probabilities, y_trues):
    loss = 0
    for t in range(len(y_trues)):
        index_of_true_char = y_trues[t]
        loss+= -np.log(probabilities[t][index_of_true_char,0])
    return loss

def forward_pass(inputs, hprev):
    xs, hs, ps = {}, {}, {}         # using dict instead of arrays makes more readable code
    
    # Note: a trick that makes the loop below smooth
    # unless we do this we will need to handle  hs[t-1] case at t=0
    hs[-1] = np.copy(hprev) 
                                       
    for t in range(len(inputs)):

        xs[t] = one_hot_encoded(index=inputs[t])
        
        hs[t] = new_state(xs[t], hs[t-1])

        ps[t] = softmax_output(state=hs[t])
    
    last_state =  hs[len(inputs)-1] # will be new previous state
    # why 'len(inputs)-1' instead of just -1 you ask? Because this is a dict and not a list 
    # and -1 is used for the previous state. see note above

    return xs, hs, ps, last_state

def copy_zeros_in_shape_of_these(*these):
    return tuple(np.zeros_like(one) for one in these)

def backward_pass(xs, hs, ps, y_trues):
    dWxh, dWhh, dWhy = copy_zeros_in_shape_of_these(Wxh, Whh, Why)
    dh_from_next_state = np.zeros_like(hs[0])
    for t in reversed(range(len(inputs))):
        dy = np.copy(ps[t])
        dy[y_trues[t]] -= 1 # gradient from softmax
                
        dWhy += np.dot(dy, hs[t].T) # y_t = Why * h_t ==> dWhy =  dy_t * transpose(h_t)

        
        dh_from_dy = np.dot(Why.T, dy) # derivative of dh from the output i.e.: y_t = Why * h_t
        dh = dh_from_dy + dh_from_next_state # h affects current y and also 'next state'        
        
        one_minus_tanhSquared = (1 - hs[t] * hs[t]) # note that hs[t] IS tanh
        # see https://socratic.org/questions/what-is-the-derivative-of-tanh-x
        
        # from high school: içinin türevi çarpı kendinin türevi (f = tanh iken)
        dh_over_dtanh =  dh * one_minus_tanhSquared  # handle tanh fucntion
        
        dWxh += np.dot(dh_over_dtanh, xs[t].T)
        dWhh += np.dot(dh_over_dtanh, hs[t-1].T)
        
        dh_from_next_state = np.dot(Whh.T, dh_over_dtanh) # this will be used in the next step 
        # which is the previous state because we are going backwards
        
    for dparam in [dWxh, dWhh, dWhy]:
        # if some gradients get too large causes exploiding gradients,
        # one heurustic is limiting the value:
        np.clip(dparam, -5, 5, out=dparam) # inplace clipping
        # see https://docs.scipy.org/doc/numpy/reference/generated/numpy.clip.html
        
    return dWxh, dWhh, dWhy
    
    
def update_parameters():
    if optimization == 'adagrad':
        for param, dparam, adagrad_mem in zip([Wxh, Whh, Why], 
                                     [dWxh, dWhh, dWhy], 
                                     [mWxh, mWhh, mWhy]):

                adagrad_mem += np.square(dparam)
                 # numpy elementwise multiplies \ divides \ sqrt's
                param += -learning_rate * (dparam / np.sqrt(adagrad_mem + 1e-8))
    elif optimization == 'sgd':
        print('SGD sucks man!')
        for param, dparam in zip([Wxh, Whh, Why], 
                                     [dWxh, dWhh, dWhy]):
            param += -learning_rate/10 * dparam

            
def sample(initial_state, seed_char_index, n):
    indexs = [seed_char_index]
    for _ in range(n):
        xs, hs, ps, initial_state = forward_pass([seed_char_index],initial_state)
        seed_char_index = np.random.choice(range(vocab_size), p=ps[0].ravel())
        indexs.append(seed_char_index)
    return indexs


def print_sample(initial_state, seed_char_index, n):
    print(''.join(ix_to_char[ix] for ix in sample(hprev, inputs[0], n)))


def sliding_window(text, width):
    length = len(text)
    for i in range(length-width):
        yield text[i:i+width], text[i+1:i+width+1]



### Below are some trials on paper reviews data set 'https://www.kaggle.com/ahmaurya/iclr2017reviews/' I use only abstracts

In the following I take only one paper abstract and run all epochs on it, when there are few possible chars RNN overfits and actually puts some real words when sampled 

In [3]:

def get_data(n):
    return list(pd.read_csv('iclr2017_papers.csv')['abstract'].dropna())[:n]

# init everything!
abstracts = get_data(1)
data = '\n'.join(abstracts)
chars = list(set(data))
vocab_size = len(chars)

print('There are {} characters, {} are unique'.format(len(data), vocab_size))

# neet idea for indexing chars back and forth,  better than dedicated function in my opinion
char_to_ix = { ch:i for i,ch in enumerate(chars) }
ix_to_char = { i:ch for i,ch in enumerate(chars) }

hidden_units = vocab_size*4//5 # size of hidden layer of neurons
seq_length = 18 # recursion number for RNN so called unrolling
learning_rate = 1e-1 # make this 1e-2 for sgd, even then it fluctuates very badly. maybe 1e-3 would be better
optimization = 'adagrad'

# trainable parameters
Wxh = np.random.normal(scale=0.01,size=(hidden_units, vocab_size)) # input to hidden
Whh = np.random.normal(scale=0.01,size=(hidden_units, hidden_units)) # hidden to hidden
Why = np.random.normal(scale=0.01,size=(vocab_size, hidden_units)) # hidden to output

        
for epoch in (range(20)):
    epoch_acc_loss = 0
    print(' - ', epoch+1,'. epoch', sep='')
    for num, abstract in enumerate(abstracts):
        abstract_acc_loss = 0
        print(num+1,'. abstract', sep='')
        hprev = np.zeros((hidden_units,1)) # reset initila state = delete RNN's memory
        if optimization == 'adagrad':
            # the 'memory is needed for adagrad , standard sgd does not use them'
            mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
        
        # loop in an abstract
        for i, (inputs, expected_outputs) in enumerate(sliding_window(abstract, seq_length)):
            inputs = list(map(lambda ch:char_to_ix[ch], inputs))
            expected_outputs = list(map(lambda ch:char_to_ix[ch], expected_outputs))

            xs, hs, ps, hprev = forward_pass(inputs, hprev)

            loss = cross_entropy_loss(ps, expected_outputs)
            abstract_acc_loss+=loss

            dWxh, dWhh, dWhy = backward_pass(xs, hs, ps, expected_outputs)

            update_parameters() # parameters are global
            
        abstract_avg_loss = abstract_acc_loss/(i+1)
#         debug()
        epoch_acc_loss+= abstract_avg_loss
        
    print('average epoch loss', epoch_acc_loss/(num+1))

There are 979 characters, 40 are unique
 - 1. epoch
1. abstract
average epoch loss 28.263477026597336
 - 2. epoch
1. abstract
average epoch loss 22.18818062705942
 - 3. epoch
1. abstract
average epoch loss 20.258261837519267
 - 4. epoch
1. abstract
average epoch loss 19.440408649204738
 - 5. epoch
1. abstract
average epoch loss 19.41274780627505
 - 6. epoch
1. abstract
average epoch loss 18.115550484174843
 - 7. epoch
1. abstract
average epoch loss 17.761373508970326
 - 8. epoch
1. abstract
average epoch loss 17.857820465854278
 - 9. epoch
1. abstract
average epoch loss 17.73807071348975
 - 10. epoch
1. abstract
average epoch loss 17.25307296236233
 - 11. epoch
1. abstract
average epoch loss 17.403057304628394
 - 12. epoch
1. abstract
average epoch loss 17.065101122294386
 - 13. epoch
1. abstract
average epoch loss 17.019714867783623
 - 14. epoch
1. abstract
average epoch loss 16.20878218283214
 - 15. epoch
1. abstract
average epoch loss 15.75983146278173
 - 16. epoch
1. abstract
avera

In [4]:
print_sample(hprev, 20, 200)

eneconceperi astabilicpongueProort. Our tabirari, st rantecessary lik.: recursion. f to incessural nugges,or neural architecurante leh urarty necamonust ratecursioretabl nuicssto apor thate a corporate


The following takes first chars of first 10 abstarcts and runs epcohs on their combined text.

In [5]:

# init every thing!
data = '\n'.join(list(pd.read_csv('iclr2017_papers.csv')['abstract'].dropna())[:10]).lower()
chars = list(set(data))
vocab_size = len(chars)

print('There are {} characters, {} are unique'.format(len(data), vocab_size))

# neet way for indexing chars back and forth,  better than dedicated functions in my opinion
char_to_ix = { ch:i for i,ch in enumerate(chars) }
ix_to_char = { i:ch for i,ch in enumerate(chars) }

hidden_units = vocab_size*4//5 # size of hidden layer of neurons
seq_length = 18 # recursion number for RNN so called unrolling
learning_rate = 1e-1 # make this 1e-2 for sgd, even then it fluctuates very badly. maybe 1e-3 would be better
optimization = 'adagrad'

# trainable parameters
Wxh = np.random.normal(scale=0.01,size=(hidden_units, vocab_size)) # input to hidden
Whh = np.random.normal(scale=0.01,size=(hidden_units, hidden_units)) # hidden to hidden
Why = np.random.normal(scale=0.01,size=(vocab_size, hidden_units)) # hidden to output


for num in range(50):
    epoch_acc_loss = 0
    hprev = np.zeros((hidden_units,1)) # initlialize initila state

    if optimization == 'adagrad':
        # the 'memory is needed for adagrad , standard sgd does not use them'
        mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
      
    # loop in text
    for i, (inputs, expected_outputs) in enumerate(sliding_window(data, seq_length)):
        inputs = list(map(lambda ch:char_to_ix[ch], inputs))
        expected_outputs = list(map(lambda ch:char_to_ix[ch], expected_outputs))

        xs, hs, ps, hprev = forward_pass(inputs, hprev)

        loss = cross_entropy_loss(ps, expected_outputs)
        epoch_acc_loss+=loss

        dWxh, dWhh, dWhy = backward_pass(xs, hs, ps, expected_outputs)

        update_parameters() # parameters are global


#         debug()        
    print(num+1,'. epoch', ' average loss ', epoch_acc_loss/(i+1), sep='')



There are 11075 characters, 53 are unique
1. epoch average loss 30.442283662474296
2. epoch average loss 29.804240122335035
3. epoch average loss 29.024874574189454
4. epoch average loss 28.96572882440713
5. epoch average loss 28.735255303056714
6. epoch average loss 28.73013578900492
7. epoch average loss 28.533745385714447
8. epoch average loss 28.3132707742684
9. epoch average loss 28.53253314656235
10. epoch average loss 28.278888403310134
11. epoch average loss 28.092286291463193
12. epoch average loss 28.216757915585028
13. epoch average loss 28.008012578099795
14. epoch average loss 27.890203206792744
15. epoch average loss 27.760775869816158
16. epoch average loss 27.472150389136388
17. epoch average loss 27.54186228855362
18. epoch average loss 27.57271463106844
19. epoch average loss 27.54594865754184
20. epoch average loss 27.74426489801027
21. epoch average loss 28.147911229596623
22. epoch average loss 27.73776393230429
23. epoch average loss 27.781105105937897
24. epoch a

In [6]:
print_sample(hprev, 40, 200)

natilst-of-d-pon contage, on-den conthod coll stat ms aning rom in mom gensfat contsim'ste-aling mad), contl-off-policy, colthe ficientiuct gradienting of stabs on-haing off-m's bleic wetse-polsto gene


The loss drops but slowly

I also tried running the rnn on abstracts without combining them. But nothing good to show came up in 100 epochs so i canceled them. Over all I believe that my implementation is correct. But we should not expect so much from one vanilla rnn.