In [1]:
import numpy as np

In [16]:
data = open('sherlock.txt', 'r').read()
chars = list(set(data))
chars = sorted(chars)
data_size = len(data)
vocab_size = len(chars)
print('Data has {} letters, {} unique letters'.format(data_size, vocab_size))

Data has 54 letters, 3 unique letters


In [17]:
# Dictionary to encode chars to ints
char_to_int = {ch:i for i, ch in enumerate(chars)}
int_to_char = {i:ch for i, ch in enumerate(chars)}
print(char_to_int)

{'1': 0, '2': 1, '3': 2}


In [20]:
# One hot encoding
char_vec_onehot = np.zeros((vocab_size,1))
char_vec_onehot[char_to_int['1']] = 1
print(char_vec_onehot.ravel())

[ 1.  0.  0.]


In [19]:
# Hyperparameters
hidden_size = 300
seq_length = 28
learning_rate = 0.15

# Model Parameters
Wxh = np.random.randn(hidden_size, vocab_size) * 0.01
Whh = np.random.randn(hidden_size, hidden_size) * 0.01
Why = np.random.randn(vocab_size, hidden_size) * 0.01

h_bias = np.ones((hidden_size, 1))
y_bias = np.ones((vocab_size, 1))

# Training

In [None]:
def lossFun(inputs, targets, hprev):
    xs, hs, ys, ps = {}, {}, {}, {}
    hs[-1] = np.copy(hprev)
    loss = 0
    
    # Forward Pass
    for t in range(len(inputs)):
        # Encode as one hot
        xs[t] = np.zeros((vocab_size,1))
        xs[t][inputs[t]] = 1
        hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + h_bias)
        ys[t] = np.dot(Why, hs[t]) + y_bias # Unnormalized log probabilites for next char
        ps[t] = np.exp(ys[t])/np.sum(np.exp(ys[t]))
        loss += -np.log(ps[t][targets[t],0]) # Softmax
    
    # Backward Pass
    dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
    dbh, dby = np.zeros_like(h_bias), np.zeros_like(y_bias)
    dhnext = np.zeros_like(hs[0])
    for t in reversed(range(len(inputs))):
        dy = np.copy(ps[t])
        dy[targets[t]] -= 1
        # My way -(ti - yi)
        dWhy += np.dot(dy, hs[t].T)
        dby += dy
        dh = np.dot(Why.T, dy) + dhnext
        dz = (1 - hs[t] * hs[t]) * dh
        dbh += dz
        dWxh += np.dot(dz, xs[t].T)
        dWhh += np.dot(dz, hs[t-1].T)
        dhnext = np.dot(Whh.T, dz)
    for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
        np.clip(dparam, -5, 5, out=dparam)    # Clip to mitigate exploding gradient
        
    return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]

In [21]:
iters = 0
p = 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(h_bias), np.zeros_like(y_bias)
smooth_loss = -np.log(1.0/vocab_size) * seq_length
while iters < 100000:
    if p+1+ seq_length > len(data) or iters == 0:
        hprev = np.zeros((hidden_size, 1))        # Clear RNN hidden state
        p = 0                                     # Start at the beginning of data
        
    inputs = [char_to_int[ch] for ch in data[p:p+seq_length]]
    targets = [char_to_int[ch] for ch in data[p+1:p+1+seq_length]]
    
    loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
    smooth_loss = smooth_loss * 0.999 + loss * 0.001
    
    # Print samples at different training intervals
    if iters % 1000 == 0:
        print('Iter: {}, loss {}'.format(iters, smooth_loss))
        sampleSentence(hprev, inputs[0], 200)
        print('\n')
        
    # Adagrad Update
    for param, dparam, mem in zip ([Wxh, Whh, Why, h_bias, y_bias],
                                   [dWxh, dWhh, dWhy, dbh, dby],
                                   [mWxh, mWhh, mWhy, mbh, mby]):
        mem += dparam * dparam
        param += -learning_rate * dparam / np.sqrt(mem + 1e-8)
      
    # Move onto next batch of data
    p += seq_length
    iters += 1

Iter: 0, loss 30.761140967083826
12333332332132322231212211332223213122223133121131222211111332331333223221212323132322113211223112133233311123131332232331221323311311323213123332232321231331111133221332121313311332323211113112121122


Iter: 1000, loss 34.34189089863277
21313232233133313331322111222322311131331333131332311311123121223211333211321323133222323112332121131133212232123323213121222222331322321112232123313321131332231213332121132113321323133321113221211212


Iter: 2000, loss 30.60410749769578
32113311312121322313311212322311321313112113211233231321321211232311133221313121232111321113312122213121332121221212121131133123232211223321231223312333111233212332233233112221222133212221112312321232


Iter: 3000, loss 29.228137593540087
23232113333132323322113222321133322231312132213122223323211222322122131212122122211222112221212232122223321111222312313222332322231331132212331321122311231321112223221332112333233131233121131231333222


Iter: 4000, loss 28.693198319841837
3221122132333

KeyboardInterrupt: 

## Generating a Sentence

In [8]:
# h: the initial hidden state
# Seed_char_int: the seed character for the sequence (char encoded as int)
# n: desired sequence length
def sampleSentence(h, seed_char_int, n):
    x = np.zeros((vocab_size, 1))
    x[seed_char_int] = 1
    
    genList = []
    
    for t in range(n):
        h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + h_bias)
        y = np.dot(Why, h) + y_bias # Unnormalized log probabilites for next char
        p = np.exp(y)/np.sum(np.exp(y))
        
        sampleChar = np.random.choice(range(vocab_size), p=p.ravel())
        x = np.zeros((vocab_size, 1))
        x[sampleChar] = 1
        genList.append(sampleChar)
        
    out_txt = ''.join(int_to_char[ix] for ix in genList)
    
    print(out_txt)

# Run sample
hprev = np.zeros((hidden_size, 1))
sampleSentence(hprev, char_to_int['a'], 200)

(éM(ZIITZcbd&5Mq.WTylPwrahtqFMPLBJV0"8WU.ulkWih4D7BeRYHm:-0ZRjB2Zig&.ijrWHw3BbKS3.-m5)pT"YrIKkNf1:5me:L!y"cOx.IJé 8zM(OlW?inOQh-fé
PRuFQNzK))&2lé)DCl!V8"gezD-1
pLwv4:Jjb) ObbsZIIry2ts
DJévLAG8R&WDrlwI
