In [21]:
import numpy as np
import matplotlib.pyplot as plt

In [22]:
# Random seed
np.random.seed(420)

In [23]:
#data = open('shakespeare.txt', 'r').read()
data = open('nescio.txt', 'r').read()
chars = list(set(data))
data_size = len(data)
vocab_size = len(chars)

In [24]:
print(f'Data set is length {data_size}')
print(f'Vocab set is length {vocab_size}')

Data set is length 202367
Vocab set is length 89


In [25]:
# Simple character embedding
char_to_idx = {char:i for i, char in enumerate(chars)}
idx_to_char = {i:char for i, char in enumerate(chars)}

In [26]:
# hyperparameters
hidden_size = 100
seq_length = 25
learning_rate = 1e-1
epochs = 100

In [27]:
# weight parameters
Wxh = np.random.randn(hidden_size, vocab_size) * 0.01 # I think its times 0.01 to avoid exploding gradients
Whh = np.random.randn(hidden_size, hidden_size) * 0.01
Why = np.random.randn(vocab_size, hidden_size) * 0.01

# bias
bh = np.zeros((hidden_size, 1))
by = np.zeros((vocab_size, 1))

What should happen:

x = data[0]
y = data[1]

h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h))

y_pred = np.dot(Why, h)

loss = y - y_pred (Simplified, we will use Cross-entropy loss for it)

In [53]:
def forward(xs, targets, hidden, Wxh, Whh, Why, bh, by):
    """Calculate the forward pass
    Calculate the cross-entropy loss, 
    which is based on the softmax functon and the negative log likelyhood.
    """
    
    y_preds = {}
    hs = {}
    softmax_probs = {}
        
    hs[-1] = np.copy(hidden)
    loss = 0
    
    for i in range(len(xs)):
        x = xs[i]
        x_vec = np.zeros((vocab_size, 1)) # vectorize the input
        x_vec[x] = 1

        # Calculate the new hidden, which is based on the input and the previous hidden layer
        hs[i] = np.tanh(np.dot(Wxh, x_vec) + np.dot(Whh, hs[i - 1]) + bh)
        # Predict y
        y_preds[i] = np.dot(Why, hs[i]) + by
        
        softmax_probs[i] = np.exp(y_preds[i]) / np.sum(np.exp(y_preds[i])) # Softmax probabilty
        loss += -np.log(softmax_probs[i][targets[i], 0]) #Negative loss likelyhood
    
    prev_hidden = hs[len(xs) - 1]

    return hs, softmax_probs, loss, prev_hidden

In [54]:
# Test for the forward pass
xs = [char_to_idx[ch] for ch in data[100:125]]
prev_hidden = np.zeros((hidden_size, 1))
targets = [char_to_idx[ch] for ch in data[101:126]]

hs, softmax_probs, loss, prev_hidden = forward(xs, targets, prev_hidden)

# Karpathy code to test
test_hs = {}
test_ys = {}
test_xs = {}
test_loss = 0
test_ps = {}
test_hs[-1] = np.copy(prev_hidden)

for t in range(len(xs)):
    test_xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation
    test_xs[t][xs[t]] = 1
    test_hs[t] = np.tanh(np.dot(Wxh, test_xs[t]) + np.dot(Whh, test_hs[t-1]) + bh) # hidden state
    test_ys[t] = np.dot(Why, test_hs[t]) + by # unnormalized log probabilities for next chars
    test_ps[t] = np.exp(test_ys[t]) / np.sum(np.exp(test_ys[t])) # probabilities for next chars
    test_loss += -np.log(test_ps[t][targets[t],0]) # softmax (cross-entropy loss)

assert loss - test_loss < 0.01 or test_loss - loss > 0.01 #Klein verschil in loss kan

TypeError: forward() missing 5 required positional arguments: 'Wxh', 'Whh', 'Why', 'bh', and 'by'

In [55]:
def loss_function(y_preds, target): 
    """Calculate the cross-entropy loss, 
    which is based on the softmax functon and the negative log likelyhood."""
    
    softmax_probs = {}
    loss = 0
    
    for i in range(len(y_preds)):
        softmax_probs[i] = np.exp(y_preds[i]) / np.sum(np.exp(y_preds[i])) # Softmax probabilty

        loss += -np.log(softmax_probs[i][target[i], 0]) #Negative loss likelyhood
    
    return softmax_probs, loss

In [56]:
#Test for loss function
target = [char_to_idx[ch] for ch in data[101]]
softmax_probs, loss = loss_function(y_preds, target)

# Karpathy code
test_loss = 0
test_ps = {}
test_ps[0] = np.exp(test_ys[0]) / np.sum(np.exp(test_ys[0])) # probabilities for next chars
test_loss += -np.log(test_ps[0][target[0],0]) # softmax (cross-entropy loss)

assert softmax_probs[0][0] == test_ps[0][0]
assert loss == test_loss

NameError: name 'y_preds' is not defined

In [57]:
def initialize_gradients():
    """Initialize the gradients to 0.
    """
    
    dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
    dby, dbh = np.zeros_like(by), np.zeros_like(bh)
    
    return dWxh, dWhh, dWhy, dby, dbh

In [58]:
def backward(softmax_probs, hs, xs, targets):
    """Perform the backward pass"""
    
    dWxh, dWhh, dWhy, dby, dbh = initialize_gradients()
    
    # Initialize empty next hidden layer for the first backprop
    dhnext = np.zeros_like(hs[0])
    
    for i in reversed(range(len(xs))):
        # X to vector
        x = xs[i]    
        x_vec = np.zeros((vocab_size, 1))
        x_vec[x] = 1

        dy = np.copy(softmax_probs[i])
        dy[targets[i]] -= 1 # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here

        dby += dy   
        dWhy += np.dot(dy, hs[i].T)
        dh = np.dot(Why.T, dy) + dhnext
        dhraw = (1 - hs[i] * hs[i]) * dh  
        dWxh += np.dot(dhraw, x_vec.T)
        dWhh += np.dot(dhraw, hs[i-1].T)
        dbh += dhraw
        dhnext = np.dot(Whh.T, dhraw)

    # Clip to prevent exploding gradients
    for dparam in [dWhy, dWxh, dWhh, dbh, dby]:
        np.clip(dparam, -5, 5, out=dparam)
    
    return dWxh, dWhh, dWhy, dby, dbh

In [59]:
# Test for gradient initialization
dWxh, dWhh, dWhy, dby, dbh = initialize_gradients()


# Karpathys code
test_dWxh, test_dWhh, test_dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
test_dbh, test_dby = np.zeros_like(bh), np.zeros_like(by)

assert dWhy.shape == test_dWhy.shape
assert dbh.shape == test_dbh.shape

In [60]:
#Test backward
x = [char_to_idx[ch] for ch in data[100]]
dWxh, dWhh, dWhy, dby, dbh = backward(softmax_probs, hs, x, target)

#Karpathy code
test_dhnext = np.zeros_like(hs[0])
test_dy = np.copy(test_ps[0])
test_dy[target[0]] -= 1 # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
test_dWhy += np.dot(test_dy, hs[0].T)
test_dby += test_dy
test_dh = np.dot(Why.T, test_dy) + test_dhnext # backprop into h
test_dhraw = (1 - hs[0] * hs[0]) * test_dh # backprop through tanh nonlinearity
test_dbh += test_dhraw

x_vec = np.zeros((vocab_size,1)) # encode in 1-of-k representation
x_vec[x] = 1

test_dWxh += np.dot(test_dhraw, x_vec.T)
test_dWhh += np.dot(test_dhraw, hs[-1].T)
test_dhnext = np.dot(Whh.T, test_dhraw)

for dparam in [test_dWxh, test_dWhh, test_dWhy, test_dbh, test_dby]:
    np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients
    
print(test_dWhy[0][0])
print(dWhy[0][0])

assert test_dWhy[0][0] == dWhy[0][0]
assert test_dby[0][0] == dby[0][0]
assert test_dWxh[0][0] == dWxh[0][0]
assert test_dWhh[0][0] == dWhh[0][0]
assert test_dhnext[0][0] == dhnext[0][0]
assert test_dbh[0][0] == dbh[0][0]

NameError: name 'softmax_probs' is not defined

In [61]:
def update_gradients(Wxh, Whh, Why, by, bh, dWxh, dWhh, dWhy, dby, dbh, learning_rate):
    """Update the gradients using stochastic gradient descent."""
    Wxh -= learning_rate * dWxh
    Whh -= learning_rate * dWhh
    Why -= learning_rate * dWhy
    bh -= learning_rate * dbh
    by -= learning_rate * dby
    
    return Wxh, Whh, Why, bh, by

In [62]:
def adagrad(Wxh, Whh, Why, by, bh, dWxh, dWhh, dWhy, dby, dbh, mWxh, mWhh, mWhy, mbh, mby, learning_rate):
    # perform parameter update with Adagrad
    for param, dparam, mem in zip([Wxh, Whh, Why, bh, by],
                                  [dWxh, dWhh, dWhy, dbh, dby],
                                  [mWxh, mWhh, mWhy, mbh, mby]):
        mem += dparam * dparam
        param += -learning_rate * dparam / np.sqrt(mem + 1e-8)  # adagrad update
        
    return Wxh, Whh, Why, by, bh, mWxh, mWhh, mWhy, mbh, mby

In [68]:
def sample(h, seed_ix, n, Wxh, Whh, Why, bh, by):
    """
    Sample a sequence of characters from the model
    h is the memory state, seed_ix is seed letter for the first time step
    """
    
    if isinstance(seed_ix, int):
        x = np.zeros((vocab_size, 1))
        x[seed_ix] = 1
        ixes = []
        for t in range(n):
            h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
            y = np.dot(Why, h) + by
            p = np.exp(y) / np.sum(np.exp(y))
            ix = np.random.choice(range(vocab_size), p=p.ravel())
            x = np.zeros((vocab_size, 1))
            x[ix] = 1
            ixes.append(ix)
    else:
        print(seed_ix)
        xs = [char_to_idx[ch] for ch in seed_ix]
        ixes = list(xs[:-1])
        
        x = np.zeros((vocab_size, 1))
        x[xs[-1]] = 1
        for t in range(n):
            h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
            y = np.dot(Why, h) + by
            p = np.exp(y) / np.sum(np.exp(y))
            ix = np.random.choice(range(vocab_size), p=p.ravel())
            x = np.zeros((vocab_size, 1))
            x[ix] = 1
            ixes.append(ix)
        
        
    return ixes

In [71]:
def RNN(data, seq_length, epochs):
    """Perform RNN over the data"""
    data_len = len(data)
    
    # Initialize weights
    Wxh = np.random.randn(hidden_size, vocab_size) * 0.01
    Whh = np.random.randn(hidden_size, hidden_size) * 0.01
    Why = np.random.randn(vocab_size, hidden_size) * 0.01
    
    # bias
    bh = np.zeros((hidden_size, 1))
    by = np.zeros((vocab_size, 1))
    
    weights = [Wxh, Whh, Why, bh, by]
    
    # Store losses
    losses = []
    smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0
    
    # Memory voor Adagrad
    mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
    mbh, mby  = np.zeros_like(bh), np.zeros_like(by)
    
    # Loop over the epochs
    for i in range(epochs):
        n = 0
        # Loop over the amount of sequences
        sequences_amount = int(data_len // seq_length)
        
        count = 0
        
        for j in range(sequences_amount):
            
            start_pos = seq_length * j
            # Reset and go from the start of data
            if n == 0 or start_pos + seq_length + 1 >= data_len:
                prev_hidden = np.zeros((hidden_size, 1))
             
            # Embed the inputs and targets
            xs = [char_to_idx[ch] for ch in data[start_pos:start_pos+seq_length]]
            targets = [char_to_idx[ch] for ch in data[start_pos+1:start_pos+seq_length+1]]

            # Forward pass
            hs, softmax_probs, loss, prev_hidden = forward(xs, targets, prev_hidden, *weights)
    
            #Backward
            dWxh, dWhh, dWhy, dby, dbh = backward(softmax_probs, hs, xs, targets)

            # Update gradients with adagrad
            Wxh, Whh, Why, by, bh, mWxh, mWhh, mWhy, mbh, mby = adagrad(Wxh, Whh, Why, by, bh, dWxh, dWhh, dWhy, dby, dbh, mWxh, mWhh, mWhy, mbh, mby, learning_rate)

            # Update gradients with gradient descent
            #Wxh, Whh, Why, bh, by = update_gradients(Wxh, Whh, Why, by, bh, dWxh, dWhh, dWhy, dby, dbh, learning_rate)
            
            smooth_loss = smooth_loss * 0.999 + loss * 0.001
            
            if n % 100 == 0:
                losses.append(smooth_loss)
                print(f'{i + 1}: {n} / {sequences_amount}: {smooth_loss}')
                
                
                # Print a sample
                sample_ix = sample(prev_hidden, xs[0], 200, *weights)
                txt = ''.join(idx_to_char[ix] for ix in sample_ix)
                print('----- \n' + txt + '  \n------')

            n += 1
            
        print(f'Finished epoch {i + 1}.')
        
        
    # Print gefeliciteerd bericht
    gefeliciteerd = sample(prev_hidden, 'Gefeliciteerd', 50, *weights)
    txt = ''.join(idx_to_char[ix] for ix in gefeliciteerd)
    print('----- \n' + txt + '  \n------')
        
    return losses

In [72]:
epochs = 1
losses = RNN(data, seq_length, epochs)

1: 0 / 8094: 112.21591544290557
----- 
wDL8ü3rkFsxZFòSgf mSG;:é!qs-écM3xe;T8tlMkDbeZP(Bâôm( B2â-7y
züFKâròê0lKTE/ë.Sa5j(3û8ó1eG  1oReK;4yoi
û(UJIHI?cGDM1h6(Sxi át.7Dc?ïV,ZWç(i3W4ál'./jLe)y9tKk,wIçBVwMRVgPfOC?âçxáMD3I!(N6"mO SuèIfdNRK;ô;D'6  
------
1: 100 / 8094: 111.72232367033438
----- 
,etiignisjic ceoicitHhhre 
irhhh,whcHdratnpiditihtojcnasiptrwiiti jjeircccjc
tcepeeke itchdcccn ejipte.rtesita geietd whetdijirntge
ditcDhtsneeeevnahai,cecie,estiE ngtec
cDhhteedscetnecd
u near d cnec  
------
1: 200 / 8094: 109.00269987444308
----- 
u do draidaaa
   ez tt dguD agunieork deelet etlvvucboulgieninh hd  Dz  ugmnhgnar 
tz  uud innreaggddgsd
  hnodgau tvndgn umnd "eiorg nkl  z gl
aDguGng,kizg.lg ai
 rh"uwaveuacb Duijgasggsgjtek guagDtg  
------
1: 300 / 8094: 106.24318227319259
----- 
dsv' 'dwenudjskisdlenmkzopeecirlal ed'giatenlt
athee soj. ''eeszisne i.tg me we'e ezjieszote.  op lj  tononinl''z'esrer.umj'nae  n wl  'ezeenhn .zi p' egvn den  lninsek'e eess sUl ls oe t' d an esronn  
-----

1: 3300 / 8094: 63.35675686040822
----- 
 Zatvok. , a8rt.

a8 Fr wantemooppthruchtjend zwu ds dalenddett zakmer. Zar deekladori "Zat .sV
vanlie
toBwaen ee  e daEn ketem "uikemagen sten kt haan denr Lan laaarBa pig sp haateu. Darteg.
Dan na t  
------
1: 3400 / 8094: 63.23306161231633
----- 
hik even zer pe mend een ?ader waoet, Iaan eid endt dubist ze van ilBo k een nienken
ke."
"?ig wer Jien, 't sskipt, Endael deesslet, watdindt 't taat rteenuat s Ak dazopaak tfe 
mn

k bier watiamint t  
------
1: 3500 / 8094: 62.98942644689497
----- 
ov ninie z'ers derj.
der
nadejte, es vee ent mf tt dad een nind

kap. we smget neen woed z'ns en movlen 't zot oop gen ooet hmokjveeZd
s kers amroBnerooherstelen aedeem
ge BwaezeuHschors doorie deei    
------
1: 3600 / 8094: 62.66048765436689
----- 
n ildet Jaan de rins weeind de st eek beerdaars hicht duen ap ienscelorassscheri. dadiem ele marlanoomaareufleldugen en nezsttjelils ood vatk ij, En an daalaleo
dlife:
"ozilgen. Els on weelun zitdegen  
---

1: 6600 / 8094: 56.33679174816946
----- 
 nachtenk lijeevaviet, gaar kpen mundet?
en maafelken, swoekóordeder en el gezij 't waad móek Bieft en en lede getankens vit en Nen er opd kafsstan deiker.

Eg datreen vaat Zoos on verschachupiste mee  
------
1: 6700 / 8094: 56.23461047228062
----- 
ri guori inzieri wadrer een teidalr gege deblerd gem asts eler muit me moen die, Maai waar he ret here haad oPden hond, dombrooken zs deezet-inbel-t. daartoedschan-iteden opdereuroegagegen Epkrsdeczis  
------
1: 6800 / 8094: 55.878325775986994
----- 
nhharkens bilkounag zan ken
Dooooan, zij hacht denk. donden en ul eerbschenen zij wachn nod or zon kovzeenten. oven denin schtene
 rom min pi tooe zoofblim en Hoyzegjnn, acnisten
Bin hoo st aalloi kul  
------
1: 6900 / 8094: 55.921335822347004
----- 
en ig 't 'n Hoyent."Gvat el hoen Bavit 'tehins van veebis bek, Doornat. Dat oijts st
er paar ie cn aad naprtichachter de 's Gop, ee moeken wan hond davan dellerren. ek al aalroulin im dek zos deen zej  
-

In [21]:
plt.plot(losses)

NameError: name 'losses' is not defined

In [24]:
def lossFun(xs, targets, hidden):
    y_preds = {}
    hs = {}
    softmax_probs = {}
        
    hs[-1] = np.copy(hidden)
    loss = 0
    
    for i in range(len(xs)):
        x = xs[i]
        x_vec = np.zeros((vocab_size, 1)) # vectorize the input
        x_vec[x] = 1

        # Calculate the new hidden, which is based on the input and the previous hidden layer
        hs[i] = np.tanh(np.dot(Wxh, x_vec) + np.dot(Whh, hs[i - 1]) + bh)
        # Predict y
        y_preds[i] = np.dot(Why, hs[i]) + by
        
        softmax_probs[i] = np.exp(y_preds[i]) / np.sum(np.exp(y_preds[i])) # Softmax probabilty
        loss += -np.log(softmax_probs[i][targets[i], 0]) #Negative loss likelyhood
    
    prev_hidden = hs[len(xs) - 1]
    
    dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
    dby, dbh = np.zeros_like(by), np.zeros_like(bh)
    
    # Initialize empty next hidden layer for the first backprop
    dhnext = np.zeros_like(hs[0])
    
    for i in reversed(range(len(xs))):
        # X to vector
        x = xs[i]    
        x_vec = np.zeros((vocab_size, 1))
        x_vec[x] = 1

        dy = np.copy(softmax_probs[i])
        dy[targets[i]] -= 1 # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here

        dby += dy   
        dWhy += np.dot(dy, hs[i].T)
        dh = np.dot(Why.T, dy) + dhnext
        dhraw = (1 - hs[i] * hs[i]) * dh  
        dWxh += np.dot(dhraw, x_vec.T)
        dWhh += np.dot(dhraw, hs[i-1].T)
        dbh += dhraw
        dhnext = np.dot(Whh.T, dhraw)

    # Clip to prevent exploding gradients
    for dparam in [dWhy, dWxh, dWhh, dbh, dby]:
        np.clip(dparam, -5, 5, out=dparam)
        
    return loss, dWxh, dWhh, dWhy, dbh, dby, prev_hidden

In [50]:
def lossFun_v2(xs, targets, hidden):
    # Forward pass
    hs, softmax_probs, loss, prev_hidden = forward(xs, targets, hidden)
    
    #Backward
    dWxh, dWhh, dWhy, dby, dbh = backward(softmax_probs, hs, xs, targets)

    # Update gradients with adagrad
    uWxh, uWhh, uWhy, uby, ubh, umWxh, umWhh, umWhy, umbh, umby = adagrad(Wxh, Whh, Why, by, bh, dWxh, dWhh, dWhy, dby, dbh, mWxh, mWhh, mWhy, mbh, mby, learning_rate)
    
    return loss, uWxh, uWhh, uWhy, uby, ubh, umWxh, umWhh, umWhy, umbh, umby, prev_hidden

In [51]:
n, p = 0, 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad
smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0
while True:
  # prepare inputs (we're sweeping from left to right in steps seq_length long)
  if p+seq_length+1 >= len(data) or n == 0: 
    hprev = np.zeros((hidden_size,1)) # reset RNN memory
    p = 0 # go from start of data
  inputs = [char_to_idx[ch] for ch in data[p:p+seq_length]]
  targets = [char_to_idx[ch] for ch in data[p+1:p+seq_length+1]]

  # sample from the model now and then
  if n % 100 == 0:
    sample_ix = sample(hprev, inputs[0], 200)
    txt = ''.join(idx_to_char[ix] for ix in sample_ix)
    print('----\n %s \n----' % (txt, ))

  # forward seq_length characters through the net and fetch gradient
  loss, Wxh, Whh, Why, by, bh, mWxh, mWhh, mWhy, mbh, mby, prev_hidden = lossFun_v2(inputs, targets, hprev)

  smooth_loss = smooth_loss * 0.999 + loss * 0.001
  if n % 100 == 0: print('iter %d, loss: %f' % (n, smooth_loss)) # print progress

  p += seq_length # move data pointer
  n += 1 # iteration counter

----
 PrttsstoootGooooooss'sDsoootDot ooooooosstuosssboob GoosssotoooooDooosDootoooooooooooootososooGoooooosogstooGgoooosostoooostotobssgs'osotoostoossssbsGootoootssGoooss'touoooooooooGotosoooooG'tsssGstoss 
----
iter 0, loss: 114.925547
----
  jareen zot, zlonk ot zrzlendzozbnt ztaat dtdoar zaan zap zmt kk ze kl tonek nzzd zt z'dHaande dhak zunt zin zok an elll z tun voedzzer 'm di kt zor zf hatTk zk 'nt zrat zaa  mogzkEk zal, zt zut zzbn  
----
iter 100, loss: 115.111534
----
 te nekechtijrendst 'manden wak deeeron denhaeltek
"viddet dr lsTPoddid reskht
tkbiesginen Ar.s dd, 'e dachdesvd.ens
Zdchovssgd wen taar dn pechralkecd sd
sitdendens.
tok
'ird. eng geind. den dlsst or  
----
iter 200, loss: 110.904239
----
 jez
matt de illen bie torin dirt  pt do ded sensorort e e hegeg venog teen niden der dat.in han me td dlg dord lidt g.
da dn 'auen de domiwel. breer det Nooe oelter wenwoodden dee, irigtg. hijd de ber 
----
iter 300, loss: 106.716997
----
  wen De dinf. zod,
te'lli
jaaen watd

KeyboardInterrupt: 