In [139]:
import tensorflow as tf
#gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3333)
#sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)

import os
os.environ["CUDA_DEVICE_ORDER"]='PCI_BUS_ID'
os.environ["CUDA_VISIBLE_DEVICES"]='2'

import keras
from keras.utils.data_utils import get_file
from keras.models import Model, Sequential
from keras.layers.normalization import BatchNormalization
from keras.layers.core import Dense, Lambda, Flatten, Dropout
from keras.layers.convolutional import Convolution1D
from keras.layers.convolutional import MaxPooling1D
from keras.optimizers import Adam, sgd
from keras.layers import Input, Embedding, Reshape, merge, LSTM, Bidirectional
from keras.layers.embeddings import Embedding
from keras.regularizers import l1, l2
from keras.preprocessing import image, sequence
from keras.layers import SimpleRNN

import pandas as pd
import numpy as np

#if 'session' in locals() and session is not None:
#    print('Close interactive session')
#    session.close()

## Setup
1. download collected works of Nietzsche
2. get character map
3. create idx <-> char map

In [11]:
path = get_file('nietzsche.txt', origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt")
text = open(path).read()
print("corpus length:", len(text))

corpus length: 600901


In [12]:
chars = sorted(list(set(text)))
vocab_size = len(chars)+1
print("total chars:", vocab_size)

total chars: 86


In [13]:
chars.insert(0, "\0") # Sometimes it's useful to have a zero value in the dataset, e.g. for padding
''.join(chars[1:-6])

'\n !"\'(),-.0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyz'

In [14]:
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))
idx = [char_indices[c] for c in text]
idx[:10]

[40, 42, 29, 30, 25, 27, 29, 1, 1, 1]

In [15]:
''.join(indices_char[i] for i in idx[:70])

'PREFACE\n\n\nSUPPOSING that Truth is a woman--what then? Is there not gro'

## 3 character model
1. Create four lists of every 4th character (starting at 0th, 1st, 2nd, 3rd), 4th list becomes the output, first 3 are inputs
2. Define number of latent factors
3. Create inputs and embedding outputs for each of the 3 character inputs

In [16]:
cs=3
c1_dat = [idx[i] for i in range(0, len(idx)-1-cs, cs)]
c2_dat = [idx[i+1] for i in range(0, len(idx)-1-cs, cs)]
c3_dat = [idx[i+2] for i in range(0, len(idx)-1-cs, cs)]
c4_dat = [idx[i+3] for i in range(0, len(idx)-1-cs, cs)]

In [17]:
#inputs
x1 = np.stack(c1_dat[:-2])
x2 = np.stack(c2_dat[:-2])
x3 = np.stack(c3_dat[:-2])

In [18]:
#output
y = np.stack(c4_dat[:-2])

In [19]:
x1.shape, y.shape

((200297,), (200297,))

In [20]:
n_fac = 42

In [21]:
def embedding_input(name, n_in, n_out):
    inp = Input(shape=(1,), dtype="int64", name=name)
    emb = Embedding(n_in, n_out, input_length=1)(inp)
    return inp, Flatten()(emb)

In [22]:
c1_in, c1 = embedding_input("c1", vocab_size, n_fac)
c2_in, c2 = embedding_input("c2", vocab_size, n_fac)
c3_in, c3 = embedding_input("c3", vocab_size, n_fac)

## Create and train model
1. Feed inputs into successive hidden inputs to create model
2. Create model from inputs and output, then compile and fit

In [26]:
n_hidden = 256 #the size of our hidden state
dense_in = Dense(n_hidden, activation="relu")
dense_hidden = Dense(n_hidden, activation="tanh") #not sure why using tanh activation
dense_out = Dense(vocab_size, activation="softmax")

In [27]:
c1_hidden = dense_in(c1)
hidden_2 = dense_hidden(c1_hidden) #why do we need 2 dense layers for first character?

In [29]:
c2_dense = dense_in(c2)
c2_hidden = merge([c2_dense, hidden_2]) #ah, it's not it's own dense layer as it merges with the dense layer following c2's DL
hidden_3 = dense_hidden(c2_hidden)

In [30]:
c3_dense = dense_in(c3)
c3_hidden = merge([c3_dense, hidden_3])
c4_out = dense_out(c3_hidden)

In [31]:
model = Model([c1_in, c2_in, c3_in], c4_out)

In [32]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())
model.optimizer.lr = 0.000001

In [33]:
model.fit([x1, x2, x3], y, batch_size=64, nb_epoch=4)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x1624abb41d0>

In [34]:
model.optimizer.lr=0.01
model.fit([x1, x2, x3], y, batch_size=64, nb_epoch=4)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x1624e9287f0>

In [47]:
model.optimizer.lr=0.000001
model.fit([x1, x2, x3], y, batch_size=64, nb_epoch=4)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x1624bdf4e48>

In [101]:
model.optimizer.lr=0.1
model.fit([x1, x2, x3], y, batch_size=64, nb_epoch=8)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x1624be0c048>

## Test model

In [95]:
def get_next(inp):
    idxs = [char_indices[c] for c in inp]
    arrs = [np.array(i)[np.newaxis] for i in idxs]
    p = model.predict(arrs)
    #p[0,2] = 0.
    i = np.argmax(p)
    print(p, i)
    return chars[i]

In [106]:
get_next(" it")

[[  2.54865657e-08   1.49483392e-02   2.31783956e-01   3.42995889e-04
    2.17914209e-03   2.19642388e-04   2.41972826e-04   2.25413038e-04
    1.16145182e-02   4.25912021e-03   3.10039613e-03   4.65834455e-05
    1.75333931e-04   1.07757573e-04   6.08266528e-05   7.72587300e-05
    6.23737724e-05   4.75745765e-05   7.50205363e-05   5.84467343e-05
    4.39511459e-05   6.12729753e-04   5.35177591e-04   2.77365005e-04
    3.13706842e-04   8.68282572e-04   1.58230527e-04   2.96991813e-04
    2.44640629e-04   8.93161923e-04   2.26080971e-04   3.19226732e-04
    3.58226011e-04   9.42568993e-04   5.34119536e-05   6.89832304e-05
    2.47740099e-04   3.08572256e-04   4.37793642e-04   6.21995248e-04
    3.19327664e-04   3.35107143e-05   4.38114774e-04   6.61385653e-04
    1.13678875e-03   2.42030277e-04   8.68161223e-05   2.82969064e-04
    2.38959619e-05   8.88590657e-05   1.48742311e-05   6.05629393e-05
    8.13349106e-05   1.00482539e-04   4.28723432e-02   6.57961145e-03
    1.71233453e-02  

' '

In [103]:
get_next(" th")

[[  3.34416619e-08   1.48813818e-02   1.38626382e-01   3.34294862e-04
    2.02722312e-03   2.25922078e-04   2.95426260e-04   2.30270147e-04
    1.01653319e-02   4.37463820e-03   3.50177567e-03   4.24712453e-05
    1.56955153e-04   1.04906991e-04   7.10012537e-05   7.22971745e-05
    6.11220967e-05   4.40359290e-05   9.89522377e-05   6.64726831e-05
    4.57423739e-05   6.02927874e-04   5.51924924e-04   2.98252155e-04
    3.28984577e-04   7.08951324e-04   1.63710487e-04   3.19173239e-04
    2.56780593e-04   1.01145206e-03   2.79002881e-04   3.65013635e-04
    3.39378603e-04   1.00887695e-03   5.70278316e-05   6.66475025e-05
    2.38061897e-04   2.52433878e-04   5.14382380e-04   6.42251864e-04
    3.34446988e-04   3.53291070e-05   4.36303089e-04   6.84067432e-04
    9.31250397e-04   2.51056772e-04   9.88259562e-05   2.74188991e-04
    2.47217158e-05   9.68689419e-05   1.71358461e-05   5.94933445e-05
    9.07268113e-05   1.09722088e-04   6.45893738e-02   7.24847987e-03
    1.58706084e-02  

'e'

## Our first RNN
1. Create inputs for n-sized (when unrolled RNN), and then the same as above

In [107]:
cs = 8
c_in_dat = [[idx[i+n] for i in range(0, len(idx)-1-cs, cs)] for n in range(cs)]
c_out_dat = [idx[i+cs] for i in range(0, len(idx)-1-cs, cs)]

In [109]:
xs = [np.stack(c[:-2]) for c in c_in_dat]

In [111]:
len(xs), xs[0].shape

(8, (75110,))

In [112]:
y = np.stack(c_out_dat[:-2])

In [113]:
n_fac = 42

In [114]:
def embedding_input(name, n_in, n_out):
    inp = Input(shape=(1,), dtype='int64', name=name+'_in')
    emb = Embedding(n_in, n_out, input_length=1, name=name+'_emb')(inp)
    return inp, Flatten()(emb)

In [115]:
c_ins = [embedding_input('c'+str(n), vocab_size, n_fac) for n in range(cs)]

In [116]:
n_hidden = 256

In [117]:
dense_in = Dense(n_hidden, activation='relu')
dense_hidden = Dense(n_hidden, activation='relu', init='identity')
dense_out = Dense(vocab_size, activation='softmax')

In [118]:
hidden = dense_in(c_ins[0][1])

In [119]:
for i in range(1,cs):
    c_dense = dense_in(c_ins[i][1])
    hidden = dense_hidden(hidden)
    hidden = merge([c_dense, hidden])

In [120]:
c_out = dense_out(hidden)

In [122]:
model = Model([c[0] for c in c_ins], c_out)
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

In [123]:
model.fit(xs, y, batch_size=64, nb_epoch=12)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


<keras.callbacks.History at 0x1624e8beac8>

## Test model

In [130]:
def get_next(inp):
    idxs = [np.array(char_indices[c])[np.newaxis] for c in inp]
    p = model.predict(idxs)
    return chars[np.argmax(p)]

In [131]:
get_next('for thos')

' '

In [132]:
get_next('part of ')

't'

In [136]:
get_next('every on')

' '

## Our first RNN with Keras
The same as before but using Keras' SimpleRNN layer instead

In [137]:
n_hidden, n_fac, cs, vocab_size = (256, 42, 8, 86)

In [140]:
model = Sequential([
    Embedding(vocab_size, n_fac, input_length=cs),
    SimpleRNN(n_hidden, activation='relu', inner_init='identity'),
    Dense(vocab_size, activation='softmax')
])

In [141]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_5 (Embedding)          (None, 8, 42)         3612        embedding_input_1[0][0]          
____________________________________________________________________________________________________
simplernn_1 (SimpleRNN)          (None, 256)           76544       embedding_5[0][0]                
____________________________________________________________________________________________________
dense_9 (Dense)                  (None, 86)            22102       simplernn_1[0][0]                
Total params: 102,258
Trainable params: 102,258
Non-trainable params: 0
____________________________________________________________________________________________________


In [142]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

In [143]:
model.fit(np.concatenate(xs,axis=1), y, batch_size=64, nb_epoch=8)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x162666e8ef0>

In [146]:
def get_next_keras(inp):
    idxs = [char_indices[c] for c in inp]
    arrs = np.array(idxs)[np.newaxis,:]
    p = model.predict(arrs)[0]
    return chars[np.argmax(p)]

In [148]:
get_next_keras('this is ')

'a'

In [149]:
get_next_keras('part of ')

't'

In [150]:
get_next_keras('queens a')

'n'

## Predict chars 2 to n using chars 1 to n-1
Feeds each predicted character back in to improve the next character prediction. More learning going on for the same amount of computation!

1 -> 2
1 2 -> 3
1 2 3 -> 4
1 2 3 4 -> 5
etc.