In [1]:
from keras.utils.data_utils import get_file
path = get_file('nietzsche.txt', origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt")
text = open(path).read().lower()
print('corpus length:', len(text))

Using Theano backend.
 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29



('corpus length:', 600901)


In [2]:
import re

words = list(re.split("[, \-!?:\n]+",text))

In [3]:
import collections
counter = collections.Counter(words)
d = dict()
chars = []
for i, (word, freq) in enumerate(counter.most_common()):
    chars.append(word)

In [4]:
chars[:10]

['the', 'of', 'and', 'to', 'in', 'is', 'a', 'that', 'as', 'it']

In [5]:
print("total words:", len(chars))

('total words:', 13183)


In [6]:
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

In [7]:
indices_char[5]

'is'

In [8]:
idx = [char_indices[c] for c in words]

In [9]:
idx[:5]

[5542, 552, 7, 144, 5]

In [10]:
import numpy as np
maxlen = 40
sentences = []
next_chars = []
for i in range(0, len(idx)-maxlen+1):
    sentences.append(idx[i:i+maxlen])
    next_chars.append(idx[i+1:i+maxlen+1])

print('total sentences:', len(sentences))

('total sentences:', 101645)


In [11]:
sentences = np.concatenate([[np.array(o)] for o in sentences[:-2]])
next_chars = np.concatenate([[np.array(o)] for o in next_chars[:-2]])

In [12]:
sentences.shape, next_chars.shape

((101643, 40), (101643, 40))

In [13]:
n_fac = 50
vocab_size = 5000

sentences = np.where(sentences < vocab_size-1, sentences, vocab_size-1)
next_chars = np.where(next_chars < vocab_size-1,next_chars, vocab_size-1)
sentences.shape, next_chars.shape

((101643, 40), (101643, 40))

In [14]:
import os
import bcolz
import pickle 

def get_glove_dataset(dataset):
    """Download the requested glove dataset from files.fast.ai
    and return a location that can be passed to load_vectors.
    """
    # see wordvectors.ipynb for info on how these files were
    # generated from the original glove data.
    md5sums = {'6B.50d': '8e1557d1228decbda7db6dfd81cd9909',
               '6B.100d': 'c92dbbeacde2b0384a43014885a60b2c',
               '6B.200d': 'af271b46c04b0b2e41a84d8cd806178d',
               '6B.300d': '30290210376887dcc6d0a5a6374d8255'}
    glove_path = os.path.abspath('data/glove/results')
    %mkdir -p $glove_path
    return get_file(dataset,
                    'http://files.fast.ai/models/glove/' + dataset + '.tgz',
                    cache_subdir=glove_path,
                    md5_hash=md5sums.get(dataset, None),
                    untar=True)

def load_vectors(loc):
    return (bcolz.open(loc+'.dat'),
        pickle.load(open(loc+'_words.pkl','rb')),
        pickle.load(open(loc+'_idx.pkl','rb')))

In [15]:
vecs, words, wordidx = load_vectors(get_glove_dataset('6B.50d'))

Untaring file...


In [16]:
import re
def create_emb():
    n_fact = vecs.shape[1]
    emb = np.zeros((vocab_size, n_fact))

    for i in range(1,len(emb)):
        word = indices_char[i]
        if word and re.match(r"^[a-zA-Z0-9\-]*$", word):
            src_idx = char_indices[word]
            emb[i] = vecs[src_idx]
        else:
            # If we can't find the word in glove, randomly initialize
            emb[i] = np.random.normal(scale=0.6, size=(n_fact,))

    # This is our "rare word" id - we want to randomly initialize
    emb[-1] = np.random.normal(scale=0.6, size=(n_fact,))
    emb/=3
    return emb

In [17]:
emb = create_emb()

In [25]:
from keras.models import Sequential, Model
from keras.layers import Embedding, LSTM, Dense, Dropout, TimeDistributed, Activation
model=Sequential([
        #Embedding(vocab_size, n_fac, input_length=maxlen,weights=[emb]),
        Embedding(vocab_size, n_fac, input_length=maxlen),
        LSTM(512, input_dim=n_fac,return_sequences=True, dropout_U=0.2, dropout_W=0.2,
             consume_less='gpu'),
        Dropout(0.2),
        LSTM(256, return_sequences=True, dropout_U=0.2, dropout_W=0.2,
             consume_less='gpu'),
        Dropout(0.2),
        TimeDistributed(Dense(vocab_size)),
        Activation('softmax')
    ])  

In [26]:
from keras.optimizers import Adam, RMSprop
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

In [30]:
model.load_weights('models/word_rnn.h5')

In [21]:
chars[:5]

['the', 'of', 'and', 'to', 'in']

In [22]:
from numpy.random import choice

def print_example():
    seed_string = []
    for i in range(50):
        seed_string.append(choice(chars[:500]))

    for i in range(320):
        x=np.array([char_indices[c] for c in seed_string[-40:]])[np.newaxis,:]
        preds = model.predict(x, verbose=0)[0][-1]
        preds = preds/np.sum(preds)
        next_char = choice(chars[:5000], p=preds)
        seed_string.append(next_char)
    print(seed_string)

In [22]:
model.optimizer.lr = 0.001

In [23]:
model.fit(sentences, np.expand_dims(next_chars,-1), batch_size=64, nb_epoch=2)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7ff29fa148d0>

In [36]:
model.optimizer.lr = 0.001

In [37]:
model.fit(sentences, np.expand_dims(next_chars,-1), batch_size=256, nb_epoch=1)

Epoch 1/1


<keras.callbacks.History at 0x7fe3879a8590>

# print_example()

In [38]:
model.save_weights('models/word_rnn.h5')