In [164]:
from keras.utils.data_utils import get_file
path = get_file('nietzsche.txt', origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt")
text = open(path).read().lower()
print('corpus length:', len(text))

('corpus length:', 600901)


In [165]:
import re

words = list(re.split("[, \-!?:\n]+",text))

In [166]:
import collections
counter = collections.Counter(words)
d = dict()
chars = []
for i, (word, freq) in enumerate(counter.most_common()):
    chars.append(word)

In [167]:
chars[:10]

['the', 'of', 'and', 'to', 'in', 'is', 'a', 'that', 'as', 'it']

In [168]:
print("total words:", len(chars))

('total words:', 13183)


In [169]:
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

In [170]:
indices_char[5]

'is'

In [171]:
idx = [char_indices[c] for c in words]

In [172]:
idx[:5]

[5542, 552, 7, 144, 5]

In [173]:
import numpy as np
maxlen = 40
sentences = []
next_chars = []
for i in range(0, len(idx)-maxlen+1):
    sentences.append(idx[i:i+maxlen])
    next_chars.append(idx[i+1:i+maxlen+1])

print('total sentences:', len(sentences))

('total sentences:', 101645)


In [174]:
sentences = np.concatenate([[np.array(o)] for o in sentences[:-2]])
next_chars = np.concatenate([[np.array(o)] for o in next_chars[:-2]])

In [175]:
sentences.shape, next_chars.shape

((101643, 40), (101643, 40))

In [177]:
n_fac = 50
vocab_size = 5000

sentences = np.where(sentences < vocab_size-1, sentences, vocab_size-1)
next_chars = np.where(next_chars < vocab_size-1,next_chars, vocab_size-1)
sentences.shape, next_chars.shape

((101643, 40), (101643, 40))

In [178]:
import os
import bcolz
import pickle 

def get_glove_dataset(dataset):
    """Download the requested glove dataset from files.fast.ai
    and return a location that can be passed to load_vectors.
    """
    # see wordvectors.ipynb for info on how these files were
    # generated from the original glove data.
    md5sums = {'6B.50d': '8e1557d1228decbda7db6dfd81cd9909',
               '6B.100d': 'c92dbbeacde2b0384a43014885a60b2c',
               '6B.200d': 'af271b46c04b0b2e41a84d8cd806178d',
               '6B.300d': '30290210376887dcc6d0a5a6374d8255'}
    glove_path = os.path.abspath('data/glove/results')
    %mkdir -p $glove_path
    return get_file(dataset,
                    'http://files.fast.ai/models/glove/' + dataset + '.tgz',
                    cache_subdir=glove_path,
                    md5_hash=md5sums.get(dataset, None),
                    untar=True)

def load_vectors(loc):
    return (bcolz.open(loc+'.dat'),
        pickle.load(open(loc+'_words.pkl','rb')),
        pickle.load(open(loc+'_idx.pkl','rb')))

In [179]:
vecs, words, wordidx = load_vectors(get_glove_dataset('6B.50d'))

Untaring file...


In [180]:
import re
def create_emb():
    n_fact = vecs.shape[1]
    emb = np.zeros((vocab_size, n_fact))

    for i in range(1,len(emb)):
        word = indices_char[i]
        if word and re.match(r"^[a-zA-Z0-9\-]*$", word):
            src_idx = char_indices[word]
            emb[i] = vecs[src_idx]
        else:
            # If we can't find the word in glove, randomly initialize
            emb[i] = np.random.normal(scale=0.6, size=(n_fact,))

    # This is our "rare word" id - we want to randomly initialize
    emb[-1] = np.random.normal(scale=0.6, size=(n_fact,))
    emb/=3
    return emb

In [181]:
emb = create_emb()

In [182]:
emb[:5]

array([[  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
        

In [202]:
from keras.models import Sequential, Model
from keras.layers import Embedding, LSTM, Dense, Dropout, TimeDistributed, Activation
model=Sequential([
        Embedding(vocab_size, n_fac, input_length=maxlen,weights=[emb]),
        LSTM(512, input_dim=n_fac,return_sequences=True, dropout_U=0.2, dropout_W=0.2,
             consume_less='gpu'),
        Dropout(0.2),
        LSTM(256, return_sequences=True, dropout_U=0.2, dropout_W=0.2,
             consume_less='gpu'),
        Dropout(0.2),
        TimeDistributed(Dense(vocab_size)),
        Activation('softmax')
    ])  

In [203]:
from keras.optimizers import Adam, RMSprop
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

In [239]:
chars[:5]

['the', 'of', 'and', 'to', 'in']

In [246]:
from numpy.random import choice

def print_example():
    seed_string = []
    for i in range(50):
        seed_string.append(choice(chars[:500]))

    for i in range(320):
        x=np.array([char_indices[c] for c in seed_string[-40:]])[np.newaxis,:]
        preds = model.predict(x, verbose=0)[0][-1]
        preds = preds/np.sum(preds)
        next_char = choice(chars[:5000], p=preds)
        seed_string.append(next_char)
    print(seed_string)

In [204]:
model.fit(sentences, np.expand_dims(next_chars,-1), batch_size=64, nb_epoch=2)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fc44f5924d0>

In [205]:
model.optimizer.lr = 0.001

In [206]:
model.fit(sentences, np.expand_dims(next_chars,-1), batch_size=64, nb_epoch=2)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fc451182a90>

In [247]:
print_example()

['virtue', 'conditions', 'self', 'type', 'this', 'learn', 'extent', 'person', 'no', 'pain', 'taste', 'hard', 'however', 'am', 'under', 'him.', 'a', 'morals', 'living', 'reason', 'strange', 'hence', 'much', 'had', 'law', 'cause', 'he', 'this', 'course', 'purpose', 'without', 'such', 'know', 'see', 'work', 'shall', 'day', 'pleasure', 'whom', 'circumstances', '=the', 'nothing', 'music', 'evil', 'we', 'eyes', 'young', 'time', 'moral', 'until', 'he', 'are', 'have', 'with', 'the', 'acquired', 'backward', 'plato', 'frittered', 'of', 'world', 'to', 'dread', 'man', 'beneath', 'him', 'is', 'not', 'so', 'the', 'believe', 'of', 'the', 'desire', 'the', 'frittered', 'frittered', 'as', 'present', 'it.', 'if', 'the', 'finally', 'or', 'dreadful', 'than', 'among', 'frittered', 'by', 'out', 'against', 'let', 'two', 'frittered', 'instinct', 'to', 'the', 'wish', 'in', 'blessedness', 'for', 'badly', 'frittered', 'beyond', 'has', 'romantic', 'nature;', 'of', 'way', 'to', 'frittered', 'that', '=the', 'noble',

In [221]:
model.save_weights('data/word_rnn.h5')