In [18]:
import numpy as np
import keras
from keras import layers
import string
import random
import sys
from utils import log, generate_overlapping_sequences, load_short_jokes_corpus

In [19]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds.astype('float64'))
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [20]:
jokes, text = load_short_jokes_corpus(limit=None)
print 'Corpus Length: {c}'.format(c=len(text))

Corpus Length: 21787187


In [21]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts(jokes)
vocab_size = len(tokenizer.word_index) + 1

print('Vocab Size', vocab_size)

('Vocab Size', 70649)


In [22]:
integer_encoded_docs = tokenizer.texts_to_sequences(jokes)
split_encoded_docs = []
next_words = []
for i, joke in enumerate(integer_encoded_docs):
    if joke:
        split_encoded_docs.append(joke[:-1])
        next_words.append(joke[-1])
padded_docs = pad_sequences(split_encoded_docs, padding='pre')
# need to get next word for each of these 
next_words = np.asarray(next_words)

In [23]:
print 'Vectorization.'
y = np.zeros((len(padded_docs), vocab_size), dtype=np.bool)
for i, padded_doc in enumerate(padded_docs):
    y[i, next_words[i]] = 1

Vectorization.


In [34]:
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Bidirectional, Dropout
from keras.optimizers import RMSprop
from keras.layers.embeddings import Embedding

In [35]:
model = Sequential()
model.add(Embedding(vocab_size, 512, input_length=padded_docs.shape[1]))
model.add(Bidirectional(LSTM(512, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)))
model.add(Bidirectional(LSTM(512)))
model.add(Dropout(0.2))
model.add(Dense(vocab_size, activation='softmax'))
optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)
print model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 92, 512)           36172288  
_________________________________________________________________
bidirectional_14 (Bidirectio (None, 92, 1024)          4198400   
_________________________________________________________________
bidirectional_15 (Bidirectio (None, 1024)              6295552   
_________________________________________________________________
dropout_5 (Dropout)          (None, 1024)              0         
_________________________________________________________________
dense_5 (Dense)              (None, 70649)             72415225  
Total params: 119,081,465
Trainable params: 119,081,465
Non-trainable params: 0
_________________________________________________________________
None


In [36]:
word_index = tokenizer.word_index
index_to_word = dict((index, word) for word, index in word_index.items())
max_words = 5
maxlen = padded_docs.shape[1]

In [37]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds.astype('float64'))
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [38]:
for i in range(1, 25):
    print 'Epoch: {i}'.format(i=i)
    model.fit(padded_docs, y, batch_size=256, nb_epoch=1) 
    if i >= 10:
        model.save_weights('jokes_weights_{i}_iter'.format(i=i), overwrite=True)
    # greedy
    start_index = random.randint(0, len(text.split(' ')) - max_words - 1)
    generated_text = " ".join(text.split(' ')[start_index: start_index + max_words])
    integer_encoded_gen_text = tokenizer.texts_to_sequences([generated_text])
    readable_gen_text = " ".join(map(lambda key: index_to_word[key], integer_encoded_gen_text[0]))
    print  '-----  Generating with Seed: "{gen}"'.format(gen=readable_gen_text)
    print '\n'
    for temperature in [0.1, 0.2, 0.5, 1.0, 1.2]:
        print '\n------------ temperature: {t}'.format(t=temperature)
        for i in range(5):
            integer_encoded_gen_text = tokenizer.texts_to_sequences([generated_text])
            readable_gen_text = " ".join(map(lambda key: index_to_word[key], integer_encoded_gen_text[0]))
            padded_gen_text = pad_sequences(integer_encoded_gen_text, maxlen=maxlen, padding='pre')
            preds = model.predict(padded_gen_text, verbose=0)[0]

            next_index = sample(preds, temperature)
            most_probable_next_word = index_to_word[next_index]
            generated_text += " " + most_probable_next_word
            generated_text = " ".join(generated_text.split(' ')[1:])
            sys.stdout.write(" " + most_probable_next_word)
    #         print generated_text

    

Epoch: 1
Epoch 1/1
  1024/231655 [..............................] - ETA: 1168s - loss: 12.4651

KeyboardInterrupt: 