In [1]:
import numpy as np
import keras
from keras import layers
import string
import random
import sys
from utils import log, generate_overlapping_sequences, load_short_jokes_corpus

Using TensorFlow backend.


In [2]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds.astype('float64'))
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)


In [3]:
jokes, text = load_short_jokes_corpus(limit=1000)
print 'Corpus Length: {c}'.format(c=len(text))

Corpus Length: 93099


In [4]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts(jokes)
vocab_size = len(tokenizer.word_index) + 1

print('Vocab Size', vocab_size)

('Vocab Size', 3990)


In [21]:
integer_encoded_docs = tokenizer.texts_to_sequences(jokes)
split_encoded_docs = []
next_words = []
for i, joke in enumerate(integer_encoded_docs):
    if joke:
        split_encoded_docs.append(joke[:-1])
        next_words.append(joke[-1])
padded_docs = pad_sequences(split_encoded_docs, padding='pre')
# need to get next word for each of these 

In [22]:
print padded_docs.shape[1]
next_words = np.asarray(next_words)
print next_words.shape

39
(1000,)


In [23]:
print 'Vectorization.'
y = np.zeros((len(padded_docs), vocab_size), dtype=np.bool)
for i, padded_doc in enumerate(padded_docs):
    y[i, next_words[i]] = 1

Vectorization.


In [24]:
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Bidirectional
from keras.optimizers import RMSprop
from keras.layers.embeddings import Embedding

In [25]:
model = Sequential()
model.add(Embedding(vocab_size, 256, input_length=padded_docs.shape[1]))
model.add(Bidirectional(LSTM(256, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)))
model.add(Bidirectional(LSTM(256)))

model.add(Dense(vocab_size, activation='softmax'))
optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)
print model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 39, 256)           1021440   
_________________________________________________________________
bidirectional_5 (Bidirection (None, 39, 512)           1050624   
_________________________________________________________________
bidirectional_6 (Bidirection (None, 512)               1574912   
_________________________________________________________________
dense_3 (Dense)              (None, 3990)              2046870   
Total params: 5,693,846
Trainable params: 5,693,846
Non-trainable params: 0
_________________________________________________________________
None


In [88]:
word_index = tokenizer.word_index
index_to_word = dict((index, word) for word, index in word_index.items())
max_words = 5

In [89]:
model.fit(padded_docs, y, batch_size=1000, nb_epoch=1) 


Epoch 1/1


<keras.callbacks.History at 0x115851450>

In [91]:
for i in range(1):
    print 'Epoch: {i}'.format(i=i)
    # greedy
    start_index = random.randint(0, len(text.split(' ')) - max_words - 1)
    generated_text = " ".join(text.split(' ')[start_index: start_index + max_words])
    integer_encoded_gen_text = tokenizer.texts_to_sequences([generated_text])
    readable_gen_text = " ".join(map(lambda key: index_to_word[key], integer_encoded_gen_text[0]))
    print  '-----  Generating with Seed: "{gen}"'.format(gen=readable_gen_text)
    print '\n'
    for temperature in [0.1, 0.2, 0.5, 1.0, 1.2]:
        print '\n------------ temperature: {t}'.format(t=temperature)
        for i in range(5):
            integer_encoded_gen_text = tokenizer.texts_to_sequences([generated_text])
            readable_gen_text = " ".join(map(lambda key: index_to_word[key], integer_encoded_gen_text[0]))
            padded_gen_text = pad_sequences(integer_encoded_gen_text, maxlen=maxlen, padding='pre')
            preds = model.predict(padded_gen_text, verbose=0)[0]

            next_index = sample(preds, temperature)
            most_probable_next_word = index_to_word[next_index]
            generated_text += " " + most_probable_next_word
            generated_text = " ".join(generated_text.split(' ')[1:])
            sys.stdout.write(" " + most_probable_next_word)
    #         print generated_text

    

        

Epoch: 0
-----  Generating with Seed: "or alive i would choose"



------------ temperature: 0.1
 homes homes homes homes homes
------------ temperature: 0.2
 downstairs' homes rest together homes
------------ temperature: 0.5
 byrne one out blood you
------------ temperature: 1.0
 essay high sure one idaho
------------ temperature: 1.2
 in replies shkreli is sure

In [71]:
integer_encoded_gen_text

[[804, 57, 57, 407, 407]]

In [72]:
generated_text

'out out again again again'

In [73]:
tokenizer.texts_to_sequences([generated_text])

[[57, 57, 407, 407, 407]]

In [92]:
maxlen


39