In [1]:
import numpy as np
import keras
from keras import layers
import string
import random
import sys
from utils import log, generate_overlapping_sequences, load_short_jokes_corpus

Using TensorFlow backend.


In [2]:
jokes, text = load_short_jokes_corpus(limit=None)
print 'Corpus Length: {c}'.format(c=len(text))        

Corpus Length: 21787187


In [3]:
import re
def clean_punctuation(joke): 
    tokens = re.findall(r"[\w']+|[.,!?;]+", joke)
    cleaned = []
    for token in tokens:
        if '?' in token:
            cleaned.append('?')
        elif '!' in token:
            cleaned.append('!')
        elif '..' in token:
            cleaned.append('...')
        else:
            cleaned.append(token)
    if '.' not in cleaned[-1] and '?' not in cleaned[-1] and '!' not in cleaned[-1]:
        cleaned.append('.')
    return " ".join(cleaned)
            

In [4]:
jokes = map(clean_punctuation, jokes)

In [5]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(filters='"#$%&()*+,-/:;<=>@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(jokes)
vocab_size = len(tokenizer.word_index) + 1

print('Vocab Size', vocab_size)

('Vocab Size', 70651)


In [6]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds.astype('float64'))
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [7]:
def generate_overlapping_encoded_sequences(jokes, maxlen, step):
    sentences = []
    next_words = [] # holds the targets
    for joke in jokes:
        for i in range(0, len(joke) - maxlen, step):
            sentences.append(joke[i: i + maxlen])
            next_words.append(joke[i + maxlen])
    return sentences, next_words

In [8]:
# split jokes into sequences of 11
integer_encoded_docs = tokenizer.texts_to_sequences(jokes)
split_encoded_docs, next_words = generate_overlapping_encoded_sequences(integer_encoded_docs, 11, 3)
padded_docs = pad_sequences(split_encoded_docs, padding='post')
# need to get next word for each of these 
next_words = np.asarray(next_words)
print("Number of Sequences:", len(padded_docs))

('Number of Sequences:', 753882)


In [9]:
print 'Vectorization.'
y = np.zeros((len(padded_docs), vocab_size), dtype=np.bool)
for i, padded_doc in enumerate(padded_docs):
    y[i, next_words[i]] = 1

Vectorization.


In [10]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Bidirectional, Dropout
from keras.optimizers import RMSprop
from keras.layers.embeddings import Embedding

In [16]:
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=padded_docs.shape[1], mask_zero=True))
model.add(Bidirectional(LSTM(128, dropout=0.1, recurrent_dropout=0.1, return_sequences=True)))
model.add(Bidirectional(LSTM(128, dropout=0.1, recurrent_dropout=0.1, return_sequences=True)))
model.add(Bidirectional(LSTM(128)))
model.add(Dense(2048, kernel_regularizer=keras.regularizers.l2(0.001), activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)
print model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 11, 100)           7065100   
_________________________________________________________________
bidirectional_11 (Bidirectio (None, 11, 256)           234496    
_________________________________________________________________
bidirectional_12 (Bidirectio (None, 11, 256)           394240    
_________________________________________________________________
bidirectional_13 (Bidirectio (None, 256)               394240    
_________________________________________________________________
dense_5 (Dense)              (None, 2048)              526336    
_________________________________________________________________
dense_6 (Dense)              (None, 70651)             144763899 
Total params: 153,378,311
Trainable params: 153,378,311
Non-trainable params: 0
______________________________________________________________

In [17]:
word_index = tokenizer.word_index
index_to_word = dict((index, word) for word, index in word_index.items())
max_words = 5
maxlen = padded_docs.shape[1]

In [18]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds.astype('float64'))
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [None]:
for i in range(1, 40):
    print 'Epoch: {i}'.format(i=i)
    model.fit(padded_docs, y, batch_size=1028, nb_epoch=1) 
    if i >= 3:
        model.save_weights('finalWeights'.format(i=i), overwrite=True)
    print '\n'
    for temperature in [0.2, 0.5, 1.0, 1.2]:
        print '\n------------ temperature: {t}'.format(t=temperature)
        start_index = random.randint(0, len(text.split(' ')) - max_words - 1)
        generated_text = " ".join(text.split(' ')[start_index: start_index + max_words])
        integer_encoded_gen_text = tokenizer.texts_to_sequences([generated_text])
        readable_gen_text = " ".join(map(lambda key: index_to_word[key], integer_encoded_gen_text[0]))
        print  '-----  Generating with Seed: "{gen}"'.format(gen=readable_gen_text)
        print '\n'
        sys.stdout.write(readable_gen_text)

        for i in range(35):
            integer_encoded_gen_text = tokenizer.texts_to_sequences([generated_text])
#             print('Integer Encoded', integer_encoded_gen_text)
            readable_gen_text = " ".join(map(lambda key: index_to_word[key], integer_encoded_gen_text[0]))
            padded_gen_text = pad_sequences(integer_encoded_gen_text, maxlen=maxlen, padding='pre')
            preds = model.predict(padded_gen_text, verbose=0)[0]

            next_index = sample(preds, temperature)
            most_probable_next_word = index_to_word[next_index]
#             print('Generated:', generated_text, 'Next: ', most_probable_next_word)
            generated_text += " " + most_probable_next_word
            generated_text = " ".join(generated_text.split(' ')[1:])
            sys.stdout.write(" " + most_probable_next_word)
            if most_probable_next_word in ('.', '?', '!'):
                break
    

Epoch: 1
Epoch 1/1



------------ temperature: 0.2
-----  Generating with Seed: "call a horny herbaceous bread"


call a horny herbaceous bread .
------------ temperature: 0.5
-----  Generating with Seed: "what do you call"


what do you call .
------------ temperature: 1.0
-----  Generating with Seed: "he was charged with shooting"


he was charged with shooting .
------------ temperature: 1.2
-----  Generating with Seed: "bleach i swear if i"


bleach i swear if i fast hammer greens' blood five time saying paper qt3 funerals lidocaine shelf jesus americas blame screaming one hey ten once road's 17 him ask been science accepted i nullpointerexception screams heee .Epoch: 2
Epoch 1/1



------------ temperature: 0.2
-----  Generating with Seed: "manchester united lost because their"


manchester united lost because their .
------------ temperature: 0.5
-----  Generating with Seed: "air i'd prob eat"


air i'd prob eat .
------------ temperature: 1.0
-----  Generating with Seed: "would

------------ temperature: 0.5
-----  Generating with Seed: "turns to the other and"


turns to the other and says it's a car .
------------ temperature: 1.0
-----  Generating with Seed: "tour de france get"


tour de france get out of you sounds is to worry what's a face .
------------ temperature: 1.2
-----  Generating with Seed: "the 1970s it was almost"


the 1970s it was almost no in everything .Epoch: 14
Epoch 1/1



------------ temperature: 0.2
-----  Generating with Seed: "nursing home old guy how"


nursing home old guy how like me .
------------ temperature: 0.5
-----  Generating with Seed: "of omeletteyoufinish stolen from"


of omeletteyoufinish stolen from a the name of the own them .
------------ temperature: 1.0
-----  Generating with Seed: "check the same three apps"


check the same three apps jimmy ran in shoe ?
------------ temperature: 1.2
-----  Generating with Seed: "the word that's changed in"


the word that's changed in the fish has there play by attention inst