In [1]:
import pandas as pd
import numpy as np

import string
import re

RANDOM = 42

Load the data

In [2]:
haikus_train_df = pd.read_pickle('./data/haikus_train_df.pickle')
haikus_test_df = pd.read_pickle('./data/haikus_test_df.pickle')

## RNN words

In [9]:
from keras.models import Sequential, load_model
from keras.layers import Dense, Input, LSTM, Dropout
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint

In [4]:
flatten = lambda l: [item for sublist in l for item in sublist]

corpuswords_raw = [item for item in flatten(list(haikus_train_df['text_withtokens_clean'])) if item != '']

words = sorted(set(corpuswords_raw))
word_to_int = dict((w, i) for i, w in enumerate(words))

n_words = len(corpuswords_raw)
n_vocab_words = len(words)
print("Total Words: ", n_words)
print("Total Vocab: ", n_vocab_words)

Total Words:  421611
Total Vocab:  24045


In [6]:
words

['<eNd>',
 '<nEXt>',
 'a',
 'aaaa',
 'aah',
 'aback',
 'abandon',
 'abandoned',
 'abandoning',
 'abating',
 'abattoir',
 'abbess',
 'abbey',
 'abbot',
 'abbott',
 'abc',
 'abcs',
 'abduction',
 'abed',
 'abel',
 'abelard',
 'aberration',
 'abhor',
 'abhorred',
 'abide',
 'abilene',
 'abjure',
 'ablaze',
 'able',
 'ablowing',
 'aboard',
 'abode',
 'abodes',
 'abolish',
 'abominable',
 'abord',
 'abortion',
 'abound',
 'about',
 'above',
 'abraham',
 'abramoff',
 'abreast',
 'abriman',
 'abroad',
 'abrupt',
 'abruptly',
 'absalom',
 'abscond',
 'absence',
 'absent',
 'absently',
 'absinthe',
 'absolute',
 'absolutely',
 'absolution',
 'absorb',
 'absorbed',
 'absorbing',
 'absorbs',
 'abstain',
 'abstemious',
 'abstract',
 'abstracted',
 'abstractedlyone',
 'abstraction',
 'absurd',
 'abundance',
 'abundant',
 'abuse',
 'abuses',
 'abydos',
 'abyss',
 'abysses',
 'acacia',
 'academy',
 'acadian',
 'acadians',
 'acc',
 'acceleration',
 'accent',
 'accents',
 'accept',
 'acceptable',
 'acc

In [5]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 3
wordX = []
wordY = []
for i in range(0, n_words - seq_length, 1):
    seq_in = corpuswords_raw[i:i + seq_length]
    seq_out = corpuswords_raw[i + seq_length]
    wordX.append([word_to_int[word] if word != '' else '' for word in seq_in])
    wordY.append(word_to_int[seq_out] if seq_out != '' else '')
n_wordpatterns = len(wordX)
print("Total Patterns: ", n_wordpatterns)

Total Patterns:  421608


In [119]:
seq_in

['<nEXt>', 'the', '<nEXt>']

In [117]:
wordX

[[14233, 5760, 1],
 [5760, 1, 2],
 [1, 2, 20322],
 [2, 20322, 8895],
 [20322, 8895, 7067],
 [8895, 7067, 1],
 [7067, 1, 25939],
 [1, 25939, 5379],
 [25939, 5379, 0],
 [5379, 0, 21758],
 [0, 21758, 18188],
 [21758, 18188, 1],
 [18188, 1, 1131],
 [1, 1131, 23284],
 [1131, 23284, 6611],
 [23284, 6611, 21565],
 [6611, 21565, 1],
 [21565, 1, 11425],
 [1, 11425, 23356],
 [11425, 23356, 15642],
 [23356, 15642, 13227],
 [15642, 13227, 0],
 [13227, 0, 21758],
 [0, 21758, 14713],
 [21758, 14713, 1],
 [14713, 1, 2],
 [1, 2, 19086],
 [2, 19086, 1541],
 [19086, 1541, 8895],
 [1541, 8895, 1],
 [8895, 1, 2714],
 [1, 2714, 0],
 [2714, 0, 22563],
 [0, 22563, 366],
 [22563, 366, 1],
 [366, 1, 700],
 [1, 700, 15693],
 [700, 15693, 13835],
 [15693, 13835, 13282],
 [13835, 13282, 1],
 [13282, 1, 15152],
 [1, 15152, 23284],
 [15152, 23284, 13769],
 [23284, 13769, 0],
 [13769, 0, 4029],
 [0, 4029, 5780],
 [4029, 5780, 14106],
 [5780, 14106, 1],
 [14106, 1, 11189],
 [1, 11189, 19276],
 [11189, 19276, 1],
 [19

In [103]:
len(wordY)

528336

In [6]:
# reshape X to be [samples, time steps, features]
X = np.reshape(wordX, (n_wordpatterns, seq_length, 1))
# normalize
X = X / float(n_vocab_words)
# one hot encode the output variable
y = to_categorical(wordY)

In [9]:
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [11]:
model = load_model('weights/word/word_weights-cont-168-4.9039.hdf5')




In [14]:
# define the checkpoint
filepath="weights/word/word_weights-2-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [13]:
# fit the model
model.fit(X, y, epochs=200, batch_size=128, callbacks=callbacks_list)

Epoch 1/168

Epoch 00001: loss improved from inf to 5.61577, saving model to word_weights-cont-01-5.6158.hdf5
Epoch 2/168

Epoch 00002: loss improved from 5.61577 to 5.59928, saving model to word_weights-cont-02-5.5993.hdf5
Epoch 3/168

Epoch 00003: loss improved from 5.59928 to 5.58304, saving model to word_weights-cont-03-5.5830.hdf5
Epoch 4/168

Epoch 00004: loss improved from 5.58304 to 5.56915, saving model to word_weights-cont-04-5.5692.hdf5
Epoch 5/168

Epoch 00005: loss improved from 5.56915 to 5.55446, saving model to word_weights-cont-05-5.5545.hdf5
Epoch 6/168

Epoch 00006: loss improved from 5.55446 to 5.54208, saving model to word_weights-cont-06-5.5421.hdf5
Epoch 7/168

Epoch 00007: loss improved from 5.54208 to 5.52813, saving model to word_weights-cont-07-5.5281.hdf5
Epoch 8/168

Epoch 00008: loss improved from 5.52813 to 5.51655, saving model to word_weights-cont-08-5.5166.hdf5
Epoch 9/168

Epoch 00009: loss improved from 5.51655 to 5.50533, saving model to word_weight


Epoch 00043: loss improved from 5.26669 to 5.26477, saving model to word_weights-cont-43-5.2648.hdf5
Epoch 44/168

Epoch 00044: loss improved from 5.26477 to 5.26195, saving model to word_weights-cont-44-5.2619.hdf5
Epoch 45/168

Epoch 00045: loss improved from 5.26195 to 5.25599, saving model to word_weights-cont-45-5.2560.hdf5
Epoch 46/168

Epoch 00046: loss improved from 5.25599 to 5.25336, saving model to word_weights-cont-46-5.2534.hdf5
Epoch 47/168

Epoch 00047: loss improved from 5.25336 to 5.24849, saving model to word_weights-cont-47-5.2485.hdf5
Epoch 48/168

Epoch 00048: loss improved from 5.24849 to 5.24366, saving model to word_weights-cont-48-5.2437.hdf5
Epoch 49/168

Epoch 00049: loss improved from 5.24366 to 5.24081, saving model to word_weights-cont-49-5.2408.hdf5
Epoch 50/168

Epoch 00050: loss improved from 5.24081 to 5.23748, saving model to word_weights-cont-50-5.2375.hdf5
Epoch 51/168

Epoch 00051: loss improved from 5.23748 to 5.23166, saving model to word_weight


Epoch 00085: loss improved from 5.11939 to 5.11731, saving model to word_weights-cont-85-5.1173.hdf5
Epoch 86/168

Epoch 00086: loss improved from 5.11731 to 5.11635, saving model to word_weights-cont-86-5.1164.hdf5
Epoch 87/168

Epoch 00087: loss improved from 5.11635 to 5.11307, saving model to word_weights-cont-87-5.1131.hdf5
Epoch 88/168

Epoch 00088: loss improved from 5.11307 to 5.10776, saving model to word_weights-cont-88-5.1078.hdf5
Epoch 89/168

Epoch 00089: loss improved from 5.10776 to 5.10546, saving model to word_weights-cont-89-5.1055.hdf5
Epoch 90/168

Epoch 00090: loss improved from 5.10546 to 5.10282, saving model to word_weights-cont-90-5.1028.hdf5
Epoch 91/168

Epoch 00091: loss improved from 5.10282 to 5.10167, saving model to word_weights-cont-91-5.1017.hdf5
Epoch 92/168

Epoch 00092: loss improved from 5.10167 to 5.09617, saving model to word_weights-cont-92-5.0962.hdf5
Epoch 93/168

Epoch 00093: loss did not improve from 5.09617
Epoch 94/168

Epoch 00094: loss 


Epoch 00127: loss improved from 5.00410 to 5.00088, saving model to word_weights-cont-127-5.0009.hdf5
Epoch 128/168

Epoch 00128: loss did not improve from 5.00088
Epoch 129/168

Epoch 00129: loss improved from 5.00088 to 4.99632, saving model to word_weights-cont-129-4.9963.hdf5
Epoch 130/168

Epoch 00130: loss improved from 4.99632 to 4.99414, saving model to word_weights-cont-130-4.9941.hdf5
Epoch 131/168

Epoch 00131: loss improved from 4.99414 to 4.98852, saving model to word_weights-cont-131-4.9885.hdf5
Epoch 132/168

Epoch 00132: loss improved from 4.98852 to 4.98801, saving model to word_weights-cont-132-4.9880.hdf5
Epoch 133/168

Epoch 00133: loss improved from 4.98801 to 4.98671, saving model to word_weights-cont-133-4.9867.hdf5
Epoch 134/168

Epoch 00134: loss improved from 4.98671 to 4.98312, saving model to word_weights-cont-134-4.9831.hdf5
Epoch 135/168

Epoch 00135: loss improved from 4.98312 to 4.98039, saving model to word_weights-cont-135-4.9804.hdf5
Epoch 136/168

E

<keras.callbacks.callbacks.History at 0x2314f8a5f08>