In [1]:
import pandas as pd
import numpy as np

import string
import re

RANDOM = 42

Load the data

In [2]:
haikus_train_df = pd.read_pickle('./data/haikus_train_df.pickle')
haikus_test_df = pd.read_pickle('./data/haikus_test_df.pickle')

## RNN words

In [3]:
from keras.models import Sequential, load_model
from keras.layers import Dense, Input, LSTM, Dropout
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [11]:
flatten = lambda l: [item for sublist in l for item in sublist]

corpuswords_raw = flatten(list(haikus_train_df['text_withtokens_clean']))

words = sorted(set(corpuswords_raw))
word_to_int = dict((w, i) for i, w in enumerate(words))

n_words = len(corpuswords_raw)
n_vocab_words = len(words)
print("Total Words: ", n_words)
print("Total Vocab: ", n_vocab_words)

Total Words:  447891
Total Vocab:  24046


In [18]:
n_poems = len(haikus_train_df)

n_poems

25128

In [13]:
words

['',
 '<eNd>',
 '<nEXt>',
 'a',
 'aaaa',
 'aah',
 'aback',
 'abandon',
 'abandoned',
 'abandoning',
 'abating',
 'abattoir',
 'abbess',
 'abbey',
 'abbot',
 'abbott',
 'abc',
 'abcs',
 'abduction',
 'abed',
 'abel',
 'abelard',
 'aberration',
 'abhor',
 'abhorred',
 'abide',
 'abilene',
 'abjure',
 'ablaze',
 'able',
 'ablowing',
 'aboard',
 'abode',
 'abodes',
 'abolish',
 'abominable',
 'abord',
 'abortion',
 'abound',
 'about',
 'above',
 'abraham',
 'abramoff',
 'abreast',
 'abriman',
 'abroad',
 'abrupt',
 'abruptly',
 'absalom',
 'abscond',
 'absence',
 'absent',
 'absently',
 'absinthe',
 'absolute',
 'absolutely',
 'absolution',
 'absorb',
 'absorbed',
 'absorbing',
 'absorbs',
 'abstain',
 'abstemious',
 'abstract',
 'abstracted',
 'abstractedlyone',
 'abstraction',
 'absurd',
 'abundance',
 'abundant',
 'abuse',
 'abuses',
 'abydos',
 'abyss',
 'abysses',
 'acacia',
 'academy',
 'acadian',
 'acadians',
 'acc',
 'acceleration',
 'accent',
 'accents',
 'accept',
 'acceptable',


In [22]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 15

poemX = []
poemY = []
n_patterns = 0

corpusX = []
corpusY = []
for poem_index in range(0, n_poems):

    wordX = []
    wordY = []
    poem = haikus_train_df['text_withtokens_clean'].iloc[poem_index]
    # add padding to poem
    poem = list(np.full(seq_length - 1, '')) + list(poem)
    for i in range(0,  len(poem) - seq_length, 1):
        seq_in = poem[i:i + seq_length]
        seq_out = poem[i + seq_length]
        wordX.append([word_to_int[word] for word in seq_in])
        wordY.append(word_to_int[seq_out])

    n_patterns = max(n_patterns, len(wordX))
    
    poemX.append(wordX)
    poemY.append(wordY)
    
    corpusX += wordX
    corpusY += wordY

n_wordpatterns = len(corpusX)
print("Total Patterns: ", n_wordpatterns)

Total Patterns:  422763


In [15]:
seq_in

['',
 '',
 '',
 '',
 'quaker',
 'meeting',
 '<nEXt>',
 'through',
 'stovepipe',
 'heated',
 'air',
 '<nEXt>',
 'the',
 'world',
 'shimmers']

In [19]:
corpusX

[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 633],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 633, 13971],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 633, 13971, 2],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 633, 13971, 2, 10474],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 633, 13971, 2, 10474, 20998],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 633, 13971, 2, 10474, 20998, 1872],
 [0, 0, 0, 0, 0, 0, 0, 0, 633, 13971, 2, 10474, 20998, 1872, 1771],
 [0, 0, 0, 0, 0, 0, 0, 633, 13971, 2, 10474, 20998, 1872, 1771, 0],
 [0, 0, 0, 0, 0, 0, 633, 13971, 2, 10474, 20998, 1872, 1771, 0, 0],
 [0, 0, 0, 0, 0, 633, 13971, 2, 10474, 20998, 1872, 1771, 0, 0, 0],
 [0, 0, 0, 0, 633, 13971, 2, 10474, 20998, 1872, 1771, 0, 0, 0, 2],
 [0, 0, 0, 633, 13971, 2, 10474, 20998, 1872, 1771, 0, 0, 0, 2, 254],
 [0, 0, 633, 13971, 2, 10474, 20998, 1872, 1771, 0, 0, 0, 2, 254, 2244],
 [0, 633, 13971, 2, 10474, 20998, 1872, 1771, 0, 0, 0, 2, 254, 2244, 19965],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 609],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 609, 1

In [20]:
len(corpusY)

422763

In [23]:
# reshape X to be [samples, time steps, features]
X = np.reshape(corpusX, (n_wordpatterns, seq_length, 1))
# normalize
X = X / float(n_vocab_words)
# one hot encode the output variable
y = to_categorical(corpusY)

In [25]:
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True, activation='relu'))
model.add(Dropout(0.2))
model.add(LSTM(256, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [11]:
model = load_model('weights/word/word_weights-cont-168-4.9039.hdf5')




In [26]:
# define the checkpoint
filepath="weights/word/word-weights-new-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [None]:
# fit the model
history = model.fit(X, y, epochs=200, batch_size=128, callbacks=callbacks_list)


Epoch 1/200

Epoch 00001: loss improved from inf to 6.38391, saving model to weights/word/word-weights-new-01-6.3839.hdf5
Epoch 2/200

Epoch 00002: loss improved from 6.38391 to 6.18402, saving model to weights/word/word-weights-new-02-6.1840.hdf5
Epoch 3/200

Epoch 00003: loss improved from 6.18402 to 6.07430, saving model to weights/word/word-weights-new-03-6.0743.hdf5
Epoch 4/200

Epoch 00004: loss improved from 6.07430 to 5.99686, saving model to weights/word/word-weights-new-04-5.9969.hdf5
Epoch 5/200

Epoch 00005: loss improved from 5.99686 to 5.93595, saving model to weights/word/word-weights-new-05-5.9360.hdf5
Epoch 6/200

Epoch 00006: loss improved from 5.93595 to 5.88276, saving model to weights/word/word-weights-new-06-5.8828.hdf5
Epoch 7/200

Epoch 00007: loss improved from 5.88276 to 5.83469, saving model to weights/word/word-weights-new-07-5.8347.hdf5
Epoch 8/200

Epoch 00008: loss improved from 5.83469 to 5.78814, saving model to weights/word/word-weights-new-08-5.7881.

Epoch 40/200

Epoch 00040: loss improved from 4.54017 to 4.52037, saving model to weights/word/word-weights-new-40-4.5204.hdf5
Epoch 41/200

Epoch 00041: loss improved from 4.52037 to 4.50351, saving model to weights/word/word-weights-new-41-4.5035.hdf5
Epoch 42/200

Epoch 00042: loss improved from 4.50351 to 4.48624, saving model to weights/word/word-weights-new-42-4.4862.hdf5
Epoch 43/200

Epoch 00043: loss improved from 4.48624 to 4.46843, saving model to weights/word/word-weights-new-43-4.4684.hdf5
Epoch 44/200

Epoch 00044: loss improved from 4.46843 to 4.45437, saving model to weights/word/word-weights-new-44-4.4544.hdf5
Epoch 45/200

Epoch 00045: loss improved from 4.45437 to 4.43793, saving model to weights/word/word-weights-new-45-4.4379.hdf5
Epoch 46/200

Epoch 00046: loss improved from 4.43793 to 4.42287, saving model to weights/word/word-weights-new-46-4.4229.hdf5
Epoch 47/200

Epoch 00047: loss improved from 4.42287 to 4.40911, saving model to weights/word/word-weights-new

Epoch 80/200

Epoch 00080: loss improved from 4.12295 to 4.11441, saving model to weights/word/word-weights-new-80-4.1144.hdf5
Epoch 81/200

Epoch 00081: loss improved from 4.11441 to 4.10950, saving model to weights/word/word-weights-new-81-4.1095.hdf5
Epoch 82/200

Epoch 00082: loss improved from 4.10950 to 4.10216, saving model to weights/word/word-weights-new-82-4.1022.hdf5
Epoch 83/200

Epoch 00083: loss improved from 4.10216 to 4.09761, saving model to weights/word/word-weights-new-83-4.0976.hdf5
Epoch 84/200

Epoch 00084: loss improved from 4.09761 to 4.09005, saving model to weights/word/word-weights-new-84-4.0900.hdf5
Epoch 85/200

Epoch 00085: loss improved from 4.09005 to 4.08490, saving model to weights/word/word-weights-new-85-4.0849.hdf5
Epoch 86/200

Epoch 00086: loss improved from 4.08490 to 4.08000, saving model to weights/word/word-weights-new-86-4.0800.hdf5
Epoch 87/200

Epoch 00087: loss improved from 4.08000 to 4.07767, saving model to weights/word/word-weights-new

Epoch 120/200

Epoch 00120: loss improved from 3.95244 to 3.94920, saving model to weights/word/word-weights-new-120-3.9492.hdf5
Epoch 121/200

Epoch 00121: loss improved from 3.94920 to 3.94663, saving model to weights/word/word-weights-new-121-3.9466.hdf5
Epoch 122/200

Epoch 00122: loss improved from 3.94663 to 3.94247, saving model to weights/word/word-weights-new-122-3.9425.hdf5
Epoch 123/200

Epoch 00123: loss improved from 3.94247 to 3.94105, saving model to weights/word/word-weights-new-123-3.9410.hdf5
Epoch 124/200

Epoch 00124: loss improved from 3.94105 to 3.93755, saving model to weights/word/word-weights-new-124-3.9376.hdf5
Epoch 125/200

Epoch 00125: loss improved from 3.93755 to 3.93667, saving model to weights/word/word-weights-new-125-3.9367.hdf5
Epoch 126/200

Epoch 00126: loss improved from 3.93667 to 3.93298, saving model to weights/word/word-weights-new-126-3.9330.hdf5
Epoch 127/200

Epoch 00127: loss improved from 3.93298 to 3.93229, saving model to weights/word/w


Epoch 00162: loss did not improve from 3.85572
Epoch 163/200

Epoch 00163: loss improved from 3.85572 to 3.85455, saving model to weights/word/word-weights-new-163-3.8545.hdf5
Epoch 164/200

Epoch 00164: loss improved from 3.85455 to 3.85128, saving model to weights/word/word-weights-new-164-3.8513.hdf5
Epoch 165/200

Epoch 00165: loss improved from 3.85128 to 3.85085, saving model to weights/word/word-weights-new-165-3.8509.hdf5
Epoch 166/200

Epoch 00166: loss improved from 3.85085 to 3.85016, saving model to weights/word/word-weights-new-166-3.8502.hdf5
Epoch 167/200

Epoch 00167: loss improved from 3.85016 to 3.84825, saving model to weights/word/word-weights-new-167-3.8483.hdf5
Epoch 168/200

Epoch 00168: loss improved from 3.84825 to 3.84748, saving model to weights/word/word-weights-new-168-3.8475.hdf5
Epoch 169/200

Epoch 00169: loss improved from 3.84748 to 3.84222, saving model to weights/word/word-weights-new-169-3.8422.hdf5
Epoch 170/200

Epoch 00170: loss did not improve 