In [1]:
import pandas as pd
import numpy as np

import string
import re

RANDOM = 42

Load the data

In [2]:
haikus_train_df = pd.read_pickle('./data/haikus_train_df.pickle')
haikus_test_df = pd.read_pickle('./data/haikus_test_df.pickle')

## RNN words

In [4]:
from keras.models import Sequential
from keras.layers import Dense, Input, LSTM, Dropout
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [5]:
flatten = lambda l: [item for sublist in l for item in sublist]

corpuswords_raw = [item for item in flatten(list(haikus_train_df['text_withtokens_clean'])) if item != '']

words = sorted(set(corpuswords_raw))
word_to_int = dict((w, i) for i, w in enumerate(words))

n_words = len(corpuswords_raw)
n_vocab_words = len(words)
print("Total Words: ", n_words)
print("Total Vocab: ", n_vocab_words)

Total Words:  421611
Total Vocab:  24045


In [6]:
words

['<eNd>',
 '<nEXt>',
 'a',
 'aaaa',
 'aah',
 'aback',
 'abandon',
 'abandoned',
 'abandoning',
 'abating',
 'abattoir',
 'abbess',
 'abbey',
 'abbot',
 'abbott',
 'abc',
 'abcs',
 'abduction',
 'abed',
 'abel',
 'abelard',
 'aberration',
 'abhor',
 'abhorred',
 'abide',
 'abilene',
 'abjure',
 'ablaze',
 'able',
 'ablowing',
 'aboard',
 'abode',
 'abodes',
 'abolish',
 'abominable',
 'abord',
 'abortion',
 'abound',
 'about',
 'above',
 'abraham',
 'abramoff',
 'abreast',
 'abriman',
 'abroad',
 'abrupt',
 'abruptly',
 'absalom',
 'abscond',
 'absence',
 'absent',
 'absently',
 'absinthe',
 'absolute',
 'absolutely',
 'absolution',
 'absorb',
 'absorbed',
 'absorbing',
 'absorbs',
 'abstain',
 'abstemious',
 'abstract',
 'abstracted',
 'abstractedlyone',
 'abstraction',
 'absurd',
 'abundance',
 'abundant',
 'abuse',
 'abuses',
 'abydos',
 'abyss',
 'abysses',
 'acacia',
 'academy',
 'acadian',
 'acadians',
 'acc',
 'acceleration',
 'accent',
 'accents',
 'accept',
 'acceptable',
 'acc

In [7]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 3
wordX = []
wordY = []
for i in range(0, n_words - seq_length, 1):
    seq_in = corpuswords_raw[i:i + seq_length]
    seq_out = corpuswords_raw[i + seq_length]
    wordX.append([word_to_int[word] if word != '' else '' for word in seq_in])
    wordY.append(word_to_int[seq_out] if seq_out != '' else '')
n_wordpatterns = len(wordX)
print("Total Patterns: ", n_wordpatterns)

Total Patterns:  421608


In [119]:
seq_in

['<nEXt>', 'the', '<nEXt>']

In [117]:
wordX

[[14233, 5760, 1],
 [5760, 1, 2],
 [1, 2, 20322],
 [2, 20322, 8895],
 [20322, 8895, 7067],
 [8895, 7067, 1],
 [7067, 1, 25939],
 [1, 25939, 5379],
 [25939, 5379, 0],
 [5379, 0, 21758],
 [0, 21758, 18188],
 [21758, 18188, 1],
 [18188, 1, 1131],
 [1, 1131, 23284],
 [1131, 23284, 6611],
 [23284, 6611, 21565],
 [6611, 21565, 1],
 [21565, 1, 11425],
 [1, 11425, 23356],
 [11425, 23356, 15642],
 [23356, 15642, 13227],
 [15642, 13227, 0],
 [13227, 0, 21758],
 [0, 21758, 14713],
 [21758, 14713, 1],
 [14713, 1, 2],
 [1, 2, 19086],
 [2, 19086, 1541],
 [19086, 1541, 8895],
 [1541, 8895, 1],
 [8895, 1, 2714],
 [1, 2714, 0],
 [2714, 0, 22563],
 [0, 22563, 366],
 [22563, 366, 1],
 [366, 1, 700],
 [1, 700, 15693],
 [700, 15693, 13835],
 [15693, 13835, 13282],
 [13835, 13282, 1],
 [13282, 1, 15152],
 [1, 15152, 23284],
 [15152, 23284, 13769],
 [23284, 13769, 0],
 [13769, 0, 4029],
 [0, 4029, 5780],
 [4029, 5780, 14106],
 [5780, 14106, 1],
 [14106, 1, 11189],
 [1, 11189, 19276],
 [11189, 19276, 1],
 [19

In [103]:
len(wordY)

528336

In [8]:
# reshape X to be [samples, time steps, features]
X = np.reshape(wordX, (n_wordpatterns, seq_length, 1))
# normalize
X = X / float(n_vocab_words)
# one hot encode the output variable
y = to_categorical(wordY)

In [9]:
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
# define the checkpoint
filepath="word_weights-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

# fit the model
model.fit(X, y, epochs=200, batch_size=128, callbacks=callbacks_list)


Epoch 1/200

Epoch 00001: loss improved from inf to 6.62133, saving model to word_weights-01-6.6213.hdf5
Epoch 2/200

Epoch 00002: loss improved from 6.62133 to 6.43705, saving model to word_weights-02-6.4371.hdf5
Epoch 3/200

Epoch 00003: loss improved from 6.43705 to 6.38342, saving model to word_weights-03-6.3834.hdf5
Epoch 4/200

Epoch 00004: loss improved from 6.38342 to 6.34354, saving model to word_weights-04-6.3435.hdf5
Epoch 5/200

Epoch 00005: loss improved from 6.34354 to 6.31035, saving model to word_weights-05-6.3103.hdf5
Epoch 6/200

Epoch 00006: loss improved from 6.31035 to 6.28251, saving model to word_weights-06-6.2825.hdf5
Epoch 7/200

Epoch 00007: loss improved from 6.28251 to 6.25696, saving model to word_weights-07-6.2570.hdf5
Epoch 8/200

Epoch 00008: loss improved from 6.25696 to 6.23163, saving model to word_weights-08-6.2316.hdf5
Epoch 9/200

Epoch 00009: loss improved from 6.23163 to 6.20634, saving model to word_weights-09-6.2063.hdf5
Epoch 10/200
   128/42



   256/421608 [..............................] - ETA: 1:32:14 - loss: 6.1494




Epoch 00010: loss improved from 6.20634 to 6.17867, saving model to word_weights-10-6.1787.hdf5
Epoch 11/200

Epoch 00011: loss improved from 6.17867 to 6.15198, saving model to word_weights-11-6.1520.hdf5
Epoch 12/200

Epoch 00012: loss improved from 6.15198 to 6.12394, saving model to word_weights-12-6.1239.hdf5
Epoch 13/200

Epoch 00013: loss improved from 6.12394 to 6.09786, saving model to word_weights-13-6.0979.hdf5
Epoch 14/200

Epoch 00014: loss improved from 6.09786 to 6.07159, saving model to word_weights-14-6.0716.hdf5
Epoch 15/200