In [1]:
import pandas as pd
import numpy as np

import string
import re

RANDOM = 42

Load the data

In [2]:
haikus_train_df = pd.read_pickle('./data/haikus_train_df.pickle')
haikus_test_df = pd.read_pickle('./data/haikus_test_df.pickle')

## RNN words

In [3]:
from keras.models import Sequential, load_model
from keras.layers import Dense, Input, LSTM, Dropout
from keras.callbacks import ModelCheckpoint

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [4]:
flatten = lambda l: [item for sublist in l for item in sublist]

corpuswords_raw = flatten(list(haikus_train_df['text_withtokens']))

words = sorted(set(corpuswords_raw))
word_to_int = dict((w, i) for i, w in enumerate(words))
int_to_word = dict((i, w) for i, w in enumerate(words))

n_words = len(corpuswords_raw)
n_vocab_words = len(words)
print("Total Words: ", n_words)
print("Total Vocab: ", n_vocab_words)

Total Words:  419015
Total Vocab:  36780


In [5]:
n_poems = len(haikus_train_df)

n_poems

25128

In [6]:
word_to_int["<nEXt>"]

542

In [7]:
from gensim.models import KeyedVectors
from gemsim.scripts import glove2word2vec
import os

glove_file = './data/image_to_text/glove.840B.300d.txt'
tmp_file = './data/image_to_text/glovetmp.txt'

if not os.path.isfile(tmp_file):
    _ = glove2word2vec(glove_file, tmp_file)

glove_model = KeyedVectors.load_word2vec_format(tmp_file)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [8]:
def buildWordVector(text, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in text:
        try:
            vec += glove_model[word].reshape((1, size))
            count += 1.
        except KeyError:
            continue
    if count != 0:
        vec /= count
    return vec

In [10]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 15

poemX = []
poemY = []
n_patterns = 0

corpusX = []
corpusY = []
for poem_index in range(0, n_poems):

    wordX = []
    wordY = []
    poem = haikus_train_df['text_withtokens'].iloc[poem_index]
    # add padding to poem
    poem = list(np.full(seq_length - 1, '')) + list(poem)
    for i in range(0,  len(poem) - seq_length, 1):
        seq_in = poem[i:i + seq_length]
        seq_out = poem[i + seq_length]
        wordX.append([word_to_int[word] for word in seq_in])
        wordY.append(word_to_int[seq_out])

    n_patterns = max(n_patterns, len(wordX))
    
    poemX.append(wordX)
    poemY.append(wordY)
    
    corpusX += wordX
    corpusY += wordY

n_wordpatterns = len(corpusX)
print("Total Patterns: ", n_wordpatterns)

Total Patterns:  393887


In [11]:
# associate an embedding with each sequence

haiku_glove_train = np.array([[buildWordVector([int_to_word[word]], 300)[0] for word in seq] for seq in corpusX])

In [12]:
haiku_glove_train.shape

(393887, 15, 300)

In [31]:
seq_in

['',
 '',
 '',
 '',
 '',
 'quaker',
 'meeting',
 '<nEXt>',
 'through',
 'stovepipe-heated',
 'air',
 '<nEXt>',
 'the',
 'world',
 'shimmers']

In [19]:
corpusX

[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 633],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 633, 13971],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 633, 13971, 2],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 633, 13971, 2, 10474],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 633, 13971, 2, 10474, 20998],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 633, 13971, 2, 10474, 20998, 1872],
 [0, 0, 0, 0, 0, 0, 0, 0, 633, 13971, 2, 10474, 20998, 1872, 1771],
 [0, 0, 0, 0, 0, 0, 0, 633, 13971, 2, 10474, 20998, 1872, 1771, 0],
 [0, 0, 0, 0, 0, 0, 633, 13971, 2, 10474, 20998, 1872, 1771, 0, 0],
 [0, 0, 0, 0, 0, 633, 13971, 2, 10474, 20998, 1872, 1771, 0, 0, 0],
 [0, 0, 0, 0, 633, 13971, 2, 10474, 20998, 1872, 1771, 0, 0, 0, 2],
 [0, 0, 0, 633, 13971, 2, 10474, 20998, 1872, 1771, 0, 0, 0, 2, 254],
 [0, 0, 633, 13971, 2, 10474, 20998, 1872, 1771, 0, 0, 0, 2, 254, 2244],
 [0, 633, 13971, 2, 10474, 20998, 1872, 1771, 0, 0, 0, 2, 254, 2244, 19965],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 609],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 609, 1

In [36]:
len(corpusY)

393887

In [13]:
# reshape X to be [samples, time steps, features]
X = haiku_glove_train #np.reshape(corpusX, (n_wordpatterns, seq_length, 300))
# normalize
#X = X / float(n_vocab_words)
# one hot encode the output variable
y = np.array(corpusY)

In [14]:
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True, activation='relu'))
model.add(Dropout(0.2))
model.add(LSTM(256, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(y.shape[0], activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['sparse_categorical_accuracy'])

In [11]:
model = load_model('weights/word_embedding/wordembed_weights-cont-168-4.9039.hdf5')




In [15]:
# define the checkpoint
filepath="weights/word_embedding/wordembed-weights-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [16]:
# fit the model
history = model.fit(X, y, epochs=200, batch_size=128, callbacks=callbacks_list)


Epoch 1/200

Epoch 00001: loss improved from inf to 6.69822, saving model to weights/word_embedding/wordembed-weights-01-6.6982.hdf5
Epoch 2/200

Epoch 00002: loss improved from 6.69822 to 5.97268, saving model to weights/word_embedding/wordembed-weights-02-5.9727.hdf5
Epoch 3/200

Epoch 00003: loss improved from 5.97268 to 5.63862, saving model to weights/word_embedding/wordembed-weights-03-5.6386.hdf5
Epoch 4/200

Epoch 00004: loss improved from 5.63862 to 5.36636, saving model to weights/word_embedding/wordembed-weights-04-5.3664.hdf5
Epoch 5/200

Epoch 00005: loss improved from 5.36636 to 5.12394, saving model to weights/word_embedding/wordembed-weights-05-5.1239.hdf5
Epoch 6/200

Epoch 00006: loss improved from 5.12394 to 4.90071, saving model to weights/word_embedding/wordembed-weights-06-4.9007.hdf5
Epoch 7/200

Epoch 00007: loss improved from 4.90071 to 4.68798, saving model to weights/word_embedding/wordembed-weights-07-4.6880.hdf5
Epoch 8/200

Epoch 00008: loss improved from


Epoch 00032: loss improved from 2.93003 to 2.91003, saving model to weights/word_embedding/wordembed-weights-32-2.9100.hdf5
Epoch 33/200

Epoch 00033: loss improved from 2.91003 to 2.89407, saving model to weights/word_embedding/wordembed-weights-33-2.8941.hdf5
Epoch 34/200

Epoch 00034: loss improved from 2.89407 to 2.88158, saving model to weights/word_embedding/wordembed-weights-34-2.8816.hdf5
Epoch 35/200

Epoch 00035: loss improved from 2.88158 to 2.86229, saving model to weights/word_embedding/wordembed-weights-35-2.8623.hdf5
Epoch 36/200

Epoch 00036: loss improved from 2.86229 to 2.84656, saving model to weights/word_embedding/wordembed-weights-36-2.8466.hdf5
Epoch 37/200

Epoch 00037: loss improved from 2.84656 to 2.83421, saving model to weights/word_embedding/wordembed-weights-37-2.8342.hdf5
Epoch 38/200

Epoch 00038: loss improved from 2.83421 to 2.82106, saving model to weights/word_embedding/wordembed-weights-38-2.8211.hdf5
Epoch 39/200

Epoch 00039: loss improved from 2

Epoch 65/200

Epoch 00065: loss improved from 2.58978 to 2.58415, saving model to weights/word_embedding/wordembed-weights-65-2.5842.hdf5
Epoch 66/200

Epoch 00066: loss improved from 2.58415 to 2.58090, saving model to weights/word_embedding/wordembed-weights-66-2.5809.hdf5
Epoch 67/200

Epoch 00067: loss improved from 2.58090 to 2.57198, saving model to weights/word_embedding/wordembed-weights-67-2.5720.hdf5
Epoch 68/200

Epoch 00068: loss improved from 2.57198 to 2.56838, saving model to weights/word_embedding/wordembed-weights-68-2.5684.hdf5
Epoch 69/200

Epoch 00069: loss improved from 2.56838 to 2.56088, saving model to weights/word_embedding/wordembed-weights-69-2.5609.hdf5
Epoch 70/200

Epoch 00070: loss improved from 2.56088 to 2.55632, saving model to weights/word_embedding/wordembed-weights-70-2.5563.hdf5
Epoch 71/200

Epoch 00071: loss improved from 2.55632 to 2.55541, saving model to weights/word_embedding/wordembed-weights-71-2.5554.hdf5
Epoch 72/200

Epoch 00072: loss im


Epoch 00097: loss improved from 2.44628 to 2.44088, saving model to weights/word_embedding/wordembed-weights-97-2.4409.hdf5
Epoch 98/200

Epoch 00098: loss improved from 2.44088 to 2.43893, saving model to weights/word_embedding/wordembed-weights-98-2.4389.hdf5
Epoch 99/200

Epoch 00099: loss improved from 2.43893 to 2.43120, saving model to weights/word_embedding/wordembed-weights-99-2.4312.hdf5
Epoch 100/200

Epoch 00100: loss improved from 2.43120 to 2.43011, saving model to weights/word_embedding/wordembed-weights-100-2.4301.hdf5
Epoch 101/200

Epoch 00101: loss improved from 2.43011 to 2.42994, saving model to weights/word_embedding/wordembed-weights-101-2.4299.hdf5
Epoch 102/200

Epoch 00102: loss improved from 2.42994 to 2.42722, saving model to weights/word_embedding/wordembed-weights-102-2.4272.hdf5
Epoch 103/200

Epoch 00103: loss improved from 2.42722 to 2.42356, saving model to weights/word_embedding/wordembed-weights-103-2.4236.hdf5
Epoch 104/200

Epoch 00104: loss improv

KeyboardInterrupt: 