In [1]:
import pandas as pd
import numpy as np

import string
import re

RANDOM = 42

Load the data

In [2]:
haikus_train_df = pd.read_pickle('./data/haikus_train_df.pickle')
haikus_test_df = pd.read_pickle('./data/haikus_test_df.pickle')

## RNN letters

In [3]:
from keras.models import Sequential, load_model
from keras.layers import Dense, Input, LSTM, Dropout
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [4]:
corpus_raw = ''.join(haikus_train_df['textchar_withtokens'])

chars = sorted(set(list(corpus_raw)))
char_to_int = dict((c, i) for i, c in enumerate(chars))

n_chars = len(corpus_raw)
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print("Total Vocab: ", n_vocab)

Total Characters:  1849446
Total Vocab:  107


In [5]:
chars

[' ',
 '!',
 '"',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 '[',
 ']',
 '_',
 '`',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '{',
 '~',
 '\x85',
 '\x92',
 '\x96',
 '\x97',
 '\xa0',
 'à',
 'ä',
 'é',
 'ü',
 'ē',
 'ū',
 'ŭ',
 '\u200b',
 '‘',
 '’',
 '“',
 '”',
 '…',
 '↕',
 '◘']

In [6]:
n_poems = len(haikus_train_df)

n_poems

25128

In [7]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 5

poemX = []
poemY = []
n_patterns = 0

corpusX = []
corpusY = []
for poem_index in range(0, n_poems):

    textX = []
    textY = []
    poem = haikus_train_df['textchar_withtokens'].iloc[poem_index]
    for i in range(0,  len(poem) - seq_length, 1):
        seq_in = poem[i:i + seq_length]
        seq_out = poem[i + seq_length]
        textX.append([char_to_int[char] for char in seq_in])
        textY.append(char_to_int[seq_out])
    n_patterns = max(n_patterns, len(textX))
    
    poemX.append(textX)
    poemY.append(textY)
    
    corpusX += textX
    corpusY += textY

print("Max patterns per poem: ", n_patterns)

Max patterns per poem:  797


In [8]:
endpoem_charindex = char_to_int['◘']
newline_charindex = char_to_int['↕']

In [9]:
pd.DataFrame(corpusX)

Unnamed: 0,0,1,2,3,4
0,59,72,0,73,59
1,72,0,73,59,77
2,0,73,59,77,67
3,73,59,77,67,77
4,59,77,67,77,105
...,...,...,...,...,...
1723801,0,77,66,67,71
1723802,77,66,67,71,71
1723803,66,67,71,71,63
1723804,67,71,71,63,76


In [10]:
corpusY

[77,
 67,
 77,
 105,
 67,
 72,
 0,
 78,
 66,
 63,
 0,
 30,
 67,
 60,
 70,
 63,
 0,
 30,
 63,
 70,
 78,
 0,
 10,
 10,
 105,
 59,
 62,
 79,
 70,
 78,
 0,
 60,
 73,
 73,
 69,
 0,
 77,
 78,
 73,
 76,
 63,
 106,
 78,
 78,
 67,
 0,
 77,
 73,
 72,
 72,
 63,
 78,
 0,
 82,
 82,
 80,
 67,
 105,
 63,
 105,
 77,
 74,
 63,
 72,
 77,
 63,
 76,
 106,
 0,
 81,
 66,
 63,
 72,
 0,
 78,
 66,
 63,
 83,
 0,
 61,
 59,
 71,
 63,
 0,
 71,
 67,
 72,
 62,
 0,
 77,
 79,
 64,
 64,
 63,
 76,
 63,
 62,
 0,
 77,
 66,
 59,
 71,
 63,
 105,
 58,
 78,
 66,
 63,
 77,
 63,
 0,
 60,
 63,
 0,
 78,
 66,
 63,
 0,
 77,
 59,
 71,
 63,
 0,
 59,
 72,
 62,
 0,
 72,
 73,
 78,
 0,
 78,
 66,
 63,
 0,
 77,
 59,
 71,
 63,
 105,
 59,
 10,
 81,
 73,
 72,
 62,
 63,
 76,
 67,
 72,
 65,
 0,
 81,
 66,
 67,
 77,
 74,
 63,
 76,
 63,
 62,
 0,
 71,
 67,
 72,
 62,
 106,
 61,
 73,
 72,
 80,
 63,
 76,
 77,
 59,
 78,
 67,
 73,
 72,
 105,
 78,
 66,
 63,
 0,
 62,
 59,
 64,
 64,
 73,
 62,
 67,
 70,
 77,
 0,
 72,
 73,
 62,
 62,
 67,
 72,
 65,
 105,
 37,

In [11]:
# reshape X to be [samples, time steps, features]
#X = np.reshape(poemX, (n_patterns, seq_length, n_poems))
# normalize
X = np.array([np.array([char / float(n_chars) for char in seq]) for poem in poemX for seq in poem])
X = np.reshape(X, (X.shape[0], X.shape[1], 1))
# one hot encode the output variable
y = to_categorical([nextchar for poem in poemY for nextchar in poem])

In [12]:
# reshape X to be [samples, time steps, features]
#X = np.reshape(poemX, (n_patterns, seq_length, n_poems))
# normalize
X = np.reshape(X, (len(corpusX), seq_length, 1))
# one hot encode the output variable
y = to_categorical(corpusY)

In [13]:
X.shape

(1723806, 5, 1)

In [21]:
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [18]:
# define the checkpoint
filepath="letter-weights-cont-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [14]:
model = load_model('letter-weights-132-2.4074.hdf5')




In [None]:
# fit the model
model.fit(X, y, epochs=68, batch_size=128, callbacks=callbacks_list)

Epoch 1/68

Epoch 00001: loss improved from inf to 2.40626, saving model to letter-weights-cont-01-2.4063.hdf5
Epoch 2/68

Epoch 00002: loss improved from 2.40626 to 2.40516, saving model to letter-weights-cont-02-2.4052.hdf5
Epoch 3/68

Epoch 00003: loss improved from 2.40516 to 2.40253, saving model to letter-weights-cont-03-2.4025.hdf5
Epoch 4/68

Epoch 00004: loss improved from 2.40253 to 2.40108, saving model to letter-weights-cont-04-2.4011.hdf5
Epoch 5/68

Epoch 00005: loss improved from 2.40108 to 2.39877, saving model to letter-weights-cont-05-2.3988.hdf5
Epoch 6/68

Epoch 00006: loss improved from 2.39877 to 2.39739, saving model to letter-weights-cont-06-2.3974.hdf5
Epoch 7/68

Epoch 00007: loss improved from 2.39739 to 2.39518, saving model to letter-weights-cont-07-2.3952.hdf5
Epoch 8/68

Epoch 00008: loss improved from 2.39518 to 2.39436, saving model to letter-weights-cont-08-2.3944.hdf5
Epoch 9/68

Epoch 00009: loss did not improve from 2.39436
Epoch 10/68

Epoch 00010: