In [1]:
import pandas as pd
import numpy as np

import string
import re

RANDOM = 42

Load the data

In [2]:
haikus_train_df = pd.read_pickle('./data/haikus_train_df.pickle')
haikus_test_df = pd.read_pickle('./data/haikus_test_df.pickle')

## RNN letters

In [3]:
from keras.models import Sequential, load_model
from keras.layers import Dense, Input, LSTM, Dropout
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [None]:
class PoetryGenerator():
    
    def __init__(self, engine, tokenize=):
        
        super().__init__()
        
        self.engine = engine
        self.tokenize = tokenize
    
    def load_corpus(self, train, valid, special_tokens):
        """
        Assumes that special tokens have already been put into the dataset
        
        Parameters
        ---
        train:          training corpus
        valid:          validation corpus
        special_tokens: dictionary of the form token_type: token
                        must include 'newline' and 'endpoem' tokens
                        other possible tokens include 'newstanza'
        """
        pass
    
    def transform(self, corpus, seq_len)
    
        poem_count = len(corpus)
        self.pattern_count = 0
        
        # prepare the dataset of input to output pairs encoded as integers
        self.seq_len = seq_len

        self.poemX = []
        self.poemY = []
        self.pattern_count = 0

        self.corpusX = []
        self.corpusY = []
        for poem_index in range(0, poem_count):

            textX = []
            textY = []
            
            poem = corpus[poem_index]
            # add padding to poem
            poem = list(np.full(seq_length - 1, '')) + list(poem)
            
            for i in range(0,  len(poem) - seq_len, 1):
                seq_in = poem[i:i + seq_len]
                seq_out = poem[i + seq_len]
                textX.append([self.token_to_int[token] for token in seq_in])
                textY.append(self.token_to_int[seq_out])

            self.pattern_count = max(self.pattern_count, len(textX))

            self.poemX.append(textX)
            self.poemY.append(textY)

            self.corpusX += textX
            self.corpusY += textY
    
    def create_dict(self):
        
        # create corpus_raw
        
        self.tokens = sorted(set(list(corpus_raw)))
        self.token_to_int = dict((t, i) for i, t in enumerate(self.tokens))
        self.int_to_token = dict((i, t) for i, t in enumerate(self.tokens))
        
        self.token_count = len(corpus_raw)
        self.vocab_count = len(tokens)

    
    def fit(self):
                
        self.fitted = True
    
    def generate(self, temperature=1.0):
        
        if not self.fitted:
            raise ValueError('Model not fitted')

In [None]:
class PoetryGenByWord(PoetryGenerator):
    
    def __init__(self, engine)
    
        super().__init__(engine, tokenize='word')

In [4]:
corpus_raw = ''.join(haikus_train_df['textchar_withtokens'])

chars = sorted(set(list(corpus_raw)))
char_to_int = dict((c, i) for i, c in enumerate(chars))

n_chars = len(corpus_raw)
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print("Total Vocab: ", n_vocab)

Total Characters:  1849446
Total Vocab:  107


In [5]:
char_to_int[''] = n_vocab

In [6]:
n_poems = len(haikus_train_df)

n_poems

25128

In [None]:
"""
i
love
you


hello
world

---

seq = 5

[0 0 0 0 i] -> \n
[0 0 0 i \n] -> love
[0 0 i \n love] -> \n
...
[ ... \n you] -> \end
[0 0 0 0 hello] -> \n
"""

In [7]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 150

poemX = []
poemY = []
n_patterns = 0

for poem_index in range(0, n_poems):

    textX = []
    textY = []
    poem = haikus_train_df['textchar_withtokens'].iloc[poem_index]
    # add padding to poem
    poem = list(np.full(seq_length - 1, '')) + list(poem)
    for i in range(0,  len(poem) - seq_length, 1):
        seq_in = poem[i:i + seq_length]
        seq_out = poem[i + seq_length]
        textX.append([char_to_int[char] for char in seq_in])
        textY.append(char_to_int[seq_out])

    n_patterns = max(n_patterns, len(textX))
    
    poemX.append(textX)
    poemY.append(textY)

print("Max patterns per poem: ", n_patterns)

Max patterns per poem:  801


In [9]:
endpoem_charindex = char_to_int['◘']
newline_charindex = char_to_int['↕']

In [10]:
# reshape X to be [samples, time steps, features]
#X = np.reshape(poemX, (n_patterns, seq_length, n_poems))
# normalize
X = np.array([np.array([char / float(n_chars) for char in seq]) for poem in poemX for seq in poem])
X = np.reshape(X, (X.shape[0], X.shape[1], 1))
# one hot encode the output variable
y = to_categorical([nextchar for poem in poemY for nextchar in poem])

In [16]:
X.shape

(1824318, 150, 1)

In [14]:
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True, activation='relu'))
model.add(Dropout(0.2))
model.add(LSTM(256, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')

In [11]:
# define the checkpoint
filepath="weights/letter/letter-weights-corrected-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [14]:
model = load_model('weights/letter/letter-weights-new-25-3.0578.hdf5')

In [17]:
# fit the model
history = model.fit(X, y, epochs=200, batch_size=128, callbacks=callbacks_list)

Epoch 1/200

Epoch 00001: loss improved from inf to 6792965275621.73047, saving model to weights/letter/letter-weights-corrected-01-6792965275621.7305.hdf5
Epoch 2/200

Epoch 00002: loss improved from 6792965275621.73047 to 3.05950, saving model to weights/letter/letter-weights-corrected-02-3.0595.hdf5
Epoch 3/200

Epoch 00003: loss improved from 3.05950 to 3.05900, saving model to weights/letter/letter-weights-corrected-03-3.0590.hdf5
Epoch 4/200

Epoch 00004: loss improved from 3.05900 to 3.05878, saving model to weights/letter/letter-weights-corrected-04-3.0588.hdf5
Epoch 5/200

Epoch 00005: loss improved from 3.05878 to 3.05867, saving model to weights/letter/letter-weights-corrected-05-3.0587.hdf5
Epoch 6/200

Epoch 00006: loss improved from 3.05867 to 3.05853, saving model to weights/letter/letter-weights-corrected-06-3.0585.hdf5
Epoch 7/200

Epoch 00007: loss improved from 3.05853 to 3.05838, saving model to weights/letter/letter-weights-corrected-07-3.0584.hdf5
Epoch 8/200

Epo


Epoch 00042: loss did not improve from 3.05773
Epoch 43/200

Epoch 00043: loss did not improve from 3.05773
Epoch 44/200

Epoch 00044: loss did not improve from 3.05773
Epoch 45/200

Epoch 00045: loss did not improve from 3.05773
Epoch 46/200

Epoch 00046: loss did not improve from 3.05773
Epoch 47/200

KeyboardInterrupt: 