In [1]:
import pandas as pd
import numpy as np

import string
import re

RANDOM = 42

Load the data

In [2]:
haikus_train_df = pd.read_pickle('./data/haikus_train_df.pickle')
haikus_test_df = pd.read_pickle('./data/haikus_test_df.pickle')

## RNN letter testing

In [3]:
from keras.models import load_model

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [4]:
corpus_raw = ''.join(haikus_train_df['textchar_withtokens'])

chars = sorted(set(list(corpus_raw)))
char_to_int = dict((c, i) for i, c in enumerate(chars))

n_chars = len(corpus_raw)
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print("Total Vocab: ", n_vocab)

Total Characters:  1849446
Total Vocab:  107


In [5]:
n_poems = len(haikus_train_df)

In [6]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 5

poemX = []
poemY = []
n_patterns = 0

corpusX = []
corpusY = []
for poem_index in range(0, n_poems):

    textX = []
    textY = []
    poem = haikus_train_df['textchar_withtokens'].iloc[poem_index]
    for i in range(0,  len(poem) - seq_length, 1):
        seq_in = poem[i:i + seq_length]
        seq_out = poem[i + seq_length]
        textX.append([char_to_int[char] for char in seq_in])
        textY.append(char_to_int[seq_out])
    n_patterns = max(n_patterns, len(textX))
    
    poemX.append(textX)
    poemY.append(textY)
    
    corpusX += textX
    corpusY += textY

print("Max patterns per poem: ", n_patterns)

Max patterns per poem:  797


In [7]:
endpoem_charindex = char_to_int['◘']
newline_charindex = char_to_int['↕']

In [9]:
def poemchar(index):
    char = int_to_char[index]
    if char == '↕':
        char = '\n'
    #elif char == '◘':
    #    char = '' # represent end of poem
    return char

In [37]:
model = load_model('weights/letter/letter-weights-cont-68-2.3198.hdf5')

In [39]:
int_to_char = dict((i, c) for i, c in enumerate(chars))

start = np.random.randint(0, len(poemX)-1)
while start == endpoem_charindex: # don't start with end of the poem
    start = np.random.randint(0, len(poemX)-1)
    
pattern = (poemX[start][0]).copy()
gen_poem = (poemX[start][0]).copy()

[print(poemchar(char), end='') for char in pattern]
# generate characters
for i in range(100):
    x = np.reshape([char / float(n_vocab) for char in pattern], (1, len(pattern), 1))
    #print(x)
    prediction = model.predict(x, verbose=0) * float(n_vocab)
    #print(prediction)
    index = np.argmax(prediction)
    #print(index)
    result = poemchar(index)
    
    if result == '◘':
        break;
    
    seq_in = [int_to_char[value] for value in pattern]
    print(result, end='')
    pattern.append(index)
    gen_poem.append(index)
    pattern = pattern[1:len(pattern)]
    
print("\nDone.")
print([poemchar(char) for char in gen_poem])

novemoopooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo
Done.
['n', 'o', 'v', 'e', 'm', 'o', 'o', 'p', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o']


## RNN word testing

In [15]:
flatten = lambda l: [item for sublist in l for item in sublist]

corpuswords_raw = [item for item in flatten(list(haikus_train_df['text_withtokens_clean'])) if item != '']

words = sorted(set(corpuswords_raw))
word_to_int = dict((w, i) for i, w in enumerate(words))

n_words = len(corpuswords_raw)
n_vocab_words = len(words)
print("Total Words: ", n_words)
print("Total Vocab: ", n_vocab_words)

Total Words:  421611
Total Vocab:  24045


In [16]:
words

['<eNd>',
 '<nEXt>',
 'a',
 'aaaa',
 'aah',
 'aback',
 'abandon',
 'abandoned',
 'abandoning',
 'abating',
 'abattoir',
 'abbess',
 'abbey',
 'abbot',
 'abbott',
 'abc',
 'abcs',
 'abduction',
 'abed',
 'abel',
 'abelard',
 'aberration',
 'abhor',
 'abhorred',
 'abide',
 'abilene',
 'abjure',
 'ablaze',
 'able',
 'ablowing',
 'aboard',
 'abode',
 'abodes',
 'abolish',
 'abominable',
 'abord',
 'abortion',
 'abound',
 'about',
 'above',
 'abraham',
 'abramoff',
 'abreast',
 'abriman',
 'abroad',
 'abrupt',
 'abruptly',
 'absalom',
 'abscond',
 'absence',
 'absent',
 'absently',
 'absinthe',
 'absolute',
 'absolutely',
 'absolution',
 'absorb',
 'absorbed',
 'absorbing',
 'absorbs',
 'abstain',
 'abstemious',
 'abstract',
 'abstracted',
 'abstractedlyone',
 'abstraction',
 'absurd',
 'abundance',
 'abundant',
 'abuse',
 'abuses',
 'abydos',
 'abyss',
 'abysses',
 'acacia',
 'academy',
 'acadian',
 'acadians',
 'acc',
 'acceleration',
 'accent',
 'accents',
 'accept',
 'acceptable',
 'acc

In [17]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 3
wordX = []
wordY = []
for i in range(0, n_words - seq_length, 1):
    seq_in = corpuswords_raw[i:i + seq_length]
    seq_out = corpuswords_raw[i + seq_length]
    wordX.append([word_to_int[word] if word != '' else '' for word in seq_in])
    wordY.append(word_to_int[seq_out] if seq_out != '' else '')
n_wordpatterns = len(wordX)
print("Total Patterns: ", n_wordpatterns)

Total Patterns:  421608


In [115]:
wordX[start]

[10643, 1927, 23343, 1]

In [24]:
model = load_model('word_weights-26-5.7531.hdf5')

In [33]:
def print_token(token):
    
    if token == '<nEXt>':
        print()
    elif token == '<eNd>':
        print()
    else:
        print(token, end=' ')

In [36]:
int_to_word = dict((i, w) for i, w in enumerate(words))

start = np.random.randint(0, len(haikus_train_df)-1)
pattern = [word_to_int[word] for _, word in zip(range(0,seq_length), haikus_train_df['text_withtokens_clean'].iloc[start])]
gen_poem = pattern.copy()

[print_token(int_to_word[value]) for value in pattern]
# generate words
for i in range(100):
    x = np.reshape(pattern, (1, len(pattern), 1))
    x = x / float(n_vocab_words)
    prediction = model.predict(x, verbose=0)
    index = np.argmax(prediction)
    result = int_to_word[index]
    
    if result == '<eNd>':
        break;
    else:
        print_token(result)
    
    seq_in = [int_to_word[value] for value in pattern]
    
    pattern.append(index)
    gen_poem.append(index)
    pattern = pattern[1:len(pattern)]
print("\nDone.")
print([int_to_word[value] for value in gen_poem])

moss growing 
the the 
the the 
the the 
the the 
the the 
the the 
the the 
the the 
the the 
the the 
the the 
the the 
the the 
the the 
the the 
the the 
the the 
the the 
the the 
the the 
the the 
the the 
the the 
the the 
the the 
the the 
the the 
the the 
the the 
the the 
the the 
the the 
the the 
the 
Done.
['moss', 'growing', '<nEXt>', 'the', 'the', '<nEXt>', 'the', 'the', '<nEXt>', 'the', 'the', '<nEXt>', 'the', 'the', '<nEXt>', 'the', 'the', '<nEXt>', 'the', 'the', '<nEXt>', 'the', 'the', '<nEXt>', 'the', 'the', '<nEXt>', 'the', 'the', '<nEXt>', 'the', 'the', '<nEXt>', 'the', 'the', '<nEXt>', 'the', 'the', '<nEXt>', 'the', 'the', '<nEXt>', 'the', 'the', '<nEXt>', 'the', 'the', '<nEXt>', 'the', 'the', '<nEXt>', 'the', 'the', '<nEXt>', 'the', 'the', '<nEXt>', 'the', 'the', '<nEXt>', 'the', 'the', '<nEXt>', 'the', 'the', '<nEXt>', 'the', 'the', '<nEXt>', 'the', 'the', '<nEXt>', 'the', 'the', '<nEXt>', 'the', 'the', '<nEXt>', 'the', 'the', '<nEXt>', 'the', 'the', '<nEXt>', 