In [1]:
import pandas as pd
import numpy as np

import string
import re

RANDOM = 42

Load the data

In [2]:
haikus_train_df = pd.read_pickle('./data/haikus_train_df.pickle')
haikus_test_df = pd.read_pickle('./data/haikus_test_df.pickle')

In [3]:
from keras.models import load_model

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


## RNN letter testing

In [629]:
corpus_raw = ''.join(haikus_train_df['textchar_withtokens'])

chars = sorted(set(list(corpus_raw)))
char_to_int = dict((c, i) for i, c in enumerate(chars))

n_chars = len(corpus_raw)
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print("Total Vocab: ", n_vocab)

Total Characters:  1849446
Total Vocab:  107


In [630]:
char_to_int[''] = n_vocab

In [631]:
chars = chars + ['']

In [4]:
n_poems = len(haikus_train_df)

In [633]:
# prepare the dataset of input to output pairs encoded as integers
seq_length_char = 150

poemX = []
poemY = []
n_patterns = 0

for poem_index in range(0, n_poems):

    textX = []
    textY = []
    poem = haikus_train_df['textchar_withtokens'].iloc[poem_index]
    # add padding to poem
    poem = list(np.full(seq_length_char - 1, '')) + list(poem)
    for i in range(0,  len(poem) - seq_length_char, 1):
        seq_in = poem[i:i + seq_length_char]
        seq_out = poem[i + seq_length_char]
        textX.append([char_to_int[char] for char in seq_in])
        textY.append(char_to_int[seq_out])
    n_patterns = max(n_patterns, len(textX))
    
    poemX.append(textX)
    poemY.append(textY)

print("Max patterns per poem: ", n_patterns)

Max patterns per poem:  801


In [7]:
endpoem_charindex = char_to_int['◘']
newline_charindex = char_to_int['↕']

In [95]:
len(haikus_train_df['textchar_withtokens'].iloc[1])

31

In [92]:
endpoem_charindex

106

In [9]:
def poemchar(index):
    char = int_to_char[index]
    if char == '↕':
        char = '\n'
    #elif char == '◘':
    #    char = '' # represent end of poem
    return char

In [45]:
list(np.full(seq_length_char-1, char_to_int[''])) + [poemX[start][0][0]]

[107, 107, 72]

In [120]:
[[''.join([poemchar(char) for char in pattern]), poemchar(next_char)] \
 for _, next_char, pattern in zip(range(0,80,1), corpusY, corpusX)]


[['a', 'n'],
 ['an', ' '],
 ['an ', 'o'],
 ['an o', 'a'],
 ['an oa', 's'],
 ['an oas', 'i'],
 ['an oasi', 's'],
 ['an oasis', '\n'],
 ['an oasis\n', 'i'],
 ['an oasis\ni', 'n'],
 ['an oasis\nin', ' '],
 ['an oasis\nin ', 't'],
 ['an oasis\nin t', 'h'],
 ['an oasis\nin th', 'e'],
 ['an oasis\nin the', ' '],
 ['an oasis\nin the ', 'B'],
 ['an oasis\nin the B', 'i'],
 ['an oasis\nin the Bi', 'b'],
 ['an oasis\nin the Bib', 'l'],
 ['an oasis\nin the Bibl', 'e'],
 ['an oasis\nin the Bible', ' '],
 ['an oasis\nin the Bible ', 'B'],
 ['an oasis\nin the Bible B', 'e'],
 ['an oasis\nin the Bible Be', 'l'],
 ['an oasis\nin the Bible Bel', 't'],
 ['an oasis\nin the Bible Belt', ' '],
 ['an oasis\nin the Bible Belt ', '-'],
 ['an oasis\nin the Bible Belt -', '-'],
 ['an oasis\nin the Bible Belt --', '\n'],
 ['an oasis\nin the Bible Belt --\n', 'a'],
 ['an oasis\nin the Bible Belt --\na', 'd'],
 ['an oasis\nin the Bible Belt --\nad', 'u'],
 ['an oasis\nin the Bible Belt --\nadu', 'l'],
 ['an oasis\

In [651]:
model = load_model('weights/letter/letter-weights-corrected-35-3.0577.hdf5')

In [5]:
def sample(preds, temperature=1.0):
    
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probs = np.random.multinomial(1, preds, 1)
    
    return np.argmax(probs)

In [653]:
int_to_char = dict((i, c) for i, c in enumerate(chars))

start = np.random.randint(0, len(poemX)-1)
while start == endpoem_charindex: # don't start with end of the poem
    start = np.random.randint(0, len(poemX)-1)
    
pattern = (poemX[start][0]).copy()
gen_poem = ([poemX[start][0][-1]]).copy()

[print(poemchar(char), end='') for char in pattern]
# generate characters
for i in range(200):
    x = np.reshape([char / float(n_vocab) for char in pattern], (1, len(pattern), 1))

    prediction = model.predict(x, verbose=0)[0] * float(n_vocab)
    index = sample(prediction, 0.3)
            
    result = poemchar(index)
    
    if result == '◘':
        break;
    
    seq_in = [int_to_char[value] for value in pattern]
    print(result, end='')
    pattern.append(index)
    gen_poem.append(index)
    pattern = pattern[1:len(pattern)]
    
print("\nDone.")
print([poemchar(char) for char in gen_poem])

t   t    l  drene      e  teer   h   l     nt ee e e    as e    s  n   e    elt           o    e sa ean o  e a  alt     ae eun  e e   oe  e isao e  t    ee h  roetne     e  re eeh  o  r ae t tene s    
Done.
['t', ' ', ' ', ' ', 't', ' ', ' ', ' ', ' ', 'l', ' ', ' ', 'd', 'r', 'e', 'n', 'e', ' ', ' ', ' ', ' ', ' ', ' ', 'e', ' ', ' ', 't', 'e', 'e', 'r', ' ', ' ', ' ', 'h', ' ', ' ', ' ', 'l', ' ', ' ', ' ', ' ', ' ', 'n', 't', ' ', 'e', 'e', ' ', 'e', ' ', 'e', ' ', ' ', ' ', ' ', 'a', 's', ' ', 'e', ' ', ' ', ' ', ' ', 's', ' ', ' ', 'n', ' ', ' ', ' ', 'e', ' ', ' ', ' ', ' ', 'e', 'l', 't', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 'o', ' ', ' ', ' ', ' ', 'e', ' ', 's', 'a', ' ', 'e', 'a', 'n', ' ', 'o', ' ', ' ', 'e', ' ', 'a', ' ', ' ', 'a', 'l', 't', ' ', ' ', ' ', ' ', ' ', 'a', 'e', ' ', 'e', 'u', 'n', ' ', ' ', 'e', ' ', 'e', ' ', ' ', ' ', 'o', 'e', ' ', ' ', 'e', ' ', 'i', 's', 'a', 'o', ' ', 'e', ' ', ' ', 't', ' ', ' ', ' ', ' ', 'e', 'e', ' ', 'h', ' ', '

## RNN word testing

In [6]:
flatten = lambda l: [item for sublist in l for item in sublist]

corpuswords_raw = flatten(list(haikus_train_df['text_withtokens_clean']))

words = sorted(set(corpuswords_raw))
word_to_int = dict((w, i) for i, w in enumerate(words))

n_words = len(corpuswords_raw)
n_vocab_words = len(words)
print("Total Words: ", n_words)
print("Total Vocab: ", n_vocab_words)

Total Words:  447891
Total Vocab:  24046


In [16]:
words

['<eNd>',
 '<nEXt>',
 'a',
 'aaaa',
 'aah',
 'aback',
 'abandon',
 'abandoned',
 'abandoning',
 'abating',
 'abattoir',
 'abbess',
 'abbey',
 'abbot',
 'abbott',
 'abc',
 'abcs',
 'abduction',
 'abed',
 'abel',
 'abelard',
 'aberration',
 'abhor',
 'abhorred',
 'abide',
 'abilene',
 'abjure',
 'ablaze',
 'able',
 'ablowing',
 'aboard',
 'abode',
 'abodes',
 'abolish',
 'abominable',
 'abord',
 'abortion',
 'abound',
 'about',
 'above',
 'abraham',
 'abramoff',
 'abreast',
 'abriman',
 'abroad',
 'abrupt',
 'abruptly',
 'absalom',
 'abscond',
 'absence',
 'absent',
 'absently',
 'absinthe',
 'absolute',
 'absolutely',
 'absolution',
 'absorb',
 'absorbed',
 'absorbing',
 'absorbs',
 'abstain',
 'abstemious',
 'abstract',
 'abstracted',
 'abstractedlyone',
 'abstraction',
 'absurd',
 'abundance',
 'abundant',
 'abuse',
 'abuses',
 'abydos',
 'abyss',
 'abysses',
 'acacia',
 'academy',
 'acadian',
 'acadians',
 'acc',
 'acceleration',
 'accent',
 'accents',
 'accept',
 'acceptable',
 'acc

In [393]:
# prepare the dataset of input to output pairs encoded as integers
seq_length_word = 15

word_poemX = []
word_poemY = []
n_wordpatterns = 0

word_corpusX = []
word_corpusY = []
for poem_index in range(0, n_poems):

    wordX = []
    wordY = []
    poem = haikus_train_df['text_withtokens_clean'].iloc[poem_index]
    # add padding to poem
    poem = list(np.full(seq_length_word - 1, '')) + list(poem)
    for i in range(0,  len(poem) - seq_length_word, 1):
        seq_in = poem[i:i + seq_length_word]
        seq_out = poem[i + seq_length_word]
        wordX.append([word_to_int[word] for word in seq_in])
        wordY.append(word_to_int[seq_out])
    n_wordpatterns = max(n_wordpatterns, len(wordX))
    
    word_poemX.append(wordX)
    word_poemY.append(wordY)
    
    word_corpusX += wordX
    word_corpusY += wordY

n_wordpatterns = len(word_corpusX)
print("Total Patterns: ", n_wordpatterns)

Total Patterns:  422763


In [394]:
wordX

[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16213],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16213, 12752],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16213, 12752, 2],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16213, 12752, 2, 21167],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16213, 12752, 2, 21167, 19984],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 16213, 12752, 2, 21167, 19984, 9626],
 [0, 0, 0, 0, 0, 0, 0, 0, 16213, 12752, 2, 21167, 19984, 9626, 394],
 [0, 0, 0, 0, 0, 0, 0, 16213, 12752, 2, 21167, 19984, 9626, 394, 2],
 [0, 0, 0, 0, 0, 0, 16213, 12752, 2, 21167, 19984, 9626, 394, 2, 20998],
 [0, 0, 0, 0, 0, 16213, 12752, 2, 21167, 19984, 9626, 394, 2, 20998, 23670],
 [0,
  0,
  0,
  0,
  16213,
  12752,
  2,
  21167,
  19984,
  9626,
  394,
  2,
  20998,
  23670,
  18438]]

In [423]:
model = load_model('weights/word/word-weights-new-200-3.7964.hdf5')

In [6]:
def print_token(token):
    
    if token == '<nEXt>':
        print()
    elif token == '<eNd>':
        print()
    elif token != '':
        print(token, end=' ')

In [397]:
word_poemX[1][0]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 609]

In [626]:
int_to_word = dict((i, w) for i, w in enumerate(words))

start = np.random.randint(0, n_poems)
pattern = word_poemX[start][0]
gen_poem = [pattern[-1]]
# generate words
for i in range(200):
    x = np.reshape(pattern, (1, len(pattern), 1))
    x = x / float(n_vocab_words)
    prediction = model.predict(x, verbose=0)[0]
    index = sample(prediction, 0.99)
    #index = np.argmax(prediction)
    
    # avoid repeating
    #if index == pattern[-1]:
    #    prediction[index] = 0
    #    index = np.argmax(prediction)
    
    result = int_to_word[index]
    if result == '':
        prediction[index] = 0
        index = np.argmax(prediction)
        result = int_to_word[index]
    
    if result == '<eNd>':
        break;
    
    seq_in = [int_to_word[value] for value in pattern]
    
    pattern.append(index)
    if result != '' and not (gen_poem[-1] == '<nEXt>' and result == '<nEXt>'):
        gen_poem.append(index)
    pattern = pattern[1:len(pattern)]

[print_token(int_to_word[value]) for value in gen_poem]
print()

print([int_to_word[value] for value in gen_poem])

arn your all 
i bowed she 
full on both within of plans day 

['arn', 'your', 'all', '<nEXt>', 'i', 'bowed', 'she', '<nEXt>', 'full', 'on', 'both', 'within', 'of', 'plans', 'day', '<nEXt>']


  after removing the cwd from sys.path.


# Word embeddings

In [7]:
from gensim.models import KeyedVectors
from gensim.scripts import glove2word2vec
import os

glove_file = './data/image_to_tect/glove.840B.300d.txt'
tmp_file = './data/image_to_text/glovetmp.txt'

if not os.path.isfile(tmp_file):
    _ = glove2word2vec(glove_file, tmp_file)

glove_model = KeyedVectors.load_word2vec_format(tmp_file)

MemoryError: Unable to allocate 2.45 GiB for an array with shape (2196018, 300) and data type float32

In [None]:
flatten = lambda l: [item for sublist in l for item in sublist]

corpuswords_raw = flatten(list(haikus_train_df['text_withtokens']))

words = sorted(set(corpuswords_raw))
word_to_int = dict((w, i) for i, w in enumerate(words))
int_to_word = dict((i, w) for i, w in enumerate(words))

n_words = len(corpuswords_raw)
n_vocab_words = len(words)
print("Total Words: ", n_words)
print("Total Vocab: ", n_vocab_words)

In [9]:
# prepare the dataset of input to output pairs encoded as integers

if os.path.isfile('./data/haiku_train_wordembed.npz'):
    loaded = np.load('./data/haiku_train_wordembed.npz', allow_pickle=True)
    
    word_poemX = loaded['X']
    word_poemY = loaded['Y']
    
    seq_length_word = len(word_poemX[0])
else:
    seq_length_word = 15

    word_poemX = []
    word_poemY = []
    n_wordpatterns = 0

    word_corpusX = []
    word_corpusY = []
    for poem_index in range(0, n_poems):

        wordX = []
        wordY = []
        poem = haikus_train_df['text_withtokens'].iloc[poem_index]
        # add padding to poem
        poem = list(np.full(seq_length_word - 1, '')) + list(poem)
        for i in range(0,  len(poem) - seq_length_word, 1):
            seq_in = poem[i:i + seq_length_word]
            seq_out = poem[i + seq_length_word]
            wordX.append([word_to_int[word] for word in seq_in])
            wordY.append(word_to_int[seq_out])
        n_wordpatterns = max(n_wordpatterns, len(wordX))

        word_poemX.append(wordX)
        word_poemY.append(wordY)

        word_corpusX += wordX
        word_corpusY += wordY

    np.savez_compressed('./data/haiku_train_wordembed.npz', X=np.array(word_poemX), Y=np.array(word_poemY))

    n_wordpatterns = len(word_corpusX)
    print("Total Patterns: ", n_wordpatterns)



In [10]:
def buildWordVector(text, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in text:
        try:
            vec += glove_model[word].reshape((1, size))
            count += 1
        except:
            continue
    if count != 0:
        vec /= count
    return vec

In [12]:
model = load_model('weights/word_embedding/wordembed-weights-70-2.5563.hdf5')




In [199]:
from sys import getsizeof

getsizeof(model) / 64

0.875

In [675]:
pattern

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6996]

In [683]:
np.array([[buildWordVector([int_to_word[word]], 300)[0] for word in pattern]]).shape

(1, 15, 300)

In [16]:
temperature = .5

start = np.random.randint(0, n_vocab_words)
pattern = list(np.zeros(14)) + [start]
gen_poem = [pattern[-1]]

# generate words
for i in range(200):
    x = np.array([[buildWordVector([int_to_word[word]], 300)[0] for word in pattern]])
    prediction = model.predict(x, verbose=0)[0]
    index = sample(prediction, temperature)
    #index = np.argmax(prediction)
    
    # avoid repeating
    #if index == pattern[-1]:
    #    prediction[index] = 0
    #    index = np.argmax(prediction)
    
    while index >= len(int_to_word):
        index = sample(prediction, temperature)
        
    result = int_to_word[index]
    
    if result == '':
        prediction[index] = 0
        index = np.argmax(prediction)
        result = int_to_word[index]
    
    if result == '<eNd>':
        break;
    
    pattern.append(index)
    if result != '':
        gen_poem.append(index)
    pattern = pattern[1:len(pattern)]

[print_token(int_to_word[value]) for value in gen_poem]
print()

print([int_to_word[value] for value in gen_poem])

  after removing the cwd from sys.path.


satins 
the pages of the sand 
fills the air 
['satins', '<nEXt>', 'the', 'pages', 'of', 'the', 'sand', '<nEXt>', 'fills', 'the', 'air']
