## Modeling Again
Because the first round was disastrous. 

#### Imports 

In [119]:
import pandas as pd
import string
import re
import numpy

from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from keras.callbacks import EarlyStopping
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical

In [2]:
lines = pd.read_csv("../data/lines.csv")

#### Cleaning and Conglomerating Data 

In [165]:
lines_list = list(lines["line"])

In [166]:
text = " newline ".join(lines_list)

In [167]:
# strip -'s and other puncuation
# capture only words, and lowercase all of them for simplicity in vocabulary of model 

text = text.replace("-", " ")
text = text.replace("'", "")

tokens = re.findall(r"\w+", text)

tokens = [word.lower() for word in tokens]

# keep newlines in -- attempt to train model for when to create new lines 
# for i in range(len(tokens)):  
#     if tokens[i] == "newline": 
#         tokens[i] = "\n" 

word_count = len(tokens)
unique_words = len(set(tokens))

In [168]:
length = 4 + 1

sequences = list()
for i in range(length, word_count): 
    seq = tokens[i - length : i]
    line = " ".join(seq)
    sequences.append(line)

In [169]:
sequences[:5]

['from fairest creatures we desire',
 'fairest creatures we desire increase',
 'creatures we desire increase newline',
 'we desire increase newline that',
 'desire increase newline that thereby']

#### Formatting Data 

In [170]:
tokenizer = Tokenizer()

tokenizer.fit_on_texts(sequences)

seqs_tokenizer = tokenizer.texts_to_sequences(sequences)

In [171]:
seqs_tokenizer[:5]

[[35, 513, 1367, 180, 217],
 [513, 1367, 180, 217, 515],
 [1367, 180, 217, 515, 1],
 [180, 217, 515, 1, 9],
 [217, 515, 1, 9, 873]]

In [172]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

3149

In [178]:
# separate sequences into input (four word seq) and output (following word)

seqs_arr = array(seqs_tokenizer)

# changed some grammar to make it work 
X = seqs_arr[:, :-1]
y = seqs_arr[:, -1]

y = to_categorical(y, num_classes = vocab_size)

seq_length = len(X[0])
seq_length

4

#### Modeling

In [179]:
# create model 

model =  Sequential()

model.add(Embedding(vocab_size, 
                    100, 
                    input_length = seq_length))
model.add(LSTM(128, return_sequences = True))
model.add(LSTM(128))
model.add(Dense(128, 
                activation = "relu"))
model.add(Dense(vocab_size, 
                activation = "softmax"))

model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 4, 100)            314900    
_________________________________________________________________
lstm_3 (LSTM)                (None, 4, 128)            117248    
_________________________________________________________________
lstm_4 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_3 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_4 (Dense)              (None, 3149)              406221    
Total params: 986,465
Trainable params: 986,465
Non-trainable params: 0
_________________________________________________________________


In [181]:
# compile model 

history = model.compile(loss = "categorical_crossentropy", 
                        optimizer = "adam", 
                        metrics = ["acc"])

callback = EarlyStopping(patience = 5)

model.fit(X, y, 
          batch_size = 128, 
          epochs = 70, 
          callbacks = [callback], 
          verbose = 2)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/70
 - 7s - loss: 6.6086 - acc: 0.1081
Epoch 2/70




 - 5s - loss: 6.0988 - acc: 0.1091
Epoch 3/70
 - 5s - loss: 6.0647 - acc: 0.1091
Epoch 4/70
 - 6s - loss: 6.0363 - acc: 0.1091
Epoch 5/70
 - 6s - loss: 5.9409 - acc: 0.1092
Epoch 6/70
 - 6s - loss: 5.8135 - acc: 0.1121
Epoch 7/70
 - 5s - loss: 5.6800 - acc: 0.1246
Epoch 8/70
 - 5s - loss: 5.5391 - acc: 0.1300
Epoch 9/70
 - 5s - loss: 5.4051 - acc: 0.1347
Epoch 10/70
 - 5s - loss: 5.2707 - acc: 0.1398
Epoch 11/70
 - 5s - loss: 5.1461 - acc: 0.1431
Epoch 12/70
 - 5s - loss: 5.0360 - acc: 0.1447
Epoch 13/70
 - 6s - loss: 4.9355 - acc: 0.1491
Epoch 14/70
 - 5s - loss: 4.8324 - acc: 0.1530
Epoch 15/70
 - 5s - loss: 4.7265 - acc: 0.1580
Epoch 16/70
 - 5s - loss: 4.6240 - acc: 0.1623
Epoch 17/70
 - 5s - loss: 4.5018 - acc: 0.1678
Epoch 18/70
 - 5s - loss: 4.3752 - acc: 0.1730
Epoch 19/70
 - 6s - loss: 4.2406 - acc: 0.1822
Epoch 20/70
 - 5s - loss: 4.1143 - acc: 0.1900
Epoch 21/70
 - 5s - loss: 3.9729 - acc: 0.2014
Epoch 22/70
 - 5s - loss: 3.8341 - acc: 0.2113
Epoch 23/70
 - 5s - loss: 3.6952

<keras.callbacks.callbacks.History at 0x164a88710>

#### Let's make some predictions
Heck. Yes. 

In [189]:
words = ["wardrobe", "creature", "beautys", "face"]

list(map(lambda word: tokenizer.word_index[word], words))

[1970, 2649, 136, 130]

In [192]:
def word_sequence(word_list): 
    return list(map(lambda word: tokenizer.word_index[word], word_list))

In [200]:
word_list = ["the", "wardrobe", "beautys", "creature"]

tokenizer.index_word[model.predict([[word_sequence(word_list)]]).argmax()]

'of'

In [213]:
def make_a_line(word_list): 
    input = word_sequence(word_list)
    next_word = tokenizer.index_word[model.predict([[input]]).argmax()]
    word_list.append(next_word)
    
    for i in range(len(word_list)):
        if word_list[i] == "newline": 
            word_list[i] = "\n"
    
    line = " ".join(word_list)
    return line

In [218]:
def feed_input(line): 
    tokens = line.split(" ")
    for i in range(len(tokens)): 
        if tokens[i] == "\n": 
            tokens[i] = "newline"
    
    return tokens[1:5]

In [232]:
make_a_line(["thine", "wardrobe", "of", "the"])

'thine wardrobe of the heart'

In [231]:
make_a_line(["how", "hath", "you", "made"])

'how hath you made disgrace'

In [237]:
make_a_line(["when", "you", "are", "of"])

'when you are of one'

In [223]:
input = ["a", "rose", "whose", "beauty"]

line1 = make_a_line(input)

poem = [line1]

for x in range(1, 51): 
    next_line = make_a_line(feed_input(poem[x - 1]))
    poem.append(next_line)
    
    if x == 1 or x%5 == 0: 
        print(poem[x])

rose whose beauty yea i
i least wilt 
 and
hear that trees makest seek
i do of your desire

 i have shall will
my decay 
 within the
wise age of him cruel
not but new shall wish
and say all i love
ye 
 that thou your
right myself 
 that i


In [228]:
def make_a_poem(input, x):

    line1 = make_a_line(input)

    poem = [line1]

    for y in range(1, x +1): 
        next_line = make_a_line(feed_input(poem[y - 1]))
        poem.append(next_line)

        if y == 1 or y%5 == 0: 
            print(poem[y])

In [229]:
input = ["i", "hope", "thy", "head"]
x = 80

make_a_poem(input, x)

hope thy head ere his
his eyes doth smother 

my praise in me 

but hope and i of
change 
 all nimble fresh
veins 
 so is those
beauty remain 
 take in
my loves report 
 how
how what summers tears all
thy shame say so when
darkness having to my mistress
esteeming 
 the sad slave
from her make to spend

 than i niggard shall
measure should honour from thy
memory 
 he time as
love prefiguring 
 and for
