In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.layers import Dropout, LSTM, Dense, Embedding

In [2]:
data = pd.read_csv('../data/preprocessed_data.csv')
text = data['text']

In [3]:
# text = text.sum()

In [4]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text)
encoded_texts = tokenizer.texts_to_sequences(text)
word_index = tokenizer.index_word
num_words = len(word_index) + 1
# encoded_texts = np.array(encoded_texts)

In [6]:
def windowize_data(data, n_prev):
    data = np.array(data)
    n_predictions = len(data) - n_prev
    y = data[n_prev:]
    indices = np.arange(n_prev) + np.arange(n_predictions)[:, None]
    x = data[indices]
    return x, y

In [44]:
len(encoded_texts)

TypeError: unhashable type: 'list'

In [8]:
n_prev = 20
X, y = windowize_data(encoded_texts[0], n_prev)
for text in encoded_texts[1:]:
    temp_X, temp_y = windowize_data(text,n_prev)
    X = np.concatenate((X, temp_X), axis=0)
    y = np.concatenate((y, temp_y), axis=0)

In [9]:
X = X[:25000,:]
y = y[:25000]

In [10]:
y = to_categorical(y, num_classes = num_words)

In [11]:
X.shape, y.shape

((25000, 20), (25000, 24775))

In [12]:
model = keras.Sequential()
model.add(Embedding(num_words, 128, input_length=n_prev))
model.add(LSTM(128, input_shape=(n_prev,1), return_sequences=True))
# model.add(Dropout(.2))
model.add(LSTM(128))
# model.add(Dropout(.2))
model.add(Dense(num_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics='accuracy')

In [16]:
callbacks = [ModelCheckpoint(('../models/model.h5'), save_best_only=True, save_weights_only=False, monitor='val_accuracy')]

In [17]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 128)           3171200   
_________________________________________________________________
lstm (LSTM)                  (None, 20, 128)           131584    
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense (Dense)                (None, 24775)             3195975   
Total params: 6,630,343
Trainable params: 6,630,343
Non-trainable params: 0
_________________________________________________________________


In [18]:
model.fit(X, y, epochs=100, batch_size=32, callbacks=callbacks)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100


Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100


Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x7f6a8454fbb0>

In [19]:
poetry_length = 10
def generate_poetry(seed_text, n_lines):
  for i in range(n_lines):
    text = []
    for _ in range(poetry_length):
      encoded = tokenizer.texts_to_sequences([seed_text])
      encoded = pad_sequences(encoded, maxlen=20, padding='pre')

      y_pred = np.argmax(model.predict(encoded), axis=-1)

      predicted_word = ""
      for word, index in tokenizer.word_index.items():
        if index == y_pred:
          predicted_word = word
          break

      seed_text = seed_text + ' ' + predicted_word
      text.append(predicted_word)

    seed_text = text[-1]
    text = ' '.join(text)
    print(text)

In [31]:
generate_poetry('I tried so hard, and got so far, but in the end it', 3)

was a hyena “the island in the water is far
until it had burst upon us but i could easily
in motion and sending forth to the bolt and went


In [25]:
model.save('../models/first_model')

In [40]:
seed_2 = "For arms there dangled from the upper portion of the carcass two tolerably long bottles, with the necks outward for hands."

In [41]:
generate_poetry(seed_2, 2)

and this vortex the whole surface of the household—of a
remembrance of aerial forms—of spiritual and meaning eyes—of sounds musical


In [37]:
data.text[25]

'AN EXTRAVAGANZA.     IT was a chilly November afternoon. I had just consummated an unusually hearty dinner, of which the dyspeptic truffe formed not the least important item, and was sitting alone in the dining-room, with my feet upon the fender, and at my elbow a small table which I had rolled up to the fire, and upon which were some apologies for dessert, with some miscellaneous bottles of wine, spirit and liqueur. In the morning I had been reading Glover’s “Leonidas,” Wilkie’s “Epigoniad,” Lamartine’s “Pilgrimage,” Barlow’s “Columbiad,” Tuckermann’s “Sicily,” and Griswold’s “Curiosities”; I am willing to confess, therefore, that I now felt a little stupid. I made effort to arouse myself by aid of frequent Lafitte, and, all failing, I betook myself to a stray newspaper in despair. Having carefully perused the column of “houses to let,” and the column of “dogs lost,” and then the two columns of “wives and apprentices runaway,” I attacked with great resolution the editorial matter, an

In [43]:
model.save('../models/whole_first_model')



INFO:tensorflow:Assets written to: ../models/whole_first_model/assets


INFO:tensorflow:Assets written to: ../models/whole_first_model/assets
