In [1]:
import glob

# read in every line from all books
all_lines = []
for filename in glob.glob('./data/*.txt'):
    all_lines.extend(open(filename, 'r').read().split('\n'))

In [2]:
import string

def clean_line(line):
    # remove punctuation and convert to lowercase
    tmp = ''.join(x for x in line if x not in string.punctuation).lower()
    # set encoding
    tmp = tmp.encode('utf8').decode('ascii', 'ignore')
    return tmp

# build a corpus by preprocessing all lines
corpus = [clean_line(x) for x in all_lines]
corpus[:5]

['the regular early morning yell of horror was the sound of arthur dent waking up and suddenly remembering where he was',
 'it wasnt just that the cave was cold it wasnt just that it was damp and smelly it was the fact that the cave was in the middle of islington and there wasnt a bus due for two million years',
 'time is the worst place so to speak to get lost in as arthur dent could testify having been lost in both time and space a good deal at least being lost in space kept you busy',
 'he was stranded in prehistoric earth as the result of a complex sequence of events which had involved him being alternately blown up and insulted in more bizarre regions of the galaxy than he ever dreamt existed and though his life had now turned very very very quiet he was still feeling jumpy',
 'he hadnt been blown up now for five years']

In [4]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()

def get_tokens(corpus):
    # the actual tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1

    # convert the data into a sequence of tokens
    sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            ngram_sequence = token_list[:i+1]
            sequences.append(ngram_sequence)
    return sequences, total_words

input_sequences, total_words = get_tokens(corpus)
input_sequences[:5]

Using TensorFlow backend.


[[1, 2318],
 [1, 2318, 1174],
 [1, 2318, 1174, 559],
 [1, 2318, 1174, 559, 4185],
 [1, 2318, 1174, 559, 4185, 3]]

In [5]:
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

def gen_padded_sequences(sequences):
    max_sequence_len = max([len(x) for x in sequences])
    sequences = np.array(pad_sequences(sequences, maxlen=max_sequence_len, padding='pre'))

    predictors, label = sequences[:, :-1], sequences[:, -1]
    label = to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = gen_padded_sequences(input_sequences)

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(predictors, label, test_size=0.3, random_state=42)

print(f'{len(X_train)} train samples and {len(X_test)} test samples.')

133553 train samples and 57238 test samples.


In [12]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dropout, Dense

input_len = max_sequence_len - 1
model = Sequential()
model.add(Embedding(total_words, 10, input_length=input_len))
model.add(LSTM(100))
model.add(Dropout(0.2))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 506, 10)           133950    
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               44400     
_________________________________________________________________
dropout_2 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 13395)             1352895   
Total params: 1,531,245
Trainable params: 1,531,245
Non-trainable params: 0
_________________________________________________________________


In [13]:
import os
from keras.callbacks import ModelCheckpoint

import matplotlib.pyplot as plt
%matplotlib inline

model_path = './adams-model.hdf5'
checkpoint = ModelCheckpoint(
    model_path,
    monitor='loss',
    verbose=0,
    save_best_only=True,
    mode='min'
)

if os.path.exists(model_path):
    model.load_weights(model_path)
    print('Loaded existing weights.')
    
should_train = True
train_epochs = 5

if should_train:
    history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=train_epochs, callbacks=[checkpoint])
    plt.plot(history.history['acc'])
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.title('Predictor-Label Accuracy')
    plt.show()

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 133553 samples, validate on 57238 samples
Epoch 1/5
  1408/133553 [..............................] - ETA: 23:32 - loss: 8.7706 - accuracy: 0.0433

KeyboardInterrupt: 

In [None]:
def gen_text(seed_text, gen_count):
    for _ in range(gen_count):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        prediction = model.predict_classes(token_list, verbose=0)

        output_word = ""
        for word, index in tokenizer.word_index.items():
            if prediction == index:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text.title()

gen_text('Ford was eating', 15)