In [1]:
import numpy as np

from keras.models import Sequential
from keras import backend as K
from keras.layers import Dense, Lambda
from keras.layers import Embedding
from keras.layers import LSTM
from keras.utils import to_categorical
from keras import regularizers

  from ._conv import register_converters as _register_converters
Using plaidml.keras.backend backend.


# Load Dataset

In [2]:
import glob
from nltk.tokenize import word_tokenize

NEWLINE_TOKEN = ' __newline__ '
UNK_TOKEN = '__unk__'

# Read and collect text
train_text = ""
dev_text = ""
test_text = ""
texts = [train_text, dev_text, test_text]

for text_idx, file in enumerate(['./data/shakespeare/train.txt', './data/shakespeare/val.txt', './data/shakespeare/test.txt']):
    with open(file, 'r') as fp:
        texts[text_idx] += NEWLINE_TOKEN.join([l.strip() for l in fp.readlines()]) + NEWLINE_TOKEN

train_text, dev_text, test_text = texts

print("Total characters:")
print("Train: %d"%(len(train_text)))
print("Dev: %d"%(len(dev_text)))
print("Test: %d"%(len(test_text)))
print(train_text[:100])

Total characters:
Train: 431408
Dev: 124528
Test: 136864
﻿ __newline__ Project Gutenberg’s The Complete Works of William Shakespeare, by William __newline__ 


# Preprocess text
We usually preprocess the text to remove casing information, separate out punctuations etc to make our data cleaner

In [8]:
tokens = [None, None, None]
for text_idx in range(len(texts)):
    tokens[text_idx] = word_tokenize(texts[text_idx].lower())

train_tokens, dev_tokens, test_tokens = tokens

print("Total tokens:")
print("Train: %d"%(len(train_tokens)))
print("Dev: %d"%(len(dev_tokens)))
print("Test: %d"%(len(test_tokens)))

Total tokens:
Train: 81356
Dev: 23823
Test: 27236


# Build vocabulary

In [9]:
VOCAB_SIZE = 5000
full_vocab = dict()
for token in train_tokens:
    full_vocab[token] = full_vocab.get(token, 0) + 1

# Sort vocabulary by occurence
sorted_vocab = sorted(full_vocab.keys(), key=lambda word: -full_vocab[word])

# Print some samples
print("Vocabulary size: %d"%(len(sorted_vocab)))
print("Most frequent tokens")
for i in range(10):
    print("\t%s: %d"%(sorted_vocab[i], full_vocab[sorted_vocab[i]]))
print("Least frequent tokens")
for i in range(1,11):
    print("\t%s: %d"%(sorted_vocab[-i], full_vocab[sorted_vocab[-i]]))

# Create final vocab
word2idx = {w: idx for idx, w in enumerate(sorted_vocab[:VOCAB_SIZE])}
idx2word = {idx: w for idx, w in enumerate(sorted_vocab[:VOCAB_SIZE])}

word2idx[UNK_TOKEN] = VOCAB_SIZE
idx2word[VOCAB_SIZE] = UNK_TOKEN
VOCAB_SIZE = VOCAB_SIZE + 1

Vocabulary size: 6650
Most frequent tokens
	__newline__: 10000
	,: 5218
	.: 4361
	the: 1658
	and: 1456
	i: 1414
	to: 1254
	’: 1186
	of: 1111
	my: 906
Least frequent tokens
	impossible-: 1
	descried: 1
	approaching: 1
	full-mann: 1
	sixty: 1
	security: 1
	assurance: 1
	forgo: 1
	renowned: 1
	unexecuted: 1


# Filter text based on vocabulary
We will now have to replace words we do not have in the vocabulary with a special token, `__unk__` in this case

In [10]:
for tokens_idx in range(len(tokens)):
    tokens[tokens_idx] = [t if t in word2idx else UNK_TOKEN for t in tokens[tokens_idx]]

train_tokens, dev_tokens, test_tokens = tokens
print("Number of tokens filtered out as unknown:")
print("Train: %d/%d"%(len([1 for t in train_tokens if t == UNK_TOKEN]), len(train_tokens)))
print("Dev: %d/%d"%(len([1 for t in dev_tokens if t == UNK_TOKEN]), len(dev_tokens)))
print("Test: %d/%d"%(len([1 for t in test_tokens if t == UNK_TOKEN]), len(test_tokens)))

Number of tokens filtered out as unknown:
Train: 1650/81356
Dev: 1946/23823
Test: 2752/27236


# Prepare data in tensor form
Our keras models finally take tensors as input and labels, so we need to modify our data to fit this form

In [11]:
X_train = np.array([word2idx[t] for t in train_tokens])
X_dev = np.array([word2idx[t] for t in dev_tokens])
X_test = np.array([word2idx[t] for t in test_tokens])

Our labels in this exercise are just the next words. Hence, for

>   `X_train = ['hello', 'how', 'are', 'you', '?']`

we will have:

>    `y_train = ['how, 'are', you', '?']`

Which is just `X_train[1:]`
We will also remove the last element of `X_train`, since we do not have any label for it

# Helper functions

In [12]:
NUM_EPOCHS = 2
def build_bag_of_words(X, context_size=1, vocab_size=VOCAB_SIZE):
    num_examples = X.shape[0]-context_size
    X_bow = np.zeros((num_examples, vocab_size))
    
    y_bow = np.zeros((num_examples, vocab_size))
    
    for idx in range(num_examples):
        for context_idx in range(context_size):
            X_bow[idx, X[idx+context_idx]] = 1
        y_bow[idx, X[idx + context_size]] = 1
    
    return X_bow, y_bow
            
def get_next_predicted_word(model, input_words, context_size=1):
    if not isinstance(input_words, list):
        input_words = [input_words]
    input_words = input_words + ["__unk__"]
    input_array = np.array([word2idx[w] for w in input_words])
    input_bow, _ = build_bag_of_words(input_array, context_size=context_size)
    scores = model.predict(input_bow)
    output_word = idx2word[np.argmax(scores)]
    
    return output_word

def get_sentence(model, start_words, context_size=1):
    if not isinstance(start_words, list):
        start_words = [start_words]

    output = [] + start_words
    while output[-1] != '__newline__' and len(output) < 100:
        prev_word = get_next_predicted_word(model, output[-context_size:], context_size=context_size)
        output.append(prev_word)
    return " ".join(output)

# Define model

In [13]:
X_train_bigram, y_train_bigram = build_bag_of_words(X_train, context_size=1)
X_dev_bigram, y_dev_bigram = build_bag_of_words(X_dev, context_size=1)
X_test_bigram, y_test_bigram = build_bag_of_words(X_test, context_size=1)

In [14]:
print(X_train_bigram.shape)
print(X_dev_bigram.shape)
print(X_test_bigram.shape)

(81355, 5001)
(23822, 5001)
(27235, 5001)


In [15]:
model = Sequential()
model.add(Dense(100, input_shape=(VOCAB_SIZE,)))
model.add(Dense(100, activation='relu'))
model.add(Dense(100, activation='relu'))
model.add(Dense(VOCAB_SIZE, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
model.summary()

INFO:plaidml:Opening device "metal_amd_radeon_pro_560.0"


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 100)               500200    
_________________________________________________________________
dense_2 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_3 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_4 (Dense)              (None, 5001)              505101    
Total params: 1,025,501
Trainable params: 1,025,501
Non-trainable params: 0
_________________________________________________________________


In [16]:
for epoch in range(NUM_EPOCHS):
    model.fit(X_train_bigram, y_train_bigram, batch_size=128, epochs=epoch+1, initial_epoch=epoch, validation_data=(X_dev_bigram, y_dev_bigram))
    print(get_sentence(model, ['i']))

Train on 81355 samples, validate on 23822 samples
Epoch 1/1
i , __newline__
Train on 81355 samples, validate on 23822 samples
Epoch 2/2
i have , __newline__


In [17]:
print(get_sentence(model, ['think']))
print(get_sentence(model, ['well']))
print(get_sentence(model, ['i']))
print(get_sentence(model, ['who']))

think __newline__
well , __newline__
i have , __newline__
who , __newline__


#### Add context
The above data uses only _one_ previous word as context, but we can change our data to include more words

In [18]:
X_train_trigram, y_train_trigram = build_bag_of_words(X_train, context_size=2)
X_dev_trigram, y_dev_trigram = build_bag_of_words(X_dev, context_size=2)
X_test_trigram, y_tes_trigram = build_bag_of_words(X_test, context_size=2)

In [19]:
print(X_train_trigram.shape)
print(y_train_trigram.shape)

(81354, 5001)
(81354, 5001)


In [20]:
model_trigram = Sequential()
model_trigram.add(Dense(100, input_shape=(VOCAB_SIZE,)))
model_trigram.add(Dense(100, activation='relu'))
model_trigram.add(Dense(100, activation='relu'))
model_trigram.add(Dense(VOCAB_SIZE, activation='softmax'))

model_trigram.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
model_trigram.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_5 (Dense)              (None, 100)               500200    
_________________________________________________________________
dense_6 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_7 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_8 (Dense)              (None, 5001)              505101    
Total params: 1,025,501
Trainable params: 1,025,501
Non-trainable params: 0
_________________________________________________________________


In [21]:
for epoch in range(NUM_EPOCHS):
    model_trigram.fit(X_train_trigram, y_train_trigram, batch_size=128, epochs=epoch+1, initial_epoch=epoch, validation_data=(X_dev_trigram, y_dev_trigram))
    print(get_sentence(model_trigram, ['i', 'have'], context_size=2))

Train on 81354 samples, validate on 23821 samples
Epoch 1/1
i have , __newline__
Train on 81354 samples, validate on 23821 samples
Epoch 2/2
i have have , __newline__


In [22]:
print(get_sentence(model_trigram, ['think', 'of'], context_size=2))
print(get_sentence(model_trigram, ['well', 'we'], context_size=2))
print(get_sentence(model_trigram, ['i', 'have'], context_size=2))
print(get_sentence(model_trigram, ['who', 'will'], context_size=2))

think of , __newline__
well we , the __unk__ of , __newline__
i have have , __newline__
who will the __unk__ of , __newline__
