In [1]:
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import tensorflow as tf
import requests
import regex

In [2]:
!pip install -q -U "tensorflow-text==2.8.*"
from tensorflow_text.tools.wordpiece_vocab.bert_vocab_from_dataset import bert_vocab_from_dataset
from tensorflow_text import BertTokenizer
from tensorflow_text import pad_model_inputs
from tensorflow_text import sliding_window

In [3]:
'''
Pride and Prejudice, 1342
Adventures in Wonderland, 11
Adventures of Sherlock Holmes, 1661
Frankenstein, 84
Moby Dick, 2701
Dracula, 345
Ulysses, 4300
The Picture of Dorian Gray, 174
A Tale of Two Cities, 98
The Great Gatsby, 64317
Great Expectations, 1400
Crime and Punishment, 2554
Metamorphosis, 5200
Iliad, 6130
The Prince, 1232
Heart of Darkness, 219
Les Misérables, 135
'''

def get_text(text_id):
    text_id = str(text_id)
    url = f'https://www.gutenberg.org/files/{text_id}/{text_id}-h/{text_id}-h.htm'
    text = requests.get(url).content.decode('utf-8')
    soup = BeautifulSoup(text, 'html.parser')
    text = [p.get_text() for p in soup.find_all('p')]
    text = ' '.join(text)
    for comma in "\"‘’‚‛“„‟":
        text = text.replace(comma, "'")
    text = ' '.join(text.split())
    text = ''.join(regex.findall("[A-Za-z .,()!?']", text))
    return text

# Training on Great Expectations, Crime and Punishment and Les Misérables
# All texts from the 1860s
texts = [get_text(i) for i in [1400, 2554, 135]]

In [4]:
vocab_size = 512

bert_vocab_args = dict(vocab_size = vocab_size,
                    reserved_tokens=["[PAD]", "[UNK]", "[START]", "[END]"],
                    bert_tokenizer_params={'lower_case':False},
                    learn_params={})
data = tf.data.Dataset.from_tensor_slices(texts)
vocab = bert_vocab_from_dataset(data, **bert_vocab_args)

with open('vocab.txt', 'w') as f:
    for token in vocab:
        f.write(token + '\n')

tokenizer = BertTokenizer('vocab.txt', lower_case = False)

In [5]:
def encode(prompts):
    return tokenizer.tokenize(prompts).merge_dims(1, 2)

def decode(tokens):
    out = []
    for t in tokens:
        out.append(tf.strings.join(tokenizer.detokenize([t])[0], ' ').numpy().decode('utf8'))
    return out

In [6]:
seq_len = 128
tokenized_texts = encode(texts)

In [7]:
prompts = tf.concat([sliding_window(text, seq_len + 1) for text in tokenized_texts], axis=0)

In [8]:
prompts = tf.random.shuffle(prompts)
split = int(prompts.shape[0]*0.8)
x_train, y_train = prompts[:split][:,:-1], prompts[:split][:,-1]
x_test, y_test = prompts[split:][:,:-1], prompts[split:][:,-1]

In [9]:
print(x_train.shape, x_test.shape)

(1755128, 128) (438783, 128)


In [10]:
import numpy as np

# From Tensorflow Guide

def get_angles(pos, i, d_model):
  angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
  return pos * angle_rates

def positional_encoding(position, d_model):
  angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                          np.arange(d_model)[np.newaxis, :],
                          d_model)
  angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
  angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

  pos_encoding = angle_rads[np.newaxis, ...]

  return tf.cast(pos_encoding, dtype=tf.float32)

In [11]:
def transformer_encoder(inputs, head_size, num_heads, ff_dim):
    x = tf.keras.layers.LayerNormalization(epsilon=1e-6, axis = -1)(inputs)

    y = inputs + tf.keras.layers.MultiHeadAttention(key_dim=head_size, num_heads=num_heads)(x, x)
    y = tf.keras.layers.LayerNormalization(epsilon=1e-6, axis = -1)(y)

    y = tf.keras.layers.Dense(ff_dim, activation='relu')(y)
    y = tf.keras.layers.Dense(inputs.shape[-1])(y)
    
    return inputs + y

def build_model(input_shape, layers, num_heads, head_size, ff_dim):
    
    inputs = tf.keras.Input(shape=input_shape)
    x = inputs

    x = tf.keras.layers.Embedding(vocab_size, num_heads*head_size, input_length=seq_len)(x)
    x *= tf.math.sqrt(tf.cast(num_heads*head_size, tf.float32))
    x += positional_encoding(seq_len, num_heads*head_size)[:, :seq_len, :]

    for _ in range(layers):
        x = transformer_encoder(x, head_size, num_heads, ff_dim)

    x = tf.keras.layers.GlobalAveragePooling1D()(x)
    x = tf.keras.layers.Dense(ff_dim, activation='relu')(x)
    outputs = tf.keras.layers.Dense(vocab_size, activation='softmax')(x)

    return tf.keras.Model(inputs, outputs)

In [12]:
model = build_model(input_shape=(seq_len,),
                    layers=4, num_heads=4,
                    head_size=16, ff_dim=256)

In [13]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 128)]        0           []                               
                                                                                                  
 embedding (Embedding)          (None, 128, 64)      32768       ['input_1[0][0]']                
                                                                                                  
 tf.math.multiply (TFOpLambda)  (None, 128, 64)      0           ['embedding[0][0]']              
                                                                                                  
 tf.__operators__.add (TFOpLamb  (None, 128, 64)     0           ['tf.math.multiply[0][0]']       
 da)                                                                                          

In [14]:
# From Tensorflow Guide

class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super(CustomSchedule, self).__init__()

    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps

  def __call__(self, step):
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)

    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

learning_rate = CustomSchedule(64)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)

In [15]:
scce = tf.keras.losses.SparseCategoricalCrossentropy()
model.compile(optimizer=optimizer, loss=scce, metrics=['sparse_categorical_accuracy'])

In [None]:
history = model.fit(x_train, y_train, epochs=10, batch_size=256)

Epoch 1/10
Epoch 2/10

In [None]:
plt.plot(history.history['loss'])

In [None]:
model.evaluate(x_test, y_test)

In [None]:
def regress(inputs, n):
    x = inputs

    for _ in range(n):
        y = tf.reshape(tf.math.argmax(model(x[:,-seq_len:]), axis=-1), (-1, 1))
        x = tf.concat((x, y), axis=1)
    
    for i in range(len(inputs)):
        print(decode(inputs[i:i+1])[0] + ' (begin new text) ' + decode(x[i:i+1][:,seq_len:])[0])
        print('')

In [None]:
regress(x_test[:10], 30)