In [152]:
# Install tensorflow_text, if executed in google colab
if 'google.colab' in str(get_ipython()):
  !pip install -q -U tensorflow --upgrade # We need a newer tensorflow version to use the causal mask of the multi head attention layer
  !pip install -q -U tensorflow-text
  !pip install -q -U sentencepiece

In [153]:
# disable compiler warnings
import os

# imports 
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_text as tf_text
from tensorflow.python.ops.numpy_ops import np_config
np_config.enable_numpy_behavior()
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.python.keras.layers import Dense
from typing import List
import datetime
from tqdm.notebook import tqdm
import sentencepiece as sp
import math
import tqdm


from tensorflow.python.client import device_lib
#os.environ['TF_CPP_MIN_LOG_LEVEL'] = '0'  # FATAL
print("Num GPUs Available: ", tf.config.list_physical_devices('GPU'))
print("TensorFlow Version: ", tf.__version__)

Num GPUs Available:  [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
TensorFlow Version:  2.11.0


# Constants / Hyperparameter

In [154]:
PREPROCESSED_BIBLE_FILE_NAME = "bible_preprocessed.txt"
VOCABULARY_SIZE = 3000 # 2000 to 7000
SEQUENCE_LENGTH = 64 # 32 to 256
BATCH_SIZE = 64
EMBEDDING_OUT = 128 # 64 to 256
ATTENTION_HEADS = 4 # 2 to 4
TRANSFORMER_DENSE_SIZE = 128 # 32 to 256
EPOCHS = 100 # 100 to 600
TRAIN_SPLIT = 0.8
TEST_SPLIT = 1 - TRAIN_SPLIT

# Load data

In [155]:
import requests

# Load file from remote, if notebook is executed inside google colab, otherwise it gets loaded from the local file system
if 'google.colab' in str(get_ipython()):
  bible_url = "https://raw.githubusercontent.com/IANNwTF-Group-3/homework11/main/bible.txt"
  response = requests.get(bible_url)
  text = response.text
else:
  file_path = "bible.txt"
  with open(file_path, "r") as f:
      text = f.read()

# Preprocessing

In [156]:
import re

# Lowercase the text
text = text.lower()

# Remove sentence numeration
text = re.sub('[0-9]+:[0-9]+ ', '', text)

# Remove special characters
for c in "!'()*,-.0123456789:;?":
  text = text.replace(c, '')

# Replace multiple spaces with a single space
text = re.sub(' +', ' ', text)

sentence_separator = "sentence-separator-placeholder"
# Remember double line breaks
text = re.sub('\n\n+', sentence_separator, text)
# Remove line breaks
text = text.replace('\n', '')
# Substitute sentence line breaks back into text
text = text.replace(sentence_separator, '\n')

Write text to file for later processing

In [157]:
if os.path.exists(PREPROCESSED_BIBLE_FILE_NAME):
  os.remove(PREPROCESSED_BIBLE_FILE_NAME)

bible_file = open(PREPROCESSED_BIBLE_FILE_NAME, 'xb')
bible_file.write(text.encode(encoding='UTF-8'))
bible_file.close()

# Tokenize

Create tokenizer model

In [158]:
# Pretrained model
# sp_model_url = "https://github.com/tensorflow/text/blob/master/tensorflow_text/python/ops/test_data/fast_sentencepiece.model?raw=true"
# sp_model = requests.get(sp_model_url).content

# Self trained model
sp_model_name = "sp_tokenizer"
sp.SentencePieceTrainer.train(input=PREPROCESSED_BIBLE_FILE_NAME, model_prefix=sp_model_name, model_type="unigram", vocab_size=VOCABULARY_SIZE)
sp_model = tf.io.gfile.GFile(f"{sp_model_name}.model", "rb").read()

Create tokenizer and bible tokens

In [159]:
sp_tokenizer = tf_text.SentencepieceTokenizer(sp_model)
sp_tokens = sp_tokenizer.tokenize(text)

Test tokenizer

In [160]:
test_tokens = sp_tokenizer.tokenize("I Jesus have sent mine angel".lower())
for t in test_tokens:
  print(sp_tokenizer.detokenize([t]))

tf.Tensor(b'i', shape=(), dtype=string)
tf.Tensor(b'jesus', shape=(), dtype=string)
tf.Tensor(b'have', shape=(), dtype=string)
tf.Tensor(b'sent', shape=(), dtype=string)
tf.Tensor(b'mine', shape=(), dtype=string)
tf.Tensor(b'angel', shape=(), dtype=string)


# Prepare dataset

Create training data using sliding window

In [161]:
sliding_window = tf_text.sliding_window(data=sp_tokens, width=SEQUENCE_LENGTH + 1, axis=0)

# Visualize sliding window
print(sliding_window)

tf.Tensor(
[[   3  292  562 ...    4   32  237]
 [ 292  562    5 ...   32  237    3]
 [ 562    5  172 ...  237    3  392]
 ...
 [ 562    5   51 ...   19   26   57]
 [   5   51 2700 ...   26   57   28]
 [  51 2700   32 ...   57   28 1339]], shape=(902189, 65), dtype=int32)


Create dataset

In [162]:
full_dataset = tf.data.Dataset.from_tensor_slices((sliding_window[:,:-1], sliding_window[:,1:]))
#full_dataset = tf.data.Dataset.from_tensor_slices(sliding_window)
full_dataset = full_dataset.shuffle(4096)
full_dataset = full_dataset.batch(BATCH_SIZE)
full_dataset = full_dataset.prefetch(tf.data.AUTOTUNE)

train_size = math.floor(len(full_dataset) * TRAIN_SPLIT)
test_size = math.floor(len(full_dataset) * TEST_SPLIT)
train_dataset = full_dataset.take(train_size)
test_dataset = full_dataset.skip(train_size).take(test_size)

# Embedding layer

In [163]:
class BibleEmbedding(tf.keras.layers.Layer):
  def __init__(self):
    super(BibleEmbedding, self).__init__()

    self.token_embedding = tf.keras.layers.Embedding(VOCABULARY_SIZE, EMBEDDING_OUT)
    self.position_embedding = tf.keras.layers.Embedding(SEQUENCE_LENGTH, EMBEDDING_OUT)

  def call(self, input_sequence):
    print(input_sequence.shape)
    input_range = tf.range(0, input_sequence.shape[1])

    return tf.math.add(self.token_embedding(input_sequence), self.position_embedding(input_range))

# Transformer block

In [164]:
class TransformerBlock(tf.keras.layers.Layer):
  def __init__(self):
    super(TransformerBlock, self).__init__()
    self.head_attention_layer = tf.keras.layers.MultiHeadAttention(ATTENTION_HEADS, EMBEDDING_OUT)
    self.dense1 = tf.keras.layers.Dense(TRANSFORMER_DENSE_SIZE, activation='relu')
    self.dense2 = tf.keras.layers.Dense(EMBEDDING_OUT, activation=None)
    self.dropout1 = tf.keras.layers.Dropout(0.1)
    self.dropout2 = tf.keras.layers.Dropout(0.1)
    self.layer_normalization1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layer_normalization2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

  def call(self, data, training):
    x = self.head_attention_layer(data, data, use_causal_mask=True)
    x = self.dropout1(x, training=training)
    x = tf.math.add(x, data)
    x = self.layer_normalization1(x)
    y = self.dense1(x)
    y = self.dense2(y)
    y = self.dropout2(y)
    x = tf.math.add(x, y)
    return self.layer_normalization2(x)

# Bible model

In [165]:
class BibleModel(tf.keras.Model):
  def __init__(self, tokenizer, optimizer=tf.keras.optimizers.Adam(), loss_function=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)):
    super(BibleModel, self).__init__()
    self.tokenizer = tokenizer
    self.optimizer = optimizer
    self.loss_function = loss_function
    self.metrics_list = [
      tf.keras.metrics.Mean(name="loss"),
      tf.keras.metrics.CategoricalAccuracy(name="accuracy"),
      #tf.keras.metrics.TopKCategoricalAccuracy(3,name="top-3-acc")
    ]

    self.embedding_layer = BibleEmbedding()
    self.transformer_layer = TransformerBlock()
    self.out_layer = tf.keras.layers.Dense(VOCABULARY_SIZE, activation=None)

  def call(self, data, training=True):
    x = self.embedding_layer(data)
    x = self.transformer_layer(x)
    return self.out_layer(x)

  def reset_metrics(self):
    for metric in self.metrics:
        metric.reset_states()

  @tf.function
  def train_step(self, data):
    x, targets = data
      
    # compute output and loss, train the variables
    with tf.GradientTape() as tape:
      predictions = self(x, training=True)
      loss = self.loss_function(targets, predictions) + tf.reduce_sum(self.losses)
          
    # update trainable variables
    gradients = tape.gradient(loss, self.trainable_variables)
    self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))

    # update metrics
    self.metrics_list[0].update_state(loss)

    for metric in self.metrics_list[1:]:
        metric.update_state(targets, predictions)
      
    # return a dict with metric information
    return {m.name : m.result() for m in self.metrics_list}

  def generate_text(self, prompt, output_length, top_k):
    generated = self.tokenizer.detokenize(self.tokenizer.tokenize(prompt))
    while len(generated) < len(prompt) + output_length:
        tokenized_prompt = self.tokenizer(generated)
        output = self(tf.expand_dims(tokenized_prompt, 0), training=False)
        logits = output[:, -1, :]  # select the last token's logits
        filtered_logits, top_indices = tf.math.top_k(logits, k=top_k, sorted=True)
        chosen_index = tf.random.categorical(filtered_logits, num_samples=1)[-1, 0].numpy()
        generated += self.tokenizer.detokenize([top_indices[0][chosen_index].numpy()])
        tokenized_prompt = self.tokenizer(generated)[-self.max_len:]
    return ' '.join(self.tokenizer.detokenize(self.tokenizer.tokenize(generated)[len(prompt):]))



# Training loop

In [166]:
def training_loop(model, train_ds, train_summary_writer):
  for epoch in range(EPOCHS):
    print(f"Epoch {epoch}:")
    
    for data in tqdm.tqdm(train_ds, position=0, leave=True):
        metrics = model.train_step(data)
        
    with train_summary_writer.as_default():
        for metric in model.metrics:
            tf.summary.scalar(f"{metric.name}", metric.result(), step=epoch)

    # print metrics
    for metric in model.metrics:
      print(f"{metric.name}: {metric.result()}")

    # reset all metrics (requires a reset_metrics method in the model)
    model.reset_metrics()    
    
    # Validation: text generation
    #prediction = model.generate_text("What is", 10, 3)
    #print("Prediction: ", prediction)

    model.reset_metrics()
    print("\n")

In [167]:
model = BibleModel(sp_tokenizer)

current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
train_log_dir = 'logs/' + current_time + '/train'    
train_summary_writer = tf.summary.create_file_writer(train_log_dir)

training_loop(model, train_dataset, train_summary_writer)


Epoch 0:


  0%|          | 0/11277 [00:00<?, ?it/s]

(64, 64)
(64, 64)


100%|██████████| 11277/11277 [03:37<00:00, 51.80it/s]


loss: 2.345102071762085
accuracy: 0.00946571584790945


Epoch 1:


100%|██████████| 11277/11277 [02:51<00:00, 65.59it/s]


loss: 2.6411609649658203
accuracy: 0.009846053086221218


Epoch 2:


100%|██████████| 11277/11277 [02:53<00:00, 65.01it/s]


loss: 2.844372034072876
accuracy: 0.010191361419856548


Epoch 3:


100%|██████████| 11277/11277 [02:57<00:00, 63.66it/s]


loss: 2.9216980934143066
accuracy: 0.010348665527999401


Epoch 4:


  4%|▍         | 433/11277 [00:07<02:57, 61.20it/s]


KeyboardInterrupt: ignored