In [1]:
# Use this if the packages are not installed yet
!pip install -q tensorflow_text
!pip install -q sentencepiece

[K     |████████████████████████████████| 4.9 MB 12.0 MB/s 
[K     |████████████████████████████████| 497.5 MB 28 kB/s 
[K     |████████████████████████████████| 15.7 MB 30.3 MB/s 
[K     |████████████████████████████████| 5.8 MB 37.0 MB/s 
[K     |████████████████████████████████| 462 kB 49.0 MB/s 
[K     |████████████████████████████████| 1.4 MB 46.0 MB/s 
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
yellowbrick 1.3.post1 requires numpy<1.20,>=1.16.0, but you have numpy 1.21.5 which is incompatible.
datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.
albumentations 0.1.12 requires imgaug<0.2.7,>=0.2.5, but you have imgaug 0.2.9 which is incompatible.[0m
[K     |████████████████████████████████| 1.2 MB 12.4 MB/s 
[?25h

In [1]:
import tensorflow as tf
import tensorflow_text as tf_txt
import tqdm.notebook as note
import sentencepiece as sp
import io
import datetime

In [2]:
# HYPERPARAMS
VOCAB_SIZE = 2000
SEQ_LEN = 32
SHUFFLE_SIZE = 1000
BATCH_SIZE = 50
D_EMBEDDINGS = 64
NUM_HEADS = 3
TOP_K = 5

In [3]:
# Save filepath to downloaded "Beyond Good and Evil"
path = tf.keras.utils.get_file("nietzsche.txt", origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt")
# Load txt into str
text = open(path).read()

# Train the tokenizer
sp.SentencePieceTrainer.train(
    input=path, model_prefix='tokenizer_model', model_type="unigram", vocab_size=VOCAB_SIZE)
# deserialize the trained model file to load it in the correct format
trained_tokenizer_model = tf.io.gfile.GFile('tokenizer_model.model', "rb").read()
# load the model as a tokenizer that can be used inside a tensorflow model
tokenizer = tf_txt.SentencepieceTokenizer(
    model=trained_tokenizer_model, out_type=tf.int32, nbest_size=-1, alpha=1, reverse=False,
    add_bos=False, add_eos=False, return_nbest=False, name=None
)

# Tokenize the str
tokens = tokenizer.tokenize(text)

In [4]:
token_windows = tf_txt.sliding_window(data=tokens, width=SEQ_LEN+1)
token_ds = tf.data.Dataset.from_tensor_slices({"input": token_windows[:,:-1], "target": token_windows[:,-1]})
token_ds = token_ds.shuffle(SHUFFLE_SIZE).batch(BATCH_SIZE)

In [5]:
class Embedding_Layer(tf.keras.layers.Layer):
    def __init__(self):
        super(Embedding_Layer, self).__init__()

        self.embedding_1 = tf.keras.layers.Embedding(input_dim=SEQ_LEN, output_dim=D_EMBEDDINGS)
        self.embedding_2 = tf.keras.layers.Embedding(input_dim=VOCAB_SIZE, output_dim=D_EMBEDDINGS)
    
    
    def call(self, x):
        zeros = self.embedding_1(tf.range(start=0, limit=SEQ_LEN))

        return zeros + self.embedding_2(x)

In [6]:
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self):
        super(TransformerBlock, self).__init__()

        self.mha = tf.keras.layers.MultiHeadAttention(num_heads=NUM_HEADS, key_dim=D_EMBEDDINGS)
        self.dense_1 = tf.keras.layers.Dense(units=32, activation="relu")
        self.dense_2 = tf.keras.layers.Dense(units=D_EMBEDDINGS)
        self.dropout_1 = tf.keras.layers.Dropout(rate=0.1)
        self.dropout_2 = tf.keras.layers.Dropout(rate=0.1)
        self.norm_1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)


    def call(self, x):

        att_out = self.mha(x, x)
        att_out = self.dropout_2(att_out)
        ln_out = self.norm_1(x + att_out)
        ffn_out = self.dense_1(ln_out)
        ffn_out = self.dense_2(ffn_out)
        ffn_out = self.dropout_2(ffn_out)

        return self.norm_2(ln_out + ffn_out)

In [7]:
class Transformer(tf.keras.Model):
    def __init__(self, tokenizer):
        super(Transformer, self).__init__()

        self.tokenizer = tokenizer
        self.optimizer = tf.keras.optimizers.Adam()
        self.loss_func = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
        self.pos_embedding = Embedding_Layer()
        self.block = TransformerBlock()
        self.pool = tf.keras.layers.GlobalAvgPool1D()
        self.dense = tf.keras.layers.Dense(units=VOCAB_SIZE)

        self.metrics_list = [
                        tf.keras.metrics.Mean(name="loss"),
                        tf.keras.metrics.CategoricalAccuracy(name="acc")
                        ]

    def call(self, x):
        embedded = self.pos_embedding(x)
        embedded = self.block(embedded)
        embedded = self.pool(embedded)

        return self.dense(embedded)


    def reset_metrics(self):
        for metric in self.metrics_list:
            metric.reset_states()
            

    @tf.function
    def train_step(self, data):
        
        x, targets = data["input"], data["target"]

        with tf.GradientTape() as tape:
            predictions = self(x)
            loss = self.loss_func(targets, predictions) + tf.reduce_sum(self.losses)
        
        gradients = tape.gradient(loss, self.trainable_variables)
        #for g, v in zip(gradients, self.trainable_variables):
            #print("gradient: ", g.shape, " var: ", v.shape, "\n")
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
        
        # update metrics
        self.metrics_list[0].update_state(loss) # loss
        self.metrics_list[1].update_state(targets, predictions) # acc

        # Return a dictionary mapping metric names to current value
        return {m.name: m.result() for m in self.metrics_list}

    def generate_text(self, prompt, sample_size=5):
        tokens = self.tokenizer.tokenize(prompt)
        prompt_len = tokens.shape[0]
        for _ in sample_size:
            tokens = tf.pad(tokens, [SEQ_LEN - prompt_len,0], "CONSTANT", constant_values=-1)
            tokens = tf.expand_dims(tokens, axis=-1)
            logits, indices = tf.math.top_k(self(tokens), k=TOP_K, sorted=True)
            sample = tf.random.categorical(logits, 1)
            tokens = tf.concat(tokens[0], sample, -1)[-(prompt_len+sample.shape[0]):]
        return f"Generated the last {sample_size} words of the following text:\n{self.tokonizer.detokenize(tokens)}\n"

In [8]:
%load_ext tensorboard

In [9]:
model = Transformer(tokenizer)

# Define where to save the log
hyperparameter_string = "VOCAB_SIZE-2000__SEQ_LEN-32__SHUFFLE_SIZE-1000__BATCH_SIZE-50__D_EMBEDDINGS-64__NUM_HEADS-3__TOP_K-5"
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

log_path = f"logs/{hyperparameter_string}/{current_time}/train"
summary_writer = tf.summary.create_file_writer(log_path)

In [10]:
tf.keras.backend.clear_session()

sample_size = 5

for epoch in range(5):
    
    print(f"Epoch {epoch}:")
    
    # Training:
    
    for data in note.tqdm(token_ds, position=0, leave=True):
        metrics = model.train_step(data)
        generated_text = model.generate_text("prompt", sample_size)
    
    # print the metrics
    print([f"{key}: {value}" for (key, value) in zip(list(metrics.keys()), list(metrics.values()))])
    # print generated text
    print(f"Generated the last {sample_size} words of the following text:\n{generated_text}\n")

    with summary_writer.as_default():
        # logging the metrics to the log file which is used by tensorboard
        for metric in model.metrics_list:
            tf.summary.scalar(f"{metric.name}", metric.result(), step=epoch)
        # logging generated text
        tf.summary.text(f"sample_size-{sample_size}", generated_text, step=epoch)
    
    # reset all metrics (requires a reset_metrics method in the model)
    model.reset_metrics()
    
    print("\n")

Epoch 0:


  0%|          | 0/3429 [00:00<?, ?it/s]

NotFoundError: ignored

In [None]:
%tensorboard --logdir logs/

In [None]:
# save the model with a meaningful name
model.save_weights(f"saved_model_{hyperparameter_string}", save_format="tf")