In [1]:
import tensorflow as tf
import numpy as np

from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import (TextVectorization, Dense, MultiHeadAttention, LayerNormalization, 
                                     Layer, Embedding, Input, Dropout)
from tensorflow.keras.callbacks import EarlyStopping

2023-02-28 16:04:48.601966: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-28 16:04:48.753568: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-02-28 16:04:48.791377: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-02-28 16:04:49.464308: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; 

# Build dataset

In [2]:
FULL_VOCAB = 'abcdefghijklmnopqrstuvwxyz'
SEQ_LEN = 10

In [3]:
def dataset1(vocab_size=2, dataset_size=10000, seq_len=10, full_vocab=FULL_VOCAB):
  """
  Inserts a space after a the first character in the vocabulary (and nowhere else)
  """
  assert vocab_size > 1
  vocab = list(full_vocab[:vocab_size])
  inputs = np.random.choice(vocab, size=(dataset_size, seq_len))
  outputs = np.where(inputs == vocab[0], 1., 0.).astype(np.float32)  # 1 = space, 0 = no space
  concatenated_inputs = np.array([''.join(row) for row in inputs])
  return concatenated_inputs, outputs

def dataset2(vocab_size=2, dataset_size=10000, seq_len=10, full_vocab=FULL_VOCAB):
  """
  Inserts a space after the combination of 1st->2nd character in the vocabulary (and nowhere else)
  """
  assert vocab_size > 1
  vocab = list(full_vocab[:vocab_size])
  inputs = np.random.choice(vocab, size=(dataset_size, seq_len))
  outputs = np.zeros_like(inputs, dtype=np.float32)
  for i, example in enumerate(inputs):
    previous_char = example[0]
    for j, char in enumerate(example[1:]):
      if (previous_char == vocab[0]) and (char == vocab[1]):  # 1 = space, 0 = no space
        outputs[i, j+1] = 1.
      previous_char = char
  concatenated_inputs = np.array([''.join(row) for row in inputs])
  return concatenated_inputs, outputs

def dataset3(vocab_size=2, dataset_size=10000, seq_len=10, insert_space_every=3, full_vocab=FULL_VOCAB):
  """
  Inserts a space after a certain number of characters, no matter what the characters
  """
  assert vocab_size > 1
  vocab = list(full_vocab[:vocab_size])
  inputs = np.random.choice(vocab, size=(dataset_size, seq_len))
  outputs = np.zeros_like(inputs, dtype=np.float32)
  outputs[:, np.arange(insert_space_every-1, outputs.shape[1], insert_space_every)] = 1.
  concatenated_inputs = np.array([''.join(row) for row in inputs])
  return concatenated_inputs, outputs

In [4]:
DATASET_FN = dataset2

train_ds = tf.data.Dataset.from_tensor_slices(DATASET_FN(vocab_size=2, seq_len=SEQ_LEN))
valid_ds = tf.data.Dataset.from_tensor_slices(DATASET_FN(vocab_size=2, seq_len=SEQ_LEN))
test_ds = tf.data.Dataset.from_tensor_slices(DATASET_FN(vocab_size=2, seq_len=SEQ_LEN))
train_ds.element_spec

2023-02-28 16:04:51.002379: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-28 16:04:53.936762: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1616] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 22296 MB memory:  -> device: 0, name: GeForce RTX 3090, pci bus id: 0000:3b:00.0, compute capability: 8.6
2023-02-28 16:04:53.938113: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1616] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 22296 MB memory:  -> device: 1, name: GeForce RTX 3090, pci bus id: 0000:5e:00.0, compute capability: 8.6
2023-02-28 16:04:53.939365: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1616] Created device /job:localhost/replica:0/t

(TensorSpec(shape=(), dtype=tf.string, name=None),
 TensorSpec(shape=(10,), dtype=tf.float32, name=None))

In [5]:
train_ds = train_ds.shuffle(1000).batch(128)
valid_ds = valid_ds.batch(128)
test_ds = test_ds.batch(128)

In [6]:
for test_inputs, test_outputs in train_ds.take(1):
  pass
print(test_inputs)
tf.print(test_outputs,summarize=-1)

tf.Tensor(
[b'babababaaa' b'abbaabbaba' b'abaababbab' b'bbabbbbabb' b'bbbbaabaab'
 b'baaaababba' b'abaaabaaba' b'aabbbaaaaa' b'bbabbbaaba' b'aababbaaaa'
 b'bbbbbabbba' b'aababbbbbb' b'abaabaaaba' b'babaaaabaa' b'abbbbabbbb'
 b'babbbabbba' b'bbbbbaabba' b'aabbaabaab' b'abbabbbbbb' b'babaabbbba'
 b'bbabbbbaab' b'baabaaaabb' b'aaabaaaaaa' b'baaabaaaaa' b'ababbabbab'
 b'aaaaaababb' b'ababbbabba' b'aaababbaaa' b'baabaaabaa' b'babbbbbaab'
 b'babbbababa' b'babbbabaaa' b'babbabbaaa' b'baaabbbaba' b'aabbbabbba'
 b'abaaabbabb' b'aaabbaabbb' b'aabbaabbaa' b'babaaaabab' b'bbbbabbbaa'
 b'aabbaababb' b'aaabaaabaa' b'bbbbbaaabb' b'abbbbaabba' b'babbabbaaa'
 b'bbaabbbbab' b'aabbabbaaa' b'baaabbaabb' b'bbbbbaaaab' b'aabbbaabba'
 b'aabbbbbbaa' b'abbbbbbabb' b'bbabaabaaa' b'ababbbabbb' b'bbbbbaaabb'
 b'bbaababbab' b'abbbbbabba' b'abbabbbbaa' b'aaaaaaabab' b'abbaababaa'
 b'aaabababbb' b'babbabbbbb' b'bbbaaabaab' b'abbabaaaba' b'bbaabaaabb'
 b'babbabbbbb' b'babbabaabb' b'baaaaaaaab' b'aababbabaa' b'baaabba

# Build layers

In [7]:
textvectorization = TextVectorization(split='character')
textvectorization.adapt(train_ds.map(lambda x, y: x))

In [8]:
textvectorization.get_vocabulary()

['', '[UNK]', 'a', 'b']

In [9]:
# Test

tv_out = textvectorization(test_inputs)
tv_out

<tf.Tensor: shape=(128, 10), dtype=int64, numpy=
array([[3, 2, 3, ..., 2, 2, 2],
       [2, 3, 3, ..., 2, 3, 2],
       [2, 3, 2, ..., 3, 2, 3],
       ...,
       [3, 2, 3, ..., 2, 2, 3],
       [3, 3, 3, ..., 3, 3, 3],
       [3, 3, 2, ..., 3, 2, 2]])>

In [10]:
def positional_encodings(length, depth):
    """
    Generates a matrix following:
    $$
        PE_{pos,i} = trig(\frac{pos, 10000^{\frac{i, d}})
    $$
    where d is the dimensionality of the output embedding and the position
    is defined absolutely (from 0).
    """
    per_trig_d_model = depth/2
    

    positions = np.arange(length)[:, np.newaxis]     # (seq, 1)
    depths = np.arange(per_trig_d_model)[np.newaxis, :]/per_trig_d_model   # (1, depth/2)
    angle_rates = 1 / (10000**depths)         # (1, depth/2)
    angle_rads = positions * angle_rates      # (seq, depth/2)

    pos_encoding = np.concatenate(
      [np.sin(angle_rads), np.cos(angle_rads)], # (seq, depth)
      axis=-1) 

    return tf.cast(pos_encoding, dtype=tf.float32)

In [11]:
D_MODEL = 32
MAX_TOKENS = textvectorization.vocabulary_size()  # includes padding and UNK tokens

In [12]:
class InputEmbeddings(Layer):
    
    def __init__(self, d_model, pos_encodings, max_tokens, name='input_embeddings', **kwargs):
        super().__init__(name=name, **kwargs)
        self.pos_encodings = pos_encodings
        self.embedding = Embedding(max_tokens, d_model, mask_zero=True)
        
    def compute_mask(self, inputs, mask=None):
        return self.embedding.compute_mask(inputs)
        
    def call(self, inputs):
        n = tf.shape(inputs)[-1]
        pos_encodings = self.pos_encodings[:n, :]
        h = self.embedding(inputs)
        return h + pos_encodings

In [13]:
# Test

input_embeddings = InputEmbeddings(D_MODEL, positional_encodings(SEQ_LEN, D_MODEL), MAX_TOKENS)
emb_out = input_embeddings(tv_out)
emb_out.shape

TensorShape([128, 10, 32])

In [14]:
def get_attention_mask(mask=None):
    if mask is None:
        return None
    mask1 = mask[:, :, None]
    mask2 = mask[:, None, :]
    return mask1 & mask2

In [15]:
class EncoderBlock(Layer):
    
    def __init__(self, num_heads, key_dim, d_model, ff_dim, name='encoder_block', **kwargs):
        super().__init__(name=name, **kwargs)
        self.supports_masking = True  # This will pass on any incoming mask
        self.num_heads = num_heads
        self.key_dim = key_dim
        self.d_model = d_model
        self.ff_dim = ff_dim
        self.multihead_attention = MultiHeadAttention(num_heads, key_dim)
        self.ff = Sequential([
            Dense(ff_dim, activation='relu'),
            Dense(d_model)
        ])
        self.layernorm1 = LayerNormalization()
        self.layernorm2 = LayerNormalization()
        
    def call(self, inputs, mask=None):
        attention_mask = get_attention_mask(mask)
        h = self.multihead_attention(inputs, inputs, attention_mask=attention_mask)
        h = self.layernorm1(inputs + h)
        
        h_ff = self.ff(h)
        return self.layernorm2(h + h_ff)

In [16]:
# Test

encoder_block = EncoderBlock(num_heads=2, key_dim=16, d_model=D_MODEL, ff_dim=32)
enc_block_out = encoder_block(emb_out)
enc_block_out.shape

2023-02-28 16:04:57.226914: I tensorflow/stream_executor/cuda/cuda_blas.cc:1614] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2023-02-28 16:04:58.090576: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8100


TensorShape([128, 10, 32])

In [17]:
enc_block_out._keras_mask

<tf.Tensor: shape=(128, 10), dtype=bool, numpy=
array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ...,
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]])>

In [18]:
class ClassifierHead(Layer):

  def __init__(self, d_model, dropout_rate, units, name='classifier_head', **kwargs):
    super().__init__(name=name, **kwargs)
    self.supports_masking = True
    self.d_model = d_model
    self.dropout_rate = dropout_rate
    self.units = units
    self.dense1 = Dense(units, activation='relu')
    self.dropout = Dropout(dropout_rate)
    self.dense2 = Dense(1)

  def call(self, inputs):
    batch_size = tf.shape(inputs)[0]
    seq_len = tf.shape(inputs)[1]
    h = self.dense1(inputs)
    h = self.dropout(h)
    h = self.dense2(h)
    return tf.reshape(h, (batch_size, seq_len))

In [19]:
# Test

classifier_head = ClassifierHead(D_MODEL, dropout_rate=0.1, units=32)
head_out = classifier_head(enc_block_out)
print(head_out._keras_mask)
head_out.shape

tf.Tensor(
[[ True  True  True ...  True  True  True]
 [ True  True  True ...  True  True  True]
 [ True  True  True ...  True  True  True]
 ...
 [ True  True  True ...  True  True  True]
 [ True  True  True ...  True  True  True]
 [ True  True  True ...  True  True  True]], shape=(128, 10), dtype=bool)


TensorShape([128, 10])

# Transformer Model

In [20]:
class Transformer(Model):

  def __init__(self, d_model, seq_len, max_tokens, num_heads, key_dim, ff_dim, dropout_rate, units,
               textvectorization, name='transformer', **kwargs):
    super().__init__(name=name, **kwargs)
    self.d_model = d_model
    self.seq_len = seq_len
    self.max_tokens = max_tokens
    self.num_heads = num_heads
    self.key_dim = key_dim
    self.ff_dim = ff_dim
    self.dropout_rate = dropout_rate
    self.units = units
    self.textvectorization = textvectorization
    self.input_embeddings = InputEmbeddings(d_model, positional_encodings(seq_len, d_model),
                                            max_tokens)
    self.encoder_block = EncoderBlock(num_heads=num_heads, key_dim=key_dim, d_model=d_model, ff_dim=ff_dim)
    self.classifier_head = ClassifierHead(d_model, dropout_rate=dropout_rate, units=units)

  def train_step(self, data):
    inputs, y_true = data
    with tf.GradientTape() as tape:
      y_pred = self(inputs)
      loss = self.compiled_loss(y_true, y_pred, regularization_losses=self.losses)
    grads = tape.gradient(loss, self.trainable_variables)
    self.optimizer.apply_gradients(zip(grads, self.trainable_variables))
    self.compiled_metrics.update_state(y_true, y_pred)
    return {m.name: m.result() for m in self.metrics} 

  def test_step(self, data):
    inputs, y_true = data
    y_pred = self(inputs)
    loss = self.compiled_loss(y_true, y_pred, regularization_losses=self.losses)
    self.compiled_metrics.update_state(y_true, y_pred)
    return {m.name: m.result() for m in self.metrics} 

  def call(self, inputs):
    h = self.textvectorization(inputs)
    h = self.input_embeddings(h)
    h = self.encoder_block(h)
    h = self.classifier_head(h)
    return h

In [21]:
NUM_HEADS = 2
KEY_DIM = 16
FF_DIM = 32
DROPOUT_RATE = 0.1
UNITS = 20

# transformer = Sequential([
#     textvectorization,
#     InputEmbeddings(D_MODEL, positional_encodings(SEQ_LEN, D_MODEL), MAX_TOKENS, input_shape=(SEQ_LEN,)),
#     EncoderBlock(num_heads=2, key_dim=16, d_model=D_MODEL, ff_dim=32),
#     ClassifierHead(D_MODEL, dropout_rate=0.1, units=20)
# ])
transformer = Transformer(D_MODEL, SEQ_LEN, MAX_TOKENS, NUM_HEADS, KEY_DIM, FF_DIM,
                          DROPOUT_RATE, UNITS, textvectorization)
_ = transformer(test_inputs)
transformer.summary()

Model: "transformer"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  multiple                 0         
 torization)                                                     
                                                                 
 input_embeddings (InputEmbe  multiple                 128       
 ddings)                                                         
                                                                 
 encoder_block (EncoderBlock  multiple                 6464      
 )                                                               
                                                                 
 classifier_head (Classifier  multiple                 681       
 Head)                                                           
                                                                 
Total params: 7,273
Trainable params: 7,273
Non-trainab

In [22]:
# Test

transformer_out = transformer(test_inputs)
transformer_out.shape

TensorShape([128, 10])

In [23]:
transformer_out._keras_mask

<tf.Tensor: shape=(128, 10), dtype=bool, numpy=
array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ...,
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]])>

In [24]:
def masked_binary_crossentropy(y_true, y_pred):
  """
  y_true: shape (batch_size, seq_len). 1. = no space, 2. = space
  y_pred: shape (batch_size, seq_len, 1). Logits
  """
  labels = y_true  # 0 = no space, 1 = space

  # Deal with Keras 'feature' that squeezes out the last dimension silently (WTF)
  # if tf.shape(y_pred)[-1] == 1:  
  #   logits = tf.squeeze(y_pred, axis=-1)  # (batch_size, seq_len)
  # else:
  logits = y_pred
  probs = tf.nn.sigmoid(logits)
  bce = - labels * tf.math.log(probs) - ((1 - labels) * tf.math.log(1 - probs))

  return tf.reduce_mean(bce)

In [25]:
# Test

masked_binary_crossentropy(test_outputs, transformer_out)

<tf.Tensor: shape=(), dtype=float32, numpy=0.59307015>

In [26]:
for test_inputs, test_outputs in train_ds.take(1):
  y_pred = transformer(test_inputs)
  loss = masked_binary_crossentropy(test_outputs, y_pred)

loss

<tf.Tensor: shape=(), dtype=float32, numpy=0.58844477>

In [27]:
from tensorflow.keras.metrics import BinaryAccuracy, Precision, Recall, AUC

earlystopping = EarlyStopping(patience=2, monitor="val_binary_accuracy")
transformer.compile(loss=masked_binary_crossentropy, optimizer='adam',
                    metrics=[BinaryAccuracy(), Precision(), Recall(), AUC(curve='PR')])

history = transformer.fit(train_ds, validation_data=valid_ds, epochs=20, callbacks=[earlystopping])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20


In [28]:
transformer.evaluate(test_ds, return_dict=True)



{'loss': 0.0003283872501924634,
 'binary_accuracy': 1.0,
 'precision': 1.0,
 'recall': 1.0,
 'auc': 1.0}

In [29]:
transformer(['ababab'+'b'*(SEQ_LEN-6)])

<tf.Tensor: shape=(1, 10), dtype=float32, numpy=
array([[-10.983806 ,   9.081677 , -11.516211 ,  10.193266 , -11.684399 ,
          7.5911064,  -8.679304 , -10.9479685, -11.096028 , -10.427702 ]],
      dtype=float32)>