In [1]:
import tensorflow as tf
import numpy as np

from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import (TextVectorization, Dense, MultiHeadAttention, LayerNormalization, 
                                     Layer, Embedding, Input, Dropout)
from tensorflow.keras.callbacks import EarlyStopping

import logging

GPU_FROM = 0
GPU_TO = 1  

visible_devices = tf.config.get_visible_devices('GPU')
logging.info(f"Num GPUs visible:{len(visible_devices)}")
tf.config.set_visible_devices(visible_devices[GPU_FROM:GPU_TO],'GPU')

visible_devices = tf.config.get_visible_devices('GPU')
logging.info(f"Num GPUs to be used: {len(visible_devices)}")

2023-02-28 15:44:38.900775: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-28 15:44:39.053421: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-02-28 15:44:39.092262: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-02-28 15:44:39.791971: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; 

# Build dataset

In [55]:
SEQ_LEN = 10

In [56]:

from segmentation.model import SpaceSegmentationTransformer
from segmentation.model import LossWithVoids
data = tf.random.stateless_binomial(shape=(10000, SEQ_LEN), counts=1, probs=0.8, seed=[1997,1997])+1
train_frac = 3/4 
val_frac = 1/8

train_idx = int(data.shape[0]*train_frac)
val_idx = train_idx + int(data.shape[0]*val_frac)

train_ds = tf.data.Dataset.from_tensor_slices(data[:train_idx])
valid_ds = tf.data.Dataset.from_tensor_slices(data[train_idx:val_idx])
test_ds = tf.data.Dataset.from_tensor_slices(data[val_idx:])


def mapper(y):
    x = tf.strings.as_string(y+3)
    x = tf.strings.reduce_join(x, axis=-1)
    return tf.strings.regex_replace(x, "(45)", r"\1 ")

train_ds = train_ds.map(mapper)
valid_ds = valid_ds.map(mapper)
test_ds = test_ds.map(mapper)

for f in train_ds.take(5):
    print(f)

tf.Tensor(b'5555545 555', shape=(), dtype=string)
tf.Tensor(b'5555445 555', shape=(), dtype=string)
tf.Tensor(b'5445 545 555', shape=(), dtype=string)
tf.Tensor(b'5545 555545 ', shape=(), dtype=string)
tf.Tensor(b'55555445 54', shape=(), dtype=string)


In [57]:
def generate_labels(text):
    max_chars = SEQ_LEN
    last_char = tf.strings.substr(text, max_chars, max_chars+1)
    text = tf.strings.substr(text,0, max_chars)
    text = tf.strings.lower(text)
    split_on_whitespace = tf.strings.strip(tf.strings.split(text))
    encoder_in = tf.strings.reduce_join(split_on_whitespace, axis=-1)
    
    space_indices = tf.math.cumsum(tf.strings.length(split_on_whitespace)) - 1 # subtract 1 to index from 0
    seq_len = space_indices[-1] + 1
    if not tf.strings.regex_full_match(last_char, " "):
        space_indices = space_indices[:-1]
    encoder_out = tf.ones((max_chars,))
    encoder_out = tf.concat([
        tf.ones((seq_len,)),
        tf.zeros((max_chars - seq_len,))
    ], axis=-1)
    encoder_out = tf.tensor_scatter_nd_update(
        encoder_out, 
        space_indices[...,tf.newaxis], # Expand dims to create a 'list' of indices
        tf.ones_like(space_indices, dtype=encoder_out.dtype)*2) 
    
    return (encoder_in), tf.cast(encoder_out-1, "float32")

train_ds = train_ds.map(generate_labels)
valid_ds = valid_ds.map(generate_labels)
test_ds = test_ds.map(generate_labels)

for f in train_ds.take(1):
    print(f)

(<tf.Tensor: shape=(), dtype=string, numpy=b'555554555'>, <tf.Tensor: shape=(10,), dtype=float32, numpy=array([ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0., -1.], dtype=float32)>)


In [58]:
train_ds = train_ds.shuffle(1000).batch(128)
valid_ds = valid_ds.batch(128)
test_ds = test_ds.batch(128)

# Build layers

In [59]:
test_inputs, test_outputs = next(iter(train_ds.take(1)))

In [68]:
textvectorization = TextVectorization(split='character', output_sequence_length=SEQ_LEN)
textvectorization.adapt(train_ds.map(lambda x, y: x))

In [69]:
textvectorization.get_vocabulary()

['', '[UNK]', '5', '4']

In [70]:
# Test

tv_out = textvectorization(test_inputs)
tv_out

<tf.Tensor: shape=(128, 10), dtype=int64, numpy=
array([[2, 3, 3, ..., 2, 2, 0],
       [2, 2, 2, ..., 2, 2, 0],
       [3, 3, 3, ..., 3, 2, 0],
       ...,
       [2, 2, 2, ..., 2, 2, 0],
       [2, 2, 2, ..., 2, 2, 3],
       [2, 2, 3, ..., 2, 0, 0]])>

In [71]:
def positional_encodings(seq_len, d_model):
    max_wavelength = 10000.

    pos = np.arange(seq_len)
    inx = np.arange(d_model)

    I, P = np.meshgrid(inx, pos)
    pe_even = np.sin(P / max_wavelength**(I/d_model))
    pe_odd = np.cos(P / max_wavelength**(I/d_model))
        
    pe = np.zeros((seq_len, d_model))
    pe[:, ::2] = pe_even[:, ::2]
    pe[:, 1::2] = pe_odd[:, ::2]
    return tf.constant(pe, dtype=tf.float32)

In [72]:
D_MODEL = 32
MAX_TOKENS = textvectorization.vocabulary_size()  # includes padding and UNK tokens

In [73]:
class InputEmbeddings(Layer):
    
    def __init__(self, d_model, pos_encodings, max_tokens, name='input_embeddings', **kwargs):
        super().__init__(name=name, **kwargs)
        self.pos_encodings = pos_encodings
        self.embedding = Embedding(max_tokens, d_model, mask_zero=True)
        
    def compute_mask(self, inputs, mask=None):
        return self.embedding.compute_mask(inputs)
        
    def call(self, inputs):
        n = tf.shape(inputs)[-1]
        pos_encodings = self.pos_encodings[:n, :]
        h = self.embedding(inputs)
        return h + pos_encodings

In [74]:
# Test

input_embeddings = InputEmbeddings(D_MODEL, positional_encodings(SEQ_LEN, D_MODEL), MAX_TOKENS)
emb_out = input_embeddings(tv_out)
emb_out.shape

TensorShape([128, 10, 32])

In [75]:
def get_attention_mask(mask=None):
    if mask is None:
        return None
    mask1 = mask[:, :, None]
    mask2 = mask[:, None, :]
    return mask1 & mask2

In [76]:
class EncoderBlock(Layer):
    
    def __init__(self, num_heads, key_dim, d_model, ff_dim, name='encoder_block', **kwargs):
        super().__init__(name=name, **kwargs)
        self.supports_masking = True  # This will pass on any incoming mask
        self.num_heads = num_heads
        self.key_dim = key_dim
        self.d_model = d_model
        self.ff_dim = ff_dim
        self.multihead_attention = MultiHeadAttention(num_heads, key_dim)
        self.ff = Sequential([
            Dense(ff_dim, activation='relu'),
            Dense(d_model)
        ])
        self.layernorm1 = LayerNormalization()
        self.layernorm2 = LayerNormalization()
        
    def call(self, inputs, mask=None):
        attention_mask = get_attention_mask(mask)
        h = self.multihead_attention(inputs, inputs, attention_mask=attention_mask)
        h = self.layernorm1(inputs + h)
        
        h_ff = self.ff(h)
        return self.layernorm2(h + h_ff)

In [77]:
# Test

encoder_block = EncoderBlock(num_heads=2, key_dim=16, d_model=D_MODEL, ff_dim=32)
enc_block_out = encoder_block(emb_out)
enc_block_out.shape

TensorShape([128, 10, 32])

In [78]:
enc_block_out._keras_mask

<tf.Tensor: shape=(128, 10), dtype=bool, numpy=
array([[ True,  True,  True, ...,  True,  True, False],
       [ True,  True,  True, ...,  True,  True, False],
       [ True,  True,  True, ...,  True,  True, False],
       ...,
       [ True,  True,  True, ...,  True,  True, False],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True, False, False]])>

In [79]:
class ClassifierHead(Layer):

  def __init__(self, d_model, dropout_rate, units, name='classifier_head', **kwargs):
    super().__init__(name=name, **kwargs)
    self.supports_masking = True
    self.d_model = d_model
    self.dropout_rate = dropout_rate
    self.units = units
    self.dense1 = Dense(units, activation='relu')
    self.dropout = Dropout(dropout_rate)
    self.dense2 = Dense(1)

  def call(self, inputs):
    batch_size = tf.shape(inputs)[0]
    seq_len = tf.shape(inputs)[1]
    h = self.dense1(inputs)
    h = self.dropout(h)
    h = self.dense2(h)
    return tf.reshape(h, (batch_size, seq_len))

In [80]:
# Test

classifier_head = ClassifierHead(D_MODEL, dropout_rate=0.1, units=32)
head_out = classifier_head(enc_block_out)
print(head_out._keras_mask)
head_out.shape

tf.Tensor(
[[ True  True  True ...  True  True False]
 [ True  True  True ...  True  True False]
 [ True  True  True ...  True  True False]
 ...
 [ True  True  True ...  True  True False]
 [ True  True  True ...  True  True  True]
 [ True  True  True ...  True False False]], shape=(128, 10), dtype=bool)


TensorShape([128, 10])

# Transformer Model

In [81]:
class Transformer(Model):

  def __init__(self, d_model, seq_len, max_tokens, num_heads, key_dim, ff_dim, dropout_rate, units,
               textvectorization, name='transformer', **kwargs):
    super().__init__(name=name, **kwargs)
    self.d_model = d_model
    self.seq_len = seq_len
    self.max_tokens = max_tokens
    self.num_heads = num_heads
    self.key_dim = key_dim
    self.ff_dim = ff_dim
    self.dropout_rate = dropout_rate
    self.units = units
    self.textvectorization = textvectorization
    self.input_embeddings = InputEmbeddings(d_model, positional_encodings(seq_len, d_model),
                                            max_tokens)
    self.encoder_block = EncoderBlock(num_heads=num_heads, key_dim=key_dim, d_model=d_model, ff_dim=ff_dim)
    self.classifier_head = ClassifierHead(d_model, dropout_rate=dropout_rate, units=units)

  def train_step(self, data):
    inputs, y_true = data
    with tf.GradientTape() as tape:
      y_pred = self(inputs)
      loss = self.compiled_loss(y_true, y_pred, regularization_losses=self.losses)
    grads = tape.gradient(loss, self.trainable_variables)
    self.optimizer.apply_gradients(zip(grads, self.trainable_variables))
    self.compiled_metrics.update_state(y_true, y_pred)
    return {m.name: m.result() for m in self.metrics} 

  def test_step(self, data):
    inputs, y_true = data
    y_pred = self(inputs)
    loss = self.compiled_loss(y_true, y_pred, regularization_losses=self.losses)
    self.compiled_metrics.update_state(y_true, y_pred)
    return {m.name: m.result() for m in self.metrics} 

  def call(self, inputs):
    h = self.textvectorization(inputs)
    h = self.input_embeddings(h)
    h = self.encoder_block(h)
    h = self.classifier_head(h)
    return h

In [82]:
NUM_HEADS = 2
KEY_DIM = 16
FF_DIM = 32
DROPOUT_RATE = 0.1
UNITS = 20

# transformer = Sequential([
#     textvectorization,
#     InputEmbeddings(D_MODEL, positional_encodings(SEQ_LEN, D_MODEL), MAX_TOKENS, input_shape=(SEQ_LEN,)),
#     EncoderBlock(num_heads=2, key_dim=16, d_model=D_MODEL, ff_dim=32),
#     ClassifierHead(D_MODEL, dropout_rate=0.1, units=20)
# ])
transformer = Transformer(D_MODEL, SEQ_LEN, MAX_TOKENS, NUM_HEADS, KEY_DIM, FF_DIM,
                          DROPOUT_RATE, UNITS, textvectorization)
_ = transformer(test_inputs)
transformer.summary()

Model: "transformer"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_4 (TextV  multiple                 0         
 ectorization)                                                   
                                                                 
 input_embeddings (InputEmbe  multiple                 128       
 ddings)                                                         
                                                                 
 encoder_block (EncoderBlock  multiple                 6464      
 )                                                               
                                                                 
 classifier_head (Classifier  multiple                 681       
 Head)                                                           
                                                                 
Total params: 7,273
Trainable params: 7,273
Non-trainab

In [83]:
# Test

transformer_out = transformer(test_inputs)
transformer_out.shape

TensorShape([128, 10])

In [84]:
transformer_out._keras_mask

<tf.Tensor: shape=(128, 10), dtype=bool, numpy=
array([[ True,  True,  True, ...,  True,  True, False],
       [ True,  True,  True, ...,  True,  True, False],
       [ True,  True,  True, ...,  True,  True, False],
       ...,
       [ True,  True,  True, ...,  True,  True, False],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True, False, False]])>

In [85]:
def masked_binary_crossentropy(y_true, y_pred):
  """
  y_true: shape (batch_size, seq_len). 1. = no space, 2. = space
  y_pred: shape (batch_size, seq_len, 1). Logits
  """
  labels = y_true  # 0 = no space, 1 = space
  mask = tf.not_equal(labels, -1)
  labels = tf.ragged.boolean_mask(labels, mask)
  # Deal with Keras 'feature' that squeezes out the last dimension silently (WTF)
  # if tf.shape(y_pred)[-1] == 1:  
  #   logits = tf.squeeze(y_pred, axis=-1)  # (batch_size, seq_len)
  # else:
  logits = y_pred
  logits = tf.ragged.boolean_mask(logits, mask)
  probs = tf.nn.sigmoid(logits)
  bce = - labels * tf.math.log(probs) - ((1 - labels) * tf.math.log(1 - probs))

  return tf.reduce_mean(tf.reduce_mean(bce, axis=-1))

In [89]:
mask = tf.not_equal(test_outputs, -1)
masked_labels = tf.ragged.boolean_mask(test_outputs, mask)
masked_preds = tf.ragged.boolean_mask(transformer_out, mask)
tf.size(masked_preds), tf.size(masked_labels), tf.math.count_nonzero(test_outputs+1)

(<tf.Tensor: shape=(), dtype=int32, numpy=1126>,
 <tf.Tensor: shape=(), dtype=int32, numpy=1126>,
 <tf.Tensor: shape=(), dtype=int64, numpy=1126>)

In [90]:
# Test

masked_binary_crossentropy(test_outputs, transformer_out)

<tf.Tensor: shape=(), dtype=float32, numpy=0.5204264>

In [91]:
for test_inputs, test_outputs in train_ds.take(1):
  y_pred = transformer(test_inputs)
  loss = masked_binary_crossentropy(test_outputs, y_pred)

loss

<tf.Tensor: shape=(), dtype=float32, numpy=0.5161548>

In [92]:
from tensorflow.keras.metrics import BinaryAccuracy, Precision, Recall, AUC

earlystopping = EarlyStopping(patience=2, monitor="val_binary_accuracy")
transformer.compile(loss=masked_binary_crossentropy, optimizer='adam',
                    metrics=[BinaryAccuracy(), Precision(), Recall(), AUC(curve='PR')])

history = transformer.fit(train_ds, validation_data=valid_ds, epochs=20, callbacks=[earlystopping])

Epoch 1/20




Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20


In [93]:
transformer.evaluate(test_ds, return_dict=True)



{'loss': 0.000361427606549114,
 'binary_accuracy': 0.999818742275238,
 'precision_2': 1.0,
 'recall_2': 0.9984732866287231,
 'auc_2': 1.0}

In [96]:
transformer(['454545'+' '*(SEQ_LEN-6)])

<tf.Tensor: shape=(1, 10), dtype=float32, numpy=
array([[-16.101921 ,   8.424156 , -16.31965  ,   7.079254 , -15.455793 ,
          7.912379 , -14.113033 , -15.557617 , -14.983434 ,   1.1651477]],
      dtype=float32)>