In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_text as text

from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab

In [2]:
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [3]:
df = pd.read_csv('./datasets/dataset.csv')
df['corpus'] = df['corpus'].astype(str)
df

Unnamed: 0,corpus
0,"A more precise measurement, such as a urine al..."
1,What are the symptoms of Fraser like syndrome ...
2,Most cases of shingles occur in adults. Only a...
3,Is Bilateral perisylvian polymicrogyria inheri...
4,""",GARD,Coccygodynia What are the treatments fo..."
...,...
77802,"In the fatal cases (4/5 patients), a transient..."
77803,This chemical robs your blood of oxygen and tr...
77804,Physical Activity Being physically active and ...
77805,""",GARD,Faciocardiorenal syndrome What is (are)..."


In [4]:
samples = df['corpus'].values
samples[-12]

'chromosome territories are also dynamic structures, with genes able to relocate from the periphery towards the interior once they have been switched on. in other cases, genes may move in the opposite direction, or simply maintain their position. chromosome territories can reposition in diseases, which might provide novel insights into disease mechanisms and why genes are incorrectly expressed in diseases.'

In [5]:
bert_tokenizer_params = dict(lower_case=True)
tokenizer = text.BertTokenizer('vocab.txt', **bert_tokenizer_params)

In [6]:
BUFFER_SIZE = 20000
BATCH_SIZE = 32
MAX_TOKENS = 129

def prepare_data(token):
      token = token[:MAX_TOKENS]
      input = token[:-1]
      label = token[1:]
      
      return input, label

train_dataset = tokenizer.tokenize(samples[15561:]).merge_dims(-2,-1).to_tensor()
train_dataset = tf.data.Dataset.from_tensor_slices(train_dataset)
train_dataset = train_dataset.map(prepare_data).shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(buffer_size=tf.data.AUTOTUNE)

val_dataset = tokenizer.tokenize(samples[:15561]).merge_dims(-2,-1).to_tensor()
val_dataset = tf.data.Dataset.from_tensor_slices(val_dataset)
val_dataset = val_dataset.map(prepare_data).shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(buffer_size=tf.data.AUTOTUNE)

for x_train, y_train in train_dataset.take(1):
      break

print(x_train.shape)
print(y_train.shape)

print('Input:', x_train[0][:10].numpy())
print('Output:', y_train[0][:10].numpy())


(32, 128)
(32, 128)
Input: [222  86 104 119 297 113 320  15  42 304]
Output: [ 86 104 119 297 113 320  15  42 304 161]


In [7]:
def positional_encoding(length, depth):
    depth = depth/2

    positions = np.arange(length)[:, np.newaxis]     # (seq, 1)
    depths = np.arange(depth)[np.newaxis, :]/depth   # (1, depth)

    angle_rates = 1 / (10000**depths)         # (1, depth)
    angle_rads = positions * angle_rates      # (pos, depth)

    pos_encoding = np.concatenate(
        [np.sin(angle_rads), np.cos(angle_rads)],
        axis=-1) 

    return tf.cast(pos_encoding, dtype=tf.float32)

In [8]:
class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.d_model = d_model
        self.embedding = tf.keras.layers.Embedding(vocab_size, d_model, mask_zero=True) 
        self.pos_encoding = positional_encoding(length=2048, depth=d_model)

    def compute_mask(self, *args, **kwargs):
        return self.embedding.compute_mask(*args, **kwargs)

    def call(self, x):
        length = tf.shape(x)[1]
        x = self.embedding(x)
        # This factor sets the relative scale of the embedding and positonal_encoding.
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x = x + self.pos_encoding[tf.newaxis, :length, :]
        return x

In [9]:
class BaseAttention(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__()
        self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
        self.layernorm = tf.keras.layers.LayerNormalization()
        self.add = tf.keras.layers.Add()

In [10]:
class CausalSelfAttention(BaseAttention):
    def call(self, x):
        attn_output = self.mha(
            query=x,
            value=x,
            key=x,
            use_causal_mask = True)
        x = self.add([x, attn_output])
        x = self.layernorm(x)
        return x

In [11]:
class FeedForward(tf.keras.layers.Layer):
    def __init__(self, d_model, dff, dropout_rate=0.1):
        super().__init__()
        self.seq = tf.keras.Sequential([
            tf.keras.layers.Dense(dff, activation='relu'),
            tf.keras.layers.Dense(d_model),
            tf.keras.layers.Dropout(dropout_rate)
        ])
        self.add = tf.keras.layers.Add()
        self.layer_norm = tf.keras.layers.LayerNormalization()

    def call(self, x):
        x = self.add([x, self.seq(x)])
        x = self.layer_norm(x) 
        return x

In [12]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, *, d_model, num_heads, dff, dropout_rate=0.1):
        super(DecoderLayer, self).__init__()

        self.causal_self_attention = CausalSelfAttention(
            num_heads=num_heads,
            key_dim=d_model,
            dropout=dropout_rate)

        self.ffn = FeedForward(d_model, dff)

    def call(self, x):
        x = self.causal_self_attention(x=x)
        x = self.ffn(x)  # Shape `(batch_size, seq_len, d_model)`.
        return x

In [13]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, *, num_layers, d_model, num_heads, dff, vocab_size, dropout_rate=0.1):
        super(Decoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.pos_embedding = PositionalEmbedding(vocab_size=vocab_size, d_model=d_model)
        self.dropout = tf.keras.layers.Dropout(dropout_rate)
        self.dec_layers = [
            DecoderLayer(d_model=d_model, num_heads=num_heads,
                        dff=dff, dropout_rate=dropout_rate)
            for _ in range(num_layers)]

        self.last_attn_scores = None

    def call(self, x):
        # `x` is token-IDs shape (batch, target_seq_len)
        x = self.pos_embedding(x)  # (batch_size, target_seq_len, d_model)

        x = self.dropout(x)

        for i in range(self.num_layers):
            x = self.dec_layers[i](x)

        # The shape of x is (batch_size, target_seq_len, d_model).
        return x

In [14]:
class Transformer(tf.keras.Model):
    def __init__(self, *, num_layers, d_model, num_heads, dff, target_vocab_size, dropout_rate=0.1):
        super().__init__()

        self.decoder = Decoder(num_layers=num_layers, d_model=d_model,
                            num_heads=num_heads, dff=dff,
                            vocab_size=target_vocab_size,
                            dropout_rate=dropout_rate)

        self.final_layer = tf.keras.layers.Dense(target_vocab_size)

    def call(self, x):
        # To use a Keras model with `.fit` you must pass all your inputs in the
        # first argument.

        x = self.decoder(x)  # (batch_size, target_len, d_model)

        # Final linear layer output.
        logits = self.final_layer(x)  # (batch_size, target_len, target_vocab_size)

        # Return the final output and the attention weights.
        return logits

In [15]:
num_layers = 4
d_model = 128
dff = 512
num_heads = 8
dropout_rate = 0.1
vocab_size = 7951

In [16]:
transformer = Transformer(
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    target_vocab_size=vocab_size,
    dropout_rate=dropout_rate)

output = transformer(x_train)

print(x_train.shape)
print(output.shape)

(32, 128)
(32, 128, 7951)


In [17]:
transformer.summary()

Model: "transformer"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 decoder (Decoder)           multiple                  3656576   
                                                                 
 dense_8 (Dense)             multiple                  1025679   
                                                                 
Total params: 4,682,255
Trainable params: 4,682,255
Non-trainable params: 0
_________________________________________________________________


In [18]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super().__init__()

        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)

        self.warmup_steps = warmup_steps

    def __call__(self, step):
        step = tf.cast(step, dtype=tf.float32)
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [19]:
learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

In [20]:
def masked_loss(label, pred):
    mask = label != 0
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
    loss = loss_object(label, pred)

    mask = tf.cast(mask, dtype=loss.dtype)
    loss *= mask

    loss = tf.reduce_sum(loss)/tf.reduce_sum(mask)
    return loss


def masked_accuracy(label, pred):
    pred = tf.argmax(pred, axis=2)
    label = tf.cast(label, pred.dtype)
    match = label == pred

    mask = label != 0

    match = match & mask

    match = tf.cast(match, dtype=tf.float32)
    mask = tf.cast(mask, dtype=tf.float32)
    return tf.reduce_sum(match)/tf.reduce_sum(mask)

In [21]:
es_callback = tf.keras.callbacks.EarlyStopping(monitor='val_masked_accuracy', patience=3)
transformer.compile(loss=masked_loss, optimizer=optimizer, metrics=[masked_accuracy])

In [22]:
# Loads the weights
transformer.load_weights('./model/checkpoints')

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x1818b7ec700>

In [35]:
transformer.fit(train_dataset, epochs=5, validation_data=val_dataset, callbacks=[es_callback])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x2ab57361c70>

In [36]:
transformer.save_weights('./model/checkpoints')

In [23]:
# Loads the new weights
transformer.load_weights('./model/checkpoints')

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x181aad6aee0>

In [24]:
def generate_text(sentence, maxlen=MAX_TOKENS):
    output_array = tokenizer.tokenize(sentence).merge_dims(-2, -1).to_tensor()
    
    for i in range(maxlen):
        prediction = transformer(output_array, training=False)
        prediction = prediction[:, -1:, :]
        prediction = tf.argmax(prediction, axis=-1)
        output_array = tf.concat([output_array, prediction], axis=1)
        
        if prediction[0][0].numpy() == 3:
            break
        
    output = tokenizer.detokenize(output_array).to_tensor()
    output = ' '.join([word.decode('utf-8') for word in output.numpy()[0]])
    return output

In [25]:
generate_text('what is type 2 diabetes?')

"what is type 2 diabetes ? type 2 diabetes is a disease in which the body is unable to use insulin effectively . the body is unable to use insulin effectively , which means that it is unable to use insulin effectively . type 2 diabetes develops when the body doesnt make enough insulin or is not able to use insulin effectively , or both . type 2 diabetes is a disease in which the body doesnt use insulin effectively . type 2 diabetes develops when the body doesnt make enough insulin or is not able to use insulin effectively , or both . type 2 diabetes is a disease in which the body doesnt make enough insulin or doesnt use insulin effectively . type 2 diabetes is a disease in which the body '"

In [27]:
generate_text('what is insulin?')

'what is insulin ? insulin is a hormone that helps the body make insulin . insulin is a hormone that helps the body use for energy . insulin helps the body make insulin and helps the body make energy . insulin is a hormone that helps the body use for energy . insulin helps the body make insulin . insulin helps the body make insulin and helps the body make insulin . insulin is a hormone that helps the body use glucose for energy . insulin helps cells make glucose for energy . insulin helps the body make glucose for energy . insulin is a hormone that helps the body use glucose for energy . insulin is a hormone that helps the body use glucose for energy . insulin is a hormone that'