In [1]:
import os
import pandas as pd
import random
import matplotlib.pyplot as plt
import tensorflow as tf
import string
import re
import numpy as np



In [2]:
print(f"Tensor Flow Version: {tf.__version__}\n")

gpu = len(tf.config.list_physical_devices('GPU'))>0
print("GPU is", "AVAILABLE" if gpu else "NOT AVAILABLE")

Tensor Flow Version: 2.13.0

GPU is AVAILABLE


In [3]:
# read the data
df = pd.read_csv("/kaggle/input/language-translation-englishfrench/eng_-french.csv")
df['source'] = df['English words/sentences']

df['target'] = df['French words/sentences'].apply(lambda x: '[start] ' + x + ' [end]')
df = df.drop(['English words/sentences', 'French words/sentences'], axis=1)

df.sample(5)

Unnamed: 0,source,target
13222,You're after me.,[start] Tu es après moi. [end]
143493,He arrived at the station out of breath.,[start] Il est arrivé à la gare à bout de souf...
60515,How are we feeling today?,"[start] Comment vous sentez-vous, aujourd'hui ..."
19655,I'm bringing wine.,[start] J'apporte du vin. [end]
75177,How much money do you want?,[start] Combien d'argent veux-tu ? [end]


In [4]:
df = df.sample(frac=1).reset_index(drop=True)

train_size = int(len(df) * 0.7)
val_size = int(len(df) * 0.2)
test_size = int(len(df) * 0.1)

train_df = df[:train_size]
val_df = df[train_size: train_size+val_size]
test_df = df[train_size+val_size: ]

In [5]:
max_tokens = 25000
sequence_length = 30

strip_chars = string.punctuation
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")

def custom_standardization(input_string):
    lower = tf.strings.lower(input_string)
    return tf.strings.regex_replace(lower, f"[{re.escape(strip_chars)}]", "")

In [6]:
source_vectorization = tf.keras.layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode = 'int',
    output_sequence_length = sequence_length
)

target_vectorization = tf.keras.layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode = 'int',
    output_sequence_length = sequence_length + 1,
    standardize = custom_standardization
)

train_source_texts = train_df['source'].values
train_target_texts = train_df['target'].values
source_vectorization.adapt(train_source_texts)
target_vectorization.adapt(train_target_texts)

In [7]:
random_sample = random.randint(0, len(train_df))

print(random_sample)
train_source_texts[random_sample], train_target_texts[random_sample]

118745


('She took my joke seriously.',
 '[start] Elle a pris ma plaisanterie au sérieux. [end]')

In [8]:
print("Train source vectors: ", source_vectorization(train_source_texts[random_sample]),'\n')
print("Train target vectors", target_vectorization(train_target_texts[random_sample]))

Train source vectors:  tf.Tensor(
[  30  253   20  917 1129    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0], shape=(30,), dtype=int64) 

Train target vectors tf.Tensor(
[   2   30   19  208   38 3736   45 1049    3    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0], shape=(31,), dtype=int64)


In [9]:
decoded_text = ''
for i in range(len(source_vectorization(train_source_texts[random_sample]))):
    decoded_text += source_vectorization.get_vocabulary()[source_vectorization(train_source_texts[random_sample])[i]]+" "

decoded_text_target = ''
for i in range(len(target_vectorization(train_target_texts[random_sample]))):
    decoded_text_target += target_vectorization.get_vocabulary()[target_vectorization(train_target_texts[random_sample])[i]]+" "
    
decoded_text, decoded_text_target

('she took my joke seriously                          ',
 '[start] elle a pris ma plaisanterie au sérieux [end]                       ')

In [10]:
train_source_vectors = source_vectorization(train_source_texts)
train_target_vectors = target_vectorization(train_target_texts)

print(train_source_vectors.shape, train_target_vectors.shape)

(122934, 30) (122934, 31)


### Positional Encoding

In [11]:
class PositionalEncoding(tf.keras.layers.Layer):
    def __init__(self, sequence_length, input_dim, output_dim, **kwargs):
        super().__init__(**kwargs)
        self.token_embeddings = tf.keras.layers.Embedding(
            input_dim=input_dim,
            output_dim=output_dim
        )
        self.position_embeddings = tf.keras.layers.Embedding(
            input_dim = sequence_length,
            output_dim = output_dim
        )
        self.sequence_length = sequence_length
        self.input_dim = input_dim
        self.output_dim = output_dim
        
    def call(self, inputs):
        embedded_tokens = self.token_embeddings(inputs)
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0,  limit=length, delta=1)
        embedded_positions = self.position_embeddings(positions)
        
        return embedded_tokens + embedded_positions
    
    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)
    
    def get_config(self):
        config = super(PositionalEncoding, self).get_config()
        config.update({
            "input_dim": input_dim,
            "output_dim": output_dim,
            "sequence_length": sequence_length
        })
        return config

In [12]:
embed_dim = 256

train_source_embedded = PositionalEncoding(
    sequence_length=sequence_length,
    input_dim=max_tokens,
    output_dim=embed_dim,
    name='source_embedding'
) (train_source_vectors)

train_target_embedded = PositionalEncoding(
    sequence_length=sequence_length,
    input_dim=max_tokens,
    output_dim=embed_dim,
    name='target_embedding'
) (train_source_vectors)

In [13]:
print(f"Positional Embedding Source: {train_source_embedded[random_sample]} Shape: {train_source_embedded[random_sample].shape}\n\n")
print(f"Positional Embedding Target: {train_target_embedded[random_sample]} Shape: {train_target_embedded[random_sample].shape}")

Positional Embedding Source: [[ 0.01388215 -0.03985804  0.01897886 ... -0.03714791 -0.02725856
  -0.0389606 ]
 [ 0.09197158  0.01963572 -0.013709   ...  0.05422709  0.05822042
  -0.02503014]
 [ 0.06591959  0.0872011   0.0118544  ...  0.05518    -0.02813543
   0.03634074]
 ...
 [ 0.04573713 -0.02952985 -0.0172715  ... -0.01451364 -0.01371461
   0.01822332]
 [-0.00430495 -0.02107967 -0.06187977 ...  0.07393841 -0.07606362
  -0.05466762]
 [-0.00090297 -0.0597273  -0.03502066 ... -0.00805563 -0.02768622
  -0.02803839]] Shape: (30, 256)


Positional Embedding Target: [[ 0.04752221  0.03651984 -0.07720277 ... -0.0132517  -0.03106666
   0.0063069 ]
 [ 0.01721491  0.06733374  0.06381149 ... -0.01127194 -0.00780546
  -0.00838776]
 [-0.02882295  0.06677619 -0.0080108  ... -0.00462546  0.02924903
   0.05477956]
 ...
 [ 0.03499118 -0.04711753  0.02189797 ...  0.02352849 -0.04209637
   0.01105725]
 [ 0.05176985 -0.00769563  0.03059796 ...  0.00303662  0.04237538
  -0.06414807]
 [-0.0157359  -0.0285

In [14]:
def shape_list(x):
    static = x.shape.as_list()
    dynamic = tf.shape(x)
    return [dynamic[i] if s is None else s for i,s in enumerate(static)]

def attention_mask(nd, ns, *, dtype):
    i = tf.range(nd)[:, None]
    j = tf.range(ns)
    m = i >= j - ns + nd
    
    return tf.cast(m, dtype)

def mask_attn_weights(w):
    _, _, nd, ns = shape_list(w)
    b = attention_mask(nd, ns, dtype=w.dtype)
    b = tf.reshape(b, [1,1, nd, ns])
    
    w = w*b - tf.cast(1e10, w.dtype)* (1-b)
    return w

In [15]:
random_tensor = tf.random.uniform(shape=(1, 1, 4, 4), minval=0, maxval=1, dtype=tf.float32)
print("Masked attention weights:", mask_attn_weights(random_tensor))

Masked attention weights: tf.Tensor(
[[[[ 3.1123352e-01 -1.0000000e+10 -1.0000000e+10 -1.0000000e+10]
   [ 2.9396057e-02  9.6611273e-01 -1.0000000e+10 -1.0000000e+10]
   [ 8.6119354e-01  1.4040947e-01  9.9359155e-02 -1.0000000e+10]
   [ 7.3288751e-01  6.8425548e-01  9.5318747e-01  6.9767725e-01]]]], shape=(1, 1, 4, 4), dtype=float32)


In [16]:
def scaled_dot_product_attention(q, k, v, use_causal_mask=False):
    d_k = tf.cast(tf.shape(k)[-1], tf.float32)
    
    scores = tf.matmul(q, k, transpose_b=True)
    scaled_scores = scores/tf.math.sqrt(d_k)
    
    if use_causal_mask:
        scaled_scores = mask_attn_weights(scaled_scores)
        
    weights = tf.nn.softmax(scaled_scores, axis=-1)
    output = tf.matmul(weights, v)
    
    return output

In [17]:
data = train_source_embedded
data = tf.expand_dims(data, axis=1)

print(f"Scaled Dot Product Attention: {scaled_dot_product_attention(data, data, data, use_causal_mask=True).shape}")

Scaled Dot Product Attention: (122934, 1, 30, 256)


In [18]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, embed_dim, heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.heads = heads
        
        if embed_dim % heads != 0:
            raise ValueError(
                f"Embedding dimension {embed_dim} must be divisble by number of heads"
            )
            
        self.q_linear = tf.keras.layers.Dense(embed_dim)
        self.k_linear = tf.keras.layers.Dense(embed_dim)
        self.v_linear = tf.keras.layers.Dense(embed_dim)
        self.concat_linear = tf.keras.layers.Dense(embed_dim)
        
    
    def split_heads(self, x, batch_size):
        x = tf.reshape(x, shape=(batch_size, -1, self.heads, self.embed_dim//self.heads))
        return tf.transpose(x, perm=[0,2,1,3])
    
    def concat_heads(self, x, batch_size):
        x = tf.transpose(x, perm=[0,2,1,3])
        return tf.reshape(x, shape=(batch_size, -1, self.embed_dim))
    
    def call(self, q, k, v, use_casual_mask=False):
        batch_size = tf.shape(k)[0]
        q = self.q_linear(q)
        k = self.k_linear(k)
        v = self.v_linear(v)
        
        q = self.split_heads(q, batch_size)
        k = self.split_heads(k, batch_size)
        v = self.split_heads(v, batch_size)
        
        attention = scaled_dot_product_attention(q, k, v, use_casual_mask)
        concat = self.concat_heads(attention, batch_size)
        concat = self.concat_linear(concat)
        return concat
    
    def get_config(self):
        config = super(MultiHeadAttention, self).get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "heads": self.heads
        })

In [19]:
class TransformerEncoder(tf.keras.layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        
        self.layer_norm1 = tf.keras.layers.LayerNormalization()
        self.layer_norm2 = tf.keras.layers.LayerNormalization()
        self.global_self_attention = MultiHeadAttention(embed_dim=embed_dim, heads=num_heads)
        
        self.feed_forward = tf.keras.Sequential(
        [
            tf.keras.layers.Dense(dense_dim, activation='relu'),
            tf.keras.layers.Dense(embed_dim)
        ])
        
    def call(self, x):
        x = self.layer_norm1(x + self.global_self_attention(q=x, k=x, v=x))
        x = self.layer_norm2(x + self.feed_forward(x))
        
        return x
    
    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "dense_dim": self.dense_dim,
            "num_heads": self.num_heads
        })
        return config

In [20]:
class TransformerDecoder(tf.keras.layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.casual_self_attention = MultiHeadAttention(embed_dim=embed_dim, heads=num_heads)
        self.cross_attention = MultiHeadAttention(embed_dim=embed_dim, heads=num_heads)
        
        self.feed_forward = tf.keras.Sequential(
        [
            tf.keras.layers.Dense(dense_dim, activation='relu'),
            tf.keras.layers.Dense(embed_dim)
        ])
        
        self.layer_norm_1 = tf.keras.layers.LayerNormalization()
        self.layer_norm_2 = tf.keras.layers.LayerNormalization()
        self.layer_norm_3 = tf.keras.layers.LayerNormalization()
        
    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "dense_dim": self.dense_dim,
            "num_heads": self.num_heads,
        })
        return config
    
    def call(self, x, context):
        x = self.layer_norm_1(x + self.casual_self_attention(q=x, k=x, v=x, use_casual_mask=True))
        x = self.layer_norm_2(x + self.cross_attention(q=x, k=context, v=context))
        x = self.layer_norm_3(x + self.feed_forward(x))
        
        return x

In [21]:
batch_size = 64

def format_dataset(source, target):
    source_vectors = source_vectorization(source)
    target_vectors = target_vectorization(target)
    return ({
        "source": source_vectors, 
        "target": target_vectors[:, :-1], 
    }, target_vectors[:, 1:])

def make_dataset(df):
    dataset = tf.data.Dataset.from_tensor_slices((df["source"].values, df["target"].values))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset, num_parallel_calls=4)
    return dataset.shuffle(2048).prefetch(16).cache()

train_ds = make_dataset(train_df)
val_ds = make_dataset(val_df)

In [22]:
for batch in train_ds.take(1):
    print("Encoder Inputs:", batch[0]["source"])
    print("Decoder Inputs:", batch[0]["target"])
    print("Decoder Outputs:", batch[1])

Encoder Inputs: tf.Tensor(
[[ 175    2   17 ...    0    0    0]
 [  17    3  202 ...    0    0    0]
 [ 477   23  241 ...    0    0    0]
 ...
 [  24 4802    0 ...    0    0    0]
 [  26 1384    4 ...    0    0    0]
 [   8  263   86 ...    0    0    0]], shape=(64, 30), dtype=int64)
Decoder Inputs: tf.Tensor(
[[   2    4   66 ...    0    0    0]
 [   2  110   69 ...    0    0    0]
 [   2 3874  171 ...    0    0    0]
 ...
 [   2    4   26 ...    0    0    0]
 [   2   92   15 ...    0    0    0]
 [   2   14   19 ...    0    0    0]], shape=(64, 30), dtype=int64)
Decoder Outputs: tf.Tensor(
[[    4    66   124 ...     0     0     0]
 [  110    69  1743 ...     0     0     0]
 [ 3874   171   196 ...     0     0     0]
 ...
 [    4    26 22318 ...     0     0     0]
 [   92    15  2741 ...     0     0     0]
 [   14    19   292 ...     0     0     0]], shape=(64, 30), dtype=int64)


In [23]:
embed_dim = 512 
dense_dim = 2048
num_heads = 8

encoder_inputs = tf.keras.Input(shape=(None,), dtype="int64", name="source")
x = PositionalEncoding(sequence_length, max_tokens, embed_dim)(encoder_inputs)

encoder_outputs = TransformerEncoder(embed_dim, dense_dim, num_heads)(x)
decoder_inputs = tf.keras.Input(shape=(None,), dtype="int64", name="target")

x = PositionalEncoding(sequence_length, max_tokens, embed_dim)(decoder_inputs)
x = TransformerDecoder(embed_dim, dense_dim, num_heads)(x, encoder_outputs)

x = tf.keras.layers.Dropout(0.5)(x)
decoder_outputs = tf.keras.layers.Dense(max_tokens, activation="softmax")(x)

transformer = tf.keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [24]:
train_df.head()

Unnamed: 0,source,target
0,I'm sorry if I frightened you.,[start] Je suis désolé si je vous ai effrayées...
1,Leave him alone.,[start] Laissez-le tranquille. [end]
2,That's clever.,[start] C'est astucieux. [end]
3,He was in prison on a charge of robbery.,[start] Il était en prison pour vol. [end]
4,You need to call Tom immediately.,[start] Tu dois appeler Tom immédiatement. [end]


In [25]:
transformer.compile(
    optimizer="rmsprop",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"])

EPOCHS = 50
checkpoint_filepath = '/tmp/checkpoint/'
callbacks_list = [
    tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.1,
        patience=3,
    ),
    tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=6,
    ),
    tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_filepath,
        save_weights_only=True,
        monitor='val_loss',
        mode='min',
        save_best_only=True
    ),
]

print(transformer.summary())
    
transformer.fit(train_ds, 
                epochs=EPOCHS, 
                callbacks=callbacks_list,
                validation_data=val_ds)

transformer.load_weights(checkpoint_filepath)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 source (InputLayer)         [(None, None)]               0         []                            
                                                                                                  
 target (InputLayer)         [(None, None)]               0         []                            
                                                                                                  
 positional_encoding (Posit  (None, None, 512)            1281536   ['source[0][0]']              
 ionalEncoding)                                           0                                       
                                                                                                  
 positional_encoding_1 (Pos  (None, None, 512)            1281536   ['target[0][0]']          

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x7a539c2e7760>

In [28]:
target_vocab = target_vectorization.get_vocabulary()
target_index_lookup = dict(zip(range(len(target_vocab)), target_vocab))
max_decoded_sentence_length = 30

def decode_sequence(input_sentence):
    tokenized_input_sentence = source_vectorization([input_sentence])
    decoded_sentence = "[start]"
    
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = target_vectorization([decoded_sentence])[:, :-1]
        
        predictions = transformer([
            tokenized_input_sentence,
            tokenized_target_sentence
        ])
        
        sampled_token_index = np.argmax(predictions[0, i, :])
        sampled_token = target_index_lookup[sampled_token_index]
        
        decoded_sentence += " " + sampled_token
        
        if sampled_token == "[end]":
            break
           
    return decoded_sentence


# let's translate 50 random sentences
for i in range(500):
    random_index = np.random.randint(0, len(test_df))
    input_sentence = test_df["source"].iloc[random_index]
    print(input_sentence)
    print(decode_sequence(input_sentence))
    print()

Do you like to live in the country?
[start] aimezvous vivre à la campagne [end]

Is Tom absent today?
[start] tom estil absent aujourdhui [end]

I only feed my dog dry dog food.
[start] je ne nourris que mon chien de la nourriture pour chien [end]

I may swim in the river.
[start] je sais nager dans la rivière [end]

We met on a blind date.
[start] nous nous sommes rencontrées à une rencontre surprise [end]

It couldn't have been better.
[start] Ça na pas pu être meilleur [end]

I like her very much.
[start] jaime beaucoup son [end]

I put the money into the safe.
[start] jai mis largent dans le coffrefort [end]

Probably the reason Tom can't go is because he doesn't have any money.
[start] probablement la raison pour laquelle tom ne peut pas aller il ne peut pas dargent [end]

I've never seen such a large whale.
[start] je nai jamais vu une aussi grande baleine [end]

Promise me that you'll help Tom.
[start] promettezmoi daider tom [end]

Did you call him up yesterday?
[start] lavezvo

In [30]:
transformer.save_weights("model.h5")

### Credits: https://www.kaggle.com/code/renaudmathieu/transformer-from-scratch/notebook
### https://www.tensorflow.org/text/tutorials/transformer