# Transfomer for translation (https://www.tensorflow.org/text/tutorials/transformer)

- train tokenizer for occitan
- create transformer architecture
- try inference with one examples, even untrained
- tokenize examples (fr - occ)
- make batches
- train transformer
- inference
- eval
- visualisation

In [29]:
import tensorflow as tf
import pandas as pd
import numpy as np

In [30]:
# Check available GPUs
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    # Limit TensorFlow to use only the first GPU
    tf.config.set_visible_devices(physical_devices[0], 'GPU')
    # Allow memory growth to avoid allocating all GPU memory upfront
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
    print('GPU is available and configured for TensorFlow.')
else:
    print('No GPU available. TensorFlow will use CPU.')

No GPU available. TensorFlow will use CPU.


In [31]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [32]:
dataset_folder = 'traduction_occitan/'
filename = 'trainNormBig.csv'
drivepath = '/content/drive/MyDrive/Colab/occitan/'
# filepath = dataset_folder + filename
filepath = drivepath + dataset_folder + filename

In [33]:
print(df)

                                                       0  \
0      Stir Fry/Padelada ! Quò es un mejan perfièch p...   
1      Coleslaw/Ensalada de chaul ! Sètz en trin de v...   
2      Maki Roll/Rotlèus Maki ! La sabor delicata de ...   
3      Lucky Lunch/Repais jauvent ! Una vièlha legend...   
4      Roasted Hazelnuts/Nosilhas grilhadas ! Ai un b...   
...                                                  ...   
40964  Un brave nombre d'amainatjaments fuguèren fach...   
40965  Un suènh, una armonia e una rigor particularas...   
40966  Dins lo cas que quauques mots vos eschaparián,...   
40967  Podètz quand volètz far ‘na pausa dins un dial...   
40968  Stardew Valley coma tots los elements grafics ...   

                                                       1        2  
0      Stir Fry/Sauté de légumes! C’est le moyen idéa...  lemosin  
1                            Pancakes/Pancakes ! Parfois  lemosin  
2      Complete Breakfast/Petit-déjeuner complet ! La...  lemosin  
3      

In [34]:
occitan_sentences = df.iloc[:, 0].tolist()
print(occitan_sentences[:5], len(occitan_sentences))

["Stir Fry/Padelada ! Quò es un mejan perfièch per botar un pauc de sana verdura dins vòstra sieta. Fasètz sautar dins un pauc d'òli de sesame e apondètz totplen de gengibre fresche e de gòuça. Aura bufatz prigondament... Ò quò es bon./Padelada", "Coleslaw/Ensalada de chaul ! Sètz en trin de vos imaginar daus modelons deissaborits de chauls molards ? Parièr. Mas una bona ensalada de chaul quò pòt èsser plan mai que quò. Asseguratz-vos d'aver dau chaul fresche e aigós per quela-qui. Boiratz emb un pauc de vinagre e de la maionesa e quò es prèste. ARadish Salad/Ensalada de rafes ! Ren vau un rafe fresche e suavament pebrat. Quò me recòrda la fin de la prima... Ma mair copava daus rafes fresches e los serviá sus dau pan de granas emb un pauc de sau. AOmelet/Moleta ! Quò es un plat tot simpleBaked Fish/Peisson au forn ! Quand fau quò chas ieuPancakes/Crèspas ! Daus uns còps me laisse emportar per daus plats sofisticats e complicats... Mas i a quauquarren de conortant dins una simpla crèspa

# Train a tokenizer for occitan


In [35]:
# Write the Occitan sentences to a text file
with open("occitan_sentences_for_tokenizer.txt", "w", encoding="utf-8") as f:
    for sentence in occitan_sentences:
        f.write(sentence + "\n")

TypeError: unsupported operand type(s) for +: 'float' and 'str'

In [36]:
from tokenizers import Tokenizer, trainers, normalizers, pre_tokenizers, decoders
from tokenizers.models import BPE
from tokenizers.normalizers import NFD, StripAccents

# Define the tokenizer
occ_tokenizer = Tokenizer(BPE())

# Customize normalization and pre-tokenization if needed
occ_tokenizer.normalizer = normalizers.Sequence([
    NFD(),   # Normalize to decomposed unicode (e.g., é -> e +  ́)
])
occ_tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

# Train the tokenizer on your Occitan sentences
trainer = trainers.BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
occ_tokenizer.train(files=["occitan_sentences_for_tokenizer.txt"], trainer=trainer)

from tokenizers.processors import TemplateProcessing
occ_tokenizer.post_processor = TemplateProcessing(
    single="[BOS] $A [EOS]",
    special_tokens=[("[BOS]", 1), ("[EOS]", 2)],
)

# Save the trained tokenizer
occ_tokenizer.save("occitan_tokenizer.json")

## test tokenizer

In [None]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.normalizers import NFD, StripAccents
from tokenizers.pre_tokenizers import Whitespace

# Load the tokenizer
occ_tokenizer = Tokenizer.from_file("occitan_tokenizer.json")

# Tokenize a sample Occitan sentence
sentence = occitan_sentences[1]
tokens = occ_tokenizer.encode(sentence)

# Print the tokens
print(sentence)
print(tokens.tokens)
print(tokens.ids)
# vocab size
print("Number of tokens in vocab:", occ_tokenizer.get_vocab_size())
print(occ_tokenizer.encode('').ids)

Que'u seguirà a quate òras deu vrèspe l'espectacle Contes d'un Doman Acabat de Gric de Prat.
['[BOS]', 'Que', "'", 'u', 'seguirà', 'a', 'quate', 'òras', 'deu', 'vrèspe', 'l', "'", 'espectacle', 'Contes', 'd', "'", 'un', 'Doman', 'Aca', 'bat', 'de', 'Gric', 'de', 'Prat', '.', '[EOS]']
[1, 276, 11, 88, 8370, 68, 1482, 1016, 194, 1719, 79, 11, 1199, 2688, 71, 11, 165, 10442, 1979, 695, 138, 5094, 138, 4590, 18, 2]
Number of tokens in vocab: 30000
[1, 2]


## instantiate french tokenizer

In [None]:
from transformers import AutoTokenizer

# Load the pre-trained French tokenizer
fr_tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-french-europeana-cased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/83.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/420 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/227k [00:00<?, ?B/s]

In [None]:
# Tokenize a French sentence
french_sentence = "la proposition de ce chose préfait possède quelque chose de puissant et indemontable"
tokens = fr_tokenizer.tokenize(french_sentence)

# Print the tokens
print(tokens, len(fr_tokenizer.get_vocab()))

['la', 'proposition', 'de', 'ce', 'chose', 'préf', '##ait', 'possède', 'quelque', 'chose', 'de', 'puissant', 'et', 'indem', '##ont', '##able'] 32000


# Transformer architecture

In [None]:
def positional_encoding(length, depth):
  depth = depth/2

  positions = np.arange(length)[:, np.newaxis]     # (seq, 1)
  depths = np.arange(depth)[np.newaxis, :]/depth   # (1, depth)

  angle_rates = 1 / (10000**depths)         # (1, depth)
  angle_rads = positions * angle_rates      # (pos, depth)

  pos_encoding = np.concatenate(
      [np.sin(angle_rads), np.cos(angle_rads)],
      axis=-1)

  return tf.cast(pos_encoding, dtype=tf.float32)

In [None]:
class PositionalEmbedding(tf.keras.layers.Layer):
  def __init__(self, vocab_size, d_model):
    super().__init__()
    self.d_model = d_model
    self.embedding = tf.keras.layers.Embedding(vocab_size, d_model, mask_zero=True)
    self.pos_encoding = positional_encoding(length=2048, depth=d_model)

  def compute_mask(self, *args, **kwargs):
    return self.embedding.compute_mask(*args, **kwargs)

  def call(self, x):
    length = tf.shape(x)[1]
    x = self.embedding(x)
    # This factor sets the relative scale of the embedding and positonal_encoding.
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x = x + self.pos_encoding[tf.newaxis, :length, :]
    return x


In [None]:
class BaseAttention(tf.keras.layers.Layer):
  def __init__(self, **kwargs):
    super().__init__()
    self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
    self.layernorm = tf.keras.layers.LayerNormalization()
    self.add = tf.keras.layers.Add()

In [None]:
class CrossAttention(BaseAttention):
  def call(self, x, context):
    attn_output, attn_scores = self.mha(
        query=x,
        key=context,
        value=context,
        return_attention_scores=True)

    # Cache the attention scores for plotting later.
    self.last_attn_scores = attn_scores

    x = self.add([x, attn_output])
    x = self.layernorm(x)

    return x

In [None]:
class GlobalSelfAttention(BaseAttention):
  def call(self, x):
    attn_output = self.mha(
        query=x,
        value=x,
        key=x)
    x = self.add([x, attn_output])
    x = self.layernorm(x)
    return x

In [None]:
class CausalSelfAttention(BaseAttention):
  def call(self, x):
    attn_output = self.mha(
        query=x,
        value=x,
        key=x,
        use_causal_mask = True)
    x = self.add([x, attn_output])
    x = self.layernorm(x)
    return x

In [None]:
class FeedForward(tf.keras.layers.Layer):
  def __init__(self, d_model, dff, dropout_rate=0.1):
    super().__init__()
    self.seq = tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),
      tf.keras.layers.Dense(d_model),
      tf.keras.layers.Dropout(dropout_rate)
    ])
    self.add = tf.keras.layers.Add()
    self.layer_norm = tf.keras.layers.LayerNormalization()

  def call(self, x):
    x = self.add([x, self.seq(x)])
    x = self.layer_norm(x)
    return x


In [None]:
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self,*, d_model, num_heads, dff, dropout_rate=0.1):
    super().__init__()

    self.self_attention = GlobalSelfAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.ffn = FeedForward(d_model, dff)

  def call(self, x):
    x = self.self_attention(x)
    x = self.ffn(x)
    return x

In [None]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self, *, num_layers, d_model, num_heads,
               dff, vocab_size, dropout_rate=0.1):
    super().__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.pos_embedding = PositionalEmbedding(
        vocab_size=vocab_size, d_model=d_model)

    self.enc_layers = [
        EncoderLayer(d_model=d_model,
                     num_heads=num_heads,
                     dff=dff,
                     dropout_rate=dropout_rate)
        for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(dropout_rate)

  def call(self, x):
    # `x` is token-IDs shape: (batch, seq_len)
    x = self.pos_embedding(x)  # Shape `(batch_size, seq_len, d_model)`.

    # Add dropout.
    x = self.dropout(x)

    for i in range(self.num_layers):
      x = self.enc_layers[i](x)

    return x  # Shape `(batch_size, seq_len, d_model)`.

In [None]:
class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self,
               *,
               d_model,
               num_heads,
               dff,
               dropout_rate=0.1):
    super(DecoderLayer, self).__init__()

    self.causal_self_attention = CausalSelfAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.cross_attention = CrossAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.ffn = FeedForward(d_model, dff)

  def call(self, x, context):
    x = self.causal_self_attention(x=x)
    x = self.cross_attention(x=x, context=context)

    # Cache the last attention scores for plotting later
    self.last_attn_scores = self.cross_attention.last_attn_scores

    x = self.ffn(x)  # Shape `(batch_size, seq_len, d_model)`.
    return x

In [None]:
class Decoder(tf.keras.layers.Layer):
  def __init__(self, *, num_layers, d_model, num_heads, dff, vocab_size,
               dropout_rate=0.1):
    super(Decoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.pos_embedding = PositionalEmbedding(vocab_size=vocab_size,
                                             d_model=d_model)
    self.dropout = tf.keras.layers.Dropout(dropout_rate)
    self.dec_layers = [
        DecoderLayer(d_model=d_model, num_heads=num_heads,
                     dff=dff, dropout_rate=dropout_rate)
        for _ in range(num_layers)]

    self.last_attn_scores = None

  def call(self, x, context):
    # `x` is token-IDs shape (batch, target_seq_len)
    x = self.pos_embedding(x)  # (batch_size, target_seq_len, d_model)

    x = self.dropout(x)

    for i in range(self.num_layers):
      x  = self.dec_layers[i](x, context)

    self.last_attn_scores = self.dec_layers[-1].last_attn_scores

    # The shape of x is (batch_size, target_seq_len, d_model).
    return x

In [None]:
class Transformer(tf.keras.Model):
  def __init__(self, *, num_layers, d_model, num_heads, dff,
               input_vocab_size, target_vocab_size, dropout_rate=0.1):
    super().__init__()
    self.encoder = Encoder(num_layers=num_layers, d_model=d_model,
                           num_heads=num_heads, dff=dff,
                           vocab_size=input_vocab_size,
                           dropout_rate=dropout_rate)

    self.decoder = Decoder(num_layers=num_layers, d_model=d_model,
                           num_heads=num_heads, dff=dff,
                           vocab_size=target_vocab_size,
                           dropout_rate=dropout_rate)

    self.final_layer = tf.keras.layers.Dense(target_vocab_size)

  def call(self, inputs):
    # To use a Keras model with `.fit` you must pass all your inputs in the
    # first argument.
    context, x  = inputs

    context = self.encoder(context)  # (batch_size, context_len, d_model)

    x = self.decoder(x, context)  # (batch_size, target_len, d_model)

    # Final linear layer output.
    logits = self.final_layer(x)  # (batch_size, target_len, target_vocab_size)

    try:
      # Drop the keras mask, so it doesn't scale the losses/metrics.
      # b/250038731
      del logits._keras_mask
    except AttributeError:
      pass

    # Return the final output and the attention weights.
    return logits

# Testing Architecture on a single example

## Instantiating model and loading all example sentences

In [None]:
# instantiating model
num_layers = 4
d_model = 64 # must correspond to output of tokenizer
dff = 512
num_heads = 8
dropout_rate = 0.1

transformer = Transformer(
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    input_vocab_size=len(fr_tokenizer.get_vocab()),
    target_vocab_size=occ_tokenizer.get_vocab_size(),
    dropout_rate=dropout_rate)

In [None]:
# "gascon", "lemosin" or "lengadoc"
#dialect_df = df[df.iloc[:, 2] == 'gascon']

occitan_sentences = dialect_df.iloc[:, 0].tolist()
french_sentences = dialect_df.iloc[:, 1].tolist()

occitan_sentences = np.array(occitan_sentences)
french_sentences = np.array(french_sentences)
print(occitan_sentences.shape, french_sentences.shape)

(23325,) (23325,)


In [None]:
occitan_sentences = np.array(df.iloc[:, 0].tolist())
french_sentences = np.array(df.iloc[:, 1].tolist())
print(occitan_sentences.shape, french_sentences.shape)

(40969,) (40969,)


## Passing an example through the model

In [None]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Choose an index
index = 0  # You can choose any index you want to test
PADDING = 64

# Select a single pair of sentences
occitan_sentence = occitan_sentences[index]
french_sentence = french_sentences[index]

# Tokenize the sentences
occitan_token_ids = occ_tokenizer.encode(occitan_sentence).ids
french_token_ids = fr_tokenizer.encode(french_sentence)

# Reshape the inputs to add a batch dimension
occitan_token_ids = np.expand_dims(occitan_token_ids, axis=0)
french_token_ids = np.expand_dims(french_token_ids, axis=0)
# Add padding to the sequences
occitan_token_ids = pad_sequences(occitan_token_ids, maxlen=PADDING, padding='post')
french_token_ids = pad_sequences(french_token_ids, maxlen=PADDING, padding='post')

print(occitan_token_ids.shape, french_token_ids.shape)
print(occitan_token_ids, french_token_ids)

(1, 64) (1, 64)
[[   1  361   30 1880  194 1376  476   30  958   72 1006  138  885  529
   503   30  882   12 1057   13  977   30 5860   71   11  370  272   11
    81  268  239    2    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0]] [[    2 21413   339    30    46 12046   212   378  4254  2875  2693  3043
     30   884    72    87 24360  1988 12999   283  8836    30  3256    12
   1612    13 12758    30 14076    71    11    37 19980   623   728  2464
    467     3     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0]]


In [None]:
# input : ((batch_size, source_sequence_length), (batch_size, target_sequence_length))
print(occitan_token_ids.shape, french_token_ids.shape)
out = transformer((occitan_token_ids, french_token_ids))

(1, 64) (1, 64)


In [None]:
print(out.shape)

(1, 64, 30000)


In [None]:
attn_scores = transformer.decoder.dec_layers[-1].last_attn_scores
print(attn_scores.shape)  # (batch, heads, target_seq, input_seq)

(1, 8, 64, 64)


In [None]:
transformer.summary()

Model: "transformer"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder (Encoder)           multiple                  2844160   
                                                                 
 decoder (Decoder)           multiple                  3247360   
                                                                 
 dense_16 (Dense)            multiple                  1950000   
                                                                 
Total params: 8041520 (30.68 MB)
Trainable params: 8041520 (30.68 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


# Creating dataset for training

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# Define a function to preprocess data into batches
def create_batches(occitan_sentences, french_sentences, occ_tokenizer, fr_tokenizer, batch_size, padding_length, validation_split=0.2):
    # Split data into training and validation sets
    occitan_train, occitan_val, french_train, french_val = train_test_split(occitan_sentences, french_sentences, test_size=validation_split, random_state=42)

    # Tokenize and pad sequences for training data
    train_dataset = _create_dataset(occitan_train, french_train, occ_tokenizer, fr_tokenizer, batch_size, padding_length)

    # Tokenize and pad sequences for validation data
    val_dataset = _create_dataset(occitan_val, french_val, occ_tokenizer, fr_tokenizer, batch_size, padding_length)

    return train_dataset, val_dataset

def _create_dataset(occitan_sentences, french_sentences, occ_tokenizer, fr_tokenizer, batch_size, padding_length):
    inputs_tokenized = []
    #french_target_tokenized = []
    occitan_target_tokenized = []

    # Tokenize and pad sequences
    for occitan_sentence, french_sentence in tqdm(zip(occitan_sentences, french_sentences)):
        occitan_token_ids = occ_tokenizer.encode(occitan_sentence).ids
        french_token_ids = fr_tokenizer.encode(french_sentence)

        # Shift French sequences for labels
        #french_input = french_token_ids[:-1]
        #french_target = french_token_ids[1:]
        occitan_input = occitan_token_ids[:-1]
        occitan_target = occitan_token_ids[1:]

        french_token_ids = pad_sequences([french_token_ids], maxlen=padding_length, padding='post')[0]
        occitan_input = pad_sequences([occitan_input], maxlen=padding_length, padding='post')[0]
        occitan_target = pad_sequences([occitan_target], maxlen=padding_length, padding='post')[0]

        #inputs_tokenized.append((occitan_token_ids, french_input))
        #french_target_tokenized.append(french_target)
        inputs_tokenized.append((french_token_ids, occitan_input))
        occitan_target_tokenized.append(occitan_target)

    # Convert lists to numpy arrays
    french_inputs = np.array([pair[0] for pair in inputs_tokenized])
    occitan_inputs = np.array([pair[1] for pair in inputs_tokenized])
    occitan_targets = np.array(occitan_target_tokenized)

    # Create dataset
    dataset = tf.data.Dataset.from_tensor_slices(((french_inputs, occitan_inputs), occitan_targets))

    # Batch the dataset
    dataset = dataset.batch(batch_size, drop_remainder=True)

    return dataset

# CREATING DATASET
batch_size = 32
padding_length = 64
validation_split = 0.1  # 20% of the data will be used for validation

train_dataset, val_dataset = create_batches(occitan_sentences, french_sentences, occ_tokenizer, fr_tokenizer, batch_size, padding_length, validation_split)


2120it [00:00, 3377.00it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (787 > 512). Running this sequence through the model will result in indexing errors
36872it [00:15, 2427.85it/s]
4097it [00:06, 660.29it/s] 


In [None]:

# Iterate through batches of training dataset
for (batch, (inputs, labels)) in enumerate(train_dataset.take(1)):
    print(f"Training Batch {batch + 1}:")
    print("Occitan Inputs shape:", inputs[0])
    print("French Inputs shape:", inputs[1])
    print("French Labels shape:", labels.shape)
    break

# Iterate through batches of validation dataset
for (batch, (inputs, labels)) in enumerate(val_dataset.take(1)):
    print(f"Validation Batch {batch + 1}:")
    print("Occitan Inputs shape:", inputs[0].shape)
    print("French Inputs shape:", inputs[1].shape)
    print("French Labels shape:", labels.shape)
    break


Training Batch 1:
Occitan Inputs shape: tf.Tensor(
[[    2  2579  6013 ...     0     0     0]
 [    2 21413   339 ...     0     0     0]
 [    2   447 30962 ...     0     0     0]
 ...
 [    2   646   399 ...     0     0     0]
 [    2   667 10563 ...     0     0     0]
 [    2   528   401 ...     0     0     0]], shape=(32, 64), dtype=int32)
French Inputs shape: tf.Tensor(
[[    1 20077 12752 ...     0     0     0]
 [    1  3169    30 ...     0     0     0]
 [    1   230  7324 ...     0     0     0]
 ...
 [    1    41  4231 ...     0     0     0]
 [    1   276 17836 ...     0     0     0]
 [    1   197    11 ...     0     0     0]], shape=(32, 64), dtype=int32)
French Labels shape: (32, 64)
Validation Batch 1:
Occitan Inputs shape: (32, 64)
French Inputs shape: (32, 64)
French Labels shape: (32, 64)


# training


In [None]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super().__init__()

    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps

  def __call__(self, step):
    step = tf.cast(step, dtype=tf.float32)
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)

    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [None]:
learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)

In [None]:
def masked_loss(label, pred):
  mask = label != 0
  loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')
  loss = loss_object(label, pred)

  mask = tf.cast(mask, dtype=loss.dtype)
  loss *= mask

  loss = tf.reduce_sum(loss)/tf.reduce_sum(mask)
  return loss


def masked_accuracy(label, pred):
  pred = tf.argmax(pred, axis=2)
  label = tf.cast(label, pred.dtype)
  match = label == pred

  mask = label != 0

  match = match & mask

  match = tf.cast(match, dtype=tf.float32)
  mask = tf.cast(mask, dtype=tf.float32)
  return tf.reduce_sum(match)/tf.reduce_sum(mask)

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint

checkpoint_path = drivepath + "checkpoints/fr_occ_base/"
epochs_per_checkpoint = 2

# Define the ModelCheckpoint callback
checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_path,
    save_weights_only=True,  # Save only the model's weights
    save_freq='epoch',  # Save after each epoch
    period=epochs_per_checkpoint,  # Save every `epochs_per_checkpoint` epochs
    verbose=1  # Print messages about saving checkpoints
)





In [None]:
transformer.compile(
    loss=masked_loss,
    optimizer=optimizer,
    metrics=[masked_accuracy]
)

# Load the latest checkpoint if it exists
latest_checkpoint = tf.train.latest_checkpoint(checkpoint_path)
if latest_checkpoint is not None:
    transformer.load_weights(latest_checkpoint)
    print('Latest checkpoint loaded:', latest_checkpoint)
else:
    print('No checkpoint found, starting training from scratch.')

No checkpoint found, starting training from scratch.


In [None]:
transformer.fit(
    train_dataset,
    epochs=10,
    steps_per_epoch=len(train_dataset),
    validation_data=val_dataset,
    validation_steps=len(val_dataset),
    callbacks=[checkpoint_callback]
)

Epoch 1/10
Epoch 2/10
Epoch 2: saving model to /content/drive/MyDrive/Colab/occitan/checkpoints/fr_occ_base/
Epoch 3/10
Epoch 4/10
Epoch 4: saving model to /content/drive/MyDrive/Colab/occitan/checkpoints/fr_occ_base/
Epoch 5/10
Epoch 6/10
Epoch 6: saving model to /content/drive/MyDrive/Colab/occitan/checkpoints/fr_occ_base/
Epoch 7/10
Epoch 8/10
Epoch 8: saving model to /content/drive/MyDrive/Colab/occitan/checkpoints/fr_occ_base/
Epoch 9/10
Epoch 10/10
Epoch 10: saving model to /content/drive/MyDrive/Colab/occitan/checkpoints/fr_occ_base/


<keras.src.callbacks.History at 0x7d065019e3e0>

# Inference

In [None]:
token_ids = fr_tokenizer.encode('a')
decoded_text = fr_tokenizer.decode(token_ids, skip_special_tokens=True)
print(token_ids)
print(decoded_text)

[2, 68, 3]
a


In [None]:
a = occ_tokenizer.encode('').ids
print(a)

[1, 2]


In [None]:
class Translator(tf.Module):
  def __init__(self, occ_tokenizer, fr_tokenizer, transformer):
    self.fr_tokenizer = fr_tokenizer
    self.occ_tokenizer = occ_tokenizer
    self.transformer = transformer

  def __call__(self, sentence, max_length=64):
    # The input sentence is Portuguese, hence adding the `[START]` and `[END]` tokens.
    # assert isinstance(sentence, tf.Tensor)
    #if len(sentence.shape) == 0:
    #  sentence = sentence[tf.newaxis]

    # sentence = self.tokenizers.pt.tokenize(sentence).to_tensor()
    # print(sentence, self.occ_tokenizer.encode(sentence))
    sentence = self.fr_tokenizer.encode(sentence)
    # print(len(sentence))
    sentence = np.expand_dims(sentence, axis=0)
    sentence = pad_sequences(sentence, maxlen=max_length, padding='post')
    # print(sentence)

    encoder_input = sentence
    print(encoder_input.shape)

    # As the output language is English, initialize the output with the
    # English `[START]` token.
    start_end = self.occ_tokenizer.encode('').ids
    print(start_end)
    # print(start_end)
    start = start_end[0]
    end = start_end[1]

    # `tf.TensorArray` is required here (instead of a Python list), so that the
    # dynamic-loop can be traced by `tf.function`.
    output_array = tf.TensorArray(dtype=tf.int64, size=0, dynamic_size=True)
    output_array = output_array.write(0, start)

    for i in tf.range(max_length):
      output = tf.transpose(output_array.stack())
      output = tf.expand_dims(output, axis=0)

      predictions = self.transformer((encoder_input, output), training=False)

      # Select the last token from the `seq_len` dimension.
      predictions = predictions[:, -1:, :]  # Shape `(batch_size, 1, vocab_size)`.

      predicted_id = tf.argmax(predictions, axis=-1)
      #print(predictions)
      print(predicted_id)

      # Concatenate the `predicted_id` to the output which is given to the
      # decoder as its input.
      output_array = output_array.write(i+1, predicted_id[0, 0])

      if predicted_id == end:
        break

    output = tf.transpose(output_array.stack())
    # The output shape is `(1, tokens)`.
    print(output.shape, output)
    text = self.occ_tokenizer.decode(output, skip_special_tokens=True)  # Shape: `()`.

    tokens = output

    # `tf.function` prevents us from using the attention_weights that were
    # calculated on the last iteration of the loop.
    # So, recalculate them outside the loop.
    output = tf.transpose(output_array.stack())
    output = tf.expand_dims(output, axis=0)
    self.transformer((encoder_input, output), training=False)
    attention_weights = self.transformer.decoder.last_attn_scores

    return text, tokens, attention_weights

In [None]:
translator = Translator(occ_tokenizer, fr_tokenizer, transformer)

In [None]:
# sentence = 'Títol : Jornadas deu Patrimòni Quan : 16 e 17 de seteme 2017 On : Bordèu (33) Adreça : Musèu d\'Aquitània Tà\'n saber mei'
# sentence = 'Bonjour à tous les amis de mon pays'
# sentence = 'Jornadas deu Patrimòni Quan'
sentence = 'Elle sera suivie à 16 h du spectacle Contes d\'un Doman Acabat de Gric de Prat.'
ground_truth = 'Que\'u seguirà a quate òras deu vrèspe l\'espectacle Contes d\'un Doman Acabat de Gric de Prat.'

translated_text, translated_tokens, attn_weights = translator(sentence)

(1, 64)
[1, 2]
tf.Tensor([[276]], shape=(1, 1), dtype=int64)
tf.Tensor([[629]], shape=(1, 1), dtype=int64)
tf.Tensor([[1641]], shape=(1, 1), dtype=int64)
tf.Tensor([[68]], shape=(1, 1), dtype=int64)
tf.Tensor([[958]], shape=(1, 1), dtype=int64)
tf.Tensor([[75]], shape=(1, 1), dtype=int64)
tf.Tensor([[194]], shape=(1, 1), dtype=int64)
tf.Tensor([[1199]], shape=(1, 1), dtype=int64)
tf.Tensor([[2688]], shape=(1, 1), dtype=int64)
tf.Tensor([[71]], shape=(1, 1), dtype=int64)
tf.Tensor([[11]], shape=(1, 1), dtype=int64)
tf.Tensor([[165]], shape=(1, 1), dtype=int64)
tf.Tensor([[1347]], shape=(1, 1), dtype=int64)
tf.Tensor([[138]], shape=(1, 1), dtype=int64)
tf.Tensor([[5094]], shape=(1, 1), dtype=int64)
tf.Tensor([[138]], shape=(1, 1), dtype=int64)
tf.Tensor([[4590]], shape=(1, 1), dtype=int64)
tf.Tensor([[18]], shape=(1, 1), dtype=int64)
tf.Tensor([[2]], shape=(1, 1), dtype=int64)
(20,) tf.Tensor(
[   1  276  629 1641   68  958   75  194 1199 2688   71   11  165 1347
  138 5094  138 4590   1

In [None]:
def print_translation(sentence, translated_text, tokens, ground_truth):
  print(f'{"Input:":15s}: {sentence}')
  print(f'{"Translation":15s}: {translated_text}')
  print(f'{"output tokens":15s}: {tokens}')
  print(f'{"Ground truth":15s}: {ground_truth}')
print_translation(sentence, translated_text, translated_tokens, ground_truth)

Input:         : Elle sera suivie à 16 h du spectacle Contes d'un Doman Acabat de Gric de Prat.
Translation    : Que serà seguida a 16 h deu espectacle Contes d ' un utís de Gric de Prat .
output tokens  : [   1  276  629 1641   68  958   75  194 1199 2688   71   11  165 1347
  138 5094  138 4590   18    2]
Ground truth   : Que'u seguirà a quate òras deu vrèspe l'espectacle Contes d'un Doman Acabat de Gric de Prat.


# Attention visualization

In [None]:
print(attn_weights.shape)

(1, 8, 20, 64)


In [None]:
!pip install bertviz

Collecting bertviz
  Downloading bertviz-1.4.0-py3-none-any.whl (157 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/157.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m157.6/157.6 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Collecting boto3 (from bertviz)
  Downloading boto3-1.34.67-py3-none-any.whl (139 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.0->bertviz)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.0->bertviz)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━

In [None]:
print(sentence)
sentence_tokens = occ_tokenizer.encode(sentence).ids
print(sentence_tokens, len(sentence_tokens))
print(attn_weights.shape)
print(len(attn_weights.shape))
print(len(tokens))

#enc_tokens = occ_tokenizer.encode(sentence).tokens
#dec_tokens = fr_tokenizer.tokenize(translated_text)
enc_tokens = fr_tokenizer.tokenize(sentence)
enc_tokens = ['[BOS]'] + enc_tokens + ['[EOS]']
dec_tokens = occ_tokenizer.encode(translated_text).tokens
dec_tokens = dec_tokens[1:-2]
enc_tokens_padded = enc_tokens + ['pad' for i in range(64 - len(enc_tokens))]
dec_tokens_padded = dec_tokens + ['pad' for j in range(attn_weights.shape[2] - len(dec_tokens))]

print(len(enc_tokens_padded), len(dec_tokens_padded))
print(enc_tokens_padded)

Elle sera suivie à 16 h du spectacle Contes d'un Doman Acabat de Gric de Prat.
[1, 18459, 4526, 386, 76, 3286, 238, 958, 75, 274, 9572, 3913, 2688, 71, 11, 165, 10442, 1979, 695, 138, 5094, 138, 4590, 18, 2] 25
(1, 8, 20, 64)
4
16
64 20
['[BOS]', 'Elle', 'sera', 'suivie', 'à', '16', 'h', 'du', 'spectacle', 'Contes', 'd', "'", 'un', 'Dom', '##an', 'Aca', '##bat', 'de', 'Gri', '##c', 'de', 'Prat', '.', '[EOS]', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad']


In [None]:
from bertviz import head_view, model_view
attn_weights_formatted = tf.expand_dims(attn_weights, axis=0) # to mimic list of layers

import torch
attn_weights_torch = torch.tensor(attn_weights_formatted.numpy())

head_view(cross_attention=attn_weights_torch, encoder_tokens=enc_tokens_padded, decoder_tokens=dec_tokens_padded)

<IPython.core.display.Javascript object>