In [1]:
import numpy
import random
import json

import re
import string
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import load_model

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
title = 'eng_niv_kal'

project_folder = '/content/drive/MyDrive/ENG-KAL_translation'
# version = 'niv'
# eng_folder = 'English texts/' + version
# kal_folder = 'Kalenjin texts'
eng_kal_txt = project_folder + '/engnivkal_202306081459.txt'

In [4]:
with open(eng_kal_txt, 'r') as fjson:
    data = json.load(fjson)
len_data = len(data)

In [5]:
data[0]

['in the beginning god created the heavens and the earth',
 '[start] eng  taunet  ko ki toi kamuktaindet koyai kipsengwet ak ng wony [end]']

In [6]:
train_len = round(.7 * len(data))
val_len = round(.2 * len(data))

random.shuffle(data)
train_pairs = data[:train_len]
val_pairs = data[train_len:train_len + val_len]
test_pairs = data[train_len + val_len:]
print(len(train_pairs), len(val_pairs), len(test_pairs))
print(train_pairs[15])
print(val_pairs[10])
print(test_pairs[14])

21275 6079 3039
['let me tell you a riddle   samson said to them   if you can give me the answer within the seven days of the feast  i will give you thirty linen garments and thirty sets of clothes', '[start] ki mwachi  samson icheek koleenji   ogany amwaiwok tangochet  ngot omuuchi omwaiwo eng  peetuusyek tisap che po igoorto  ak onai kiit ne ibooru  agoonok anyun ngoroik sosom che po katanit ak ngoroik sosom che kiwalawali [end]']
['is it not to share your food with the hungryand to provide the poor wanderer with shelter—when you see the naked  to clothe them and not to turn away from your own flesh and blood', '[start] tos ma kebae amitwogiguk che amei rubeet  ak imutu kibananook che kigetimda  kobwa koong ung   ye igeer ne mi aach ngor  ituch  ameung egei piik che po oreng ung [end]']
['whoever robs their father and drives out their motheris a child who brings shame and disgrace', '[start] ibu kalilanet ak teweernateet weeriit ne iseeri tuguugap kwandanyi ak koon kamennyi [end]']


Define the train pairs

In [7]:
strip_chars = string.punctuation
strip_chars = strip_chars.replace('(', '').replace(')', '')

# def custom_standardization(input_string):
#     lowercase = tf.strings.lower(input_string)
#     return tf.strings.regex_replace(
#         lowercase, f'[{re.escape(strip_chars)}]', ''
#     )

vocab_size = 15000
sequence_length = 20

source_vectorization = layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length,
)

target_vectorization = layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length + 1,
    standardize='lower_and_strip_punctuation'
)

train_eng_texts = [pair[0] for pair in train_pairs]
train_kal_texts = [pair[1] for pair in train_pairs]
source_vectorization.adapt(train_eng_texts)
target_vectorization.adapt(train_kal_texts)

Prepare the datasets for the translation task

In [8]:
batch_size = 32

def format_dataset(eng, kal):
    eng = source_vectorization(eng)
    kal = target_vectorization(kal)
    return(
        {
            'english': eng,
            'kalenjin': kal[:, :-1]
        }, kal[:, 1:]
    )

def make_dataset(pairs):
    eng_texts, kal_texts = zip(*pairs)
    eng_texts, kal_texts = list(eng_texts), list(kal_texts)
    dataset = tf.data.Dataset.from_tensor_slices((eng_texts, kal_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset, num_parallel_calls=tf.data.AUTOTUNE)
    return dataset.shuffle(3).prefetch(1).cache()

train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

In [9]:
for inputs, targets, in train_ds.take(1):
    print(f"inputs['english'].shape: {inputs['english'].shape}")
    print(f"inputs['kalenjin'].shape: {inputs['kalenjin'].shape}")
    print(f"targets.shape: {targets.shape}")

inputs['english'].shape: (32, 20)
inputs['kalenjin'].shape: (32, 20)
targets.shape: (32, 20)


TRANSFORMER NETWORK

In [10]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim                          # Size of the input token vectors
        self.dense_dim = dense_dim                          # Size of the inner dense layer
        self.num_heads = num_heads                          # Number of attention heads
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [
             layers.Dense(dense_dim, activation='relu'),
             layers.Dense(embed_dim),
            ]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()

    def call(self, inputs, mask=None):                      # Computation comes here
        if mask is not None:
            mask = mask[:, tf.newaxis, :]                   # Expand the mask to 3D
        attention_output = self.attention(
            inputs, inputs, attention_mask=mask
        )
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)

    def get_config(self):                                   # Implement serialization so the model can be saved
        config = super().get_config()
        config.update(
            {
                'embed_dim': self.embed_dim,
                'num_heads': self.num_heads,
                'dense_dim': self.dense_dim             
            }
        )
        return config

POSITIONAL EMBEDDING

In [11]:
class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, input_dim, output_dim, **kwargs):
        super().__init__(**kwargs)
        self.token_embeddings = layers.Embedding(                   # Prepare and Embedding layer for the token indices
            input_dim=input_dim, output_dim=output_dim
        )
        self.position_embeddings = layers.Embedding(                # Prepare and Embedding layer for the token positions
            input_dim=sequence_length, output_dim=output_dim
        )
        self.sequence_length = sequence_length
        self.input_dim = input_dim
        self.output_dim = output_dim

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions                 # Add both embedding vectors together

    def compute_mask(self, inputs, mask=None):                      # Generate a mask so we can ignore padding 0s in the inputs
        return tf.math.not_equal(inputs, 0)

    def get_config(self):                                           # Implement serialization so we can save the model
        config = super().get_config()
        config.update(
            {
                'output_dim': self.output_dim,
                'sequence_length': self.sequence_length,
                'input_dim': self.input_dim
            }
        )
        return config

DECODER NETWORK

In [12]:
class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention_1 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.attention_2 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [
             layers.Dense(dense_dim, activation='relu'),
             layers.Dense(embed_dim),
            ]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.layernorm_3 = layers.LayerNormalization()
        self.support_masking = True
    
    def get_causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j, dtype='int32')
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1),
             tf.constant([1, 1], dtype=tf.int32)], axis=0
        )
        return tf.tile(mask, mult)

    def call(self, inputs, encoder_outputs, mask=None):
        causal_mask = self.get_causal_attention_mask(inputs)
        if mask is not None:
            padding_mask = tf.cast(
                mask[:, tf.newaxis, :], dtype='int32'
            )
            padding_mask = tf.minimum(padding_mask, causal_mask)
        attention_output_1 = self.attention_1(
            query=inputs,
            value=inputs,
            key=inputs,
            attention_mask=causal_mask
        )
        attention_output_1 = self.layernorm_1(inputs + attention_output_1)
        attention_output_2 = self.attention_2(
            query=attention_output_1,
            value=encoder_outputs,
            key=encoder_outputs,
            attention_mask=padding_mask,
        )
        attention_output_2 = self.layernorm_2(
            attention_output_1 + attention_output_2
        )
        proj_output = self.dense_proj(attention_output_2)
        return self.layernorm_3(attention_output_2 + proj_output)

    def get_config(self):
        config = super().get_config()
        config.update(
            {
                'embed_dim': self.embed_dim,
                'num_heads': self.num_heads,
                'dense_dim': self.dense_dim
            }
        )
        return config

PUTTING ALL TOGETHER

In [38]:
embed_dim = 256
dense_dim = 2048
num_heads = 18

encoder_inputs = keras.Input(shape=(None,), dtype='int64', name='english')
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs)
encoder_outputs = TransformerEncoder(embed_dim, dense_dim, num_heads)(x)

decoder_inputs = keras.Input(shape=(None,), dtype='int64', name='kalenjin')
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)
x = TransformerDecoder(embed_dim, dense_dim, num_heads)(x, encoder_outputs)
x = layers.Dropout(0.5)(x)
decoder_outputs = layers.Dense(vocab_size, activation='softmax')(x)

transformer = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

Compile the Transformer Network

In [39]:
import datetime

In [40]:
now = datetime.datetime.now()
running_time = now.strftime("%Y%m%d%H%M")

In [41]:
transformer.compile(
    optimizer='rmsprop',
    loss='sparse_categorical_crossentropy',
    metrics='accuracy'
)

callbacks = keras.callbacks.ModelCheckpoint(f'{project_folder}/models/eng_to_kal_transformer_{running_time}.keras', save_best_only=True)

In [42]:
epochs = 5
transformer.fit(train_ds,
                epochs=epochs,
                validation_data=val_ds,
                callbacks=callbacks)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fde583cc790>

Test translation accuracy

In [30]:
len(kal_vocab)

15000

In [43]:
kal_vocab = target_vectorization.get_vocabulary()
kal_index_lookup = dict(zip(range(len(kal_vocab)), kal_vocab))
max_decoded_sentence_length = 20

def decode_sequence(input_sentence):
    tokenized_input_sentence = source_vectorization([input_sentence])
    decoded_sentence = '[start]'
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = target_vectorization(
            [decoded_sentence]
        )[:, :-1]
        predictions = transformer(
            [tokenized_input_sentence, tokenized_target_sentence]
        )
        sampled_token_index = numpy.argmax(predictions[0, i, :])
        sampled_token = kal_index_lookup[sampled_token_index]
        decoded_sentence += ' ' + sampled_token
        if sampled_token == '[end]':
            break
    return decoded_sentence

test_eng_texts = [pair[0] for pair in test_pairs]
for _ in range(6):
    input_sentence = random.choice(test_eng_texts)
    print('*******************************************************************')
    print(input_sentence)
    print(decode_sequence(input_sentence))

*******************************************************************
each of us should please our neighbors for their good  to build them up
[start] ak ki [UNK] chii age tugul eng icheek kole [UNK] end         
*******************************************************************
you are my war club my weapon for battle—with you i shatter nations with you i destroy kingdoms
[start] ki [UNK] anyun ak [UNK] ak [UNK] amu ki [UNK] ak [UNK] end       
*******************************************************************
‘so do not be afraid  jacob my servant do not be dismayed  israel ’declares the lord ‘i will surely save you out of a distant place your descendants from the land of their exile jacob will again have peace and security and no one will make him afraid
[start] amu ma uu nooto kamuktaindet ne toroor jakobo ak jakobo ak ki le israel amu ma [UNK] israel end 
*******************************************************************
remove your scourge from me i am overcome by the blow of you

Save the training data and metadata for later use

In [46]:
test_sentences = [
'where are you going',
'come home today',
'what is your name',
'how much love',
'God is calling you to himself'
 ]

test_sentences = ['[start] ' + item + ' [end]' for item in test_sentences]

In [47]:
for i in range(len(test_sentences)):
    input_sentence = test_sentences[i]
    print('*******************************************************************')
    print(input_sentence)
    print(decode_sequence(input_sentence))

*******************************************************************
[start] where are you going [end]
[start] [UNK] end                  
*******************************************************************
[start] come home today [end]
[start] [UNK] anyun daudi ak [UNK] end              
*******************************************************************
[start] what is your name [end]
[start] [UNK] end                  
*******************************************************************
[start] how much love [end]
[start] [UNK] [UNK] [UNK] end                
*******************************************************************
[start] God is calling you to himself [end]
[start] [UNK] kamuktaindet end                 


In [None]:
hist = transformer.history.history

time_and_date = running_time
number_of_text_pairs =  len(text_pairs)
training_loss = hist['loss']
training_accuracy = hist['accuracy']
val_loss = hist['val_loss']
val_accuracy = hist['val_accuracy']

train_history = {
    'time_and_date': time_and_date,
    'number_of_text_pairs': number_of_text_pairs,
    'epochs': epochs,
    'training_loss': training_loss,
    'training_accuracy': training_accuracy,
    'val_loss': val_loss, 
    'val_accuracy': val_accuracy
}

In [None]:
import json

In [None]:
project_folder = '/content/drive/MyDrive/Colab Notebooks/TensorFlow for Beginners/Natural Language Processing/ENG-KAL_translation/train_history'
history_file_name = title + '_' + f'train_history_{time_and_date}.json'
history_file = project_folder + '/' + history_file_name

with open(history_file, 'w') as file_object:  #open the file in write mode
 json.dump(train_history, file_object)

In [None]:
# # Uncoment this code to read the json file of interest. Rememeber to add the link

# with open(history_file, 'r') as f:
#   data = json.load(f)
# data

In [31]:
model_path = "/content/drive/MyDrive/ENG-KAL_translation/models/eng_to_kal_transformer_202306081320.keras"
model = load_model(model_path)

ValueError: ignored