In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pathlib
import random
import string
import re
import numpy as np
import tensorflow as tf
from keras import callbacks
from nltk import word_tokenize
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import TextVectorization
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
text_file = pathlib.Path('/content/drive/MyDrive/NLP Project/Dataset/Txt/ara_eng.txt')

In [None]:
with open(text_file) as f:
    lines = f.read().split("\n")[:-1]
text_pairs = []
for line in lines:
    eng, ar = line.split("\t")
    ar = "[start] " + ar + " [end]"
    text_pairs.append((eng, ar))

Here's what our sentence pairs look like:

In [None]:
for _ in range(5):
    print(random.choice(text_pairs))

('Tom and Mary have a small farm.', '[start] لتوم وماري مزرعة صغيرة. [end]')
('but before we go into bolivar s time in peru let s get to know his personality journalist alvaro vargas llosa writes in an article that bolivar was a better warlord than all the other latin american leaders from that time but that warlordism itself is still the heart of the latin problem.', '[start] قبل ان نعرف ما فعله بوليفار اثناء بقايه في البيرو دعونا نتعرف على شخصيته يكتب الصحفي الفارو فاراكاس يوسا في احدى مقالاته ان بوليفار كان سيد حرب وافضل من جميع قادة امريكا اللاتينية في ذلك الزمن لكن كونه سيد حرب كان قلب المشكة اللاتينية [end]')
('humans of costa rica a page created in july of has more than likes.', '[start] اناس من كوستاريكا صفحة تم انشاوها في شهر يوليو تموز من عام لديها اكثر من معجب [end]')
('This play has three acts.', '[start] لهذه المسرحية ثلاثة فصول. [end]')
('on july the hackers were hacked resulting in the release of a gb trove of documents demonstrating among other things that hacking team 

Now, let's split the sentence pairs into a training set, a validation set,
and a test set.

In [None]:
random.shuffle(text_pairs)
num_val_samples = int(0.15 * len(text_pairs))
num_train_samples = len(text_pairs) - 2 * num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples : num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples :]

print(f"{len(text_pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")
print(f"{len(test_pairs)} test pairs")

24638 total pairs
17248 training pairs
3695 validation pairs
3695 test pairs


In [None]:
strip_chars = string.punctuation +"؟"+"[٠١٢٣٤٥٦٧٨٩]"
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")

vocab_size = 20000
sequence_length = 150
batch_size = 128


def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(lowercase, "[%s]" % re.escape(strip_chars), "")


eng_vectorization = TextVectorization(
    max_tokens=vocab_size, output_mode="int", output_sequence_length=sequence_length,
)
ar_vectorization = TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length + 1,
    standardize=custom_standardization,
)
train_eng_texts = [pair[0] for pair in train_pairs]
train_ar_texts = [pair[1] for pair in train_pairs]
eng_vectorization.adapt(train_eng_texts)
ar_vectorization.adapt(train_ar_texts)

In [None]:
def format_dataset(eng, ar):
    eng = eng_vectorization(eng)
    ar = ar_vectorization(ar)
    return ({"encoder_inputs": eng, "decoder_inputs": ar[:, :-1],}, ar[:, 1:])


def make_dataset(pairs):
    eng_texts, ar_texts = zip(*pairs)
    eng_texts = list(eng_texts)
    ar_texts = list(ar_texts)
    dataset = tf.data.Dataset.from_tensor_slices((eng_texts, ar_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset)
    return dataset.shuffle(2048).prefetch(16).cache()


train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

In [None]:
for inputs, targets in train_ds.take(1):
    print(f'inputs["encoder_inputs"].shape: {inputs["encoder_inputs"].shape}')
    print(f'inputs["decoder_inputs"].shape: {inputs["decoder_inputs"].shape}')
    print(f"targets.shape: {targets.shape}")

inputs["encoder_inputs"].shape: (128, 150)
inputs["decoder_inputs"].shape: (128, 150)
targets.shape: (128, 150)


L7ad hena

In [None]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, mask=None):
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
        attention_output = self.attention(
            query=inputs, value=inputs, key=inputs, attention_mask=padding_mask
        )
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)


class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):
        super().__init__(**kwargs)
        self.token_embeddings = layers.Embedding(
            input_dim=vocab_size, output_dim=embed_dim
        )
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=embed_dim
        )
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)


class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, latent_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.latent_dim = latent_dim
        self.num_heads = num_heads
        self.attention_1 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.attention_2 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [layers.Dense(latent_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.layernorm_3 = layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, encoder_outputs, mask=None):
        causal_mask = self.get_causal_attention_mask(inputs)
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
            padding_mask = tf.minimum(padding_mask, causal_mask)

        attention_output_1 = self.attention_1(
            query=inputs, value=inputs, key=inputs, attention_mask=causal_mask
        )
        out_1 = self.layernorm_1(inputs + attention_output_1)

        attention_output_2 = self.attention_2(
            query=out_1,
            value=encoder_outputs,
            key=encoder_outputs,
            attention_mask=padding_mask,
        )
        out_2 = self.layernorm_2(out_1 + attention_output_2)

        proj_output = self.dense_proj(out_2)
        return self.layernorm_3(out_2 + proj_output)

    def get_causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)],
            axis=0,
        )
        return tf.tile(mask, mult)

In [None]:
embed_dim = 256
latent_dim = 2048
num_heads = 8

encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="encoder_inputs")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs)
encoder_outputs = TransformerEncoder(embed_dim, latent_dim, num_heads)(x)
encoder = keras.Model(encoder_inputs, encoder_outputs)

decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="decoder_inputs")
encoded_seq_inputs = keras.Input(shape=(None, embed_dim), name="decoder_state_inputs")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)
x = TransformerDecoder(embed_dim, latent_dim, num_heads)(x, encoded_seq_inputs)
x = layers.Dropout(0.5)(x)
decoder_outputs = layers.Dense(vocab_size, activation="softmax")(x)
decoder = keras.Model([decoder_inputs, encoded_seq_inputs], decoder_outputs)

decoder_outputs = decoder([decoder_inputs, encoder_outputs])
transformer = keras.Model(
    [encoder_inputs, decoder_inputs], decoder_outputs, name="transformer"
)

In [None]:
transformer.summary()

Model: "transformer"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 positional_embedding (Position  (None, None, 256)   5158400     ['encoder_inputs[0][0]']         
 alEmbedding)                                                                                     
                                                                                                  
 decoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 transformer_encoder (Transform  (None, None, 256)   3155456     ['positional_embedding[

In [None]:

early_stopping = callbacks.EarlyStopping(monitor='val_accuracy', patience=5, verbose=True)
checkpoint = callbacks.ModelCheckpoint('/content/drive/MyDrive/NLP Project/FinalTransformer.h5', monitor='val_accuracy', verbose=True, save_best_only=True)

In [None]:
epochs = 40  # This should be at least 30 for convergence

transformer.compile(
    "adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)
transformer.fit(train_ds, epochs=epochs, validation_data=val_ds,callbacks=[checkpoint,early_stopping])

Epoch 1/40
Epoch 1: val_accuracy improved from -inf to 0.23924, saving model to /content/drive/MyDrive/NLP Project/FinalTransformer.h5
Epoch 2/40
Epoch 2: val_accuracy improved from 0.23924 to 0.25785, saving model to /content/drive/MyDrive/NLP Project/FinalTransformer.h5
Epoch 3/40
Epoch 3: val_accuracy improved from 0.25785 to 0.26933, saving model to /content/drive/MyDrive/NLP Project/FinalTransformer.h5
Epoch 4/40
Epoch 4: val_accuracy improved from 0.26933 to 0.27448, saving model to /content/drive/MyDrive/NLP Project/FinalTransformer.h5
Epoch 5/40
Epoch 5: val_accuracy did not improve from 0.27448
Epoch 6/40
Epoch 6: val_accuracy did not improve from 0.27448
Epoch 7/40
Epoch 7: val_accuracy did not improve from 0.27448
Epoch 8/40
Epoch 8: val_accuracy did not improve from 0.27448
Epoch 9/40
Epoch 9: val_accuracy did not improve from 0.27448
Epoch 9: early stopping


<keras.callbacks.History at 0x7f3c71630790>

In [None]:

ar_vocab = ar_vectorization.get_vocabulary()
ar_index_lookup = dict(zip(range(len(ar_vocab)), ar_vocab))
max_decoded_sentence_length = 20

def decode_sequence(input_sentence, reference_translation):
    tokenized_input_sentence = eng_vectorization([input_sentence])
    decoded_sentence = "[start]"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = ar_vectorization([decoded_sentence])[:, :-1]
        predictions = transformer([tokenized_input_sentence, tokenized_target_sentence])

        sampled_token_index = np.argmax(predictions[0, i, :])
        sampled_token = ar_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token

        if sampled_token == "[end]":
            break

    # Calculate the BLEU score for the decoded sentence
    reference_translation = word_tokenize(reference_translation)
    chencherry = SmoothingFunction()
    decoded_sentence = word_tokenize(decoded_sentence[1:-1]) # remove start and end tokens
    bleu_score = round(sentence_bleu([reference_translation], decoded_sentence,smoothing_function=chencherry.method1),4)
    bleu_score1 = round(sentence_bleu([reference_translation], decoded_sentence,(1,0,0,0),smoothing_function=chencherry.method1),4)
    return decoded_sentence, bleu_score , bleu_score1

test_eng_texts = [pair[0] for pair in test_pairs]
total_bleu_score = 0.0
total_bleu1_score = 0.0
for i in range(50):
    input_sentence = random.choice(test_eng_texts)
    reference_translation = test_pairs[test_eng_texts.index(input_sentence)][1]
    decoded, bleu_score , bleu1_score = decode_sequence(input_sentence, reference_translation)
    total_bleu_score += bleu_score
    total_bleu1_score += bleu1_score
    print(f"Input sentence: {input_sentence}")
    print(f"Reference translation: {reference_translation}")
    print(f"Decoded sentence: {' '.join(decoded)}\n")
    print(f"BLEU score: {bleu_score}")
    print(f"BLEU 1-gram score: {bleu1_score}\n")

average_bleu_score = total_bleu_score / 50
average_bleu1_score = total_bleu1_score / 50
print(f"Average BLEU score: {round(average_bleu_score,4)}")
print(f"Average BLEU 1-gram score: {round(average_bleu1_score,4)}")


Input sentence: join us for a gvmeetup in tunis on november global voices.
Reference translation: [start] الاعلان عن ملتقى الاصوات العالمية في تونس يوم نوفمبر الاصوات العالمية [end]
Decoded sentence: start ] الكويت يوم من سبتمبر ايلول في الاصوات العالمية [ end

BLEU score: 0.1786
BLEU 1-gram score: 0.4395

Input sentence: Pass me the butter, please.
Reference translation: [start] من فضلك ناولني الزبدة. [end]
Decoded sentence: start ] [ UNK ] إلى هنا [ end

BLEU score: 0.0636
BLEU 1-gram score: 0.5338

Input sentence: erkan s field diary discusses turkey s lifting of the headscarf hijab ban in his country.
Reference translation: [start] مدونة حقول اركان انكليزي تناقش رفع الحظر عن الحجاب في تركيا [end]
Decoded sentence: start ] [ UNK ] [ UNK ] [ UNK ] في لبنان من الاردن رسالة الى الولايات المتحدة الامريكية [ UNK ] في مدينة [ UNK ] [ end

BLEU score: 0.0215
BLEU 1-gram score: 0.2333

Input sentence: The boy ran away.
Reference translation: [start] هرب الولد. [end]
Decoded sentence: start 