In [1]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
# #############################################################
# # @Open A TSV file and store the content into a list of     #
# #############################################################
def open_tsv_return_list(filename= "./English-French.tsv", preview_length= 5):
    """
    Open a Tsv file and return a list -- 
    The return type is a list of strings mapping source language to target language
    eg:("English --> French")
    Params : 
            filename = the filename of the file to open
            preview_length = the length of the elements to see
    """
    with open(filename, "r", encoding="utf-8") as f:
        content = f.read().replace("\u202f", "").lower().encode("utf-8")

    content = content.decode()
    content = content.split("\n")
    print(content[:preview_length])
    return content

In [3]:
# #########################################################
# #@ Map the data and ignore the index of for each subset #
# #########################################################
def pair_value(content):
    """
    Take the return value of tsv pair values and return 
    a list in the right format.
    """
    trad = list()
    for sentences in content:
        pair = sentences.split("\t")
        pair_content = list()
        i = 1
        for sent in pair:
            if (i==1) or (i==3):
                pass
            else:
                pair_content.append(sent)
            i = i+1
        trad.append(pair_content)
    return trad


In [4]:
# ################################# 
# # @Clean the dataset            #
# #################################
def clean_pair(mapper) :
    sign = '!"#$%&()*+,./:;<=>?@[\\]_`{|}~'
    table = str.maketrans('', '', sign)

    clean_doc = list()

    for pair_sent in mapper:
        sub_sentence = list()
        for sentence in pair_sent:
            sentence = sentence.split()
            words = [w.translate(table) for w in sentence]
            sub_sentence.append(" ".join(words))
        clean_doc.append(sub_sentence)
    return clean_doc

In [5]:
#
# # @Apply the above functions 
#
content = open_tsv_return_list()
translations = pair_value(content)
clean_doc = clean_pair(translations)

["\ufeff1276\tlet's try something.\t3091\tessayons quelque chose!", "1276\tlet's try something.\t456963\ttentons quelque chose!", '1277\ti have to go to sleep.\t373908\tje dois aller dormir.', "1280\ttoday is june 18th and it is muiriel's birthday!\t3095\taujourd'hui nous sommes le 18 juin et c'est l'anniversaire de muiriel!", "1280\ttoday is june 18th and it is muiriel's birthday!\t696081\taujourd'hui c'est le 18 juin, et c'est l'anniversaire de muiriel."]


In [6]:
# #########################################################
# # @Split the content into to English sentences onnly    #
# #########################################################
def split_to_eng_content(clean_doc):
    english_sentences = []
    for sentence_pair in clean_doc:
        en = sentence_pair[0] if len(sentence_pair)>1 else ""
        english_sentences.append(en)
        
    return english_sentences
# #########################################################
# # @Split the content into to French sentences onnly     #
# #########################################################
def split_to_fr_content(clean_doc):
    french_sentences = []        
    for sentence_pair in clean_doc:
        fr = sentence_pair[-1] if len(sentence_pair)>1 else ""
        french_sentences.append(fr)
        
    return french_sentences

# ##### @Apply the Functions
english_sentences = split_to_eng_content(clean_doc)
french_sentences = split_to_fr_content(clean_doc)

In [7]:
# ###################################################
#   # @Save content to English text only -->en_file #
# ###################################################
def sentences_saver_to_eng(en_file= "english_sentences.txt",
                        english_sentences= split_to_eng_content(clean_doc)):
    file = open(en_file, "w")
    for line in english_sentences:
        file.write(line+"\n") 
    file.close()
    return file.closed
#####################################################   
# # @Save content to French text only -->fr_file    #
#####################################################
def sentences_saver_to_fr(fr_file= "french_sentences.txt",
                       french_sentences= split_to_eng_content(clean_doc) ):
    file = open(fr_file, "w")
    for line in french_sentences:
        file.write(line+"\n")
    file.close()
    return file.closed


In [11]:
#
# # @Split the dataset into 80% for training, 10% for test and 10% for validation
#
np.random.shuffle(clean_doc)
dataset = clean_doc
train_dataset                = dataset[:(len(dataset)*80)//100] 
test_validation_dataset      = dataset[(len(dataset)*80)//100:]
test_dataset                 = test_validation_dataset[:(len(test_validation_dataset)*50)//100] 
validation_dataset           = test_validation_dataset[(len(test_validation_dataset)*50)//100:]
del dataset
del test_validation_dataset

In [14]:
# ###########################
# # @Create a tokenizer     #
# ###########################
def create_tokenizer(senteces_list):
    tknz = Tokenizer()
    tknz.fit_on_texts(senteces_list)
    return tknz

#
# # @Find the longest sequence in the list
#

def max_length(lines):
    return max(len(line.split()) for line in lines)

In [15]:
# ######################################
#   #####  English Part Now #####      #
# ######################################
eng_tokenizer   = create_tokenizer(english_sentences)
eng_max_seq     = max_length(english_sentences)
eng_vocab_size  = len(eng_tokenizer.word_index)+1
print(f"The english vocab size is {eng_vocab_size}")
print(f"The english longest sentence is of size: {eng_max_seq}")
# ######################################
#   #####  French Part Now #####       #
# ######################################
fr_tokenizer    = create_tokenizer(french_sentences)
fr_max_seq      = max_length(french_sentences)
fr_vocab_size   = len(fr_tokenizer.word_index)+1
print(f"The french vocab size is {fr_vocab_size}")
print(f"The french longest sentence is of size: {fr_max_seq}")

The english vocab size is 30421
The english longest sentence is of size: 262
The french vocab size is 47506
The french longest sentence is of size: 237


In [16]:
######################################
# # @Feature Engeniering             #
# ####################################
def encoded_sequences(sentences_list, tokenizer, length):
    encoded = tokenizer.texts_to_sequences(sentences_list)
    encoded = pad_sequences(encoded, maxlen=length)
    return encoded
    

In [17]:
fr_encoded_sequences = encoded_sequences(french_sentences, fr_tokenizer, fr_max_seq)
eng_encoded_sequences = encoded_sequences(english_sentences, eng_tokenizer, eng_max_seq)

In [19]:
eng_encoded_sequences[0]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   