In [1]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

2021-12-20 11:11:28.724335: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-12-20 11:11:28.724416: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
# #############################################################
# # @Open A TSV file and store the content into a list of     #
# #############################################################
def open_tsv_return_list(filename : str = "./English-French.tsv"):
    """
    Open a Tsv file and return a list -- 
    The return type is a list of strings mapping source language to target language
    eg:("English --> French")
    Params : 
            filename = the filename of the file to open
            preview_length = the length of the elements to see
    """
    with open(filename, "r", encoding="utf-8") as f:
        content = f.read().replace("\u202f", "").lower().encode("utf-8")

    content = content.decode().split("\n")
    print("Lenght of the dataset is : ", len(content))
    return content[-10000:]
    

In [3]:
# #########################################################
# #@ Map the data and ignore the index of for each subset #
# #########################################################
def pair_value(content):
    """
    Take the return value of tsv pair values and return 
    a list in the right format.
    """
    trad = list()
    for sentences in content:
        pair = sentences.split("\t")
        pair_content = list()
        i = 1
        for sent in pair:
            if (i==1) or (i==3):
                pass
            else:
                pair_content.append(sent)
            i = i+1
        trad.append(pair_content)
    return trad


In [4]:
# ################################# 
# # @Clean the dataset            #
# #################################
def clean_pair(mapper) :
    sign = '!"#$%&()*+,./:;<=>?@[\\]_`{|}~'
    table = str.maketrans('', '', sign)

    clean_doc = list()

    for pair_sent in mapper:
        sub_sentence = list()
        for sentence in pair_sent:
            sentence = sentence.split()
            words = [w.translate(table) for w in sentence]
            sub_sentence.append(" ".join(words))
        clean_doc.append(sub_sentence)
    
    del sign, table
    return clean_doc

In [5]:
#
# # @Apply the above functions 
#
content = open_tsv_return_list()
translations = pair_value(content)
clean_doc = clean_pair(translations)
del content
del translations

Lenght of the dataset is :  310290


### Split to train and Test and validation and just Play with the trainnig dataset

In [6]:
len(clean_doc)

10000

In [7]:
#
# # @Split the dataset into 80% for training, 10% for test and 10% for validation
#
np.random.shuffle(clean_doc)
train_dataset                = clean_doc[:(len(clean_doc)*80)//100] 
test_validation_dataset      = clean_doc[(len(clean_doc)*80)//100:]
test_dataset                 = test_validation_dataset[:(len(test_validation_dataset)*50)//100] 
validation_dataset           = test_validation_dataset[(len(test_validation_dataset)*50)//100:]
del test_validation_dataset

In [8]:
# #########################################################
# # @Split the content into to English sentences onnly    #
# #########################################################
def split_to_eng_content(clean_doc):
    english_sentences = []
    for sentence_pair in clean_doc:
        en = sentence_pair[0] if len(sentence_pair)>1 else ""
        english_sentences.append(en)
        
    return english_sentences
# #########################################################
# # @Split the content into to French sentences onnly     #
# #########################################################
def split_to_fr_content(clean_doc):
    french_sentences = []        
    for sentence_pair in clean_doc:
        fr = sentence_pair[-1] if len(sentence_pair)>1 else ""
        french_sentences.append(fr)
        
    return french_sentences

# ######## @Apply the Functions
english_sentences = split_to_eng_content(train_dataset)
french_sentences = split_to_fr_content(train_dataset)

In [9]:
# ###################################################
#   # @Save content to English text only -->en_file #
# ###################################################
# def sentences_saver_to_eng(en_file= "english_train_sentences.txt",
#                         english_sentences= split_to_eng_content(clean_doc)):
#     file = open(en_file, "w")
#     for line in english_sentences:
#         file.write(line+"\n") 
#     file.close()
#     return file.closed
# #####################################################   
# # # @Save content to French text only -->fr_file    #
# #####################################################
# def sentences_saver_to_fr(fr_file= "french_train_sentences.txt",
#                        french_sentences= split_to_eng_content(clean_doc) ):
#     file = open(fr_file, "w")
#     for line in french_sentences:
#         file.write(line+"\n")
#     file.close()
#     return file.closed


In [9]:
# ###########################
# # @Create a tokenizer     #
# ###########################
def create_tokenizer(sentences_list):
    tknz = Tokenizer()
    tknz.fit_on_texts(sentences_list)
    return tknz

#
# # @Find the longest sequence in the list
#

def max_length(lines):
    return max(len(line.split()) for line in lines)

In [10]:
# ######################################
#   #####  English Part Now #####      #
# ######################################
eng_tokenizer   = create_tokenizer(english_sentences)
eng_max_seq     = max_length(english_sentences)
eng_vocab_size  = len(eng_tokenizer.word_index)+1
print(f"The english vocab size is {eng_vocab_size}")
print(f"The english longest sentence is of size: {eng_max_seq}")
# ######################################
#   #####  French Part Now #####       #
# ######################################
fr_tokenizer    = create_tokenizer(french_sentences)
fr_max_seq      = max_length(french_sentences)
fr_vocab_size   = len(fr_tokenizer.word_index)+1
print(f"The french vocab size is {fr_vocab_size}")
print(f"The french longest sentence is of size: {fr_max_seq}")

The english vocab size is 6750
The english longest sentence is of size: 76
The french vocab size is 8607
The french longest sentence is of size: 81


In [23]:
######################################
# # @Feature Engeniering             #
# ####################################
def encoded_sequences(sentences_list, tokenizer, length):
    encoded = tokenizer.texts_to_sequences(sentences_list)
    encoded = pad_sequences(encoded, maxlen=length)
    return encoded


def encoded_output(sequences, vocab_size, tokenizer=fr_tokenizer):
    y = []
    for sequence in tokenizer.texts_to_sequences(sequences):
        encode = to_categorical(sequence, num_classes=vocab_size)
        y.append(encode)
    print(np.array(y).shape)
    # y = np.array(y).reshape(np.array(sequences).shape[0], np.array(sequences).shape[1], vocab_size)
    # return y

In [24]:
fr_encoded_sequences = encoded_output(french_sentences, fr_vocab_size)
eng_encoded_sequences = encoded_sequences(english_sentences, eng_tokenizer, eng_max_seq)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, RepeatVector, TimeDistributed, Dense
from tensorflow.keras.utils import plot_model
from tensorflow.keras.callbacks import ModelCheckpoint

# define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
	model = Sequential()
	model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
	model.add(LSTM(n_units))
	model.add(RepeatVector(tar_timesteps))
	model.add(LSTM(n_units, return_sequences=True))
	model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
	return model
 
# define model
model = define_model(fr_vocab_size, eng_vocab_size, fr_max_seq, eng_max_seq, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')
# summarize defined model
print(model.summary())
plot_model(model, to_file='model.png', show_shapes=True)

Error: Session cannot generate requests

In [None]:
# fit model
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(eng_encoded_sequences, fr_encoded_sequences, epochs=30, 
          batch_size=128, validation_split=0.25, callbacks=[checkpoint], verbose=2)