# Translation system to translate English to french


In [1]:
!pip install nltk




In [2]:
import nltk
from nltk.corpus import comtrans

# nltk.download('comtrans')
print(comtrans.aligned_sents("alignment-en-fr.txt")[54])
words = comtrans.words("alignment-en-fr.txt")

for word in words[:6]:
    print(word)


<AlignedSent: 'In any event , this ...' -> 'En tout cas , cette ...'>
Resumption
of
the
session
I
declare


In [3]:
print(comtrans.aligned_sents("alignment-en-fr.txt")[0].words)
print(comtrans.aligned_sents("alignment-en-fr.txt")[0].mots)


['Resumption', 'of', 'the', 'session']
['Reprise', 'de', 'la', 'session']


## Preprocessing the corpora


In [4]:
import pickle
import re
from collections import Counter
from nltk.corpus import comtrans


### Function to retrieve the corpora


In [5]:
# Function takes an argument for the languages
def retrieve_corpora(translated_sentences_l1_l2="alignment-en-fr.txt"):
    print("Retrieving corpora: {}".format(translated_sentences_l1_l2))
    als = comtrans.aligned_sents(translated_sentences_l1_l2)
    sentences_l1 = [sent.words for sent in als]  # store the english sentences
    sentences_l2 = [sent.mots for sent in als]  # store the french sentences
    return sentences_l1, sentences_l2  # return the list of both languages


In [6]:
## Testing function
sen_l1, sen_l2 = retrieve_corpora()
print("# A sentence in the two languages English and French")
print("English: ", sen_l1[:5])  # first 5 Sentences in English
print("French: ", sen_l2[:5])  # first 5 Sentences in French

print("# Corpora Length (Number of Sentences)")
print(len(sen_l1))
print(len(sen_l2))

assert len(sen_l1) == len(sen_l2)  # check if the number of sentences are the same


Retrieving corpora: alignment-en-fr.txt
# A sentence in the two languages English and French
English:  [['Resumption', 'of', 'the', 'session'], ['I', 'declare', 'resumed', 'the', 'session', 'of', 'the', 'European', 'Parliament', 'adjourned', 'on', 'Friday', '17', 'December', '1999', ',', 'and', 'I', 'would', 'like', 'once', 'again', 'to', 'wish', 'you', 'a', 'happy', 'new', 'year', 'in', 'the', 'hope', 'that', 'you', 'enjoyed', 'a', 'pleasant', 'festive', 'period', '.'], ['You', 'have', 'requested', 'a', 'debate', 'on', 'this', 'subject', 'in', 'the', 'course', 'of', 'the', 'next', 'few', 'days', ',', 'during', 'this', 'part-session', '.'], ['Please', 'rise', ',', 'then', ',', 'for', 'this', 'minute', "'", 's', 'silence', '.'], ['(', 'The', 'House', 'rose', 'and', 'observed', 'a', 'minute', "'", 's', 'silence', ')']]
French:  [['Reprise', 'de', 'la', 'session'], ['Je', 'déclare', 'reprise', 'la', 'session', 'du', 'Parlement', 'européen', 'qui', 'avait', 'été', 'interrompue', 'le', 'ven

# clean up tokens


In [7]:
def clean_sentences(sentence):
    regex_splitter = re.compile("([!?.,:;$\"')( ])")
    clean_words = [re.split(regex_splitter, word.lower()) for word in sentence]
    return [w for words in clean_words for w in words if words if w]


# testing function
clean_sen_l1 = [clean_sentences(s) for s in sen_l1]
clean_sen_l2 = [clean_sentences(s) for s in sen_l2]

print("English: ", clean_sen_l1[0])
print("French: ", clean_sen_l2[0])


English:  ['resumption', 'of', 'the', 'session']
French:  ['reprise', 'de', 'la', 'session']


### Filtering the sentences that are too long to be processed due to limited resources


In [8]:
def filter_sentence_length(sentences_l1, sentences_l2, min_len=0, max_len=20):
    filtered_sentences_l1 = []
    filtered_sentences_l2 = []
    for i in range(len(sentences_l1)):
        if (
            min_len <= len(sentences_l1[i]) <= max_len
            and min_len <= len(sentences_l2[i]) <= max_len
        ):
            filtered_sentences_l1.append(sentences_l1[i])
            filtered_sentences_l2.append(sentences_l2[i])

    return filtered_sentences_l1, filtered_sentences_l2


# Testing
filt_clean_sen_l1, filt_clean_sen_l2 = filter_sentence_length(
    clean_sen_l1, clean_sen_l2
)

# checking how many sentences made it through the filter (13,405 survived)
print("# Filterd Corpora length")
print(len(filt_clean_sen_l1))

assert len(filt_clean_sen_l1) == len(filt_clean_sen_l2)


# Filterd Corpora length
13405


# Move text to numbers


### Create a dictionary of words for each language


In [9]:
_PAD = "_PAD"  # For padding
_GO = "_GO"  # to divide 2 sentences
_EOS = "_EOS"  # to indicate where sentence stops
_UNK = "_UNK"  # for unknown words
_START_VOCAB = [_PAD, _GO, _EOS, _UNK]
PAD_ID = 0
GO_ID = 1
EOS_ID = 2
UNK_ID = 3
OP_DICT_IDS = [PAD_ID, GO_ID, EOS_ID, UNK_ID]


In [10]:
def create_indexed_dictionary(sentences, dict_size=10000, storage_path=None):
    count_words = Counter()
    dict_words = {}
    opt_dict_size = len(OP_DICT_IDS)
    for sen in sentences:
        for word in sen:
            count_words[word] += 1

    dict_words[_PAD] = PAD_ID
    dict_words[_GO] = GO_ID
    dict_words[_EOS] = EOS_ID
    dict_words[_UNK] = UNK_ID
    # print(count_words.most_common(dict_size))

    for idx, item in enumerate(count_words.most_common(dict_size)):
        dict_words[item[0]] = idx + opt_dict_size
    if storage_path:
        pickle.dump(dict_words, open(storage_path, "wb"))
    return dict_words


### Look up tokens and substitute them with their token ID


In [11]:
def sentences_to_indexes(sentences, indexed_dictionary):
    indexed_sentences = []
    not_found_counter = 0
    for sent in sentences:
        idx_sent = []
    for word in sent:
        try:
            idx_sent.append(indexed_dictionary[word])
        except KeyError:
            idx_sent.append(UNK_ID)
            not_found_counter += 1
    indexed_sentences.append(idx_sent)
    print("[sentences_to_indexes] Did not find {} words".format(not_found_counter))
    return indexed_sentences


### testing


### subsititute tokens with their ID and if Token is not in the dictionary, the ID of unknown is used


In [12]:
dict_l1 = create_indexed_dictionary(
    filt_clean_sen_l1, dict_size=15000, storage_path="/tmp/l1_dict.p"
)
dict_l2 = create_indexed_dictionary(
    filt_clean_sen_l2, dict_size=10000, storage_path="/tmp/l2_dict.p"
)
idx_sentences_l1 = sentences_to_indexes(filt_clean_sen_l1, dict_l1)
idx_sentences_l2 = sentences_to_indexes(filt_clean_sen_l2, dict_l2)
print("# Same sentences as before, with their dictionary ID")
print("English:", list(zip(filt_clean_sen_l1[0], idx_sentences_l1[0])))
print("French:", list(zip(filt_clean_sen_l2[0], idx_sentences_l2[0])))


[sentences_to_indexes] Did not find 0 words
[sentences_to_indexes] Did not find 0 words
# Same sentences as before, with their dictionary ID
English: [('resumption', 168), ('of', 1308), ('the', 1239), ('session', 5)]
French: [('reprise', 21), ('de', 472), ('la', 20), ('session', 5)]


In [13]:
# function to get the maximum size
def extract_max_length(corpora):
    return max([len(sentence) for sentence in corpora])


max_length_l1 = extract_max_length(idx_sentences_l1)
max_length_l2 = extract_max_length(idx_sentences_l2)
print("# Max sentence lengths:")
print("English", max_length_l1)
print("French", max_length_l2)


# Max sentence lengthsL:
English 7
French 9


## pad the sequences to be the same length


### padd input to be 20 symbols long and output to be 20 symbols long

### insert \_GO t the beginning of the output sentence and \_EOS at the end to position the start and the end of the translation


In [14]:
def prepare_sentences(sentences_l1, sentences_l2, len_l1, len_l2):
    assert len(sentences_l1) == len(sentences_l2)
    data_set = []
    for i in range(len(sentences_l1)):
        padding_l1 = len_l1 - len(sentences_l1[i])
        pad_sentence_l1 = ([PAD_ID] * padding_l1) + sentences_l1[i]
        padding_l2 = len_l2 - len(sentences_l2[i])
        pad_sentence_l2 = [GO_ID] + sentences_l2[i] + [EOS_ID] + ([PAD_ID] * padding_l2)

        data_set.append([pad_sentence_l1, pad_sentence_l2])
    return data_set


In [15]:
data_set = prepare_sentences(
    idx_sentences_l1, idx_sentences_l2, max_length_l1, max_length_l2
)
print("# Prepared minibatch with paddings and extra stuff")
print("En:", data_set[0][0])
print("Fr:", data_set[0][1])
print("# The sentence pass from X to Y tokens")
print("English:", len(idx_sentences_l1[0]), "->", len(data_set[0][0]))
print("French:", len(idx_sentences_l2[0]), "->", len(data_set[0][1]))


# Prepared minibatch with paddings and extra stuff
En: [168, 1308, 1239, 5, 556, 955, 4]
Fr: [1, 21, 472, 20, 5, 1378, 11, 489, 1643, 4, 2]
# The sentence pass from X to Y tokens
English: 7 -> 7
French: 9 -> 11


## training the Translator


In [16]:
import time
import math
import sys
import pickle
import glob
import os
import tensorflow as tf

from seq2seq_model import Seq2SeqModel

# from corpora_tools import *

path_l1_dict = "/tmp/l1_dict.p"
path_l2_dict = "/tmp/l2_dict.p"
model_dir = "/tmp/translate "
model_checkpoints = model_dir + "/translate.ckpt"


2024-09-21 12:17:22.021173: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-21 12:17:22.029296: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-21 12:17:22.037703: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-21 12:17:22.040136: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-21 12:17:22.048283: I tensorflow/core/platform/cpu_feature_guar

This function returns the cleaned sentences, the dataset, the maximum length
of the sentences, and the lengths of the dictionaries.


In [17]:
def build_dataset(use_stored_dictionary=False):
    sen_l1, senl2 = retrieve_corpora()
    clean_sen_l1 = [clean_sentences(s) for s in sen_l1]
    clean_sent_l2 = [clean_sentences(s) for s in sen_l2]
    filt_clean_sen_l1, filt_clean_sen_l2 = filter_sentence_length(
        clean_sen_l1, clean_sen_l2
    )

    if not use_stored_dictionary:
        dict_l1 = create_indexed_dictionary(
            filt_clean_sen_l1, dict_size=15000, storage_path=path_l1_dict
        )
        dict_l2 = create_indexed_dictionary(
            filt_clean_sen_l1, dict_size=10000, storage_path=path_l2_dict
        )

    else:
        dict_l1 = pickle.load(open(path_l1_dict, "rb"))
        dict_l2 = pickle.load(open(path_l2_dict, "rb"))

    dict_l1_length = len(dict_l1)
    dict_l2_length = len(dict_l2)

    data_set = prepare_sentences(
        idx_sentences_l1, idx_sentences_l2, max_length_l1, max_length_l2
    )
    return (
        (filt_clean_sen_l1, filt_clean_sen_l2),
        data_set,
        (max_length_l1, max_length_l2),
        (dict_l1_length, dict_l2_length),
    )


In [18]:
def cleanup_checkpoints(model_dir, model_checkpoints):
    for f in glob.glob(model_checkpoints + "*"):
        os.remove(f)
    try:
        os.mkdir(model_dir)
    except FileExistsError:
        pass
