## This is the base Notebook for Neural Machine Translation (En-Fr translation)

In [1]:
import re
from tensorflow.keras.preprocessing.sequence import pad_sequences

###  Step 1 : Data Fetching

In [2]:
def fetch_data(data_path):
    with open(data_path, 'r', encoding='utf-8') as f:
        lines = f.read().split('\n')
    en_text = []
    fr_text = []
    for line in lines:
        en, fr, _ = line.split('\t')
        en_text.append(en)
        fr_text.append(fr)
        
    return en_text, fr_text

In [3]:
DATA_PATH = r'datasets/French-English/fra.txt'
en_text, fr_text = fetch_data(DATA_PATH)

In [4]:
# en_text[:10]

In [5]:
# fr_text[:10]

###  Step 2 : Data Cleaning

In [6]:
def text_clean(text):
    """ Function to clean the text before training
    """
    text = text.lower()
    
    # Replace the short words in there expanded forms
    text = re.sub("i'm", "i am", text)
    text = re.sub("&", "and", text)
    
    # remove all non essential charachters
    text = re.sub(r"[-{}\"#/@;:<>()+=|.?,%$!]","", text)
    
    text = re.sub(r"[0-9]","", text)
    
    # Remove outside spaces
    text = text.strip()
    
    return text

In [7]:
clean_en_text = [text_clean(text) for text in en_text]

In [8]:
clean_fr_text = [text_clean(text) for text in fr_text]

### 3. Data Formating

In [9]:
tokens = ['<SOS>', '<PAD>', '<EOS>']

In [10]:
en_vocab = sorted(set((' '.join(clean_en_text)).split()))

In [11]:
en_vocab.extend(tokens)

In [12]:
fr_vocab = sorted(set((' '.join(clean_fr_text)).split()))

In [13]:
fr_vocab.extend(tokens)

In [14]:
en_word_idx = dict([(word, i) for i, word in enumerate(en_vocab)])
en_idx_word = dict([(i, word) for i, word in enumerate(en_vocab)])

In [15]:
fr_word_idx = dict([(word, i) for i, word in enumerate(fr_vocab)])
fr_idx_word = dict([(i, word) for i, word in enumerate(fr_vocab)])

In [16]:
tokenise_fr_text = []
for line in clean_fr_text:
    tokenise_fr_text.append(tokens[0] + " " + line + " " + tokens[2])

In [17]:
complete_fr_text = tokenise_fr_text
complete_en_text = clean_en_text

In [18]:
max_en_seq_length = max([len(text.split()) for text in complete_en_text])
max_en_seq_length

44

In [19]:
max_fr_seq_length = max([len(text.split()) for text in complete_fr_text])
max_fr_seq_length

57

In [20]:
complete_en_text[:5]

['go', 'hi', 'hi', 'run', 'run']

In [21]:
complete_fr_text[:5]

['<SOS> va <EOS>',
 '<SOS> salut <EOS>',
 '<SOS> salut <EOS>',
 '<SOS> cours <EOS>',
 '<SOS> courez <EOS>']

#### Encoding data with indexes

In [22]:
enc_en_text = [[en_word_idx[word] for word in line.split()] for line in complete_en_text]
enc_fr_text = [[fr_word_idx[word] for word in line.split()] for line in complete_fr_text]

####  Padding of sequences

In [23]:
pad_en_text = pad_sequences(sequences=enc_en_text, maxlen=max_en_seq_length, padding='post', truncating='post', value=en_word_idx['<PAD>'])
pad_fr_text = pad_sequences(sequences=enc_fr_text, maxlen=max_fr_seq_length, padding='post', truncating='post', value=fr_word_idx['<PAD>'])

In [24]:
pad_en_text

array([[ 5603, 14726, 14726, ..., 14726, 14726, 14726],
       [ 6157, 14726, 14726, ..., 14726, 14726, 14726],
       [ 6157, 14726, 14726, ..., 14726, 14726, 14726],
       ...,
       [ 3336,  6975, 12015, ..., 14726, 14726, 14726],
       [11699, 13072,   663, ...,  7443,  6973, 14726],
       [ 6496, 12010, 14395, ...,     5,  8471, 12093]])

In [25]:
pad_en_text.shape

(177210, 44)

In [26]:
pad_fr_text

array([[29594, 27919, 29596, ..., 29595, 29595, 29595],
       [29594, 24489, 29596, ..., 29595, 29595, 29595],
       [29594, 24489, 29596, ..., 29595, 29595, 29595],
       ...,
       [29594, 14864, 17084, ..., 29595, 29595, 29595],
       [29594, 21192, 28742, ..., 29595, 29595, 29595],
       [29594, 24969, 21542, ..., 15256, 17950, 29596]])

In [27]:
pad_fr_text.shape

(177210, 57)