## This is the base Notebook for Neural Machine Translation (En-Fr translation)

In [1]:
import re
import numpy as np
import tensorflow as tf

from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, Embedding, LSTM


In [2]:
#To check whether the Tensorflow is using or identifying the GPUs or not",
physical_devices = tf.config.experimental.list_physical_devices('GPU')
print("Number of available GPUs : {}".format(len(physical_devices)))
tf.config.experimental.set_memory_growth(physical_devices[0], True)

Number of available GPUs : 1


###  Step 1 : Data Fetching

In [3]:
def fetch_data(data_path):
    """ 
    This function will fetch the dataset with 'utf-8' encoding, 
    separate the source (en) and target (fr) language.
    
    input: path of the dataset txt file
    output: list of all English text, list of corresponding French text
    """
    with open(data_path, 'r', encoding='utf-8') as f:
        lines = f.read().split('\n')
    en_text = []
    fr_text = []
    for line in lines:
        en, fr, _ = line.split('\t')
        en_text.append(en)
        fr_text.append(fr)
        
    return en_text, fr_text

In [4]:
DATA_PATH = r'datasets/French-English/fra.txt'
en_text, fr_text = fetch_data(DATA_PATH)

In [5]:
# fr_text[:10]

In [6]:
en_text = en_text[:50000]
fr_text = fr_text[:50000]

###  Step 2 : Data Cleaning

In [7]:
def text_clean(text):
    """ 
    Function to clean the text before training.
    input: text single line
    output: cleaned text line
    """
    
    text = text.lower()
    
    # Replace the short words in there expanded forms
    text = re.sub("i'm", "i am", text)
    text = re.sub("&", "and", text)
    
    # remove all non essential charachters
    text = re.sub(r"[-{}\"#/@;:<>()+=|.?,%$!]","", text)
    text = re.sub(r"[0-9]","", text)
    
    # Remove outside spaces
    text = text.strip()
    
    return text

In [8]:
clean_en_text = [text_clean(text) for text in en_text]
clean_fr_text = [text_clean(text) for text in fr_text]

### Step 3. Data Formating

In [9]:
# Adding tokens to identify the start and end of TARGET language
tokens = ['<SOS>', '<PAD>', '<EOS>']

In [10]:
en_vocab = sorted(set((' '.join(clean_en_text)).split()))

In [11]:
en_vocab.extend(tokens)

In [12]:
fr_vocab = sorted(set((' '.join(clean_fr_text)).split()))

In [13]:
fr_vocab.extend(tokens)

In [14]:
en_word_idx = dict([(word, i) for i, word in enumerate(en_vocab)])
en_idx_word = dict([(i, word) for i, word in enumerate(en_vocab)])

In [15]:
fr_word_idx = dict([(word, i) for i, word in enumerate(fr_vocab)])
fr_idx_word = dict([(i, word) for i, word in enumerate(fr_vocab)])

In [16]:
tokenise_fr_text = []
for line in clean_fr_text:
    tokenise_fr_text.append(tokens[0] + " " + line + " " + tokens[2])

In [17]:
complete_fr_text = tokenise_fr_text
complete_en_text = clean_en_text

In [18]:
max_en_seq_length = max([len(text.split()) for text in complete_en_text])
max_en_seq_length

8

In [19]:
max_fr_seq_length = max([len(text.split()) for text in complete_fr_text])
max_fr_seq_length

16

In [20]:
complete_en_text[:5]

['go', 'hi', 'hi', 'run', 'run']

In [21]:
complete_fr_text[:5]

['<SOS> va <EOS>',
 '<SOS> salut <EOS>',
 '<SOS> salut <EOS>',
 '<SOS> cours <EOS>',
 '<SOS> courez <EOS>']

#### Encoding data with indexes

In [22]:
enc_en_text = [[en_word_idx[word] for word in line.split()] for line in complete_en_text]
enc_fr_text = [[fr_word_idx[word] for word in line.split()] for line in complete_fr_text]

####  Padding of sequences

In [23]:
pad_en_text = pad_sequences(sequences=enc_en_text, maxlen=max_en_seq_length, padding='post', truncating='post', value=en_word_idx['<PAD>'])
pad_fr_text = pad_sequences(sequences=enc_fr_text, maxlen=max_fr_seq_length, padding='post', truncating='post', value=fr_word_idx['<PAD>'])

In [24]:
pad_en_text

array([[2242, 5980, 5980, ..., 5980, 5980, 5980],
       [2495, 5980, 5980, ..., 5980, 5980, 5980],
       [2495, 5980, 5980, ..., 5980, 5980, 5980],
       ...,
       [3195, 2617, 2429, ..., 5980, 5980, 5980],
       [3195, 2617, 2759, ..., 5980, 5980, 5980],
       [3195, 2617, 3083, ..., 3232, 5980, 5980]])

In [25]:
pad_en_text.shape

(50000, 8)

In [26]:
pad_fr_text

array([[12549, 11811, 12551, ..., 12550, 12550, 12550],
       [12549, 10301, 12551, ..., 12550, 12550, 12550],
       [12549, 10301, 12551, ..., 12550, 12550, 12550],
       ...,
       [12549,  9031,  3045, ..., 12550, 12550, 12550],
       [12549,  9031,  6953, ..., 12550, 12550, 12550],
       [12549,  9031,  5705, ..., 12550, 12550, 12550]])

In [27]:
pad_fr_text.shape

(50000, 16)

### Step 4. Data preperation for model

In [75]:
X_train, X_test, y_train, y_test = train_test_split(pad_en_text, pad_fr_text, test_size=0.1, random_state=42)

In [76]:
X_train.shape, X_test.shape

((45000, 8), (5000, 8))

In [77]:
# def data_batch_generator(X,y):
max_en_seq_length

8

### Step 5. Model Building

In [78]:
embedding_dim = 50
en_vocab_len = len(en_vocab)
fr_vocab_len = len(fr_vocab)


#### Encoder States

In [79]:
# LAYERS
encoder_input = Input(shape =(X_train.shape[1]))
encoder_embedding_layer = Embedding(en_vocab_len , embedding_dim,input_length= max_en_seq_length)
encoder_lstm_layer = LSTM(50, return_state = True )

In [80]:
# OUTPUTS
encoder_embedding_output = encoder_embedding_layer(encoder_input)

In [81]:
encoder_seq_output, encoder_memory_state, encoder_carry_state = encoder_lstm_layer(encoder_embedding_output)

#### Encoder Model

In [82]:
model = Model(encoder_input, encoder_seq_output)

In [83]:
model.summary()

Model: "functional_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         [(None, 8)]               0         
_________________________________________________________________
embedding_4 (Embedding)      (None, 8, 50)             299100    
_________________________________________________________________
lstm_4 (LSTM)                [(None, 50), (None, 50),  20200     
Total params: 319,300
Trainable params: 319,300
Non-trainable params: 0
_________________________________________________________________


####  Decoder States

In [84]:
decoder_input = Input(shape=(y_train.shape[1]))

In [85]:
decoder_embedding_layer = Embedding(fr_vocab_len , embedding_dim,input_length= max_fr_seq_length)

In [86]:
decoder_lstm_layer = LSTM(50, return_sequences=True, return_state=True)

In [87]:
decoder_embedding_output = decoder_embedding_layer(decoder_input)

In [88]:
decoder_seq_output, _, _ = decoder_lstm_layer(decoder_embedding_output, initial_state=[encoder_memory_state, encoder_carry_state])

In [89]:
decoder_dense = Dense(fr_vocab_len, activation='softmax')

In [90]:
decoder_output = decoder_dense(decoder_seq_output)

####  Decoder model

In [91]:
model = Model([encoder_input, decoder_input], decoder_output)

In [92]:
model.summary()

Model: "functional_11"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            [(None, 8)]          0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            [(None, 16)]         0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 8, 50)        299100      input_5[0][0]                    
__________________________________________________________________________________________________
embedding_5 (Embedding)         (None, 16, 50)       627600      input_6[0][0]                    
______________________________________________________________________________________

#### Compile Model

In [93]:
model.compile(optimizer='adam', loss='categorical_crossentropy',  metrics=['accuracy'])

In [94]:
# prepare decoder input and target data format using a generator

In [95]:
# prepare decoder input and target data format using a generator
def batch_data_generator(X, y, batch_size=64):
    while True:
        for batch in range (0, X_train.shape[0], batch_size):
            encoder_input_data = np.zeros((batch_size, X_train.shape[1]), dtype = 'int32')
            decoder_input_data = np.zeros((batch_size, y_train.shape[1]), dtype = 'int32')
            decoder_target_data = np.zeros((batch_size, y_train.shape[1] ,fr_vocab_len) ,dtype = 'int32')

            for seq_index, (input_seq, target_seq) in enumerate(zip(X[batch:batch+batch_size], y[batch:batch+batch_size])):
                    
                    for word_index, word in enumerate(input_seq):
                        encoder_input_data[seq_index, word_index] = word 
                    for word_index, word in enumerate(target_seq):
                        if word_index<len(target_seq)-1: 
                            decoder_input_data[seq_index, word_index] = word # decoder input seq
                        if word_index>0: 
                            decoder_target_data[seq_index, word_index - 1, word] = 1.

            yield([encoder_input_data, decoder_input_data], decoder_target_data)

In [96]:
batch_size = 64
epochs = 10
steps_per_epoch = (X_train.shape[0]/batch_size)
val_steps = (X_test.shape[0]/batch_size)

In [97]:
history = model.fit(batch_data_generator(X_train, y_train,batch_size), steps_per_epoch = steps_per_epoch, epochs=epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [98]:
model.save('models/en_fr_vanialla_LSTM_model.h5')

###  Inference

In [100]:
# encoder_model = Model(encoder_input, [encoder_memory_state, encoder_carry_state])

In [103]:
# Encode the input sequence to get the "Context vectors"
encoder_model = Model(encoder_input , [encoder_memory_state, encoder_carry_state])

# Decoder setup

# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(50,))
decoder_state_input_c = Input(shape=(50,))
decoder_state_input = [decoder_state_input_h, decoder_state_input_c]

# Get the embeddings of the decoder sequence
dec_emb2 = decoder_embedding_layer(decoder_input)

# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm_layer(dec_emb2, initial_state=decoder_state_input)
decoder_states2 = [state_h2, state_c2]

# A dense softmax layer to generate prob dist. over the target vocabulary
decoder_outputs2 = decoder_dense(decoder_outputs2)

# Final decoder model
decoder_model = Model(
    [decoder_input] + decoder_state_input,
    [decoder_outputs2] + decoder_states2)

In [113]:
def decode_sequence(input_seq):
        # Encode the input as state vectors.
        states_value = encoder_model.predict(input_seq)
        
        # Generate empty target sequence of length 1.
        target_seq = np.zeros((1,1))
        
        # Populate the first character of 
        #target sequence with the start character.
        target_seq[0, 0] = fr_word_idx['<SOS>']
        
        # Sampling loop for a batch of sequences
        # (to simplify, here we assume a batch of size 1).
        stop_condition = False
        decoded_sentence = ''
        
        while not stop_condition:
            output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
            
            # Sample a token
            sampled_token_index = np.argmax(output_tokens[0, -1, :])
            sampled_word =fr_idx_word[sampled_token_index]
            decoded_sentence += ' '+ sampled_word
            
            # Exit condition: either hit max length
            # or find stop character.
            if (sampled_word == '<EOS>' or len(decoded_sentence.split()) > 25):
                stop_condition = True
        
            # Update the target sequence (of length 1).
            target_seq = np.zeros((1,1))
            target_seq[0, 0] = sampled_token_index
            
            # Update states
            states_value = [h, c]
        return decoded_sentence

In [159]:
# Create a batch generator for batch size 1
train_gen = batch_data_generator(X_test, y_test, batch_size = 1)
k=-1

In [164]:
# Predict the target sentence and compare with the actual target sentence given a source sentence
k+=1000
(input_seq, actual_output), target_output = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
# print(input_seq)
print(en_idx_word[input_seq[0][0]],en_idx_word[input_seq[0][1]], en_idx_word[input_seq[0][2]], en_idx_word[input_seq[0][3]], en_idx_word[input_seq[0][4]])
print('Input Source sentence:', clean_en_text[k:k+1][0])
print('Actual Target Sentence:', clean_fr_text[k:k+1][0])
print('Predicted Target Sentence:', decoded_sentence[:-5].strip())

[[5960 1545 2429 5364 2242 5980 5980 5980]]
you don't have to go
Input Source sentence: do it at once
Actual Target Sentence: faitesle immédiatement
Predicted Target Sentence: vous ne vous n'êtes pas
