## This is the base Notebook for Neural Machine Translation (En-Fr translation)

In [1]:
import re
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, Embedding, LSTM


###  Step 1 : Data Fetching

In [2]:
def fetch_data(data_path):
    """ 
    This function will fetch the dataset with 'utf-8' encoding, 
    separate the source (en) and target (fr) language.
    
    input: path of the dataset txt file
    output: list of all English text, list of corresponding French text
    """
    with open(data_path, 'r', encoding='utf-8') as f:
        lines = f.read().split('\n')
    en_text = []
    fr_text = []
    for line in lines:
        en, fr, _ = line.split('\t')
        en_text.append(en)
        fr_text.append(fr)
        
    return en_text, fr_text

In [3]:
DATA_PATH = r'datasets/French-English/fra.txt'
en_text, fr_text = fetch_data(DATA_PATH)

In [4]:
# fr_text[:10]

###  Step 2 : Data Cleaning

In [5]:
def text_clean(text):
    """ 
    Function to clean the text before training.
    input: text single line
    output: cleaned text line
    """
    
    text = text.lower()
    
    # Replace the short words in there expanded forms
    text = re.sub("i'm", "i am", text)
    text = re.sub("&", "and", text)
    
    # remove all non essential charachters
    text = re.sub(r"[-{}\"#/@;:<>()+=|.?,%$!]","", text)
    text = re.sub(r"[0-9]","", text)
    
    # Remove outside spaces
    text = text.strip()
    
    return text

In [6]:
clean_en_text = [text_clean(text) for text in en_text]
clean_fr_text = [text_clean(text) for text in fr_text]

### Step 3. Data Formating

In [7]:
# Adding tokens to identify the start and end of TARGET language
tokens = ['<SOS>', '<PAD>', '<EOS>']

In [8]:
en_vocab = sorted(set((' '.join(clean_en_text)).split()))

In [9]:
en_vocab.extend(tokens)

In [10]:
fr_vocab = sorted(set((' '.join(clean_fr_text)).split()))

In [11]:
fr_vocab.extend(tokens)

In [12]:
en_word_idx = dict([(word, i) for i, word in enumerate(en_vocab)])
en_idx_word = dict([(i, word) for i, word in enumerate(en_vocab)])

In [13]:
fr_word_idx = dict([(word, i) for i, word in enumerate(fr_vocab)])
fr_idx_word = dict([(i, word) for i, word in enumerate(fr_vocab)])

In [14]:
tokenise_fr_text = []
for line in clean_fr_text:
    tokenise_fr_text.append(tokens[0] + " " + line + " " + tokens[2])

In [15]:
complete_fr_text = tokenise_fr_text
complete_en_text = clean_en_text

In [16]:
max_en_seq_length = max([len(text.split()) for text in complete_en_text])
max_en_seq_length

44

In [17]:
max_fr_seq_length = max([len(text.split()) for text in complete_fr_text])
max_fr_seq_length

57

In [18]:
complete_en_text[:5]

['go', 'hi', 'hi', 'run', 'run']

In [19]:
complete_fr_text[:5]

['<SOS> va <EOS>',
 '<SOS> salut <EOS>',
 '<SOS> salut <EOS>',
 '<SOS> cours <EOS>',
 '<SOS> courez <EOS>']

#### Encoding data with indexes

In [20]:
enc_en_text = [[en_word_idx[word] for word in line.split()] for line in complete_en_text]
enc_fr_text = [[fr_word_idx[word] for word in line.split()] for line in complete_fr_text]

####  Padding of sequences

In [21]:
pad_en_text = pad_sequences(sequences=enc_en_text, maxlen=max_en_seq_length, padding='post', truncating='post', value=en_word_idx['<PAD>'])
pad_fr_text = pad_sequences(sequences=enc_fr_text, maxlen=max_fr_seq_length, padding='post', truncating='post', value=fr_word_idx['<PAD>'])

In [22]:
pad_en_text

array([[ 5603, 14726, 14726, ..., 14726, 14726, 14726],
       [ 6157, 14726, 14726, ..., 14726, 14726, 14726],
       [ 6157, 14726, 14726, ..., 14726, 14726, 14726],
       ...,
       [ 3336,  6975, 12015, ..., 14726, 14726, 14726],
       [11699, 13072,   663, ...,  7443,  6973, 14726],
       [ 6496, 12010, 14395, ...,     5,  8471, 12093]])

In [23]:
pad_en_text.shape

(177210, 44)

In [24]:
pad_fr_text

array([[29594, 27919, 29596, ..., 29595, 29595, 29595],
       [29594, 24489, 29596, ..., 29595, 29595, 29595],
       [29594, 24489, 29596, ..., 29595, 29595, 29595],
       ...,
       [29594, 14864, 17084, ..., 29595, 29595, 29595],
       [29594, 21192, 28742, ..., 29595, 29595, 29595],
       [29594, 24969, 21542, ..., 15256, 17950, 29596]])

In [25]:
pad_fr_text.shape

(177210, 57)

### Step 4. Data preperation for model

In [26]:
X_train, X_test, y_train, y_test = train_test_split(pad_en_text, pad_fr_text, test_size=0.20, random_state=42)

In [27]:
X_train.shape, X_test.shape

((141768, 44), (35442, 44))

In [28]:
# def data_batch_generator(X,y):
max_en_seq_length

44

### Step 5. Model Building

In [29]:
embedding_dim = 50
en_vocab_len = len(en_vocab)
fr_vocab_len = len(fr_vocab)


#### Encoder States

In [30]:
# LAYERS
encoder_input = Input(shape =(X_train.shape[1]))
encoder_embedding_layer = Embedding(en_vocab_len , embedding_dim,input_length= max_en_seq_length)
encoder_lstm_layer = LSTM(50, return_state = True )

In [31]:
# OUTPUTS
encoder_embedding_output = encoder_embedding_layer(encoder_input)

In [32]:
encoder_seq_output, encoder_memory_state, encoder_carry_state = encoder_lstm_layer(encoder_embedding_output)

#### Encoder Model

In [33]:
model = Model(encoder_input, encoder_seq_output)

In [34]:
model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 44)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 44, 50)            736400    
_________________________________________________________________
lstm (LSTM)                  [(None, 50), (None, 50),  20200     
Total params: 756,600
Trainable params: 756,600
Non-trainable params: 0
_________________________________________________________________


####  Decoder States

In [35]:
decoder_input = Input(shape=(y_train.shape[1]))

In [36]:
decoder_embedding_layer = Embedding(fr_vocab_len , embedding_dim,input_length= max_fr_seq_length)

In [37]:
decoder_lstm_layer = LSTM(50, return_sequences=True, return_state=True)

In [38]:
decoder_embedding_output = decoder_embedding_layer(decoder_input)

In [39]:
decoder_seq_output, _, _ = decoder_lstm_layer(decoder_embedding_output, initial_state=[encoder_memory_state, encoder_carry_state])

In [40]:
decoder_dense = Dense(fr_vocab_len, activation='softmax')

In [41]:
decoder_output = decoder_dense(decoder_seq_output)

####  Decoder model

In [42]:
model = Model([encoder_input, decoder_input], decoder_output)

In [43]:
model.summary()

Model: "functional_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 44)]         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 57)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 44, 50)       736400      input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 57, 50)       1479850     input_2[0][0]                    
_______________________________________________________________________________________

#### Compile Model

In [44]:
model.compile(optimizer='adam', loss='categorical_crossentropy',  metrics='f1')

In [45]:
# prepare decoder input and target data format using a generator

In [46]:
# prepare decoder input and target data format using a generator
def batch_data_generator(X, y, batch_size=64):
    while True:
        for batch in range (0, X_train.shape[0], batch_size):
            encoder_input_data = np.zeros((batch_size, X_train.shape[1]), dtype = 'int32')
            decoder_input_data = np.zeros((batch_size, y_train.shape[1]), dtype = 'int32')
            decoder_target_data = np.zeros((batch_size, y_train.shape[1] ,fr_vocab_len) ,dtype = 'int32')

            for seq_index, (input_seq, target_seq) in enumerate(zip(X[batch:batch+batch_size], y[batch:batch+batch_size])):
                    
                    for word_index, word in enumerate(input_seq):
                        encoder_input_data[seq_index, word_index] = word
                    
                    for word_index, word in enumerate(target_seq):
                        if word_index<len(target_seq)-1: 
                            decoder_input_data[seq_index, word_index] = word # decoder input seq
                        if word_index>0: 
                            decoder_target_data[seq_index, word_index - 1, word] = 1.

            yield([encoder_input_data, decoder_input_data], decoder_target_data)

In [47]:
batch_size = 64
epochs = 2
steps_per_epoch = (X_train.shape[0]/batch_size)

In [48]:
# history = model.fit(batch_data_generator(X_train, y_train,batch_size), 
#                     steps_per_epoch = steps_per_epoch,
#                     epochs=epochs,
#                     verbose = 2)