# Machine Translation (NMT)
Objective:
- Build a Neural Machine Translation (NMT) model to translate human readable dates ("25th of June, 2009") into machine readable dates ("2009-06-25")


Exemple:

`9 may 1998 -> 1998-05-09
10.11.19 -> 2019-11-10
9/10/70 -> 1970-09-10
saturday april 28 1990 -> 1990-04-28
thursday january 26 1995 -> 1995-01-26
monday march 7 1983 -> 1983-03-07`

In [1]:
import pandas as pd
import tensorflow.keras.utils as ku
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dense, Embedding, Bidirectional, LSTM
from tensorflow.keras import Sequential
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer

physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

In [2]:
url_data = 'https://raw.githubusercontent.com/Haller-x/Data-Science/main/data/dates_dataset.csv'
data = pd.read_csv(url_data, header=None)
train_index = int(0.9 * len(data))
train_data = data[:train_index]
test_data = data[train_index:]

token_origin = tokenenizer(char_level=True)
token_end = tokenenizer(char_level = True)

token_origin.fit_on_texts(train_data.loc[:,0])
token_end.fit_on_texts(train_data.loc[:,1])

seq_train = token_origin.texts_to_sequences(train_data.loc[:,0])
seq_test = token_origin.texts_to_sequences(test_data.loc[:,0])

label_train = np.array(token_end.texts_to_sequences(train_data.loc[:,1]))
label_test = np.array(token_end.texts_to_sequences(test_data.loc[:,1]))

label_train = ku.to_categorical(label_train, num_classes = len(token_end.word_index)+1)
label_test = ku.to_categorical(label_test, num_classes = len(token_end.word_index)+1)

In [3]:
max_size_orig = 30
max_size_dest = 10

padding_type = 'post'
truncating_type = 'post'

padded_train = pad_sequences(seq_train, maxlen = max_size_orig, padding=padding_type, truncating=truncating_type)
padded_test = pad_sequences(seq_test, maxlen = max_size_orig, padding=padding_type, truncating=truncating_type) 

In [4]:
from tensorflow.keras.layers import Reshape, Lambda, Dropout, TimeDistributed, RepeatVector
import tensorflow.keras.backend as K

model = Sequential()
model.add(Embedding(len(token_origin.word_index)+1, 50,  input_length=max_size_orig))
model.add(LSTM(50))
model.add(RepeatVector(10))
model.add(LSTM(50, return_sequences=True))
model.add(TimeDistributed(Dense(len(token_end.word_index)+1, activation='softmax')))
optimizer = 'adam'
loss = 'categorical_crossentropy'

model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 30, 50)            1800      
_________________________________________________________________
lstm (LSTM)                  (None, 50)                20200     
_________________________________________________________________
repeat_vector (RepeatVector) (None, 10, 50)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 10, 50)            20200     
_________________________________________________________________
time_distributed (TimeDistri (None, 10, 12)            612       
Total params: 42,812
Trainable params: 42,812
Non-trainable params: 0
_________________________________________________________________


In [5]:
monitor = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5,restore_best_weights=True)

board = tf.keras.callbacks.TensorBoard(log_dir='logs')

In [6]:
num_epochs = 60

model.fit(padded_train,label_train, epochs = num_epochs, validation_data=(padded_test,label_test), verbose=0 ,
         callbacks=[monitor,board])

Epoch 1/60
282/282 - 4s - loss: 1.8568 - accuracy: 0.3448 - val_loss: 1.3719 - val_accuracy: 0.4842
Epoch 2/60
282/282 - 3s - loss: 1.2314 - accuracy: 0.4810 - val_loss: 1.1912 - val_accuracy: 0.4927
Epoch 3/60
282/282 - 3s - loss: 1.1844 - accuracy: 0.4853 - val_loss: 1.1798 - val_accuracy: 0.4910
Epoch 4/60
282/282 - 3s - loss: 1.1761 - accuracy: 0.4854 - val_loss: 1.1684 - val_accuracy: 0.4960
Epoch 5/60
282/282 - 3s - loss: 1.0290 - accuracy: 0.5777 - val_loss: 0.9586 - val_accuracy: 0.6048
Epoch 6/60
282/282 - 4s - loss: 0.8973 - accuracy: 0.6269 - val_loss: 0.8480 - val_accuracy: 0.6457
Epoch 7/60
282/282 - 3s - loss: 0.8005 - accuracy: 0.6632 - val_loss: 0.7604 - val_accuracy: 0.6842
Epoch 8/60
282/282 - 2s - loss: 0.7237 - accuracy: 0.7001 - val_loss: 0.6989 - val_accuracy: 0.7096
Epoch 9/60
282/282 - 3s - loss: 0.6553 - accuracy: 0.7330 - val_loss: 0.6246 - val_accuracy: 0.7342
Epoch 10/60
282/282 - 2s - loss: 0.5756 - accuracy: 0.7600 - val_loss: 0.5483 - val_accuracy: 0.7723

<tensorflow.python.keras.callbacks.History at 0x7f9d8c1a6290>

In [13]:
loss, acc = model.evaluate(padded_test,label_test)
print('Loss:',loss)
print('Acc:',round(acc,3))

Loss: 0.002300913678482175
Acc: 0.999


In [7]:
seq = token_origin.texts_to_sequences(['august 26 1975'])
seq
padded_seq = pad_sequences(seq, maxlen = max_size_orig, padding=padding_type, truncating=truncating_type)

token_end.sequences_to_texts(np.argmax(model.predict(padded_seq), axis=-1))

['1 9 7 5 - 0 8 - 2 6']

In [8]:
def predict_dates(dates):
    seq = token_origin.texts_to_sequences(dates)
    padded_seq = pad_sequences(seq, maxlen = max_size_orig, padding=padding_type, truncating=truncating_type)
    converted = token_end.sequences_to_texts(np.argmax(model.predict(padded_seq), axis=-1))
    return [i.replace(' ','') for i in converted]
                           
predict_dates(['august 26 1975', 'friday july 12 1991', '10 sep 1975'])


['1975-08-26', '1991-07-12', '1975-09-10']