In [1]:
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense


In [2]:
batch_size = 64
epochs = 50
latent_dim = 256
num_samples = 10000

In [3]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [4]:
with open("/content/drive/MyDrive/project/NMT/fre/fra.txt", 'r', encoding='utf-8') as f:
  lines = f.read().split('\n')

In [5]:
input_texts = []
target_texts = []
input_characters = set()
target_characters = set()
for line in lines[: min(num_samples, len(lines) - 1)]:
  input_text, target_text, _ = line.split('\t')
  target_text = '\t' + target_text + '\n'
  input_texts.append(input_text)
  target_texts.append(target_text)
  for char in input_text:
    if char not in input_characters:
      input_characters.add(char)
  for char in target_text:
    if char not in target_characters:
      target_characters.add(char)

In [6]:
input_texts[1]

'Go.'

In [7]:
target_texts[1]

'\tMarche.\n'

In [8]:
input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max([len(txt) for txt in input_texts ])
max_decoder_seq_length = max([len(txt) for txt in target_texts ])

In [9]:
input_token_index = dict([(char,i) for i, char in enumerate(input_characters)])
target_token_index = dict([(char,i) for i, char in enumerate(target_characters)])

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
train_text, test_text, train_labels, test_labels = train_test_split(input_texts, target_texts, 
                                                                    random_state=2018, 
                                                                    test_size=0.3, 
                                                                    )

In [12]:
train_text[1]

'He is in Tokyo.'

In [13]:
train_labels[1]

'\tIl est à Tokyo.\n'

In [14]:
test_text[1]

'I shouted.'

In [15]:
test_labels[1]

"\tJ'ai crié.\n"

In [16]:
encoder_input_data_train = np.zeros((len(train_text),max_encoder_seq_length,num_encoder_tokens), dtype='float32')
decoder_input_data_train = np.zeros((len(train_text),max_decoder_seq_length,num_decoder_tokens), dtype='float32')
decoder_target_data_train = np.zeros((len(train_text),max_decoder_seq_length,num_decoder_tokens), dtype='float32')

In [17]:
for i, (input_text, target_text) in enumerate(zip(train_text, train_labels)):
  for t, char in enumerate(input_text):
    encoder_input_data_train[i,t,input_token_index[char]]=1.
  encoder_input_data_train[i, t+1:, input_token_index[' ']] = 1.
  for t,char in enumerate(target_text):
    decoder_input_data_train[i, t, target_token_index[char]] = 1.
    if t>0:
      decoder_target_data_train[i,t-1, target_token_index[char]] = 1.
  decoder_input_data_train[i,t+1:, target_token_index[' ']] = 1.
  decoder_target_data_train[i, t:, target_token_index[' ']] = 1.

In [18]:
encoder_input_data_train[0].shape

(15, 71)

In [19]:
encoder_input_data_test = np.zeros((len(test_text),max_encoder_seq_length,num_encoder_tokens), dtype='float32')
decoder_input_data_test = np.zeros((len(test_text),max_decoder_seq_length,num_decoder_tokens), dtype='float32')
decoder_target_data_test = np.zeros((len(test_text),max_decoder_seq_length,num_decoder_tokens), dtype='float32')

In [20]:
for i, (input_text, target_text) in enumerate(zip(test_text, test_labels)):
  for t, char in enumerate(input_text):
    encoder_input_data_test[i,t,input_token_index[char]]=1.
  encoder_input_data_test[i, t+1:, input_token_index[' ']] = 1.
  for t,char in enumerate(target_text):
    decoder_input_data_test[i, t, target_token_index[char]] = 1.
    if t>0:
      decoder_target_data_test[i,t-1, target_token_index[char]] = 1.
  decoder_input_data_test[i,t+1:, target_token_index[' ']] = 1.
  decoder_target_data_test[i, t:, target_token_index[' ']] = 1.

In [21]:
encoder_input_data_test[0].shape

(15, 71)

In [22]:
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(latent_dim, return_state = True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h, state_c]

In [23]:
decoder_inputs = Input(shape=(None, num_decoder_tokens))
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state = encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation = 'softmax')
decoder_outputs = decoder_dense(decoder_outputs)


In [24]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer= 'adam', loss = 'categorical_crossentropy', metrics=['accuracy'])
model.fit([encoder_input_data_train, decoder_input_data_train], decoder_target_data_train, batch_size = batch_size, epochs= epochs, validation_split = 0.2)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f6fc0089e50>

In [25]:
model.evaluate([encoder_input_data_test, decoder_input_data_test], decoder_target_data_test)



[0.36262035369873047, 0.8940847516059875]