# Here we will write a program to translate English Language to French (LSTM)

In [1]:
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, LSTM

In [2]:
batch_size = 64       ## batch size for training
epochs = 50            ## number of epoch to train
latent_dims = 256      ## latent dinmensionality of encoding space
num_sample = 10000       ## number of sample to train on

data = 'french.txt'

In [3]:
## vectorize the data

input_texts =[]
target_texts = []
input_characters = set()
target_charecters =set()

with open(data, 'r' ,encoding ='utf-8') as f:
    lines = f.read().split('\n')

In [4]:
lines

['Go.\tVa !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)',
 'Go.\tMarche.\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8090732 (Micsmithel)',
 'Go.\tEn route !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8267435 (felix63)',
 'Go.\tBouge !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #9022935 (Micsmithel)',
 'Hi.\tSalut !\tCC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #509819 (Aiji)',
 'Hi.\tSalut.\tCC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #4320462 (gillux)',
 'Run!\tCours\u202f!\tCC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #906331 (sacredceltic)',
 'Run!\tCourez\u202f!\tCC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #906332 (sacredceltic)',
 'Run!\tPrenez vos jambes à vos cous !\tCC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #2077449 (sacredceltic)',
 'Run!\tFile !\tCC-BY 2.0 (France) Attribution: tatoeba.org #90

In [5]:
for line in lines[: min(num_sample, len(lines)-1)]:
    
    input_text, target_text, _ = line.split('\t')

    target_text ='\t' + target_text +'\n'
    
    input_texts.append(input_text)
    target_texts.append(target_text)
    
    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)
    
    for char in target_text:
        if char not in target_charecters:
            target_charecters.add(char)

In [6]:
input_characters     ## try with (target_charecters)

{' ',
 '!',
 '"',
 '$',
 '%',
 '&',
 "'",
 ',',
 '-',
 '.',
 '0',
 '1',
 '2',
 '3',
 '5',
 '7',
 '8',
 '9',
 ':',
 '?',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'Y',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 'é'}

In [7]:
input_texts            
                                ## try with (target_text)

['Go.',
 'Go.',
 'Go.',
 'Go.',
 'Hi.',
 'Hi.',
 'Run!',
 'Run!',
 'Run!',
 'Run!',
 'Run!',
 'Run!',
 'Run!',
 'Run!',
 'Run.',
 'Run.',
 'Run.',
 'Run.',
 'Run.',
 'Run.',
 'Run.',
 'Run.',
 'Who?',
 'Wow!',
 'Wow!',
 'Wow!',
 'Duck!',
 'Duck!',
 'Duck!',
 'Fire!',
 'Help!',
 'Hide.',
 'Hide.',
 'Jump!',
 'Jump.',
 'Stop!',
 'Stop!',
 'Stop!',
 'Wait!',
 'Wait!',
 'Wait!',
 'Wait.',
 'Wait.',
 'Wait.',
 'Wait.',
 'Begin.',
 'Begin.',
 'Go on.',
 'Go on.',
 'Go on.',
 'Hello!',
 'Hello!',
 'I see.',
 'I see.',
 'I try.',
 'I won!',
 'I won!',
 'I won.',
 'Oh no!',
 'Relax.',
 'Relax.',
 'Relax.',
 'Relax.',
 'Relax.',
 'Relax.',
 'Relax.',
 'Relax.',
 'Relax.',
 'Relax.',
 'Relax.',
 'Relax.',
 'Smile.',
 'Smile.',
 'Smile.',
 'Sorry?',
 'Attack!',
 'Attack!',
 'Attack!',
 'Attack!',
 'Buy it.',
 'Buy it.',
 'Buy it.',
 'Buy it.',
 'Cheers!',
 'Cheers!',
 'Cheers!',
 'Cheers!',
 'Eat it.',
 'Eat it.',
 'Get up.',
 'Get up.',
 'Get up.',
 'Go now.',
 'Go now.',
 'Go now.',
 'Got it!',


In [8]:
input_characters = sorted(list(input_characters))
target_charecters = sorted(list(target_charecters))

num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_charecters)

max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

In [9]:
print('Number of sample: ', len(input_texts))
print('Number of unique input tokens: ', num_encoder_tokens)
print('Number of unique output tokens: ', num_decoder_tokens)
print('Max sequence length for inputs: ', max_encoder_seq_length)
print('Max sequence length for outputs: ', max_decoder_seq_length)

Number of sample:  10000
Number of unique input tokens:  71
Number of unique output tokens:  93
Max sequence length for inputs:  15
Max sequence length for outputs:  59


In [10]:
input_token_index = dict([(char, i) for i,char in enumerate(input_characters)])

target_token_index = dict([(char, i) for i,char in enumerate(target_charecters)])

In [11]:
input_token_index

{' ': 0,
 '!': 1,
 '"': 2,
 '$': 3,
 '%': 4,
 '&': 5,
 "'": 6,
 ',': 7,
 '-': 8,
 '.': 9,
 '0': 10,
 '1': 11,
 '2': 12,
 '3': 13,
 '5': 14,
 '7': 15,
 '8': 16,
 '9': 17,
 ':': 18,
 '?': 19,
 'A': 20,
 'B': 21,
 'C': 22,
 'D': 23,
 'E': 24,
 'F': 25,
 'G': 26,
 'H': 27,
 'I': 28,
 'J': 29,
 'K': 30,
 'L': 31,
 'M': 32,
 'N': 33,
 'O': 34,
 'P': 35,
 'Q': 36,
 'R': 37,
 'S': 38,
 'T': 39,
 'U': 40,
 'V': 41,
 'W': 42,
 'Y': 43,
 'a': 44,
 'b': 45,
 'c': 46,
 'd': 47,
 'e': 48,
 'f': 49,
 'g': 50,
 'h': 51,
 'i': 52,
 'j': 53,
 'k': 54,
 'l': 55,
 'm': 56,
 'n': 57,
 'o': 58,
 'p': 59,
 'q': 60,
 'r': 61,
 's': 62,
 't': 63,
 'u': 64,
 'v': 65,
 'w': 66,
 'x': 67,
 'y': 68,
 'z': 69,
 'é': 70}

In [12]:
target_token_index

{'\t': 0,
 '\n': 1,
 ' ': 2,
 '!': 3,
 '%': 4,
 '&': 5,
 "'": 6,
 '(': 7,
 ')': 8,
 ',': 9,
 '-': 10,
 '.': 11,
 '0': 12,
 '1': 13,
 '2': 14,
 '3': 15,
 '5': 16,
 '8': 17,
 '9': 18,
 ':': 19,
 '?': 20,
 'A': 21,
 'B': 22,
 'C': 23,
 'D': 24,
 'E': 25,
 'F': 26,
 'G': 27,
 'H': 28,
 'I': 29,
 'J': 30,
 'K': 31,
 'L': 32,
 'M': 33,
 'N': 34,
 'O': 35,
 'P': 36,
 'Q': 37,
 'R': 38,
 'S': 39,
 'T': 40,
 'U': 41,
 'V': 42,
 'W': 43,
 'Y': 44,
 'a': 45,
 'b': 46,
 'c': 47,
 'd': 48,
 'e': 49,
 'f': 50,
 'g': 51,
 'h': 52,
 'i': 53,
 'j': 54,
 'k': 55,
 'l': 56,
 'm': 57,
 'n': 58,
 'o': 59,
 'p': 60,
 'q': 61,
 'r': 62,
 's': 63,
 't': 64,
 'u': 65,
 'v': 66,
 'w': 67,
 'x': 68,
 'y': 69,
 'z': 70,
 '\xa0': 71,
 '«': 72,
 '»': 73,
 'À': 74,
 'Ç': 75,
 'É': 76,
 'Ê': 77,
 'à': 78,
 'â': 79,
 'ç': 80,
 'è': 81,
 'é': 82,
 'ê': 83,
 'î': 84,
 'ï': 85,
 'ô': 86,
 'ù': 87,
 'û': 88,
 'œ': 89,
 '\u2009': 90,
 '’': 91,
 '\u202f': 92}

In [13]:
encoder_input_data = np.zeros((len(input_texts), max_encoder_seq_length, num_encoder_tokens), dtype ='float32')

decoder_input_data = np.zeros((len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype ='float32')

decoder_target_data = np.zeros((len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype ='float32')

In [14]:
encoder_input_data              ## try with ...decoder_input_data  &  decoder_target_data

array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0.

In [15]:
for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[char]] = 1
        
    encoder_input_data[i, t+1:, input_token_index[' ']] = 1
    
    for t, char in enumerate(target_text):
        ## decoder_target_data  is  ahead of decoder_input_data  by one timestep
        decoder_input_data[i, t, target_token_index[char]] = 1
        
        if t > 0:
            ## decoder target data will be ahead by one timestep
            ## and will not include the start character
            
            decoder_target_data[i, t-1, target_token_index[char]] = 1
            
    decoder_input_data[i, t+1:, target_token_index[' ']] = 1
    
    decoder_target_data[i, t:, target_token_index[' ']] = 1

In [16]:
encoder_input_data[0].shape

(15, 71)

In [17]:
## define an input sequence and process it...
encoder_inputs = Input(shape =(None, num_encoder_tokens))

encoder = LSTM(latent_dims, return_state= True)

encoder_outputs, state_h, state_c = encoder(encoder_inputs)

## we discard encoder_outputs and only keep the states...
encoder_states = [state_h, state_c]

In [18]:
## set up the decoder, using "encoder_states" as initial state
decoder_inputs = Input(shape =(None, num_decoder_tokens))

## we set up our decoder to return full output sequences, and to return internal states as well, we don't use the
## return states in the training model, but we will use them in reference
decoder_lstm = LSTM(latent_dims, return_sequences= True, return_state= True)

decoder_outputs, _,_ = decoder_lstm(decoder_inputs, initial_state= encoder_states)


decoder_dense = Dense(num_decoder_tokens, activation= 'softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [19]:
## define the model that will turn encoder_input_data  & decoder_input_data  into decoder_target_data

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [20]:
## run training
model.compile(optimizer= 'rmsprop', loss= 'categorical_crossentropy', metrics= ['accuracy'])

In [21]:
model.fit([encoder_input_data, decoder_input_data], decoder_target_data, batch_size= batch_size, 
          epochs= epochs, validation_split= 0.2)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x18ff59aea00>

In [32]:
## define sampling models

encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape= (latent_dims,))
decoder_state_input_c =Input(shape =(latent_dims,))

decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state = decoder_states_inputs)

decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

reverse_input_char_index = dict((i,char) for char,i in input_token_index.items())

reverse_target_char_index = dict((i,char) for char,i in target_token_index.items())


def decode_sequence(input_seq):
    ## encode the input as state vectors
    states_value = encoder_model.predict(input_seq)
    
    ## generate empty target sequence of length i
    target_seq = np.zeros((1,1,num_decoder_tokens))
    
    ## populate the first character of target sequence with the start character
    target_seq[0,0, target_token_index['\t']] = 1
    
    
    ## sampling loop for a batch of sequence (to simplify, here we assume a batch of size 1)
    stop_condition = False
    decoded_sentence = ''
    
    while not stop_condition:
        output_token, h, c = decoder_model.predict([target_seq] + states_value)
        
        ## sample a taken
        sampled_token_index = np.argmax(output_token[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence = decoded_sentence + sampled_char
        
        
        ## exit condition either hit max length or find stop characte
        if (sampled_char == '\n' or len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True
            
        ## update the target sequence (of length 1)
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] =1
        
        ## update states
        states_value = [h, c]
        
        
    return decoded_sentence


for seq_index in range(100):
    ## take one sequence (part of the training set) for trying out decoding
    input_seq = encoder_input_data[seq_index: seq_index +1]
    
    decoded_sentence = decode_sequence(input_seq)
    
    print('-')
    print('Input sentence: ', input_texts[seq_index])
    print('Decoded sentence: ', decoded_sentence)

-
Input sentence:  Go.
Decoded sentence:  Bouge !

-
Input sentence:  Go.
Decoded sentence:  Bouge !

-
Input sentence:  Go.
Decoded sentence:  Bouge !

-
Input sentence:  Go.
Decoded sentence:  Bouge !

-
Input sentence:  Hi.
Decoded sentence:  Salut.

-
Input sentence:  Hi.
Decoded sentence:  Salut.

-
Input sentence:  Run!
Decoded sentence:  Filez !

-
Input sentence:  Run!
Decoded sentence:  Filez !

-
Input sentence:  Run!
Decoded sentence:  Filez !

-
Input sentence:  Run!
Decoded sentence:  Filez !

-
Input sentence:  Run!
Decoded sentence:  Filez !

-
Input sentence:  Run!
Decoded sentence:  Filez !

-
Input sentence:  Run!
Decoded sentence:  Filez !

-
Input sentence:  Run!
Decoded sentence:  Filez !

-
Input sentence:  Run.
Decoded sentence:  Filez !

-
Input sentence:  Run.
Decoded sentence:  Filez !



-
Input sentence:  Run.
Decoded sentence:  Filez !

-
Input sentence:  Run.
Decoded sentence:  Filez !

-
Input sentence:  Run.
Decoded sentence:  Filez !

-
Input sentence:  Run.
Decoded sentence:  Filez !

-
Input sentence:  Run.
Decoded sentence:  Filez !

-
Input sentence:  Run.
Decoded sentence:  Filez !

-
Input sentence:  Who?
Decoded sentence:  Qui ?

-
Input sentence:  Wow!
Decoded sentence:  Achanons-nous !

-
Input sentence:  Wow!
Decoded sentence:  Achanons-nous !

-
Input sentence:  Wow!
Decoded sentence:  Achanons-nous !

-
Input sentence:  Duck!
Decoded sentence:  Fais !

-
Input sentence:  Duck!
Decoded sentence:  Fais !

-
Input sentence:  Duck!
Decoded sentence:  Fais !

-
Input sentence:  Fire!
Decoded sentence:  Au feu !

-
Input sentence:  Help!
Decoded sentence:  À l'aide !

-
Input sentence:  Hide.
Decoded sentence:  Accument-il !

-
Input sentence:  Hide.
Decoded sentence:  Accument-il !

-
Input sentence:  Jump!
Decoded sentence:  Saute.

-
Input sentence:  Jum

-
Input sentence:  Wait.
Decoded sentence:  Attendez !

-
Input sentence:  Wait.
Decoded sentence:  Attendez !

-
Input sentence:  Wait.
Decoded sentence:  Attendez !

-
Input sentence:  Begin.
Decoded sentence:  Commence.

-
Input sentence:  Begin.
Decoded sentence:  Commence.

-
Input sentence:  Go on.
Decoded sentence:  Va !

-
Input sentence:  Go on.
Decoded sentence:  Va !

-
Input sentence:  Go on.
Decoded sentence:  Va !

-
Input sentence:  Hello!
Decoded sentence:  Bonjour !

-
Input sentence:  Hello!
Decoded sentence:  Bonjour !

-
Input sentence:  I see.
Decoded sentence:  Je vois coirir.

-
Input sentence:  I see.
Decoded sentence:  Je vois coirir.

-
Input sentence:  I try.
Decoded sentence:  J'essaye.

-
Input sentence:  I won!
Decoded sentence:  J'ai gagné !

-
Input sentence:  I won!
Decoded sentence:  J'ai gagné !

-
Input sentence:  I won.
Decoded sentence:  J'ai cain.

-
Input sentence:  Oh no!
Decoded sentence:  Estacentenen !

-
Input sentence:  Relax.
Decoded sente

-
Input sentence:  Relax.
Decoded sentence:  Détends-toi !

-
Input sentence:  Relax.
Decoded sentence:  Détends-toi !

-
Input sentence:  Relax.
Decoded sentence:  Détends-toi !

-
Input sentence:  Relax.
Decoded sentence:  Détends-toi !

-
Input sentence:  Relax.
Decoded sentence:  Détends-toi !

-
Input sentence:  Relax.
Decoded sentence:  Détends-toi !

-
Input sentence:  Smile.
Decoded sentence:  Souriez !

-
Input sentence:  Smile.
Decoded sentence:  Souriez !

-
Input sentence:  Smile.
Decoded sentence:  Souriez !

-
Input sentence:  Sorry?
Decoded sentence:  Va vois ?

-
Input sentence:  Attack!
Decoded sentence:  Attaquez !

-
Input sentence:  Attack!
Decoded sentence:  Attaquez !

-
Input sentence:  Attack!
Decoded sentence:  Attaquez !

-
Input sentence:  Attack!
Decoded sentence:  Attaquez !

-
Input sentence:  Buy it.
Decoded sentence:  Achète-le !

-
Input sentence:  Buy it.
Decoded sentence:  Achète-le !

-
Input sentence:  Buy it.
Decoded sentence:  Achète-le !

-
Input

-
Input sentence:  Get up.
Decoded sentence:  De calme !

-
Input sentence:  Get up.
Decoded sentence:  De calme !

-
Input sentence:  Get up.
Decoded sentence:  De calme !

-
Input sentence:  Go now.
Decoded sentence:  Va chercher Tom.

-
Input sentence:  Go now.
Decoded sentence:  Va chercher Tom.

-
Input sentence:  Go now.
Decoded sentence:  Va chercher Tom.

-
Input sentence:  Got it!
Decoded sentence:  Compris !

-
Input sentence:  Got it!
Decoded sentence:  Compris !

-
Input sentence:  Got it!
Decoded sentence:  Compris !

-
Input sentence:  Got it?
Decoded sentence:  Viai ?

-
Input sentence:  Got it?
Decoded sentence:  Viai ?

