<a href="https://colab.research.google.com/github/Imran0897/Language_Translation_Eng_to_French/blob/main/Language_Translation_Eng_to_French.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [38]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import re
import os
import nltk
nltk.download("stopwords")
nltk.download('punkt')
import pickle
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import LSTM,Embedding,Input,Dense,SpatialDropout1D,Activation
from tensorflow.keras.models import Model,Sequential

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [39]:
df = pd.read_csv('/content/drive/MyDrive/Practice/eng_-french.csv')

In [40]:
df

Unnamed: 0,English words/sentences,French words/sentences
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !
...,...,...
175616,"Top-down economics never works, said Obama. ""T...","« L'économie en partant du haut vers le bas, ç..."
175617,A carbon footprint is the amount of carbon dio...,Une empreinte carbone est la somme de pollutio...
175618,Death is something that we're often discourage...,La mort est une chose qu'on nous décourage sou...
175619,Since there are usually multiple websites on a...,Puisqu'il y a de multiples sites web sur chaqu...


In [41]:
df.columns = ['English','French']
df.head()

Unnamed: 0,English,French
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !


In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175621 entries, 0 to 175620
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   English  175621 non-null  object
 1   French   175621 non-null  object
dtypes: object(2)
memory usage: 2.7+ MB


In [43]:
# Data preprocessing

In [44]:
# Clean English column

def clean_english(text):
  text = text.lower()
  text = re.sub('[^a-z!?]'," ",text)
  text = nltk.word_tokenize(text)
  text = " ".join([i.strip() for i in text])
  return text


In [45]:
data=df[:]

In [46]:
clean_english(data.iloc[1,0])

'run !'

In [47]:
# clean french language

def clean_french(text):
  text = text.lower()
  #remove any characters not a-z and ?!,'
  # characters a-z and (éâàçêêëôîû) chars of frensh lang which contain accent
  text = re.sub('[^a-zéâàçêêëôîû!?]'," ",text)
  return text



In [48]:
data.iloc[4,1],clean_french(data.iloc[4,1])

('Ça alors\u202f!', 'ça alors !')

In [49]:
data.iloc[6,1],clean_french(data.iloc[6,1])

("À l'aide\u202f!", 'à l aide !')

In [50]:
data['English'] = data['English'].apply(lambda txt:clean_english(txt))
data['French'] = data['French'].apply(lambda txt:clean_french(txt))


In [51]:
data['English']

0                                                        hi
1                                                     run !
2                                                     run !
3                                                     who ?
4                                                     wow !
                                ...                        
175616    top down economics never works said obama the ...
175617    a carbon footprint is the amount of carbon dio...
175618    death is something that we re often discourage...
175619    since there are usually multiple websites on a...
175620    if someone who doesn t know your background sa...
Name: English, Length: 175621, dtype: object

In [52]:
data['French']

0                                                    salut!
1                                                   cours !
2                                                  courez !
3                                                     qui ?
4                                                ça alors !
                                ...                        
175616      l économie en partant du haut vers le bas  ç...
175617    une empreinte carbone est la somme de pollutio...
175618    la mort est une chose qu on nous décourage sou...
175619    puisqu il y a de multiples sites web sur chaqu...
175620    si quelqu un qui ne connaît pas vos antécédent...
Name: French, Length: 175621, dtype: object

In [53]:
# add <start> <end> token to decoder sentence (French)

data['French'] = data['French'].apply(lambda txt:f"<start> {txt} <end>")

In [54]:
data.tail(10)

Unnamed: 0,English,French
175611,five tremors in excess of magnitude on the ric...,<start> cinq secousses dépassant la magnitude ...
175612,no matter how much you try to convince people ...,<start> peu importe le temps que tu passeras à...
175613,a child who is a native speaker usually knows ...,<start> un enfant qui est un locuteur natif co...
175614,there are four main causes of alcohol related ...,<start> il y a quatre causes principales de dé...
175615,we need to uphold laws against discrimination ...,<start> nous devons faire respecter les lois c...
175616,top down economics never works said obama the ...,<start> l économie en partant du haut vers l...
175617,a carbon footprint is the amount of carbon dio...,<start> une empreinte carbone est la somme de ...
175618,death is something that we re often discourage...,<start> la mort est une chose qu on nous décou...
175619,since there are usually multiple websites on a...,<start> puisqu il y a de multiples sites web s...
175620,if someone who doesn t know your background sa...,<start> si quelqu un qui ne connaît pas vos an...


In [55]:
# tokenization and build vocabalury

english_tokenize=Tokenizer(filters='#$%&()*+,-./:;<=>@[\\]^_`{|}~\t\n')
english_tokenize.fit_on_texts(data["English"])

In [56]:
num_encoder_tokens=len(english_tokenize.word_index)
num_encoder_tokens

13905

In [57]:
encoder=english_tokenize.texts_to_sequences(data["English"])

In [58]:
encoder[:5]

[[2752], [417, 124], [417, 124], [76, 5], [3489, 124]]

In [59]:
max_encoder_sequence_len=np.max([len(enc) for enc in encoder])
max_encoder_sequence_len

47

In [60]:
french_tokenize=Tokenizer(filters="#$%&()*+,-./:;<=>@[\\]^_`{|}~\t\n")
french_tokenize.fit_on_texts(data["French"])

In [61]:
num_decoder_tokens=len(french_tokenize.word_index)
num_decoder_tokens

24129

In [62]:
decoder=french_tokenize.texts_to_sequences(data["French"])
decoder[:5]

[[2, 15399, 1],
 [2, 551, 40, 1],
 [2, 4807, 40, 1],
 [2, 46, 6, 1],
 [2, 38, 381, 40, 1]]

In [63]:
max_decoder_sequence_len=np.max([len(dec) for dec in decoder])
max_decoder_sequence_len

61

In [64]:
idx_2_txt_encoder={k:i for i,k in english_tokenize.word_index.items()}
idx_2_txt_encoder

{1: 'i',
 2: 'you',
 3: 'to',
 4: 'the',
 5: '?',
 6: 'a',
 7: 't',
 8: 'is',
 9: 'that',
 10: 'tom',
 11: 'it',
 12: 'he',
 13: 's',
 14: 'do',
 15: 'of',
 16: 'this',
 17: 'in',
 18: 'me',
 19: 'have',
 20: 'don',
 21: 'we',
 22: 'was',
 23: 'what',
 24: 'my',
 25: 'can',
 26: 'for',
 27: 'are',
 28: 'm',
 29: 'your',
 30: 'be',
 31: 're',
 32: 'she',
 33: 'want',
 34: 'not',
 35: 'know',
 36: 'like',
 37: 'on',
 38: 'with',
 39: 'they',
 40: 'his',
 41: 'all',
 42: 'did',
 43: 'at',
 44: 'how',
 45: 'go',
 46: 'think',
 47: 'there',
 48: 'll',
 49: 'him',
 50: 've',
 51: 'and',
 52: 'time',
 53: 'about',
 54: 'here',
 55: 'very',
 56: 'didn',
 57: 'get',
 58: 'were',
 59: 'no',
 60: 'as',
 61: 'one',
 62: 'her',
 63: 'will',
 64: 'had',
 65: 'if',
 66: 'why',
 67: 'just',
 68: 'up',
 69: 'out',
 70: 'going',
 71: 'has',
 72: 'd',
 73: 'would',
 74: 'so',
 75: 'need',
 76: 'who',
 77: 'good',
 78: 'mary',
 79: 'tell',
 80: 'let',
 81: 'should',
 82: 'an',
 83: 'see',
 84: 'when',
 85

In [65]:
idx_2_txt_decoder={k:i for i,k in french_tokenize.word_index.items()}
idx_2_txt_decoder

{1: 'end',
 2: 'start',
 3: 'je',
 4: 'de',
 5: 'pas',
 6: '?',
 7: 'est',
 8: 'vous',
 9: 'que',
 10: 'il',
 11: 'à',
 12: 'ne',
 13: 'le',
 14: 'la',
 15: 'tu',
 16: 'ce',
 17: 'a',
 18: 'j',
 19: 'n',
 20: 'tom',
 21: 'l',
 22: 'un',
 23: 'ai',
 24: 'nous',
 25: 'en',
 26: 'd',
 27: 'une',
 28: 's',
 29: 'les',
 30: 'me',
 31: 'suis',
 32: 'c',
 33: 'pour',
 34: 'elle',
 35: 'qu',
 36: 'faire',
 37: 're',
 38: 'ça',
 39: 'm',
 40: '!',
 41: 'dans',
 42: 'y',
 43: 'plus',
 44: 'des',
 45: 'te',
 46: 'qui',
 47: 'moi',
 48: 'tout',
 49: 'veux',
 50: 't',
 51: 'être',
 52: 'fait',
 53: 'était',
 54: 'avec',
 55: 'êtes',
 56: 'mon',
 57: 'du',
 58: 'au',
 59: 'si',
 60: 'se',
 61: 'as',
 62: 'et',
 63: 'avez',
 64: 'sont',
 65: 'cette',
 66: 'ils',
 67: 'es',
 68: 'son',
 69: 'tr',
 70: 'peux',
 71: 'cela',
 72: 'votre',
 73: 'temps',
 74: 'pourquoi',
 75: 'été',
 76: 'dit',
 77: 'sur',
 78: 'lui',
 79: 'ici',
 80: 'ma',
 81: 'pense',
 82: 'sais',
 83: 'chose',
 84: 'jamais',
 85: 'toi'

In [66]:
idx_2_txt_decoder[0]="<pad>"
idx_2_txt_encoder[0]="<pad>"

In [67]:
idx_2_txt_encoder[0]

'<pad>'

In [68]:
# Pad Sequence

In [69]:
encoder_seq=pad_sequences(encoder,maxlen=max_encoder_sequence_len,padding="post")
encoder_seq.shape

(175621, 47)

In [70]:
encoder_seq

array([[2752,    0,    0, ...,    0,    0,    0],
       [ 417,  124,    0, ...,    0,    0,    0],
       [ 417,  124,    0, ...,    0,    0,    0],
       ...,
       [ 607,    8,   99, ...,    0,    0,    0],
       [ 361,   47,   27, ...,    0,    0,    0],
       [  65,  276,   76, ...,    6, 1100, 1448]], dtype=int32)

In [71]:
decoder_inp=pad_sequences([arr[:-1] for arr in decoder],maxlen=max_decoder_sequence_len,padding="post")
decoder_inp.shape

(175621, 61)

In [72]:
[arr[:-1] for arr in decoder][1]

[2, 551, 40]

In [73]:
decoder[3]

[2, 46, 6, 1]

In [74]:
decoder_output=pad_sequences([arr[1:] for arr in decoder],maxlen=max_decoder_sequence_len,padding="post")
decoder_output.shape

(175621, 61)

In [75]:
decoder_output

array([[15399,     1,     0, ...,     0,     0,     0],
       [  551,    40,     1, ...,     0,     0,     0],
       [ 4807,    40,     1, ...,     0,     0,     0],
       ...,
       [   14,   327,     7, ...,     0,     0,     0],
       [ 7523,    10,    42, ...,     0,     0,     0],
       [   59,   173,    22, ...,  2136,     1,     0]], dtype=int32)

In [76]:
[arr[1:] for arr in decoder][3]

[46, 6, 1]

In [77]:
# Design LSTM NN (Encoder & Decoder)

In [78]:
# encoder model
encoder_input=Input(shape=(None,),name="encoder_input_layer")
encoder_embedding=Embedding(num_encoder_tokens,50,input_length=max_encoder_sequence_len,name="encoder_embedding_layer")(encoder_input)
encoder_lstm=LSTM(32,activation="tanh",return_sequences=True,return_state=True,name="encoder_lstm_1_layer")(encoder_embedding)
encoder_lstm2=LSTM(32,activation="tanh",return_state=True,name="encoder_lstm_2_layer")(encoder_lstm)
_,state_h,state_c=encoder_lstm2
encoder_states=[state_h,state_c]

In [79]:
Embedding(num_encoder_tokens,300,input_length=max_encoder_sequence_len,name="encoder_embedding_layer")

<keras.layers.core.embedding.Embedding at 0x7f3181d02020>

In [80]:
Embedding(num_encoder_tokens,300,input_length=max_encoder_sequence_len,name="encoder_embedding_layer")(encoder_input)

<KerasTensor: shape=(None, None, 300) dtype=float32 (created by layer 'encoder_embedding_layer')>

In [82]:
# decoder model
decoder_input=Input(shape=(None,),name="decoder_input_layer")
decoder_embedding=Embedding(num_decoder_tokens,300,input_length=max_decoder_sequence_len,name="decoder_embedding_layer")(decoder_input)
decoder_lstm=LSTM(32,activation="tanh",return_state=True,return_sequences=True,name="decoder_lstm_layer")
decoder_outputs,_,_=decoder_lstm(decoder_embedding,initial_state=encoder_states)
decoder_dense=Dense(num_decoder_tokens+1,activation="softmax",name="deocer_final_layer")
outputs=decoder_dense(decoder_outputs)

In [83]:
model=Model([encoder_input,decoder_input],outputs)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_input_layer (InputLaye  [(None, None)]      0           []                               
 r)                                                                                               
                                                                                                  
 encoder_embedding_layer (Embed  (None, None, 50)    695250      ['encoder_input_layer[0][0]']    
 ding)                                                                                            
                                                                                                  
 decoder_input_layer (InputLaye  [(None, None)]      0           []                               
 r)                                                                                           

In [84]:
encoder_seq.shape,decoder_inp.shape,decoder_output.shape

((175621, 47), (175621, 61), (175621, 61))

In [87]:
loss = tf.losses.SparseCategoricalCrossentropy()
model.compile(optimizer='rmsprop', loss=loss, metrics=['accuracy'])
history=model.fit(
    [encoder_seq,decoder_inp],
    decoder_output,
    epochs=2,
    batch_size=64
)

Epoch 1/2
Epoch 2/2
