<a href="https://colab.research.google.com/github/Kavyapm1960/project/blob/main/eng_fr_translate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# basic libs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# cleaning data
import re
import os
import nltk
nltk.download("stopwords")
nltk.download('punkt')

# save vocabulary in files
import pickle

# tokenization
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Model
from tensorflow.keras.layers import LSTM,Embedding,Input,Dense,SpatialDropout1D,Activation
from tensorflow.keras.models import Model,Sequential

# training model dependanices
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [4]:
df=pd.read_csv("/content/drive/MyDrive/eng_-french.csv")
df.columns=["english","frensh"]
df.head()


Unnamed: 0,english,frensh
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !


In [5]:
data=df[:]
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175621 entries, 0 to 175620
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   english  175621 non-null  object
 1   frensh   175621 non-null  object
dtypes: object(2)
memory usage: 2.7+ MB


In [6]:
# clean english column
def clean_english(text):
  text=text.lower() # lower case

  # remove any characters not a-z and ?!,'
  text=re.sub(u"[^a-z!?',]"," ",text)

  # word tokenization
  text=nltk.word_tokenize(text)

  # join text
  text=" ".join([i.strip() for i in text])

  return text
clean_english(data.iloc[0,0])

'hi'

In [7]:
data.iloc[1,0],clean_english(data.iloc[1,0])


('Run!', 'run !')

In [8]:
# clean frensh language
def clean_frensh(text):
  text=text.lower() # lower case

  # remove any characters not a-z and ?!,'
  # characters a-z and (éâàçêêëôîû) chars of frensh lang which contain accent
  text=re.sub(u"[^a-zéâàçêêëôîû!?',]"," ",text)

  return text
clean_frensh(data.iloc[0,1])

'salut!'

In [9]:
data.iloc[4,1],clean_frensh(data.iloc[4,1])

('Ça alors\u202f!', 'ça alors !')

In [10]:
data.iloc[6,1],clean_frensh(data.iloc[6,1])

("À l'aide\u202f!", "à l'aide !")

In [11]:
# i show this two functions are ready to apply in dataframe
data["english"]=data["english"].apply(lambda txt:clean_english(txt))
data["frensh"]=data["frensh"].apply(lambda txt:clean_frensh(txt))
# add <start> <end> token to decoder sentence (Frensh)
data["frensh"]=data["frensh"].apply(lambda txt:f"<start> {txt} <end>")

In [12]:
data.sample(10)

Unnamed: 0,english,frensh
154325,i 'm going to work out the problem by myself,<start> je vais résoudre le probl me moi même ...
57346,this plan is fool proof,<start> ce plan est sans faute <end>
75309,i ca n't make ends meet now,<start> je n'arrive pas à joindre les deux bou...
157620,"i know you 're busy , but i could use some help",<start> je sais que tu es occupée mais je ne r...
14693,i know he did it,<start> je sais qu'il l'a fait <end>
104948,she turned down his invitation,<start> elle déclina son invitation <end>
521,do it now,<start> faites le maintenant <end>
125834,is that what you really want to do ?,<start> est ce là ce que vous voulez vraiment ...
137423,i think this translation is incorrect,<start> je pense que cette traduction est inco...
13145,you are the one,<start> tu es l'élu <end>


In [13]:
# english tokenizer
english_tokenize=Tokenizer(filters='#$%&()*+,-./:;<=>@[\\]^_`{|}~\t\n')
english_tokenize.fit_on_texts(data["english"])

In [14]:
num_encoder_tokens=len(english_tokenize.word_index)
num_encoder_tokens

13904

In [15]:
encoder=english_tokenize.texts_to_sequences(data["english"])
encoder[:5]

[[2745], [408, 124], [408, 124], [77, 5], [3483, 124]]

In [16]:
max_encoder_sequence_len=np.max([len(enc) for enc in encoder])
max_encoder_sequence_len

47

In [17]:
# frensh tokenizer
french_tokenize=Tokenizer(filters="#$%&()*+,-./:;<=>@[\\]^_`{|}~\t\n")
french_tokenize.fit_on_texts(data["frensh"])

In [18]:
num_decoder_tokens=len(french_tokenize.word_index)
num_decoder_tokens

26942

In [19]:
decoder=french_tokenize.texts_to_sequences(data["frensh"])
decoder[:5]

[[2, 16889, 1],
 [2, 572, 33, 1],
 [2, 5116, 33, 1],
 [2, 39, 6, 1],
 [2, 32, 393, 33, 1]]

In [20]:
max_decoder_sequence_len=np.max([len(dec) for dec in decoder])
max_decoder_sequence_len

57

In [21]:
idx_2_txt_decoder={k:i for i,k in french_tokenize.word_index.items()}
idx_2_txt_decoder[1]

'end'

In [22]:
idx_2_txt_encoder={k:i for i,k in english_tokenize.word_index.items()}
idx_2_txt_encoder[2]

'you'

In [23]:
idx_2_txt_decoder[0]="<pad>"
idx_2_txt_encoder[0]="<pad>"

In [24]:
encoder_seq=pad_sequences(encoder,maxlen=max_encoder_sequence_len,padding="post")
encoder_seq.shape

(175621, 47)

In [25]:
decoder_inp=pad_sequences([arr[:-1] for arr in decoder],maxlen=max_decoder_sequence_len,padding="post")
decoder_inp.shape

(175621, 57)

In [26]:
decoder_output=pad_sequences([arr[1:] for arr in decoder],maxlen=max_decoder_sequence_len,padding="post")
decoder_output.shape

(175621, 57)

In [27]:
print([idx_2_txt_decoder[i] for i in decoder_output[0]])

['salut!', 'end', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']


In [28]:
print([idx_2_txt_encoder[i] for i in encoder_seq[0]])

['hi', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']


In [29]:
# encoder model
encoder_input=Input(shape=(None,),name="encoder_input_layer")
encoder_embedding=Embedding(num_encoder_tokens,300,input_length=max_encoder_sequence_len,name="encoder_embedding_layer")(encoder_input)
encoder_lstm=LSTM(256,activation="tanh",return_sequences=True,return_state=True,name="encoder_lstm_1_layer")(encoder_embedding)
encoder_lstm2=LSTM(256,activation="tanh",return_state=True,name="encoder_lstm_2_layer")(encoder_lstm)
_,state_h,state_c=encoder_lstm2
encoder_states=[state_h,state_c]

In [30]:
# decoder model
decoder_input=Input(shape=(None,),name="decoder_input_layer")
decoder_embedding=Embedding(num_decoder_tokens,300,input_length=max_decoder_sequence_len,name="decoder_embedding_layer")(decoder_input)
decoder_lstm=LSTM(256,activation="tanh",return_state=True,return_sequences=True,name="decoder_lstm_layer")
decoder_outputs,_,_=decoder_lstm(decoder_embedding,initial_state=encoder_states)
decoder_dense=Dense(num_decoder_tokens+1,activation="softmax",name="deocer_final_layer")
outputs=decoder_dense(decoder_outputs)

In [31]:
model=Model([encoder_input,decoder_input],outputs)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 encoder_input_layer (Input  [(None, None)]               0         []                            
 Layer)                                                                                           
                                                                                                  
 encoder_embedding_layer (E  (None, None, 300)            4171200   ['encoder_input_layer[0][0]'] 
 mbedding)                                                                                        
                                                                                                  
 decoder_input_layer (Input  [(None, None)]               0         []                            
 Layer)                                                                                       

In [35]:
encoder_seq.shape,decoder_inp.shape,decoder_output.shape

((175621, 47), (175621, 57), (175621, 57))

In [38]:
import tensorflow as tf

# Define the loss function
loss = tf.keras.losses.SparseCategoricalCrossentropy()

# Define the optimizer
optimizer = tf.keras.optimizers.RMSprop()

# Compile the model
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

# Define early stopping callback
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)

# Train the model
history = model.fit(
    [encoder_seq, decoder_inp],
    decoder_output,
    epochs=5,
    batch_size=128,  # Reducing batch size to conserve memory
    callbacks=[callback]  # Including the early stopping callback
)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [39]:
model.save("Translate_Eng_FR.h5")

  saving_api.save_model(


In [40]:
model.save_weights("/kaggle/working/model_NMT")

In [41]:
def make_references():
  encoder_reference_model=Model(encoder_input,encoder_states)

  decoder_state_h=Input(shape=(256,))
  decoder_state_c=Input(shape=(256,))
  decoder_input_states=[decoder_state_h,decoder_state_c]

  decoder_outputs,state_h,state_c=decoder_lstm(decoder_embedding,initial_state=decoder_input_states)

  decoder_state=[state_h,state_c]
  decoder_outputs=decoder_dense(decoder_outputs)
  decoder_reference_model=Model([decoder_input]+decoder_input_states,[decoder_outputs]+decoder_state)

  return encoder_reference_model,decoder_reference_model

In [42]:
def prepare_text(text):
  text=clean_english(text)

  res=[english_tokenize.word_index[i] for i in text.split(" ")]
  pad=pad_sequences([res],maxlen=max_encoder_sequence_len,padding="post")
  return pad
prepare_text("How are you")

array([[43, 23,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]],
      dtype=int32)

In [None]:
for i in range(20):
    enc_model,dec_model=make_references()

    states_value=enc_model(prepare_text(input("Enter text :- ")))

    empty_target_seq=np.zeros((1,1))
    empty_target_seq[0,0]=french_tokenize.word_index["start"]

    stop_condition=False
    decoded_translaition=""

    while not stop_condition:
      dec_output,h,c=dec_model.predict([empty_target_seq]+states_value)
      sampled_word_index=np.argmax(dec_output[0,-1,:])
      sampled_word=None

      for word,index in french_tokenize.word_index.items():
        if sampled_word_index == index:
          decoded_translaition+=' {}'.format(word)
          sampled_word=word

        if sampled_word == "end" or len(decoded_translaition.split(" ")) >= max_decoder_sequence_len:
          stop_condition=True

      empty_target_seq=np.zeros((1,1))
      empty_target_seq[0,0]=sampled_word_index
      states_value=[h,c]
    print(decoded_translaition)

Enter text :- come
 soyez ! end
Enter text :- go
 soyez ! end
Enter text :- are you coming
 vous êtes tr s end
Enter text :- happy birthday
 soyez de la maison end
