## **Importing required libraries (Dependencies)**

In [2]:
import string
import re
import numpy as np
from numpy import array, argmax, random, take
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Bidirectional, RepeatVector, TimeDistributed
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from keras import optimizers
import matplotlib.pyplot as plt


## **Defining function to read file and splitting into sentences**

In [3]:
def readtext(filename):
    file = open(filename,mode = 'rt',encoding = 'utf-8')
    text = file.read()
    file.close()
    return text

In [4]:
def lines(text):
    sentence = text.strip().split('\n')
    sentence = [i.split('\t') for i in sentence]
    return sentence

In [5]:
data = readtext("/kaggle/input/french-english/fra.txt")
fra_eng = lines(data)
fra_eng = array(fra_eng)

In [28]:
type(fra_eng)

numpy.ndarray

In [29]:
fra_eng

array([['Go.', 'Va !',
        'CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)'],
       ['Go.', 'Marche.',
        'CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8090732 (Micsmithel)'],
       ['Go.', 'En route !',
        'CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8267435 (felix63)'],
       ...,
       ["If someone who doesn't know your background says that you sound like a native speaker, it means they probably noticed something about your speaking that made them realize you weren't a native speaker. In other words, you don't really sound like a native speaker.",
        "Si quelqu'un qui ne connaît pas vos antécédents dit que vous parlez comme un locuteur natif, cela veut dire qu'il a probablement remarqué quelque chose à propos de votre élocution qui lui a fait prendre conscience que vous n'êtes pas un locuteur natif. En d'autres termes, vous ne parlez pas vraiment comme un locuteur natif.",
        'CC-BY 2.0 (Fr

**Taking 80,000 datasets for training**

In [6]:
fra_eng = fra_eng[:80000,:]

In [7]:
#cleaning the data removing puncuations from the data
fra_eng[:,0] = [s.translate(str.maketrans('', '', string.punctuation)) for s in fra_eng[:,0]]
fra_eng[:,1] = [s.translate(str.maketrans('', '', string.punctuation)) for s in fra_eng[:,1]]

In [32]:
fra_eng

array([['Go', 'Va ',
        'CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)'],
       ['Go', 'Marche',
        'CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8090732 (Micsmithel)'],
       ['Go', 'En route ',
        'CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8267435 (felix63)'],
       ...,
       ['You did the right thing', 'Vous avez fait ce quil fallait',
        'CC-BY 2.0 (France) Attribution: tatoeba.org #1663857 (Spamster) & #1673003 (sacredceltic)'],
       ['You did the right thing', 'Tu as fait ce quil fallait',
        'CC-BY 2.0 (France) Attribution: tatoeba.org #1663857 (Spamster) & #1673006 (sacredceltic)'],
       ['You didnt get very far', 'Tu nes pas allé très loin',
        'CC-BY 2.0 (France) Attribution: tatoeba.org #2283619 (CK) & #2284282 (sacredceltic)']],
      dtype='<U349')

In [33]:
#converting into lower case
for i in range(len(fra_eng)):
    fra_eng[i,0] = fra_eng[i,0].lower()
    fra_eng[i,1] = fra_eng[i,1].lower()

In [34]:
eng_length = []
fra_length = []

for i in fra_eng[:,0]:
    eng_length.append(len(i.split()))

for i in fra_eng[:,1]:
    fra_length.append(len(i.split()))

In [35]:
length_df = pd.DataFrame({'eng':eng_length, 'fra':fra_length})

In [36]:
length_df['eng'].value_counts()

eng
4    29984
3    20438
5    19726
2     5989
6     3616
1      159
7       88
Name: count, dtype: int64

**Max length of english sequence comes out to be 7**

In [37]:
length_df['fra'].value_counts()

fra
4     22881
5     18120
3     17202
6      9893
2      5729
7      3511
1      1240
8      1071
9       267
10       68
11       16
12        1
14        1
Name: count, dtype: int64

**Max length of french sequence comes out to be 14**

In [38]:
#Tokenization is the process of converting each word in the vocabulary into an integer based on frequency of occurence

def tokenization(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

eng_tokenizer = tokenization(fra_eng[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1

eng_l = 7
print('English Vocabulary Size: %d' % eng_vocab_size)

fra_tokenizer = tokenization(fra_eng[:, 1])
fra_vocab_size = len(fra_tokenizer.word_index) + 1

fra_l = 14
print('French Vocabulary Size: %d' % fra_vocab_size)


English Vocabulary Size: 7820
French Vocabulary Size: 18312


In [39]:
#converting into sequence and padding them upto maxlen
def encode_pad(tokenizer,length,lines):
    sequence = tokenizer.texts_to_sequences(lines)
    sequence = pad_sequences(sequence,maxlen = length,padding = 'post')
    return sequence

In [40]:
#splitting them into training and testing data
from sklearn.model_selection import train_test_split
train,test = train_test_split(fra_eng,test_size = 0.2,random_state = 12)

In [41]:

trainX = encode_pad(fra_tokenizer, fra_l, train[:, 1])
trainY = encode_pad(eng_tokenizer, eng_l, train[:, 0])

testX = encode_pad(fra_tokenizer, fra_l, test[:, 1])
testY = encode_pad(eng_tokenizer, eng_l, test[:, 0])

In [42]:
fra_vocab_size

18312


**Now we'll build the Sequential model.The first layer is the embedding layer which projects each token in an N dimensional vector spaceLSTM is the artificial recurrent neural net architecture.It can not only process past data but take feedback from future data as well.**



In [43]:
def build_model(in_vocab, out_vocab, in_timesteps, out_timesteps, units):
    model = Sequential()
    model.add(Embedding(in_vocab, units,  mask_zero=True))
    model.add(LSTM(units))
    model.add(RepeatVector(out_timesteps))
    model.add(LSTM(units, return_sequences=True))
    model.add(Dense(out_vocab, activation='softmax'))
    model.build(input_shape=(None, in_timesteps))
    return model

model = build_model(fra_vocab_size, eng_vocab_size, fra_l, eng_l, 512)
rms = optimizers.RMSprop(learning_rate=0.001)
model.compile(optimizer=rms, loss='sparse_categorical_crossentropy')

In [44]:
model.summary()

In [63]:
filename = 'model_best.keras'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

history = model.fit(trainX, trainY.reshape(trainY.shape[0], trainY.shape[1], 1), 
          epochs=50, batch_size=512, 
          validation_split = 0.2,
          callbacks=[checkpoint], verbose=1)

Epoch 1/50
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 304ms/step - loss: 1.3225
Epoch 1: val_loss improved from inf to 1.79391, saving model to model_best.keras
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 341ms/step - loss: 1.3225 - val_loss: 1.7939
Epoch 2/50
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 287ms/step - loss: 1.2953
Epoch 2: val_loss improved from 1.79391 to 1.78225, saving model to model_best.keras
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 320ms/step - loss: 1.2954 - val_loss: 1.7823
Epoch 3/50
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 287ms/step - loss: 1.2668
Epoch 3: val_loss did not improve from 1.78225
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 314ms/step - loss: 1.2669 - val_loss: 1.8374
Epoch 4/50
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 293ms/step - loss: 1.2531
Epoch 4: val_loss improved from 1.78225

In [64]:
model = load_model('model_best.keras')
preds_probabilities = model.predict(testX.reshape((testX.shape[0], testX.shape[1])))
preds = np.argmax(preds_probabilities, axis=-1)

[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step


In [65]:
def get_word(n, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == n:
            return word
    return None

In [66]:
preds_text = []
for i in preds:
    temp = []
    for j in range(len(i)):
        t = get_word(i[j], eng_tokenizer)
        if j > 0: 
            if (t == get_word(i[j-1], eng_tokenizer)) or (t == None):  
                temp.append('')
            else:
                temp.append(t)
             
        else: 
            if(t == None): 
                temp.append('')
            else:
                temp.append(t)            
        
    preds_text.append(' '.join(temp))

In [67]:
pred_df = pd.DataFrame({'actual' : test[:,0], 'predicted' : preds_text})
pred_df.tail(25)

Unnamed: 0,actual,predicted
15975,we took a long walk,we did a walk
15976,youre very open,youre very open
15977,stop acting like a baby,stop acting like a baby
15978,ive got eyes,i am no
15979,its all because of you,its all to you
15980,here drink this,heres this
15981,my life was a wreck,my life was in
15982,im prepared,im free
15983,she has few friends,she has few friends
15984,i like walnuts,i like watching


In [69]:
from nltk.translate.bleu_score import sentence_bleu
sumn = 0
for i in range(len(pred_df)):
    reference = pred_df['actual'][0]
    candidate = pred_df['predicted'][1]
    score = sentence_bleu([pred_df['actual'][i].split()],pred_df['predicted'][i].split())
    sumn+=score
    
print("The average BLEU score for the translation is {:.2f} %".format(sumn*100/len(pred_df)))
# Here we have calculated bleu score for every translation and taken an average

The average BLEU score for the translation is 64.46 %
