In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, Dense, LSTM, Dropout, TimeDistributed, RepeatVector
from collections import Counter
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.losses import SparseCategoricalCrossentropy

# New Section

In [None]:
# df = pd.read_csv('eng-french_processed.csv')
# df = pd.read_csv('eng-french.csv')
# df = pd.read_csv('eng-french_processed_100k.csv')
df = pd.read_csv('eng-french_processed_50k.csv')
df.head()

Unnamed: 0,English,French
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !


In [None]:
df.shape

(49999, 2)

In [None]:
# Separating the English and French datasets
eng = df['English']
fra = df['French']

In [None]:
eng_word_counter = Counter([word for sentence in eng for word in sentence.split()])
print("Total count of English words:",len([word for sentence in eng for word in sentence.split()]))
print("Count of distinct English words:",len(eng_word_counter))
print("10 most common English words:",list(zip(*eng_word_counter.most_common(10)))[0])

Total count of English words: 192358
Count of distinct English words: 9946
10 most common English words: ('I', 'a', 'you', 'is', 'to', 'the', "I'm", 'He', 'Tom', 'was')


In [None]:
fra_word_counter = Counter([word for sentence in fra for word in sentence.split()])
print("Total count of French words:",len([word for sentence in fra for word in sentence.split()]))
print("Count of distinct French words:",len(fra_word_counter))
print("10 most common French words:",list(zip(*fra_word_counter.most_common(10)))[0])

Total count of French words: 221631
Count of distinct French words: 17389
10 most common French words: ('Je', '?', 'pas', 'de', 'Il', '!', 'est', 'ne', 'le', 'suis')


In [None]:
def word_count(line):
  return len(line.split())

In [None]:
df['English_word_count'] = df['English'].apply(lambda x: word_count(x))
df['French_word_count'] = df['French'].apply(lambda x: word_count(x))

In [None]:
def create_tokenizer(sentences):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(sentences)
  return tokenizer

In [None]:
def max_sentence_length(lines):
  return max(len(sentence.split()) for sentence in lines)

In [None]:
def encode_sequences(tokenizer,sentences,max_sent_len):
  text_to_seq = tokenizer.texts_to_sequences(sentences) # encode sequences with integers
  text_pad_seq = pad_sequences(text_to_seq,maxlen=max_sent_len,padding='post') # pad sequences with 0
  return text_pad_seq

English vocabulary size: 6005
Maximum length of English sentences: 7


In [None]:
# Prepare English tokenizer
eng_tokenizer = create_tokenizer(eng)
eng_vocab_size = len(eng_tokenizer.word_index) + 1
max_eng_sent_len = max_sentence_length(eng)
print("English vocabulary size:", eng_vocab_size)
print("Maximum length of English sentences:", max_eng_sent_len)

In [None]:
# Prepare French tokenizer
fra_tokenizer = create_tokenizer(fra)
fra_vocab_size = len(fra_tokenizer.word_index) + 1
max_fra_sent_len = max_sentence_length(fra)
print("French vocabulary size:", fra_vocab_size)
print("Maximum length of French sentences:", max_fra_sent_len)

French vocabulary size: 12903
Maximum length of French sentences: 14


In [None]:
import pickle
# saving
with open('eng_tokenizer.pickle', 'wb') as handle:
    pickle.dump(eng_tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('fra_tokenizer.pickle', 'wb') as handle:
    pickle.dump(fra_tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# loading
# with open('tokenizer.pickle', 'rb') as handle:
#     tokenizer = pickle.load(handle)

In [None]:
max_eng_sent_len = 22
max_fra_sent_len = 22

In [None]:
# Perform encoding of sequences
X = encode_sequences(eng_tokenizer,eng,max_eng_sent_len)
y = encode_sequences(fra_tokenizer,fra,max_fra_sent_len)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.15,shuffle=True,random_state=101)

In [None]:
def create_model(src_vocab_size,dest_vocab_size,src_timesteps,dest_timesteps,no_units):
  model = Sequential()
  model.add(Embedding(src_vocab_size,no_units,input_length=src_timesteps,mask_zero=True))
  model.add(LSTM(no_units))
  model.add(RepeatVector(dest_timesteps))
  model.add(LSTM(no_units,return_sequences=True))
  model.add(TimeDistributed(Dense(1024,activation='relu')))
  model.add(Dropout(0.2))
  model.add(TimeDistributed(Dense(dest_vocab_size,activation='softmax')))
  return model

In [None]:
model = create_model(eng_vocab_size,fra_vocab_size,max_eng_sent_len,max_fra_sent_len,512)
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 22, 512)           3074560   
                                                                 
 lstm_4 (LSTM)               (None, 512)               2099200   
                                                                 
 repeat_vector_2 (RepeatVec  (None, 22, 512)           0         
 tor)                                                            
                                                                 
 lstm_5 (LSTM)               (None, 22, 512)           2099200   
                                                                 
 time_distributed_4 (TimeDi  (None, 22, 1024)          525312    
 stributed)                                                      
                                                                 
 dropout_2 (Dropout)         (None, 22, 1024)         

In [None]:
model.compile(loss=SparseCategoricalCrossentropy(),optimizer='rmsprop',metrics='accuracy')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
es = EarlyStopping(monitor='val_accuracy',patience=3,mode='max',verbose=1)
lr = ReduceLROnPlateau(monitor='val_accuracy',patience=2,mode='max',verbose=1,factor=0.1,min_lr=0.001)

In [72]:
r = model.fit(X_train,
              y_train.reshape(y_train.shape[0],y_train.shape[1],1),
              epochs=5,
              batch_size=512,
              callbacks=[es,lr],
              validation_data=(X_test,y_test.reshape(y_test.shape[0],y_test.shape[1],1)))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 5: early stopping


In [73]:
model.evaluate(X_test,y_test.reshape(y_test.shape[0],y_test.shape[1],1))



[1.5479532480239868, 0.802321195602417]

In [None]:
model.save('english_to_french_translator.h5')

In [83]:
translator_model = load_model('english_to_french_translator.h5')
translator_model

<keras.src.engine.sequential.Sequential at 0x7ab079382e90>

In [84]:
sample_test, orig_fra_sent = X_test[:22], y_test[:22]
sample_test[:2], orig_fra_sent[:2]

(array([[  10, 1280,   71,  696,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
        [  14,    2,  280,   40,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0]],
       dtype=int32),
 array([[   4, 1875,    8,  497,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
        [   2,   26,   51, 9062,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0]],
       dtype=int32))

In [85]:
test_predictions = translator_model.predict(sample_test)
test_predictions[:2]



array([[[1.5602694e-03, 5.2095681e-02, 2.5886193e-02, ...,
         8.7083981e-06, 1.0203895e-05, 9.4036577e-06],
        [4.2427011e-02, 3.8045149e-03, 1.6080741e-02, ...,
         1.0707522e-05, 1.1230518e-05, 1.1849377e-05],
        [4.2098337e-01, 1.5743786e-03, 6.6679507e-03, ...,
         7.0639308e-06, 6.3451112e-06, 7.4372279e-06],
        ...,
        [9.9999601e-01, 4.4096303e-08, 4.7194693e-08, ...,
         1.3363821e-12, 1.0148505e-12, 1.2182551e-12],
        [9.9999601e-01, 4.3683269e-08, 4.6753165e-08, ...,
         1.3167478e-12, 9.9990979e-13, 1.2006449e-12],
        [9.9999624e-01, 4.3360529e-08, 4.6408900e-08, ...,
         1.3014672e-12, 9.8827587e-13, 1.1869289e-12]],

       [[1.9760507e-03, 5.0759345e-02, 2.8021658e-02, ...,
         7.8192297e-06, 9.1868333e-06, 8.4632029e-06],
        [6.6630401e-02, 3.7678394e-03, 1.6560014e-02, ...,
         9.7623833e-06, 1.0220115e-05, 1.0799988e-05],
        [5.5746108e-01, 1.3126085e-03, 5.3472789e-03, ...,
         5.033

In [86]:
def convert_pred_to_sent(input_seq):
    sent = ''
    for idx in input_seq:
      if idx:
        sent += fra_tokenizer.index_word[idx] + ' '
    sent = sent[:-1]
    return sent

In [87]:
def convert_idx_to_sent(input_seq,tokenizer):
    sent = ''
    for idx in input_seq:
      if idx:
        sent += tokenizer.index_word[idx] + ' '
    return sent

In [88]:
len(sample_test), len(orig_fra_sent), len(test_predictions)

(22, 22, 22)

In [89]:
orig_eng_text = []
orig_fra_text = []
pred_fra_text = []

for i in range(len(sample_test)):
  orig_eng_text.append(convert_idx_to_sent(sample_test[i],eng_tokenizer))
  pred_fra_text.append(convert_pred_to_sent(np.argmax(test_predictions[i],axis=1)))
  orig_fra_text.append(convert_idx_to_sent(orig_fra_sent[i],fra_tokenizer))

In [90]:
predictions = pd.DataFrame()
predictions['Original English Sentence'] = orig_eng_text
predictions['Original French Sentence'] = orig_fra_text
predictions['Predicted French Sentence'] = pred_fra_text
predictions

Unnamed: 0,Original English Sentence,Original French Sentence,Predicted French Sentence
0,he raised his arm,il leva le bras,tu le
1,are you from here,vous êtes du coin,tu
2,are you a psychologist,es tu psychologue,tu
3,we're taking off,nous sommes en train de décoller,tu
4,she's assertive,elle a de l'assurance,tu
5,this looks like silk,ça ressemble à de la soie,tu
6,don't judge me,ne me jugez pas,tu
7,no one was watching,personne ne regardait,tu le
8,you need not go there,tu n'as pas besoin de t'y rendre,tu pas
9,my tooth hurts,j'ai mal à la dent,tu
