In [1]:
#importing required libraries
import numpy as np
from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential
from keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional, Dropout, LSTM
from keras.layers import Embedding
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy

# Data Parsing

In [2]:
#initialize all variables 
input_texts=[]
target_texts=[]

In [3]:
# Load English data
f1 = open(r"small_vocab_en.txt")
input_texts = f1.readlines()
f1.close()

# Load French data
f2 = open(r"small_vocab_fr.txt")
target_texts = f2.readlines()
f2.close()

#printing some example data
for sample_i in range(2):
    print('English sample {}:  {}'.format(sample_i + 1, input_texts[sample_i]))
    print('French sample {}:  {}\n'.format(sample_i + 1, target_texts[sample_i]))

English sample 1:  new jersey is sometimes quiet during autumn , and it is snowy in april .

French sample 1:  new jersey est parfois calme pendant l' automne , et il est neigeux en avril .


English sample 2:  the united states is usually chilly during july , and it is usually freezing in november .

French sample 2:  les Ã©tats-unis est gÃ©nÃ©ralement froid en juillet , et il gÃ¨le habituellement en novembre .




# Data Preprocessing

In [4]:
#Tokenizing data
eng_tokenizer = Tokenizer()
eng_tokenizer.fit_on_texts(input_texts)
prepro_eng = eng_tokenizer.texts_to_sequences(input_texts)

fre_tokenizer = Tokenizer()
fre_tokenizer.fit_on_texts(target_texts)
prepro_fre = fre_tokenizer.texts_to_sequences(target_texts)

In [5]:
#padding all tokenized texts to equal length
prepro_eng = pad_sequences(prepro_eng,padding='post')
prepro_fre = pad_sequences(prepro_fre,padding='post')

In [6]:
#printing highest length of a sentence and vocabulary size
max_english_sequence_length = prepro_eng.shape[1]
max_french_sequence_length = prepro_fre.shape[1]
english_vocab_size = len(eng_tokenizer.word_index)
french_vocab_size = len(fre_tokenizer.word_index)

print('Data Preprocessed')
print("Max English sentence length:", max_english_sequence_length)
print("Max French sentence length:", max_french_sequence_length)
print("English vocabulary size:", english_vocab_size)
print("French vocabulary size:", french_vocab_size)

Data Preprocessed
Max English sentence length: 15
Max French sentence length: 21
English vocabulary size: 199
French vocabulary size: 345


# Defining Model

In [8]:
def model_final(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    # Hyperparameters
    learning_rate = 0.003
    
    # Build the layers    
    model = Sequential()
    # Embedding
    model.add(Embedding(english_vocab_size, 128, input_length=input_shape[1],
                         input_shape=input_shape[1:]))
    # Encoder
    model.add(Bidirectional(GRU(128)))
    model.add(RepeatVector(output_sequence_length))
    # Decoder
    model.add(Bidirectional(GRU(128, return_sequences=True)))
    model.add(TimeDistributed(Dense(512, activation='relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(french_vocab_size, activation='softmax')))
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    return model

print('Final Model Loaded')

Final Model Loaded


In [9]:
#Initialising Model
translator = model_final(prepro_eng.shape, prepro_fre.shape[1], len(eng_tokenizer.word_index)+1, len(fre_tokenizer.word_index)+1)
translator.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 15, 128)           25600     
                                                                 
 bidirectional (Bidirection  (None, 256)               198144    
 al)                                                             
                                                                 
 repeat_vector (RepeatVecto  (None, 21, 256)           0         
 r)                                                              
                                                                 
 bidirectional_1 (Bidirecti  (None, 21, 256)           296448    
 onal)                                                           
                                                                 
 time_distributed (TimeDist  (None, 21, 512)           131584    
 ributed)                                               

# Training the Model

In [10]:
#Splitting data into testing and training
x_train, x_test, y_train, y_test = train_test_split(prepro_eng,prepro_fre,test_size=0.33,random_state=42)

#Fitting the Model
translator.fit(x_train, y_train, batch_size=1024, epochs=25, validation_split=0.2)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.src.callbacks.History at 0x23409d2bf10>

# Testing  the Model

In [11]:
#Finding loss and accuracy for test data
print("test loss and test accuracy are:")
results = translator.evaluate(x_test,y_test,batch_size=1024)
print(results)

test loss and test accuracy are:
[0.1372891217470169, 0.9607860445976257]


In [21]:
#function for converting ids back to text
def logits_to_text(logits, tokenizer ,mode='default'):
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'
    if mode == 'probability_scores':
        return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])
    else :
        return ' '.join([index_to_words[prediction] for prediction in logits])

In [24]:
#showing 3 samples and their translations from model
predictions = translator.predict(x_test[:3])
for i in range(3):
    print("English :  ",logits_to_text(x_test[i], eng_tokenizer))
    print("Machine Translation :  ",logits_to_text(predictions[i], fre_tokenizer, mode='probability_scores'))
    print("Correct Translation :  ",logits_to_text(y_test[i], fre_tokenizer))
    print("*******************************************************************************************")

English :   china is usually busy during september but it is sometimes cold in spring <PAD> <PAD>
Machine Translation :   chine est gã©nã©ralement occupã© en septembre mais il est parfois froid au printemps <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Correct Translation :   chine est gã©nã©ralement occupã© en septembre mais il est parfois froid au printemps <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
*******************************************************************************************
English :   he dislikes pears and peaches <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Machine Translation :   il n'aime les poires et les pãªches <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Correct Translation :   il aime pas les poires et les pãªches <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
*******************************************************************************************
English :   the un