In [21]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import tensorflow
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
import nltk
import sentencepiece
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense,Input,Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

In [3]:
# Read the file line by line into a DataFrame
def read_file(where):
    with open(where, "r", encoding="utf-8") as f:
        lines = f.read().splitlines()
    return  pd.DataFrame(lines, columns=["sentence"])

In [4]:
#Data loading 
en_data = read_file("/kaggle/input/machine-translation/europarl-v7.fr-en.en")
print(en_data.head())
fr_data = read_file("/kaggle/input/machine-translation/europarl-v7.fr-en.fr")
print(fr_data.head())

                                            sentence
0                          Resumption of the session
1  I declare resumed the session of the European ...
2  Although, as you will have seen, the dreaded '...
3  You have requested a debate on this subject in...
4  In the meantime, I should like to observe a mi...
                                            sentence
0                              Reprise de la session
1  Je déclare reprise la session du Parlement eur...
2  Comme vous avez pu le constater, le grand "bog...
3  Vous avez souhaité un débat à ce sujet dans le...
4  En attendant, je souhaiterais, comme un certai...


In [5]:
#Just half the dataset
sample_size = 500000
en_data_sample = en_data[:sample_size]
fr_data_sample = fr_data[:sample_size]

In [6]:
#Exporting the english samples to train them into sentencepiece and get vocabulary with 8k in size 
en_data_sample.to_csv('en_sample.txt', index=False, header=False)
sentencepiece.SentencePieceTrainer.train(input='en_sample.txt', model_prefix='bpe_en', vocab_size=10000,
                                        control_symbols=['<start>', '<end>'])

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: en_sample.txt
  input_format: 
  model_prefix: bpe_en
  model_type: UNIGRAM
  vocab_size: 10000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  control_symbols: <start>
  control_symbols: <end>
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_s

In [7]:
fr_data_sample.to_csv('fr_sample.txt', index=False, header=False)
sentencepiece.SentencePieceTrainer.train(input='fr_sample.txt', model_prefix='bpe_fr', vocab_size=10000,
                                         control_symbols=['<start>', '<end>'])

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: fr_sample.txt
  input_format: 
  model_prefix: bpe_fr
  model_type: UNIGRAM
  vocab_size: 10000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  control_symbols: <start>
  control_symbols: <end>
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_s

In [8]:
#Model loading
sp_en = sentencepiece.SentencePieceProcessor(model_file='bpe_en.model')
sp_fr = sentencepiece.SentencePieceProcessor(model_file='bpe_fr.model')

start_id = sp_fr.piece_to_id("<start>")
end_id = sp_fr.piece_to_id("<end>")

In [16]:
#Converting the text into list of strings
en_sentences = en_data_sample['sentence'].astype(str).tolist()
fr_sentences = fr_data_sample['sentence'].astype(str).tolist()
#Tokenization
#Encode + Add special tokens
en_tokenized = [sp_en.encode(s, out_type=int) for s in en_sentences]
fr_tokenized = [[start_id] + sp_fr.encode(s, out_type=int) + [end_id] for s in fr_sentences]

#Pad sequences
max_len = 30
en_padded = pad_sequences(en_tokenized, maxlen=max_len, padding='post')
fr_padded = pad_sequences(fr_tokenized, maxlen=max_len+2, padding='post')  # +2 for <start>, <end>

#Shift for decoder input/target
decoder_input = fr_padded[:, :-1] 
decoder_target = fr_padded[:, 1:] 
vocab_size_en = sp_en.get_piece_size()
vocab_size_fr = sp_fr.get_piece_size()

In [10]:
#Encoder Part
embedding_dim = 100 #Hyper paramter to be tuned if needed
lstm_units = 256 #Memory cells -> Neurons

encoder_inputs = Input(shape=(None,))
enc_emb = Embedding(vocab_size_en, embedding_dim)(encoder_inputs)
encoder_lstm, state_h, state_c = LSTM(lstm_units, return_state=True,use_cudnn=True)(enc_emb)

encoder_states = [state_h, state_c]

decoder_input_layer = Input(shape=(None,), name='decoder_input')
decoder_embed = Embedding(input_dim=vocab_size_fr,
                          output_dim=embedding_dim,
                          mask_zero=True,
                          name='decoder_embedding')(decoder_input_layer)

decoder_outputs, _, _ = LSTM(units=lstm_units,
                             return_sequences=True,
                             return_state=True,
                             name='decoder_lstm',
                             use_cudnn=True)(decoder_embed, initial_state=encoder_states)

decoder_outputs = Dense(vocab_size_fr, activation='softmax')(decoder_outputs)

model = Model(inputs=[encoder_inputs, decoder_input_layer], outputs=decoder_outputs)


I0000 00:00:1746387502.014913      31 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


In [11]:
model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Summary
model.summary()

In [12]:
#Early stopping on validation loss
es = keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=3,
    verbose=1)

In [13]:
# Real training data
decoder_input_data = fr_padded[:, :-1]
decoder_target_data = fr_padded[:, 1:]
history = model.fit(
        [en_padded, decoder_input_data],                  # encoder input and decoder input data
        decoder_target_data[..., np.newaxis],             # labels
        batch_size=32,
        epochs=10,
        validation_split=0.1,
        callbacks=[es],
        verbose=1
)

Epoch 1/10


I0000 00:00:1746387508.124728     793 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m14063/14063[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m424s[0m 30ms/step - accuracy: 0.2263 - loss: 4.4749 - val_accuracy: 0.2789 - val_loss: 4.8321
Epoch 2/10
[1m14063/14063[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m418s[0m 30ms/step - accuracy: 0.3372 - loss: 3.3285 - val_accuracy: 0.3032 - val_loss: 4.5770
Epoch 3/10
[1m14063/14063[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m420s[0m 30ms/step - accuracy: 0.3654 - loss: 2.9914 - val_accuracy: 0.3195 - val_loss: 4.4060
Epoch 4/10
[1m14063/14063[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m422s[0m 30ms/step - accuracy: 0.3824 - loss: 2.8089 - val_accuracy: 0.3322 - val_loss: 4.3208
Epoch 5/10
[1m14063/14063[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m421s[0m 30ms/step - accuracy: 0.3930 - loss: 2.6845 - val_accuracy: 0.3357 - val_loss: 4.2026
Epoch 6/10
[1m14063/14063[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m419s[0m 30ms/step - accuracy: 0.3996 - loss: 2.5930 - val_accuracy: 0.3397 - val_loss: 4.14

In [22]:
no_samples = 5000
bleu = 0
smooth_fn = SmoothingFunction().method1

for i in range(no_samples):
    x1 = model.predict([ en_padded[i+6:i+7],decoder_input_data[i+6:i+7] ],verbose = 0)
    pred_ids = np.argmax(x1[0], axis=-1)  # shape: (sequence_len,)

    # Decode the token ID sequences to text
    reference_ids = decoder_target_data[i+6].tolist()
    reference_text = sp_fr.decode_ids(reference_ids)
    predicted_text = sp_fr.decode_ids(pred_ids.tolist())

    # Tokenize the decoded text for BLEU
    reference_tokens = reference_text.strip().split()
    predicted_tokens = predicted_text.strip().split()

    # Compute BLEU score
    bleu += sentence_bleu([reference_tokens], predicted_tokens,smoothing_function = smooth_fn)
print("Average BLEU Score:", bleu/no_samples)

Average BLEU Score: 0.07070826352751303
