In [11]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input,Embedding, Dense, LSTM
from tensorflow.keras.models import Model

In [12]:
# Loading CSV (columns: 'source', 'target') in chunks 
chunksize = 10000
chunks = pd.read_csv('/kaggle/input/en-fr-translation-dataset/en-fr.csv', chunksize=chunksize)   
source_texts, target_texts = [], []

for chunk in chunks:
    source_texts.extend(chunk['en'].astype(str).to_list())
    target_texts.extend(chunk['fr'].astype(str).to_list())

source_texts = source_texts[:100000]
target_texts = target_texts[:100000]

In [13]:
# Tokenizing
num_words = 10000
src_tokenizer = Tokenizer(num_words=num_words, filters='', oov_token='<OOV>')
tgt_tokenizer = Tokenizer(num_words=num_words, filters='', oov_token='<OOV>')

src_tokenizer.fit_on_texts(source_texts)
tgt_tokenizer.fit_on_texts(target_texts)

src_sequences = src_tokenizer.texts_to_sequences(source_texts)
tgt_sequences = tgt_tokenizer.texts_to_sequences(target_texts)

max_src_len = int(np.percentile([len(seq) for seq in src_sequences], 75))
max_tgt_len =int(np.percentile([len(seq) for seq in tgt_sequences], 75))

src_sequences = pad_sequences(src_sequences, maxlen=max_src_len, padding='post')
tgt_sequences = pad_sequences(tgt_sequences, maxlen=max_tgt_len, padding='post')

In [14]:
src_train, src_val, tgt_train, tgt_val = train_test_split(src_sequences, tgt_sequences, test_size=0.2, random_state = 42)

In [15]:
embed_dim=256
units=512

In [16]:
##Encoder
enco_inputs = Input(shape=(max_src_len,))
enc_emb = Embedding(num_words, embed_dim)(enco_inputs)
enco_lstm, state_h, state_c = LSTM(units, return_state=True)(enc_emb)
encoder_states=[state_h, state_c]

In [17]:
##Decoder
deco_inputs = Input(shape=(max_tgt_len,))
deco_emb = Embedding(num_words, embed_dim)(deco_inputs)
deco_lstm = LSTM(units, return_sequences=True, return_state=True)
deco_outputs, _, _ = deco_lstm(deco_emb, initial_state = encoder_states) 
deco_dense = Dense(num_words, activation = 'softmax')
deco_outputs = deco_dense(deco_outputs)

In [18]:
model = Model([enco_inputs, deco_inputs], deco_outputs)
model.compile(optimizer = 'adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [19]:
# Prepare decoder target data (shifted by one)
tgt_train_out = np.expand_dims(np.roll(tgt_train, -1, axis=1), -1)
tgt_val_out = np.expand_dims(np.roll(tgt_val, -1, axis=1), -1)

In [20]:
chunk_size =10000
total_size = len(src_train)

for i in range(0, total_size, chunk_size):
    print(f"Training on rows {i} to {i+chunk_size}...")
    src_chunk = src_train[i:i+chunk_size]
    tgt_chunk = tgt_train[i:i+chunk_size]

    tgt_out_chunk = np.expand_dims(np.roll(tgt_chunk, -1, axis =1), -1)

    model.fit([src_chunk, tgt_chunk], tgt_out_chunk,
              batch_size=64, epochs=2)

Training on rows 0 to 10000...
Epoch 1/2
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 67ms/step - accuracy: 0.3537 - loss: 5.3569
Epoch 2/2
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 68ms/step - accuracy: 0.4240 - loss: 4.0076
Training on rows 10000 to 20000...
Epoch 1/2
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 70ms/step - accuracy: 0.4373 - loss: 3.8161
Epoch 2/2
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 70ms/step - accuracy: 0.4521 - loss: 3.5477
Training on rows 20000 to 30000...
Epoch 1/2
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 71ms/step - accuracy: 0.4627 - loss: 3.4618
Epoch 2/2
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 70ms/step - accuracy: 0.4755 - loss: 3.2392
Training on rows 30000 to 40000...
Epoch 1/2
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 69ms/step - accuracy: 0.4726 - loss: 3.2787
Epoch 2/2
[1m157

In [21]:
from nltk.translate.bleu_score import sentence_bleu

def decode_sequence(input_seq):
    # For simplicity, just use the model to predict one batch
    pred = model.predict([input_seq, np.zeros((input_seq.shape[0], max_tgt_len))])
    pred_seq = np.argmax(pred, axis=-1)
    return pred_seq

# Evaluate on a few samples
for i in range(5):
    src = src_val[i:i+1]
    tgt = tgt_val[i]
    pred_seq = decode_sequence(src)[0]
    tgt_words = [w for w in tgt if w != 0]
    pred_words = [w for w in pred_seq if w != 0]
    reference = [tgt_tokenizer.sequences_to_texts([tgt_words])[0].split()]
    candidate = tgt_tokenizer.sequences_to_texts([pred_words])[0].split()
    print("BLEU:", sentence_bleu(reference, candidate))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 293ms/step
BLEU: 4.117940420322104e-237
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
BLEU: 5.573026331357879e-238
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
BLEU: 0


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
BLEU: 4.634131446844473e-244
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
BLEU: 0
