In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/language-translation-englishfrench/eng_-french.csv


# **Important Libraries**

In [2]:
import numpy as np
import pandas as pd
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Bidirectional, Concatenate, Layer
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# **Load Dataset**

In [3]:
# Load dataset
df = pd.read_csv('/kaggle/input/language-translation-englishfrench/eng_-french.csv', names=["English", "French"], header=0)
df.head()

Unnamed: 0,English,French
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !


# **Data Cleaning and Spliting**

In [4]:
# Clean English text
def clean_english_text(text):
    text = text.lower()
    contractions = {
        "i'm": "i am", "you're": "you are", "it's": "it is",
        "can't": "cannot", "don't": "do not", "didn't": "did not",
        "i've": "i have", "we're": "we are", "isn't": "is not",
        "won't": "will not", "aren't": "are not"
    }
    for contraction, full_form in contractions.items():
        text = re.sub(r'\b{}\b'.format(contraction), full_form, text)
    text = re.sub(r"[^a-z\s]+", "", text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Clean French text
def clean_french_text(text):
    text = text.lower()
    contractions = {
        "c'est": "ce est", "j'ai": "je ai", "il y a": "il y avoir",
        "n'est": "ne est", "qu'est": "que est", "d'accord": "de accord"
    }
    for contraction, full_form in contractions.items():
        text = re.sub(r'\b{}\b'.format(contraction), full_form, text)
    text = re.sub(r"[^a-z\u00e0\u00e8\u00e9\u00e2\u00ea\u00ee\u00f4\u00fb\u00e7\u00f9\u00ef\u00fc\u0153\s]+", "", text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [5]:
# Apply text cleaning
df["English"] = df["English"].apply(clean_english_text)
df["French"] = df["French"].apply(clean_french_text)

# Add special tokens for French sequences
df["French"] = df["French"].apply(lambda x: f"<start> {x} <end>")

# Extract cleaned sentences
english_sentences = df["English"].tolist()
french_sentences = df["French"].tolist()
print("Cleaned English Sentences:", english_sentences[:5])
print("Cleaned French Sentences:", french_sentences[:5])

Cleaned English Sentences: ['hi', 'run', 'run', 'who', 'wow']
Cleaned French Sentences: ['<start> salut <end>', '<start> cours <end>', '<start> courez <end>', '<start> qui <end>', '<start> ça alors <end>']


In [6]:
# Split data
train_english, test_english, train_french, test_french = train_test_split(
    df["English"], df["French"], test_size=0.2, random_state=42
)

# **Tokenization**

In [7]:
# Preprocessing function
def preprocess_text(tokenizer, texts, max_len):
    sequences = tokenizer.texts_to_sequences(texts)
    return pad_sequences(sequences, maxlen=max_len, padding='post')

# Tokenization and Padding
max_vocab_size = 10000
max_sequence_length = 20

english_tokenizer = Tokenizer(num_words=max_vocab_size)
english_tokenizer.fit_on_texts(train_english)

french_tokenizer = Tokenizer(num_words=max_vocab_size)
french_tokenizer.fit_on_texts(train_french)

train_english_padded = preprocess_text(english_tokenizer, train_english, max_sequence_length)
train_french_padded = preprocess_text(french_tokenizer, train_french, max_sequence_length)
test_english_padded = preprocess_text(english_tokenizer, test_english, max_sequence_length)
test_french_padded = preprocess_text(french_tokenizer, test_french, max_sequence_length)

# Prepare decoder target sequences
train_decoder_target_data = train_french_padded[:, 1:]
train_decoder_target_data = pad_sequences(train_decoder_target_data, maxlen=max_sequence_length, padding='post')

# Vocabulary sizes
english_vocab_size = len(english_tokenizer.word_index) + 1
french_vocab_size = len(french_tokenizer.word_index) + 1

# **Model**

In [8]:
# Custom Attention Layer
class AttentionLayer(Layer):
    def call(self, inputs):
        decoder_outputs, encoder_outputs = inputs
        attention_scores = tf.matmul(decoder_outputs, encoder_outputs, transpose_b=True)
        attention_weights = tf.nn.softmax(attention_scores, axis=-1)
        context_vector = tf.matmul(attention_weights, encoder_outputs)
        return context_vector

In [9]:
# Seq2Seq Model with Bidirectional LSTM and Attention Mechanism
encoder_inputs = Input(shape=(max_sequence_length,))
encoder_embedding = Embedding(input_dim=english_vocab_size, output_dim=256, mask_zero=True)(encoder_inputs)
encoder_lstm = Bidirectional(LSTM(256, return_sequences=True, return_state=True))
encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder_lstm(encoder_embedding)
state_h = Concatenate()([forward_h, backward_h])
state_c = Concatenate()([forward_c, backward_c])
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(max_sequence_length,))
decoder_embedding = Embedding(input_dim=french_vocab_size, output_dim=256, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(512, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)

attention_layer = AttentionLayer()
attention_result = attention_layer([decoder_outputs, encoder_outputs])
decoder_concat_input = Concatenate(axis=-1)([decoder_outputs, attention_result])

decoder_dense = Dense(french_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_concat_input)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()



In [10]:
# Train the model with EarlyStopping callbacks
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=3)

model.fit(
    [train_english_padded, train_french_padded],
    np.expand_dims(train_decoder_target_data, -1),
    batch_size=64,
    epochs=5,
    validation_split=0.2,
    callbacks=[early_stopping_callback]
)

Epoch 1/5
[1m1757/1757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m368s[0m 206ms/step - accuracy: 0.7227 - loss: 2.0938 - val_accuracy: 0.8455 - val_loss: 0.8443
Epoch 2/5
[1m1757/1757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m363s[0m 206ms/step - accuracy: 0.8616 - loss: 0.7011 - val_accuracy: 0.8773 - val_loss: 0.6015
Epoch 3/5
[1m1757/1757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m362s[0m 206ms/step - accuracy: 0.8969 - loss: 0.4478 - val_accuracy: 0.8883 - val_loss: 0.5308
Epoch 4/5
[1m1757/1757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m362s[0m 206ms/step - accuracy: 0.9164 - loss: 0.3329 - val_accuracy: 0.8937 - val_loss: 0.5033
Epoch 5/5
[1m1757/1757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m363s[0m 207ms/step - accuracy: 0.9302 - loss: 0.2627 - val_accuracy: 0.8958 - val_loss: 0.4989


<keras.src.callbacks.history.History at 0x7ef8ea735b40>

# **Evaluate and Prediciton**

In [11]:
# Evaluate the model
test_decoder_target_data = test_french_padded[:, 1:]
test_decoder_target_data = pad_sequences(test_decoder_target_data, maxlen=max_sequence_length, padding='post')
loss, accuracy = model.evaluate(
    [test_english_padded, test_french_padded],
    np.expand_dims(test_decoder_target_data, -1)
)
print(f"Test Loss: {loss:.2f}, Test Accuracy: {accuracy:.2f}")

[1m1098/1098[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 50ms/step - accuracy: 0.8961 - loss: 0.4906
Test Loss: 0.49, Test Accuracy: 0.90


In [12]:
# Inference models with Attention Mechanism
encoder_model = Model(encoder_inputs, [encoder_outputs] + encoder_states)

decoder_state_input_h = Input(shape=(512,))
decoder_state_input_c = Input(shape=(512,))
decoder_hidden_states_input = Input(shape=(max_sequence_length, 512))

decoder_lstm_outputs, state_h_decoded, state_c_decoded = decoder_lstm(
    decoder_embedding,
    initial_state=[decoder_state_input_h, decoder_state_input_c]
)
attention_result_decoded = attention_layer([decoder_lstm_outputs, decoder_hidden_states_input])
decoder_concat_input_decoded = Concatenate(axis=-1)([decoder_lstm_outputs, attention_result_decoded])
decoder_outputs_decoded = decoder_dense(decoder_concat_input_decoded)

decoder_model = Model(
    [decoder_inputs] + [decoder_state_input_h, decoder_state_input_c, decoder_hidden_states_input],
    [decoder_outputs_decoded] + [state_h_decoded, state_c_decoded]
)

In [13]:
# Reverse lookup for French vocabulary
reverse_french_vocab = {i: word for word, i in french_tokenizer.word_index.items()}

def decode_sequence(input_seq):
    encoder_outputs, state_h, state_c = encoder_model.predict(input_seq, verbose=0)  
    states_value = [state_h, state_c]
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = french_tokenizer.word_index.get('<start>', 0)

    decoded_sentence = ''
    for _ in range(max_sequence_length): 
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value + [encoder_outputs])
        
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = reverse_french_vocab.get(sampled_token_index, '')

        if sampled_word == '<end>':
            break

        # Append the word to the sentence
        decoded_sentence += ' ' + sampled_word
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return decoded_sentence.replace('end','').strip()

In [14]:
# Create a list to store the actual and predicted translations
translations = []

# Test translations
for i in range(10):
    input_seq = test_english_padded[i:i + 1]
    translated_sentence = decode_sequence(input_seq)
    actual_sentence = test_french.iloc[i]
    
    # Append both actual and predicted sentences to the list
    translations.append({"Actual": actual_sentence, "Predicted": translated_sentence})

# Convert the list to a DataFrame
translations_df = pd.DataFrame(translations)

# Print the DataFrame with actual and predicted translations
translations_df

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 124ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1

Unnamed: 0,Actual,Predicted
0,<start> prends place <end>,place
1,<start> jaimerais que tom soit là <end>,jespère que tom était là
2,<start> comment sest passée laudition <end>,comment est allé
3,<start> je nai pas dami avec lequel je puisse ...,je nai aucun ami à parler de mes problèmes
4,<start> jaime beaucoup cette jupe puisje lessa...,japprécie vraiment cette jupe peut
5,<start> que lui estil arrivé <end>,ce est lui
6,<start> prends deux cartes de ton choix <end>,prs deux cartes vous
7,<start> je ne crains pas de mourir <end>,je ne suis pas encore effrayé
8,<start> lhorloge sest arrêtée <end>,lhorloge a
9,<start> tu ferais bien de tassurer que ce est ...,vous feriez mieux de assurer que ce est vrai
