In [8]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [9]:
# STEP 3: Load and Clean Data
def load_data(path='fra.txt', num_examples = 50000):
    with open(path, encoding='utf-8') as f:
        lines = f.read().strip().split('\n')
    
    pairs = [line.split('\t')[:2] for line in lines[:num_examples]]
    
    input_texts = []
    target_texts = []
    for eng, fra in pairs:
        input_texts.append(eng)
        target_texts.append(f"<start> {fra} <end>")
    
    return input_texts, target_texts

input_texts, target_texts = load_data()

In [10]:
# STEP 4: Tokenize
input_tokenizer = Tokenizer(filters='')
target_tokenizer = Tokenizer(filters='')

input_tokenizer.fit_on_texts(input_texts)
target_tokenizer.fit_on_texts(target_texts)

input_seqs = input_tokenizer.texts_to_sequences(input_texts)
target_seqs = target_tokenizer.texts_to_sequences(target_texts)


In [11]:
# STEP 5: Pad sequences
input_tensor = pad_sequences(input_seqs, padding='post')
target_tensor = pad_sequences(target_seqs, padding='post')

print("Input tensor shape:", input_tensor.shape)
print("Target tensor shape:", target_tensor.shape)


Input tensor shape: (50000, 7)
Target tensor shape: (50000, 14)


In [12]:
# STEP 7: Vocabulary Sizes
input_vocab_size = len(input_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1

print("Input vocab size:", input_vocab_size)
print("Target vocab size:", target_vocab_size)


Input vocab size: 9130
Target vocab size: 17458


In [13]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense


In [14]:
embedding_dim = 256
units = 512

input_vocab_size = len(input_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1


In [15]:
# Encoder
encoder_inputs = Input(shape=(None,))
enc_emb = Embedding(input_vocab_size, embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)

# Save encoder states to initialize decoder
encoder_states = [state_h, state_c]


In [16]:
# Decoder
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(target_vocab_size, embedding_dim)
dec_emb = dec_emb_layer(decoder_inputs)

decoder_lstm = LSTM(units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)

decoder_dense = Dense(target_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)


In [17]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
model.summary()


In [18]:
# Remove the first <start> token from target and prepare as labels
decoder_target_data = target_tensor[:, 1:]


In [19]:
batch_size = 64
epochs = 10

model.fit(
    [input_tensor, target_tensor[:, :-1]],
    tf.expand_dims(decoder_target_data, -1),
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2
)


Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m303s[0m 482ms/step - loss: 2.7013 - val_loss: 2.0848
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m305s[0m 488ms/step - loss: 1.4937 - val_loss: 1.8286
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m312s[0m 498ms/step - loss: 1.1978 - val_loss: 1.6865
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m302s[0m 483ms/step - loss: 0.9919 - val_loss: 1.6156
Epoch 5/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m300s[0m 480ms/step - loss: 0.8232 - val_loss: 1.5638
Epoch 6/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m301s[0m 481ms/step - loss: 0.6842 - val_loss: 1.5437
Epoch 7/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m299s[0m 479ms/step - loss: 0.5658 - val_loss: 1.5169
Epoch 8/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m307s[0m 491ms/step - loss: 0.4685 - val_loss: 1.5220
Epoch 9/

<keras.src.callbacks.history.History at 0x1ecf2989ba0>

In [20]:
# Encoder model for inference (takes input sentence, returns hidden states)
encoder_model = Model(encoder_inputs, encoder_states)


In [21]:
# Decoder inputs
decoder_state_input_h = Input(shape=(units,))
decoder_state_input_c = Input(shape=(units,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

# Decoder embedding
dec_emb_inf = dec_emb_layer(decoder_inputs)

# LSTM with previous states
decoder_outputs_inf, state_h_inf, state_c_inf = decoder_lstm(
    dec_emb_inf, initial_state=decoder_states_inputs)

decoder_states_inf = [state_h_inf, state_c_inf]
decoder_outputs_inf = decoder_dense(decoder_outputs_inf)

# Full decoder inference model
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs_inf] + decoder_states_inf)

In [22]:
target_idx_word = {i: w for w, i in target_tokenizer.word_index.items()}

def decode_sequence(input_seq):
    # Encode the input
    states_value = encoder_model.predict(input_seq)

    # Start with <start> token
    target_seq = np.array([[target_tokenizer.word_index['<start>']]])

    decoded_sentence = ''
    stop_condition = False
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample the token with highest probability (greedy search)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = target_idx_word.get(sampled_token_index, '')

        if sampled_word == '<end>' or len(decoded_sentence.split()) > 50:
            stop_condition = True
        else:
            decoded_sentence += ' ' + sampled_word

        # Update the target sequence and states
        target_seq = np.array([[sampled_token_index]])
        states_value = [h, c]

    return decoded_sentence.strip()


In [23]:
# Pick 5 random sentences to test
for i in range(5):
    input_seq = input_tensor[i:i+1]
    decoded = decode_sequence(input_seq)
    
    print(f"Input    : {input_texts[i]}")
    print(f"Predicted: {decoded}")
    print(f"Target   : {target_texts[i].replace('<start>', '').replace('<end>', '').strip()}")
    print("-" * 50)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 134ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 113ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
Input    : Go.
Predicted: en route !
Target   : Va !
--------------------------------------------------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
Input    : Go.
Predicted: en route !
Target   : Marche.
--------------------------------------------------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/

In [24]:
import tensorflow as tf
from tensorflow.keras.layers import Layer

class BahdanauAttention(Layer):
    def __init__(self, units):
        super().__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        query_with_time_axis = tf.expand_dims(query, 1)
        score = self.V(tf.nn.tanh(self.W1(values) + self.W2(query_with_time_axis)))

        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, tf.squeeze(attention_weights, -1)


In [25]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units):
        super().__init__()
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.lstm = tf.keras.layers.LSTM(dec_units, return_sequences=True, return_state=True)
        self.attention = BahdanauAttention(dec_units)
        self.fc = tf.keras.layers.Dense(vocab_size)

    def call(self, x, hidden, enc_output):
        context_vector, attention_weights = self.attention(hidden, enc_output)
        x = self.embedding(x)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        output, state_h, state_c = self.lstm(x)
        x = self.fc(output)
        return x, state_h, state_c, attention_weights


In [26]:
import matplotlib.pyplot as plt
import seaborn as sns

def plot_attention(attention, input_sentence, predicted_sentence):
    fig = plt.figure(figsize=(10, 8))
    sns.heatmap(attention, xticklabels=input_sentence.split(), yticklabels=predicted_sentence.split(), cmap='viridis')
    plt.xlabel('Input')
    plt.ylabel('Predicted')
    plt.title("Attention Heatmap")
    plt.show()


In [27]:
import matplotlib.pyplot as plt

def plot_training_curves(history):
    plt.figure(figsize=(12, 5))

    # Loss
    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'], label='Train Loss')
    if 'val_loss' in history.history:
        plt.plot(history.history['val_loss'], label='Val Loss')
    plt.title('Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    # Accuracy (optional)
    if 'accuracy' in history.history:
        plt.subplot(1, 2, 2)
        plt.plot(history.history['accuracy'], label='Train Acc')
        if 'val_accuracy' in history.history:
            plt.plot(history.history['val_accuracy'], label='Val Acc')
        plt.title('Accuracy')
        plt.xlabel('Epoch')
        plt.ylabel('Accuracy')
        plt.legend()

    plt.tight_layout()
    plt.show()
