#Basic Attention Mechanism with LSTM Encoder-Decoder

#✅ Overview:

We implement the Bahdanau-style (additive) attention manually to enhance the LSTM model using teacher forcing.

📦 Step-by-Step Implementation

1. Install Libraries

In [1]:
!pip install tensorflow numpy pandas



#2. Prepare Data

In [2]:
# Import necessary libraries
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Define a small parallel English-Hindi toy dataset
data = [
    ["hello", "नमस्ते"],
    ["how are you", "आप कैसे हैं"],
    ["i am fine", "मैं ठीक हूँ"],
    ["what is your name", "आपका नाम क्या है"],
    ["my name is john", "मेरा नाम जॉन है"],
    ["thank you", "धन्यवाद"],
    ["good morning", "सुप्रभात"],
    ["good night", "शुभ रात्रि"]
]

# Separate English and Hindi texts into two lists
eng_texts, hin_texts = zip(*data)

# Add special start-of-sentence (<sos>) and end-of-sentence (<eos>) tokens to Hindi texts
hin_texts_input = ['<sos> ' + txt for txt in hin_texts]  # For decoder input
hin_texts_output = [txt + ' <eos>' for txt in hin_texts]  # For decoder output (target)

# Initialize tokenizers for English and Hindi
eng_tokenizer = Tokenizer()               # Default filters remove punctuation and lowercase text
hin_tokenizer = Tokenizer(filters='')     # Hindi tokenizer with no filters (to preserve characters)

# Fit tokenizers on respective text data
eng_tokenizer.fit_on_texts(eng_texts)
hin_tokenizer.fit_on_texts(hin_texts_input + hin_texts_output)  # Combine input & output for full vocabulary

# Convert text sequences into integer sequences and pad them to uniform length
eng_seq = pad_sequences(eng_tokenizer.texts_to_sequences(eng_texts), padding='post')         # English input
hin_seq_input = pad_sequences(hin_tokenizer.texts_to_sequences(hin_texts_input), padding='post')   # Hindi decoder input
hin_seq_output = pad_sequences(hin_tokenizer.texts_to_sequences(hin_texts_output), padding='post') # Hindi decoder output

# Calculate vocabulary sizes (+1 for padding/UNK token)
eng_vocab = len(eng_tokenizer.word_index) + 1
hin_vocab = len(hin_tokenizer.word_index) + 1

# Display the shape of the final English sequence matrix
print("Input shape:", eng_seq.shape)


Input shape: (8, 4)


#3. Define Encoder, Decoder with Bahdanau Attention

In [3]:
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, dot, Activation
from tensorflow.keras.models import Model

# Set the dimensionality of the LSTM hidden units
latent_dim = 256

# ------------------------ ENCODER ------------------------

# Input layer for encoder sequence (English sentences)
enc_input = Input(shape=(None,))  # variable-length input

# Embedding layer to learn word embeddings for encoder input
enc_emb = Embedding(input_dim=eng_vocab, output_dim=latent_dim)(enc_input)

# LSTM layer returns full sequence and final hidden + cell states
# enc_lstm: all hidden states for every time step (used for attention)
# enc_h, enc_c: final hidden and cell states (used to initialize decoder)
enc_lstm, enc_h, enc_c = LSTM(latent_dim, return_sequences=True, return_state=True)(enc_emb)

# Store final states to initialize decoder
enc_states = [enc_h, enc_c]

# ------------------------ DECODER ------------------------

# Input layer for decoder sequence (Hindi input with <sos>)
dec_input = Input(shape=(None,))  # variable-length output sequence

# Embedding layer for decoder input
dec_emb = Embedding(input_dim=hin_vocab, output_dim=latent_dim)(dec_input)

# LSTM decoder receives encoder's final states as initial state
# dec_lstm: all decoder time-step outputs
dec_lstm, _, _ = LSTM(latent_dim, return_sequences=True, return_state=True)(dec_emb, initial_state=enc_states)

# ------------------------ ATTENTION ------------------------

# Compute attention scores using dot product between decoder output and encoder output
# score shape: (batch_size, dec_seq_len, enc_seq_len)
score = dot([dec_lstm, enc_lstm], axes=[2, 2])

# Normalize attention scores to get probabilities
attention_weights = Activation('softmax')(score)  # softmax over encoder steps

# Multiply attention weights with encoder hidden states to get context vector
# context_vector shape: (batch_size, dec_seq_len, latent_dim)
context_vector = dot([attention_weights, enc_lstm], axes=[2, 1])

# ------------------------ CONCATENATION & OUTPUT ------------------------

# Concatenate decoder output and context vector for each time step
# shape becomes (batch_size, dec_seq_len, latent_dim * 2)
dec_concat = Concatenate()([context_vector, dec_lstm])

# Final output layer: softmax over vocabulary to predict next word
output = Dense(hin_vocab, activation='softmax')(dec_concat)

# Define the complete model taking encoder and decoder inputs
model = Model([enc_input, dec_input], output)

# Compile model with Adam optimizer and sparse categorical crossentropy loss
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# Print model architecture
model.summary()


#4. Train the Model

In [4]:
hin_seq_output = np.expand_dims(hin_seq_output, -1)
model.fit([eng_seq, hin_seq_input], hin_seq_output, epochs=300, batch_size=2)


Epoch 1/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 76ms/step - loss: 2.9742
Epoch 2/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step - loss: 2.7970
Epoch 3/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step - loss: 2.4747
Epoch 4/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step - loss: 2.0572
Epoch 5/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step - loss: 1.9061
Epoch 6/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 61ms/step - loss: 1.8022
Epoch 7/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - loss: 1.8159
Epoch 8/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - loss: 1.5852
Epoch 9/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - loss: 1.5441
Epoch 10/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - loss: 1.5087
Epoch 11/

<keras.src.callbacks.history.History at 0x7ce394bda2d0>

#Luong Attention Mechanism

✅ Overview:

Luong attention is a multiplicative method where we align decoder hidden state with all encoder states using dot product.

📦 Step-by-Step Implementation

1. Add Luong Attention Mechanism

In [5]:
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, dot, Activation
from tensorflow.keras.models import Model

latent_dim = 256  # Dimensionality of LSTM hidden states and embeddings

# ---------------- Encoder ----------------
# Define input layer for the encoder (English sentence)
enc_input = Input(shape=(None,))

# Embed the input words (word index → dense vector)
enc_emb = Embedding(input_dim=eng_vocab, output_dim=latent_dim)(enc_input)

# Pass the embeddings to LSTM. Return the full sequence, and the final hidden and cell states
enc_output, enc_h, enc_c = LSTM(latent_dim, return_sequences=True, return_state=True)(enc_emb)
# enc_output: sequence of encoder hidden states (for attention)
# enc_h, enc_c: final states to initialize decoder

# ---------------- Decoder ----------------
# Define input layer for the decoder (Hindi sentence)
dec_input = Input(shape=(None,))

# Embed the decoder input words
dec_emb = Embedding(input_dim=hin_vocab, output_dim=latent_dim)(dec_input)

# Pass embeddings to LSTM, initializing with encoder's final states
dec_output, _, _ = LSTM(latent_dim, return_sequences=True, return_state=True)(dec_emb, initial_state=[enc_h, enc_c])
# dec_output: sequence of decoder hidden states

# ---------------- Luong Attention (Dot Product) ----------------
# Calculate attention scores via dot product between decoder and encoder outputs
score = dot([dec_output, enc_output], axes=[2, 2])  # shape: (batch_size, dec_seq_len, enc_seq_len)

# Apply softmax to get attention weights
attention_weights = Activation('softmax')(score)  # shape: (batch_size, dec_seq_len, enc_seq_len)

# Multiply attention weights with encoder outputs to get context vectors
context = dot([attention_weights, enc_output], axes=[2, 1])  # shape: (batch_size, dec_seq_len, latent_dim)

# ---------------- Context + Decoder Output ----------------
# Concatenate context vectors with decoder LSTM outputs
combined = Concatenate()([context, dec_output])  # shape: (batch_size, dec_seq_len, 2*latent_dim)

# Final Dense layer to generate probabilities over the Hindi vocabulary
output = Dense(hin_vocab, activation='softmax')(combined)

# ---------------- Model Compilation ----------------
# Define the full model with encoder and decoder inputs
model = Model([enc_input, dec_input], output)

# Compile the model with Adam optimizer and sparse categorical crossentropy loss
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# Display the model architecture summary
model.summary()


#2. Train Luong Attention Model

In [6]:
model.fit([eng_seq, hin_seq_input], hin_seq_output, epochs=300, batch_size=2)


Epoch 1/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 43ms/step - loss: 2.9818
Epoch 2/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - loss: 2.8569
Epoch 3/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step - loss: 2.6389
Epoch 4/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - loss: 2.0010
Epoch 5/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step - loss: 1.9935
Epoch 6/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - loss: 1.7375
Epoch 7/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - loss: 1.6733
Epoch 8/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step - loss: 1.4793
Epoch 9/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - loss: 1.4000
Epoch 10/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - loss: 1.2380
Epoch 11/

<keras.src.callbacks.history.History at 0x7ce391810ed0>

#📝 Summary

| Feature          | Bahdanau Attention       | Luong Attention             |
| ---------------- | ------------------------ | --------------------------- |
| Type             | Additive                 | Multiplicative (Dot)        |
| Extra Parameters | Yes (weights in scoring) | No (simpler, faster)        |
| Suitable for     | Variable alignment       | Simple, symmetric alignment |
