#Encoder-Decoder with LSTM for English-to-Hindi translation

#🔧 Requirements
Install required packages:


In [1]:
!pip install tensorflow numpy pandas




#📦 Step 1: Load a Dataset (Toy English-Hindi Pairs)
We'll use a small custom dataset for simplicity:

In [2]:
# Sample English-Hindi translation pairs
data = [
    ["hello", "नमस्ते"],
    ["how are you", "आप कैसे हैं"],
    ["i am fine", "मैं ठीक हूँ"],
    ["what is your name", "आपका नाम क्या है"],
    ["my name is john", "मेरा नाम जॉन है"],
    ["thank you", "धन्यवाद"],
    ["good morning", "सुप्रभात"],
    ["good night", "शुभ रात्रि"]
]


#🧼 Step 2: Preprocessing

In [3]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# ---------------- Split English and Hindi sentences ----------------
# Assuming 'data' is a list of tuples: [(eng1, hin1), (eng2, hin2), ...]
eng_texts, hin_texts = zip(*data)

# ---------------- Add Start and End Tokens ----------------
# Add <sos> (start of sentence) at beginning of Hindi input
hin_texts_input = ['<sos> ' + text for text in hin_texts]

# Add <eos> (end of sentence) at end of Hindi output
hin_texts_output = [text + ' <eos>' for text in hin_texts]

# ---------------- Tokenization ----------------
# Tokenizer for English
eng_tokenizer = Tokenizer()  # Default filters will remove punctuation etc.
eng_tokenizer.fit_on_texts(eng_texts)  # Build word index
eng_seq = eng_tokenizer.texts_to_sequences(eng_texts)  # Convert text to sequence of integers
eng_seq = pad_sequences(eng_seq, padding='post')  # Pad sequences with zeros at the end

# Tokenizer for Hindi
hin_tokenizer = Tokenizer(filters='')  # Keep all tokens including punctuation
hin_tokenizer.fit_on_texts(hin_texts_input + hin_texts_output)  # Fit on both input and output to ensure all words are in vocab
hin_seq_input = hin_tokenizer.texts_to_sequences(hin_texts_input)  # Tokenize input with <sos>
hin_seq_output = hin_tokenizer.texts_to_sequences(hin_texts_output)  # Tokenize output with <eos>
hin_seq_input = pad_sequences(hin_seq_input, padding='post')  # Pad Hindi input
hin_seq_output = pad_sequences(hin_seq_output, padding='post')  # Pad Hindi output

# ---------------- Vocabulary Sizes ----------------
eng_vocab_size = len(eng_tokenizer.word_index) + 1  # +1 for padding token (index 0)
hin_vocab_size = len(hin_tokenizer.word_index) + 1

# ---------------- Debug Info ----------------
print("English sequence shape:", eng_seq.shape)
print("Hindi input/output shape:", hin_seq_input.shape, hin_seq_output.shape)


English sequence shape: (8, 4)
Hindi input/output shape: (8, 5) (8, 5)


#🧠 Step 3: Define the Model (Encoder-Decoder with LSTM)

In [4]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense

# -------------------- Model Hyperparameters --------------------
latent_dim = 256  # Dimensionality of the LSTM hidden states

# -------------------- Encoder --------------------
# Define encoder input layer (takes sequences of token IDs)
enc_input = Input(shape=(None,))  # 'None' allows variable-length sequences

# Embedding layer to convert token IDs to dense vectors
enc_emb = Embedding(input_dim=eng_vocab_size, output_dim=latent_dim)(enc_input)

# LSTM layer: only outputs final state, not sequences
enc_lstm, h, c = LSTM(latent_dim, return_state=True)(enc_emb)

# Store encoder's final hidden and cell states
enc_states = [h, c]

# -------------------- Decoder --------------------
# Define decoder input layer (takes target sequence with <sos> tokens)
dec_input = Input(shape=(None,))

# Embedding layer for Hindi tokens
dec_emb = Embedding(input_dim=hin_vocab_size, output_dim=latent_dim)(dec_input)

# LSTM for decoder, returns entire output sequence (for teacher forcing)
# It uses the encoder's final states as initial state
dec_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
dec_output, _, _ = dec_lstm(dec_emb, initial_state=enc_states)

# Final Dense layer with softmax to output probability over vocabulary
dec_dense = Dense(hin_vocab_size, activation='softmax')
dec_output = dec_dense(dec_output)

# -------------------- Full Model --------------------
# The model takes encoder input and decoder input and returns decoder output
model = Model([enc_input, dec_input], dec_output)

# Compile the model with appropriate loss and optimizer
# sparse_categorical_crossentropy expects integer targets (not one-hot)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# Display model architecture
model.summary()


#🏋️ Step 4: Training with Teacher Forcing

In [5]:
# -------------------- Reshape Hindi Output --------------------
# The model expects the target output to be 3D: (samples, timesteps, features)
# Our `hin_seq_output` is currently 2D (samples, timesteps)
# We add an extra dimension to represent features = 1 (class ID at each timestep)
hin_seq_output = np.expand_dims(hin_seq_output, -1)

# -------------------- Model Training --------------------
# Fit the model using encoder and decoder input sequences
# Output is the shifted decoder target (with <eos> at end)
# Batch size of 2 is used here for small datasets or testing
# 300 epochs for better convergence (can be tuned based on loss)
model.fit(
    [eng_seq, hin_seq_input],   # Inputs: English input + Hindi input with <sos>
    hin_seq_output,             # Target: Hindi output with <eos>
    batch_size=2,
    epochs=300
)


Epoch 1/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 92ms/step - loss: 2.9755
Epoch 2/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - loss: 2.8111
Epoch 3/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - loss: 2.5276
Epoch 4/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - loss: 2.2159
Epoch 5/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step - loss: 2.1243
Epoch 6/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step - loss: 1.8076
Epoch 7/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - loss: 1.6876
Epoch 8/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - loss: 1.4517
Epoch 9/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - loss: 1.3473
Epoch 10/300
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - loss: 1.5625
Epoch 11/

<keras.src.callbacks.history.History at 0x7823f74625d0>

#🔁 Step 5: Inference Models for Prediction
🔸 Encoder Model

In [6]:
# -------------------- Encoder Inference Model --------------------
# During inference (translation), we only need the encoder to produce the context (hidden and cell states)
# This model takes the input English sequence and outputs the internal states of the encoder LSTM
# These states will be passed as the initial states to the decoder during prediction time
encoder_model = Model(enc_input, enc_states)


🔸 Decoder Model

In [7]:
# -------------------- Decoder Inference Model --------------------

# Define placeholders for the decoder's initial states (coming from encoder)
# These will be fed at each time step during inference (word-by-word decoding)
dec_state_input_h = Input(shape=(latent_dim,))  # Decoder LSTM hidden state
dec_state_input_c = Input(shape=(latent_dim,))  # Decoder LSTM cell state
dec_states_inputs = [dec_state_input_h, dec_state_input_c]

# Reuse the same decoder input layer (word input at each timestep) from training
# But during inference, it will take one word at a time
dec_emb2 = Embedding(input_dim=hin_vocab_size, output_dim=latent_dim)(dec_input)

# Reuse the decoder LSTM layer, now it runs with input word and previous states
# It returns output + new states (which will be fed again into the next time step)
dec_output2, state_h2, state_c2 = dec_lstm(dec_emb2, initial_state=dec_states_inputs)
dec_states2 = [state_h2, state_c2]

# Final dense layer to convert decoder output to vocabulary probabilities
dec_output2 = dec_dense(dec_output2)

# Define the final inference decoder model
# Inputs: current decoder word input + previous states
# Outputs: predicted next word probabilities + updated states for next timestep
decoder_model = Model(
    [dec_input] + dec_states_inputs,
    [dec_output2] + dec_states2
)


#🔤 Step 6: Translate New Sentence

In [8]:
# Create reverse mappings: index → word for English and Hindi vocabularies
reverse_eng_index = {v: k for k, v in eng_tokenizer.word_index.items()}
reverse_hin_index = {v: k for k, v in hin_tokenizer.word_index.items()}
hin_word_index = hin_tokenizer.word_index  # word → index for Hindi

# Function to translate an English sentence to Hindi using trained seq2seq model
def translate_sentence(input_sentence):
    # Step 1: Convert input English sentence to sequence of token IDs
    input_seq = eng_tokenizer.texts_to_sequences([input_sentence])

    # Step 2: Pad the input sequence to match model's expected input shape
    input_seq = pad_sequences(input_seq, maxlen=eng_seq.shape[1], padding='post')

    # Step 3: Encode the input sequence using the encoder model to get initial states
    states_value = encoder_model.predict(input_seq)

    # Step 4: Initialize target sequence with the <sos> token to start decoding
    target_seq = np.zeros((1, 1))  # shape: (batch_size, 1)
    target_seq[0, 0] = hin_word_index['<sos>']

    # Step 5: Initialize an empty string to store the translated Hindi sentence
    decoded_sentence = ''

    # Step 6: Start decoding word-by-word until <eos> or max length is reached
    stop_condition = False
    while not stop_condition:
        # Predict next word using decoder model + current target token + previous states
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Get the token with highest probability at current time step
        sampled_token_index = np.argmax(output_tokens[0, -1, :])

        # Convert token ID back to the corresponding Hindi word
        sampled_word = reverse_hin_index.get(sampled_token_index, '')

        # If <eos> token is predicted or too long, stop decoding
        if sampled_word == '<eos>' or len(decoded_sentence.split()) > 10:
            stop_condition = True
        else:
            # Append predicted word to the output sentence
            decoded_sentence += sampled_word + ' '

        # Update the target_seq to the predicted word for next time step
        target_seq[0, 0] = sampled_token_index

        # Update states for next time step
        states_value = [h, c]

    # Return the final translated Hindi sentence
    return decoded_sentence.strip()


# ---------- Sample Translations ----------
print(translate_sentence("thank you"))
print(translate_sentence("what is your name"))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 190ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 207ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
धन्यवाद
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
आपका नाम क्या है
