In [2]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
import numpy as np
import re

# --- Configuration ---
BATCH_SIZE = 64
EPOCHS = 30  # Start with 30, increase to 100+ for better results
LATENT_DIM = 256  # Size of the "context vector"
NUM_SAMPLES = 10000  # Number of sentences to train on
DATA_PATH = 'hin.txt'  # <-- This is our manually downloaded file

# ----------------------------------------
# ## Step 1: Load and Preprocess Data
# ----------------------------------------

input_texts = []
target_texts = []
input_characters = set()
target_characters = set()

with open(DATA_PATH, 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')

for line in lines[:min(NUM_SAMPLES, len(lines) - 1)]:
    # Tab-separated: English[TAB]Hindi[TAB]Attribution
    parts = line.split('\t')
    if len(parts) < 2:
        continue # Skip lines that don't have both English and Hindi

    input_text = parts[0]
    target_text = parts[1]

    # Clean punctuation
    input_text = re.sub(r"([?.!,])", r" \1", input_text.lower().strip())
    target_text = re.sub(r"([?.!,])", r" \1", target_text.lower().strip())
    
    # We add '[START]' and '[END]' tokens to the *target* (Hindi) sentence.
    # This tells the model when to start and stop translating.
    target_text = f"[START] {target_text} [END]"

    input_texts.append(input_text)
    target_texts.append(target_text)

print(f"Total samples: {len(input_texts)}")
print(f"Example Input (English): {input_texts[0]}")
print(f"Example Target (Hindi): {target_texts[0]}")

# ----------------------------------------
# ## Step 2: Tokenization (Vectorizing)
# ----------------------------------------

# We need to convert our words into numbers (indices)
# We use separate "dictionaries" (Tokenizers) for English and Hindi

# Input (English) Tokenizer
input_tokenizer = tf.keras.preprocessing.text.Tokenizer(
    filters='', oov_token='<unk>' # <unk> = "unknown word"
)
input_tokenizer.fit_on_texts(input_texts)
input_sequences = input_tokenizer.texts_to_sequences(input_texts)

# Target (Hindi) Tokenizer
target_tokenizer = tf.keras.preprocessing.text.Tokenizer(
    filters='', oov_token='<unk>'
)
target_tokenizer.fit_on_texts(target_texts)
target_sequences = target_tokenizer.texts_to_sequences(target_texts)

# Get key properties for our model
input_word_index = input_tokenizer.word_index
target_word_index = target_tokenizer.word_index

num_encoder_tokens = len(input_word_index) + 1
num_decoder_tokens = len(target_word_index) + 1

max_encoder_seq_length = max(len(seq) for seq in input_sequences)
max_decoder_seq_length = max(len(seq) for seq in target_sequences)

print(f"\nUnique English tokens: {num_encoder_tokens}")
print(f"Unique Hindi tokens: {num_decoder_tokens}")
print(f"Max English sentence length: {max_encoder_seq_length}")
print(f"Max Hindi sentence length: {max_decoder_seq_length}")

# ----------------------------------------
# ## Step 3: Prepare Data for Training
# ----------------------------------------

# Pad all sequences to be the same length
encoder_input_data = tf.keras.preprocessing.sequence.pad_sequences(
    input_sequences, maxlen=max_encoder_seq_length, padding='post'
)

# For the decoder, we need two versions:
# 1. decoder_input_data: The target sentence (e.g., "[START] word1 word2 [END]")
decoder_input_data = tf.keras.preprocessing.sequence.pad_sequences(
    target_sequences, maxlen=max_decoder_seq_length, padding='post'
)

# 2. decoder_target_data: The *answer* the model should predict.
#    This is the same as decoder_input_data, but "shifted" one step.
#    e.g., "word1 word2 [END]"
#    The model sees "[START]" and must predict "word1".
#    The model sees "word1" and must predict "word2".
#
# We use 'sparse_categorical_crossentropy' as our loss, so we don't need
# to one-hot encode. We just need a 3D array of shape:
# (num_samples, max_seq_length, 1) holding the *index* of the correct word.

decoder_target_data = np.zeros(
    (len(target_sequences), max_decoder_seq_length, 1), dtype="float32"
)

for i, seq in enumerate(target_sequences):
    for t in range(len(seq)):
        if t > 0:
            # decoder_target_data[i, t-1, 0] will be the index of word `t`
            decoder_target_data[i, t - 1, 0] = seq[t]


print(f"\n--- Data Shapes ---")
print(f"encoder_input_data shape: {encoder_input_data.shape}")
print(f"decoder_input_data shape: {decoder_input_data.shape}")
print(f"decoder_target_data shape: {decoder_target_data.shape}")

Total samples: 3116
Example Input (English): wow !
Example Target (Hindi): [START] वाह ! [END]

Unique English tokens: 2509
Unique Hindi tokens: 3214
Max English sentence length: 25
Max Hindi sentence length: 29

--- Data Shapes ---
encoder_input_data shape: (3116, 25)
decoder_input_data shape: (3116, 29)
decoder_target_data shape: (3116, 29, 1)


In [3]:

# ----------------------------------------
# ## Step 4: Build the Training Model
# ----------------------------------------

# --- The Encoder ---
# This part reads the English sentence and compresses it into a "context vector".

# 1. Input layer
encoder_inputs = Input(shape=(None,), name='encoder_input')

# 2. Embedding layer: Converts word indices (like 5) into dense vectors (like [0.1, -0.3, ...])
enc_embedding_layer = Embedding(num_encoder_tokens, LATENT_DIM, mask_zero=True)
enc_emb = enc_embedding_layer(encoder_inputs)

# 3. LSTM layer: Processes the sequence.
#    We set 'return_state=True' to get the final "context vector" (h and c states).
encoder_lstm = LSTM(LATENT_DIM, return_state=True, name='encoder_lstm')
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)

# We only need the states (the context) to pass to the decoder.
encoder_states = [state_h, state_c]

# --- The Decoder ---
# This part takes the "context vector" and generates the Hindi sentence.

# 1. Input layer (for the Hindi sentence)
decoder_inputs = Input(shape=(None,), name='decoder_input')

# 2. Embedding layer (for Hindi words)
dec_embedding_layer = Embedding(num_decoder_tokens, LATENT_DIM, mask_zero=True)
dec_emb = dec_embedding_layer(decoder_inputs)

# 3. LSTM layer
#    We set 'return_sequences=True' to get an output at *every* time step (for every word).
#    We give it the 'encoder_states' as its starting point.
decoder_lstm = LSTM(LATENT_DIM, return_sequences=True, return_state=True, name='decoder_lstm')
decoder_outputs, _, _ = decoder_lstm(
    dec_emb, initial_state=encoder_states # <-- This is the seq2seq magic!
)

# 4. Output (Dense) layer
#    This converts the LSTM's output vector into a probability score for
#    *every possible word* in the Hindi vocabulary.
decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='decoder_dense')
decoder_outputs = decoder_dense(decoder_outputs)

# --- The Full Training Model ---
# Connects the Encoder and Decoder into one model.
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(
    optimizer='rmsprop',
    loss='sparse_categorical_crossentropy', # Good for integer targets
    metrics=['accuracy']
)

model.summary()


# ----------------------------------------
# ## Step 5: Train the Model
# ----------------------------------------

print("\n--- Starting Model Training ---")

history = model.fit(
    [encoder_input_data, decoder_input_data],  # Inputs
    decoder_target_data,                       # Target (the "answers")
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_split=0.2
)

print("--- Model Training Complete ---")
model.save('s2s_model.h5')

# ----------------------------------------
# ## Step 6: Build the Inference (Testing) Models
# ----------------------------------------

# > **Why do we need new models for testing?**
# >
# > The `model` we just trained is for **training**. It uses "Teacher Forcing," which means it gets the *entire* correct Hindi sentence as an input all at once.
# >
# > For **testing** (inference), we don't have the Hindi sentence! We need to generate it *one word at a time*.
# >
# > We do this by splitting our trained model into two parts:
# > 1.  **Encoder Model:** Takes an English sentence and outputs the context vector (states).
# > 2.  **Decoder Model:** Takes the context vector *and* the *last predicted word* to predict the *next* word.

# 1. The Encoder Model (same as before)
encoder_model = Model(encoder_inputs, encoder_states)

# 2. The Decoder Model (this is the new part)
# We need to define new inputs for the states, as they will be fed in a loop
decoder_state_input_h = Input(shape=(LATENT_DIM,))
decoder_state_input_c = Input(shape=(LATENT_DIM,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

# Get the embedding of the *single* input word
dec_emb_inference = dec_embedding_layer(decoder_inputs)

# Run the LSTM for *one step* using the previous states
decoder_outputs_inf, state_h_inf, state_c_inf = decoder_lstm(
    dec_emb_inference, initial_state=decoder_states_inputs
)
decoder_states_inf = [state_h_inf, state_c_inf]

# Get the word probabilities
decoder_outputs_inf = decoder_dense(decoder_outputs_inf)

# This is the final decoder model for testing
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs_inf] + decoder_states_inf
)

print("\n--- Inference Models Built ---")


# ----------------------------------------
# ## Step 7: Test the Model (Translate!)
# ----------------------------------------

# Create reverse lookups to convert indices back to words
reverse_input_word_index = {i: word for word, i in input_word_index.items()}
reverse_target_word_index = {i: word for word, i in target_word_index.items()}

# Get the special [START] and [END] token indices
start_token_index = target_word_index['[start]']
end_token_index = target_word_index['[end]']


def decode_sequence(input_seq):
    # 1. Encode the input sentence to get the "context"
    states_value = encoder_model.predict(input_seq)

    # 2. Start the decoder with just the [START] token
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = start_token_index

    stop_condition = False
    decoded_sentence = []

    # 3. Loop to generate words one by one
    while not stop_condition:
        # 4. Predict the next word and get the new states
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value
        )

        # 5. Get the word with the highest probability
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = reverse_target_word_index.get(sampled_token_index, '<unk>')

        # 6. Check if we should stop
        if (sampled_word == '[end]' or
            len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True
        else:
            decoded_sentence.append(sampled_word)

        # 7. Update the inputs for the next loop
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index # The new input is the word we just predicted
        states_value = [h, c] # The new state is the state we just got

    return " ".join(decoded_sentence)


# --- Let's test it! ---
print("\n--- Model Testing ---")
for i in range(10):  # Test on 10 random samples
    
    # Get a random test sample
    test_index = np.random.choice(len(input_texts))
    input_seq = encoder_input_data[test_index: test_index + 1] # Needs to be batch shape
    
    original_input = input_texts[test_index]
    original_target = target_texts[test_index]
    
    # Translate
    decoded_sentence = decode_sequence(input_seq)
    
    print("-" * 50)
    print(f"Input:    {original_input}")
    print(f"Original: {original_target.replace('[START]', '').replace('[END]', '').strip()}")
    print(f"Predicted: {decoded_sentence}")


--- Starting Model Training ---
Epoch 1/30
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 170ms/step - accuracy: 0.4789 - loss: 6.6602 - val_accuracy: 0.6185 - val_loss: 6.0052
Epoch 2/30
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 163ms/step - accuracy: 0.7631 - loss: 5.2860 - val_accuracy: 0.6244 - val_loss: 5.8713
Epoch 3/30
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 158ms/step - accuracy: 0.3585 - loss: 4.9886 - val_accuracy: 0.0765 - val_loss: 5.6506
Epoch 4/30
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 164ms/step - accuracy: 0.0745 - loss: 4.7938 - val_accuracy: 0.0763 - val_loss: 5.6166
Epoch 5/30
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 167ms/step - accuracy: 0.0752 - loss: 4.7053 - val_accuracy: 0.0759 - val_loss: 5.6544
Epoch 6/30
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 165ms/step - accuracy: 0.0749 - loss: 4.6553 - val_accuracy: 0.0760 - val_loss: 5



--- Model Training Complete ---

--- Inference Models Built ---

--- Model Testing ---
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 209ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 173ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
--------------------------------------------------
Input:    the house is haunted .
Original: इस घर में भूत है।
Predicted: यह बहुत बहुत है।
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [None]:
# the Above Accuracy Can be increased by various methods