In [1]:
import os

os.environ["KERAS_BACKEND"] = "tensorflow"

import pathlib
import random
import string
import re
import numpy as np

import tensorflow.data as tf_data
import tensorflow.strings as tf_strings

import keras
from keras import layers
from keras import ops
from keras.layers import TextVectorization


2025-08-22 08:46:19.501994: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755852379.718681      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755852379.783330      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
import pandas as pd

# Path to your CSV file
text_file = "/kaggle/input/english-hindi-dataset/Dataset_English_Hindi.csv"

# Read CSV (assuming first column = English, second column = Hindi)
df = pd.read_csv(text_file, header=None, names=["en", "hi"])

# Preview data
print(df.head())

# Extract English and Hindi sentences as lists
english_texts = df["en"].astype(str).tolist()
hindi_texts   = df["hi"].astype(str).tolist()


        en      hi
0  English   Hindi
1    Help!   बचाओ!
2    Jump.   उछलो.
3    Jump.   कूदो.
4    Jump.  छलांग.


In [5]:
import pandas as pd

# Load CSV file (two columns: English, Hindi)
text_file = "/kaggle/input/english-hindi-dataset/Dataset_English_Hindi.csv"
df = pd.read_csv(text_file, header=None, names=["en", "hi"])

# Prepare sentence pairs with start/end tokens for Hindi
text_pairs = []
for eng, hin in zip(df["en"], df["hi"]):
    hin = "[start] " + str(hin) + " [end]"
    text_pairs.append((str(eng), hin))

print("Sample pairs:")
for i in range(5):
    print(text_pairs[i])


Sample pairs:
('English', '[start] Hindi [end]')
('Help!', '[start] बचाओ! [end]')
('Jump.', '[start] उछलो. [end]')
('Jump.', '[start] कूदो. [end]')
('Jump.', '[start] छलांग. [end]')


In [6]:
import random

# Shuffle the sentence pairs
random.shuffle(text_pairs)

# Split into train/val/test
num_val_samples = int(0.15 * len(text_pairs))
num_train_samples = len(text_pairs) - 2 * num_val_samples

train_pairs = text_pairs[:num_train_samples]
val_pairs   = text_pairs[num_train_samples : num_train_samples + num_val_samples]
test_pairs  = text_pairs[num_train_samples + num_val_samples :]

print(f"{len(text_pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")
print(f"{len(test_pairs)} test pairs")


130477 total pairs
91335 training pairs
19571 validation pairs
19571 test pairs


In [22]:
# Characters to strip (punctuation)
strip_chars = string.punctuation + "¿।"   # added Hindi danda "।"
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")

# Vocabulary & sequence setup
vocab_size = 15000
sequence_length = 20
batch_size = 128


In [23]:
def custom_standardization(input_string):
    # Lowercase + remove punctuation (but keep Devanagari intact)
    lowercase = tf_strings.lower(input_string)
    return tf_strings.regex_replace(lowercase, "[%s]" % re.escape(strip_chars), "")

# English vectorizer
eng_vectorization = TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length,
)

# Hindi vectorizer (target)
hin_vectorization = TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length + 1,  # +1 for [end] token
    standardize=custom_standardization,
)

# Prepare training data
train_eng_texts = [pair[0] for pair in train_pairs]  # English
train_hin_texts = [pair[1] for pair in train_pairs]  # Hindi (with [start] [end])

# Adapt vectorizers on training texts
eng_vectorization.adapt(train_eng_texts)
hin_vectorization.adapt(train_hin_texts)


In [9]:
def format_dataset(eng, hin):
    eng = eng_vectorization(eng)
    hin = hin_vectorization(hin)
    return (
        {
            "encoder_inputs": eng,
            "decoder_inputs": hin[:, :-1],  # teacher forcing (shifted input)
        },
        hin[:, 1:],  # target (shifted output)
    )

def make_dataset(pairs):
    eng_texts, hin_texts = zip(*pairs)
    eng_texts = list(eng_texts)
    hin_texts = list(hin_texts)
    dataset = tf_data.Dataset.from_tensor_slices((eng_texts, hin_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset)
    return dataset.cache().shuffle(2048).prefetch(16)

# Training and validation datasets
train_ds = make_dataset(train_pairs)
val_ds   = make_dataset(val_pairs)


In [24]:
for inputs, targets in train_ds.take(1):
    print(f'inputs["encoder_inputs"].shape: {inputs["encoder_inputs"].shape}')
    print(f'inputs["decoder_inputs"].shape: {inputs["decoder_inputs"].shape}')
    print(f"targets.shape: {targets.shape}")


inputs["encoder_inputs"].shape: (64, 20)
inputs["decoder_inputs"].shape: (64, 20)
targets.shape: (64, 20)


In [25]:
import keras.ops as ops

class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [
                layers.Dense(dense_dim, activation="relu"),
                layers.Dense(embed_dim),
            ]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, mask=None):
        if mask is not None:
            padding_mask = ops.cast(mask[:, None, :], dtype="int32")
        else:
            padding_mask = None

        attention_output = self.attention(
            query=inputs, value=inputs, key=inputs, attention_mask=padding_mask
        )
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)

    def get_config(self):
        config = super().get_config()
        config.update(
            {
                "embed_dim": self.embed_dim,
                "dense_dim": self.dense_dim,
                "num_heads": self.num_heads,
            }
        )
        return config


In [26]:
from keras import layers, ops

class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):
        super().__init__(**kwargs)
        # Token embeddings (word embeddings)
        self.token_embeddings = layers.Embedding(
            input_dim=vocab_size, output_dim=embed_dim, mask_zero=True
        )
        # Positional embeddings (for sequence positions)
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=embed_dim
        )
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim

    def call(self, inputs):
        # inputs.shape = (batch_size, sequence_length)
        length = ops.shape(inputs)[-1]   # dynamic length
        positions = ops.arange(start=0, stop=length, step=1)
        positions = ops.expand_dims(positions, axis=0)   # shape (1, seq_len)
        
        # Embedding lookup
        embedded_tokens = self.token_embeddings(inputs)           # (batch, seq_len, embed_dim)
        embedded_positions = self.position_embeddings(positions)  # (1, seq_len, embed_dim)
        
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        # Mask pad tokens (id=0) for attention
        return ops.not_equal(inputs, 0)

    def get_config(self):
        config = super().get_config()
        config.update(
            {
                "sequence_length": self.sequence_length,
                "vocab_size": self.vocab_size,
                "embed_dim": self.embed_dim,
            }
        )
        return config



In [27]:
class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, latent_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.latent_dim = latent_dim
        self.num_heads = num_heads

        # Self-attention (masked, causal)
        self.attention_1 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        # Cross-attention (attends to encoder outputs)
        self.attention_2 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )

        # Feed-forward network
        self.dense_proj = keras.Sequential(
            [
                layers.Dense(latent_dim, activation="relu"),
                layers.Dense(embed_dim),
            ]
        )

        # Layer norms
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.layernorm_3 = layers.LayerNormalization()

        self.supports_masking = True

    def call(self, inputs, encoder_outputs, mask=None):
        # Step 1: Causal mask (prevent attending to future tokens)
        causal_mask = self.get_causal_attention_mask(inputs)

        # Step 2: Combine causal + padding mask
        if mask is not None:
            padding_mask = ops.cast(mask[:, None, :], dtype="int32")  # (batch, 1, seq_len)
            combined_mask = ops.minimum(padding_mask, causal_mask)   # apply both
        else:
            combined_mask = causal_mask

        # Step 3: Masked self-attention
        attention_output_1 = self.attention_1(
            query=inputs,
            value=inputs,
            key=inputs,
            attention_mask=causal_mask,   # always apply causal mask here
        )
        out_1 = self.layernorm_1(inputs + attention_output_1)

        # Step 4: Cross-attention with encoder outputs
        attention_output_2 = self.attention_2(
            query=out_1,
            value=encoder_outputs,
            key=encoder_outputs,
            attention_mask=combined_mask,
        )
        out_2 = self.layernorm_2(out_1 + attention_output_2)

        # Step 5: Feed-forward
        proj_output = self.dense_proj(out_2)
        return self.layernorm_3(out_2 + proj_output)

    def get_causal_attention_mask(self, inputs):
        """Generates a lower-triangular (causal) mask"""
        input_shape = ops.shape(inputs)
        batch_size, seq_len = input_shape[0], input_shape[1]

        i = ops.arange(seq_len)[:, None]
        j = ops.arange(seq_len)
        mask = ops.cast(i >= j, dtype="int32")   # lower-triangular

        mask = ops.reshape(mask, (1, seq_len, seq_len))  # (1, seq, seq)
        mask = ops.tile(mask, (batch_size, 1, 1))        # (batch, seq, seq)
        return mask

    def get_config(self):
        config = super().get_config()
        config.update(
            {
                "embed_dim": self.embed_dim,
                "latent_dim": self.latent_dim,
                "num_heads": self.num_heads,
            }
        )
        return config


In [28]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Hyperparameters
embed_dim = 256
latent_dim = 2048
num_heads = 8

# Encoder
encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="encoder_inputs")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs)
encoder_outputs = TransformerEncoder(embed_dim, latent_dim, num_heads)(x)
encoder = keras.Model(encoder_inputs, encoder_outputs, name="encoder")

# Decoder
decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="decoder_inputs")
encoder_state_inputs = keras.Input(shape=(None, embed_dim), name="encoder_state_inputs")

x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)
x = TransformerDecoder(embed_dim, latent_dim, num_heads)(x, encoder_state_inputs)
x = layers.Dropout(0.5)(x)
decoder_outputs = layers.Dense(vocab_size, activation="softmax")(x)

decoder = keras.Model(
    [decoder_inputs, encoder_state_inputs], decoder_outputs, name="decoder"
)

# Full Transformer
decoder_outputs = decoder([decoder_inputs, encoder_outputs])
transformer = keras.Model(
    [encoder_inputs, decoder_inputs], decoder_outputs, name="transformer"
)

transformer.summary()


In [29]:
# Number of epochs
epochs = 60  # At least 30 for convergence

# Print model summary
transformer.summary()

# Compile the model
transformer.compile(
    optimizer="rmsprop",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

# Fit the model
history = transformer.fit(
    train_ds,
    epochs=epochs,
    validation_data=val_ds
)


Epoch 1/60


W0000 00:00:1755854917.818545     101 assert_op.cc:38] Ignoring Assert operator compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert


[1m 357/1428[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m32s[0m 31ms/step - accuracy: 0.1044 - loss: 6.5658

W0000 00:00:1755854938.656750      99 assert_op.cc:38] Ignoring Assert operator compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert


[1m1427/1428[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 36ms/step - accuracy: 0.1342 - loss: 5.9374

W0000 00:00:1755854979.053730     100 assert_op.cc:38] Ignoring Assert operator compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert
W0000 00:00:1755854980.052516      99 assert_op.cc:38] Ignoring Assert operator compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert


[1m1428/1428[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 39ms/step - accuracy: 0.1342 - loss: 5.9368 - val_accuracy: 0.1890 - val_loss: 4.9013
Epoch 2/60
[1m1428/1428[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 31ms/step - accuracy: 0.1856 - loss: 4.9619 - val_accuracy: 0.2015 - val_loss: 4.6316
Epoch 3/60
[1m1428/1428[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 32ms/step - accuracy: 0.2026 - loss: 4.6924 - val_accuracy: 0.2125 - val_loss: 4.4836
Epoch 4/60
[1m1428/1428[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 32ms/step - accuracy: 0.2158 - loss: 4.5253 - val_accuracy: 0.2195 - val_loss: 4.4274
Epoch 5/60
[1m1428/1428[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 31ms/step - accuracy: 0.2252 - loss: 4.4163 - val_accuracy: 0.2241 - val_loss: 4.3599
Epoch 6/60
[1m1428/1428[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 31ms/step - accuracy: 0.2345 - loss: 4.3353 - val_accuracy: 0.2280 - val_loss: 4.3619
Epoch 7/60
[1m

In [30]:
# Get Hindi vocabulary and lookup dictionary
hin_vocab = hin_vectorization.get_vocabulary()
hin_index_lookup = dict(zip(range(len(hin_vocab)), hin_vocab))

max_decoded_sentence_length = 20

def decode_sequence(input_sentence):
    # Vectorize the English input
    tokenized_input_sentence = eng_vectorization([input_sentence])

    # Start token for Hindi decoding
    decoded_sentence = "[start]"
    for i in range(max_decoded_sentence_length):
        # Vectorize current decoded Hindi sequence (excluding last token for prediction step)
        tokenized_target_sentence = hin_vectorization([decoded_sentence])[:, :-1]

        # Predict next token
        predictions = transformer([tokenized_input_sentence, tokenized_target_sentence])

        # Convert prediction to token index
        sampled_token_index = ops.convert_to_numpy(
            ops.argmax(predictions[0, i, :])
        ).item()

        # Lookup Hindi word
        sampled_token = hin_index_lookup[sampled_token_index]

        # Append to the decoded sequence
        decoded_sentence += " " + sampled_token

        # Stop if end token reached
        if sampled_token == "[end]":
            break

    return decoded_sentence


# Test the translation on some random English sentences
test_eng_texts = [pair[0] for pair in test_pairs]
for _ in range(30):
    input_sentence = random.choice(test_eng_texts)
    translated = decode_sequence(input_sentence)
    print("English:", input_sentence)
    print("Hindi Translation:", translated)
    print("=====================")


English: Jawala Singh , a small farmer from Ludhiana district in Punjab , was not in such a miserable situation three years ago when he sold his tractor to repay a debt .
Hindi Translation: [start] [UNK] सिंह जिला देश में अभी तक तीन साल की उम्र से [UNK] में ही छोटी छोटी तीन महीने पहले
English: (Applause)
Hindi Translation: [start] तालियाँ [end]
English: and on painting day, we all gathered in Nyamirambo,
Hindi Translation: [start] और एक दिन हम सभी को [UNK] पर एक ही पूजा करते हैं [end]
English: But on the whole the supremacy of thought and perception of unity in diversity are precious traits of the Indian mind and they are mirrored in all the cultures which had developed in India .
Hindi Translation: [start] किंतु इसी [UNK] की [UNK] और [UNK] की [UNK] [UNK] [UNK] की एकता का [UNK] के [UNK] के [UNK] से
English: Where do they occur ?
Hindi Translation: [start] जहाँ वे उन्हें कहाँ पाई जाती हैं [end]
English: the worlds percentage is more than 59.5%
Hindi Translation: [start] विश्व का प्रतिशत

In [34]:
# Function to decode a given English sentence into Hindi
def decode_sequence(input_sentence):
    # Vectorize the English input
    tokenized_input_sentence = eng_vectorization([input_sentence])

    # Start token for Hindi decoding
    decoded_sentence = "[start]"
    for i in range(max_decoded_sentence_length):
        # Vectorize current decoded Hindi sequence (excluding last token for prediction step)
        tokenized_target_sentence = hin_vectorization([decoded_sentence])[:, :-1]

        # Predict next token
        predictions = transformer([tokenized_input_sentence, tokenized_target_sentence])

        # Convert prediction to token index
        sampled_token_index = ops.convert_to_numpy(
            ops.argmax(predictions[0, i, :])
        ).item()

        # Lookup Hindi word
        sampled_token = hin_index_lookup[sampled_token_index]

        # Append to the decoded sequence
        decoded_sentence += " " + sampled_token

        # Stop if end token reached
        if sampled_token == "[end]":
            break

    # Clean up [start] and [end] tokens for readability
    return decoded_sentence.replace("[start]", "").replace("[end]", "").strip()


# -----------------------------
# Try your own custom sentences
# -----------------------------
custom_sentences = [
    "How are you?",
    "I am going to school.",
    "What is your name?",
    "Today is a beautiful day.",
    "i love you"
]

for input_sentence in custom_sentences:
    translated = decode_sequence(input_sentence)
    print("English:", input_sentence)
    print("Hindi Translation:", translated)
    print("=====================")


English: How are you?
Hindi Translation: आप क्यों
English: I am going to school.
Hindi Translation: मैं स्कूल जा रहा हूँ
English: What is your name?
Hindi Translation: आपका नाम क्या है
English: Today is a beautiful day.
Hindi Translation: आज बहुत अच्छे पक्षी हैं
English: i love you
Hindi Translation: मुझे पसंद है


In [36]:
transformer.save("eng_hin_transformer.h5")

In [40]:
import gradio as gr
from keras import ops

# Function to decode a given English sentence into Hindi
def decode_sequence(input_sentence):
    # Vectorize the English input
    tokenized_input_sentence = eng_vectorization([input_sentence])

    # Start token for Hindi decoding
    decoded_sentence = "[start]"
    for i in range(max_decoded_sentence_length):
        # Vectorize current decoded Hindi sequence (excluding last token for prediction step)
        tokenized_target_sentence = hin_vectorization([decoded_sentence])[:, :-1]

        # Predict next token
        predictions = transformer([tokenized_input_sentence, tokenized_target_sentence])

        # Convert prediction to token index
        sampled_token_index = ops.convert_to_numpy(
            ops.argmax(predictions[0, i, :])
        ).item()

        # Lookup Hindi word
        sampled_token = hin_index_lookup[sampled_token_index]

        # Append to the decoded sequence
        decoded_sentence += " " + sampled_token

        # Stop if end token reached
        if sampled_token == "[end]":
            break

    # Clean up [start] and [end] tokens for readability
    return decoded_sentence.replace("[start]", "").replace("[end]", "").strip()


# Wrap with Gradio
def translate(input_text):
    return decode_sequence(input_text)

demo = gr.Interface(
    fn=translate,
    inputs=gr.Textbox(lines=2, placeholder="Enter an English sentence..."),
    outputs="text",
    title="English → Hindi Translator",
    description="Enter an English sentence and get its Hindi translation."
)

demo.launch()


* Running on local URL:  http://127.0.0.1:7863
It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

* Running on public URL: https://438cce0fbb993aca6d.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)






Created dataset file at: .gradio/flagged/dataset1.csv
