In [137]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [138]:
english_sentences = [
    "hello",
    "how are you",
    "good morning",
    "thank you",
    "welcome"
]

tamil_sentences = [
    "வணக்கம்",
    "நீங்கள் எப்படி இருக்கிறீர்கள்",
    "காலை வணக்கம்",
    "நன்றி",
    "வரவேற்கிறேன்"
]

# Add start and end tokens (plain words)
tamil_sentences = ["startseq " + s + " endseq" for s in tamil_sentences]


In [139]:
eng_tokenizer = Tokenizer(filters='')
tam_tokenizer = Tokenizer(filters='')

eng_tokenizer.fit_on_texts(english_sentences)
tam_tokenizer.fit_on_texts(tamil_sentences)

eng_seq = eng_tokenizer.texts_to_sequences(english_sentences)
tam_seq = tam_tokenizer.texts_to_sequences(tamil_sentences)


In [140]:
max_eng_len = max(len(seq) for seq in eng_seq)
max_tam_len = max(len(seq) for seq in tam_seq)

encoder_input = pad_sequences(eng_seq, maxlen=max_eng_len, padding='post')
decoder_input = pad_sequences(tam_seq, maxlen=max_tam_len, padding='post')

# Decoder output (shifted)
decoder_output = np.zeros_like(decoder_input)
decoder_output[:, :-1] = decoder_input[:, 1:]


In [141]:
latent_dim = 256

encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(len(eng_tokenizer.word_index)+1, latent_dim)
encoder_embedded = encoder_embedding(encoder_inputs)

encoder_lstm = LSTM(latent_dim, return_state=True)
_, state_h, state_c = encoder_lstm(encoder_embedded)

encoder_states = [state_h, state_c]


In [142]:
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(len(tam_tokenizer.word_index)+1, latent_dim)
decoder_embedded = decoder_embedding(decoder_inputs)

decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(
    decoder_embedded,
    initial_state=encoder_states
)

decoder_dense = Dense(len(tam_tokenizer.word_index)+1, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)


In [143]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

model.fit(
    [encoder_input, decoder_input],
    decoder_output,
    batch_size=2,
    epochs=300,
    verbose=1
)


Epoch 1/300
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 49ms/step - accuracy: 0.2800 - loss: 2.2826
Epoch 2/300
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - accuracy: 0.4275 - loss: 2.1410
Epoch 3/300
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step - accuracy: 0.5025 - loss: 1.8754
Epoch 4/300
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - accuracy: 0.5025 - loss: 1.4664
Epoch 5/300
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - accuracy: 0.4275 - loss: 1.3919
Epoch 6/300
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - accuracy: 0.4525 - loss: 1.2976
Epoch 7/300
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - accuracy: 0.4725 - loss: 1.1637
Epoch 8/300
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - accuracy: 0.6650 - loss: 1.0968
Epoch 9/300
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

<keras.src.callbacks.history.History at 0x7fd0805b38f0>

In [144]:
encoder_model = Model(encoder_inputs, encoder_states)


In [145]:
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_emb_inf = decoder_embedding(decoder_inputs)

decoder_outputs_inf, state_h_inf, state_c_inf = decoder_lstm(
    decoder_emb_inf,
    initial_state=decoder_states_inputs
)

decoder_states_inf = [state_h_inf, state_c_inf]
decoder_outputs_inf = decoder_dense(decoder_outputs_inf)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs_inf] + decoder_states_inf
)


In [146]:
reverse_tam_index = {v: k for k, v in tam_tokenizer.word_index.items()}

start_token = tam_tokenizer.word_index["startseq"]
end_token = tam_tokenizer.word_index["endseq"]

def translate(sentence):
    seq = eng_tokenizer.texts_to_sequences([sentence])
    seq = pad_sequences(seq, maxlen=max_eng_len, padding='post')

    states = encoder_model.predict(seq, verbose=0)

    target_seq = np.array([[start_token]])
    decoded_sentence = []

    for _ in range(max_tam_len):
        output, h, c = decoder_model.predict([target_seq] + states, verbose=0)
        token = np.argmax(output[0, -1])

        if token == end_token:
            break

        word = reverse_tam_index.get(token, "")
        decoded_sentence.append(word)

        target_seq = np.array([[token]])
        states = [h, c]

    return " ".join(decoded_sentence)


In [148]:
print("English:", "welcome")
print("Tamil  :", translate("welcome"))


English: welcome
Tamil  : வரவேற்கிறேன்
