In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense

# Load dataset
df = pd.read_csv("french_tamil_words.csv")
french_words = df['french'].astype(str)
tamil_words = df['tamil'].astype(str)

# Create char-level vocabulary
input_chars = sorted(set("".join(french_words)))
target_chars = sorted(set("".join(tamil_words)) | set(["\t", "\n"]))  # Start (\t) and End (\n) tokens

input_token_index = {ch: i for i, ch in enumerate(input_chars)}
target_token_index = {ch: i for i, ch in enumerate(target_chars)}
reverse_target_index = {i: ch for ch, i in target_token_index.items()}

max_encoder_seq_length = 5  # all French words are 5-letter
max_decoder_seq_length = max([len(txt) for txt in tamil_words]) + 2  # + start and end

# Vectorize input and target data
encoder_input_data = np.zeros((len(french_words), max_encoder_seq_length, len(input_chars)))
decoder_input_data = np.zeros((len(french_words), max_decoder_seq_length, len(target_chars)))
decoder_target_data = np.zeros((len(french_words), max_decoder_seq_length, len(target_chars)))

for i, (input_text, target_text) in enumerate(zip(french_words, tamil_words)):
    target_text = '\t' + target_text + '\n'
    for t, ch in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[ch]] = 1
    for t, ch in enumerate(target_text):
        decoder_input_data[i, t, target_token_index[ch]] = 1
        if t > 0:
            decoder_target_data[i, t - 1, target_token_index[ch]] = 1

# Build the model
latent_dim = 256

encoder_inputs = Input(shape=(None, len(input_chars)))
encoder_lstm = LSTM(latent_dim, return_state=True)
_, state_h, state_c = encoder_lstm(encoder_inputs)
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(None, len(target_chars)))
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(len(target_chars), activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.fit([encoder_input_data, decoder_input_data], decoder_target_data, batch_size=16, epochs=300)

# Save model
model.save("french_to_tamil_model.h5")


2025-05-29 18:35:37.297154: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748523937.823360  108763 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748523937.974880  108763 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1748523939.629876  108763 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1748523939.629949  108763 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1748523939.629953  108763 computation_placer.cc:177] computation placer alr

Epoch 1/300
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 77ms/step - loss: 1.7875
Epoch 2/300
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 69ms/step - loss: 1.5524
Epoch 3/300
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 53ms/step - loss: 1.4715
Epoch 4/300
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 68ms/step - loss: 1.4130
Epoch 5/300
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 60ms/step - loss: 1.3816
Epoch 6/300
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 57ms/step - loss: 1.3591
Epoch 7/300
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 56ms/step - loss: 1.3154
Epoch 8/300
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 56ms/step - loss: 1.2302
Epoch 9/300
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 71ms/step - loss: 1.1902
Epoch 10/300
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 59ms/step - l



In [2]:
# Inference models
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)
    
    target_seq = np.zeros((1, 1, len(target_chars)))
    target_seq[0, 0, target_token_index['\t']] = 1

    decoded_sentence = ''
    for _ in range(max_decoder_seq_length):
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_index[sampled_token_index]
        if sampled_char == '\n':
            break
        decoded_sentence += sampled_char

        target_seq = np.zeros((1, 1, len(target_chars)))
        target_seq[0, 0, sampled_token_index] = 1
        states_value = [h, c]

    return decoded_sentence


In [3]:
import tkinter as tk

def vectorize_input(word):
    vec = np.zeros((1, max_encoder_seq_length, len(input_chars)))
    for t, ch in enumerate(word):
        if ch in input_token_index:
            vec[0, t, input_token_index[ch]] = 1
    return vec

def handle_translate():
    word = entry.get().strip().lower()
    if len(word) != 5:
        output_label.config(text="Only 5-letter French words allowed.")
        return
    vec = vectorize_input(word)
    translated = decode_sequence(vec)
    output_label.config(text=f"Tamil: {translated}")

root = tk.Tk()
root.title("Custom French to Tamil Translator")
root.geometry("400x200")

tk.Label(root, text="Enter 5-letter French word:").pack(pady=10)
entry = tk.Entry(root, font=('Arial', 14))
entry.pack()

tk.Button(root, text="Translate", command=handle_translate).pack(pady=10)
output_label = tk.Label(root, text="", font=('Arial', 16), fg="blue")
output_label.pack()

root.mainloop()


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 278ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
