In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [2]:
# Generate random English and Tamil sentences for demonstration (replace with your dataset)
english_sentences = [
    "Hello, how are you?",
    "I like machine learning.",
    "Translate this text.",
]
tamil_sentences = [
    "வணக்கம், நீங்கள் எப்படி?",
    "நான் இயந்திர கற்பதை பிடிக்கிறேன்.",
    "இந்த உரையை மொழிபெயர் செய்க.",
]


In [3]:
# Tokenize English and Tamil sentences using word-level tokenization
tokenizer_en = Tokenizer()  # Word-level tokenizer
tokenizer_en.fit_on_texts(english_sentences)
english_sequences = tokenizer_en.texts_to_sequences(english_sentences)


In [4]:
tokenizer_ta = Tokenizer()  # Word-level tokenizer
tokenizer_ta.fit_on_texts(tamil_sentences)
tamil_sequences = tokenizer_ta.texts_to_sequences(tamil_sentences)


In [5]:
# Pad sequences for model input
max_len = max(len(seq) for seq in english_sequences)
english_sequences = pad_sequences(english_sequences, maxlen=max_len, padding='post')
tamil_sequences = pad_sequences(tamil_sequences, maxlen=max_len, padding='post')


In [6]:
# Create an encoder-decoder model
vocab_size_en = len(tokenizer_en.word_index) + 1
vocab_size_ta = len(tokenizer_ta.word_index) + 1
embedding_dim = 256


In [7]:
input_layer = keras.layers.Input(shape=(max_len,))
encoder = Embedding(input_dim=vocab_size_en, output_dim=embedding_dim)(input_layer)
encoder = LSTM(256, return_sequences=True)(encoder)


In [8]:
decoder = LSTM(256, return_sequences=True)(encoder)
output_layer = Dense(vocab_size_ta, activation='softmax')(decoder)


In [9]:
model = Model(input_layer, output_layer)


In [10]:
model.compile(optimizer='adam', loss='categorical_crossentropy')


In [11]:
# Train the model with one-hot encoded target sequences (provide your training data)
# tamil_sequences_onehot = tf.one_hot(tamil_sequences, depth=vocab_size_ta)
# model.fit(english_sequences, tamil_sequences_onehot, epochs=100)


In [12]:
# Inference: Translate English to Tamil
def translate_to_tamil(input_text):
    input_sequence = tokenizer_en.texts_to_sequences([input_text])
    input_sequence = pad_sequences(input_sequence, maxlen=max_len, padding='post')
    prediction = model.predict(input_sequence)
    predicted_sequence = np.argmax(prediction, axis=-1)
    tamil_text = tokenizer_ta.sequences_to_texts(predicted_sequence)[0]
    return tamil_text


In [13]:
# Test translation
input_text = "How are you?"
tamil_translation = translate_to_tamil(input_text)
print(f"English: {input_text}")
print(f"Tamil: {tamil_translation}")

English: How are you?
Tamil: வணக்கம் மொழிபெயர் மொழிபெயர் இயந்திர
