### Import Libraries

In [1]:
import numpy as np
import pandas as pd

### Load Dataset

In [2]:
with open("/kaggle/input/tamiltxt/tam.txt", encoding="utf8") as f:
    data = f.read().split("\n")

X_txt = []
y_txt = []

X_voc = set()
y_voc = set()
    
for line in data:
    # Skip In-Valid Seq
    if(len(line.split("\t")) != 3):
        continue
    
    target, inp, _ = line.split("\t")
    target = "\t" + target + "\n" # "\t" -> Start Seq, "\n" -> End Seq
    
    X_txt.append(inp)
    y_txt.append(target)
    
    X_voc.update(set(inp))
    y_voc.update(set(target))

In [3]:
print("No of Records : ", len(X_txt), end="\n\n")

X_voc = sorted(list(X_voc))
y_voc = sorted(list(y_voc))

print("Vocab Size of Input tok : ", len(X_voc))
print("vocab Size of Output tok : ", len(y_voc), end="\n\n")

max_encoder = max([len(t) for t in X_txt])
max_decoder = max([len(t) for t in y_txt])

print("Max Seq length for input : ", max_encoder)
print("Max Seq length for output : ", max_decoder)

No of Records :  201

Vocab Size of Input tok :  52
vocab Size of Output tok :  55

Max Seq length for input :  109
Max Seq length for output :  96


### Tokenization

In [4]:
input_tok_enc = dict([(char, i) for i, char in enumerate(X_voc)])
input_tok_dec = dict([(i, char) for i, char in enumerate(X_voc)])

target_tok_enc = dict([(char, i) for i, char in enumerate(y_voc)])
target_tok_dec = dict([(i, char) for i, char in enumerate(y_voc)])

In [5]:
X = [[input_tok_enc[char] for char in text] for text in X_txt]
y = [[target_tok_enc[char] for char in text] for text in y_txt]

### Padding

In [6]:
from tensorflow.keras.utils import pad_sequences

X = pad_sequences(X, maxlen=max_encoder, padding="post", value=input_tok_enc[' '])
y = pad_sequences(y, maxlen=max_decoder, padding="post", value=target_tok_enc[' '])

2024-03-31 03:58:11.443475: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-31 03:58:11.443576: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-31 03:58:11.586665: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


### Model

In [7]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, CategoryEncoding

latent_dim = 256

# Encoder Block
encoder_inputs = Input(shape=(None, len(X_voc)))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)

encoder_states = [state_h, state_c]

# Decoder Block
decoder_inputs = (Input(shape=(None, len(y_voc))))
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(len(y_voc), activation="softmax")
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [8]:
import tensorflow as tf

model.compile(optimizer=tf.keras.optimizers.Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

#### Model - Training

In [9]:
from sklearn.preprocessing import OneHotEncoder

encoder_input_data = np.zeros((len(X), max_encoder, len(X_voc)), dtype="float")
for i, line in enumerate(X):
    for j, k in enumerate(line):
        encoder_input_data[i][j][k] = 1.0
        
decoder_input_data = np.zeros((len(X), max_decoder, len(y_voc)), dtype="float")
decoder_target_data = np.zeros((len(X), max_decoder, len(y_voc)), dtype="float")
for i, line in enumerate(y):
    for j, k in enumerate(line):
        decoder_input_data[i, j, k] = 1.0
        if(j > 0):
            decoder_target_data[i, j-1, k] = 1.0
            
model.fit([encoder_input_data, decoder_input_data], decoder_target_data, batch_size=64, epochs=5000, validation_split=0.2)

Epoch 1/5000
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 180ms/step - accuracy: 0.3394 - loss: 3.8919 - val_accuracy: 0.6519 - val_loss: 3.6119
Epoch 2/5000
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - accuracy: 0.8102 - loss: 3.2701 - val_accuracy: 0.6509 - val_loss: 2.0635
Epoch 3/5000
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - accuracy: 0.8114 - loss: 1.2451 - val_accuracy: 0.6509 - val_loss: 2.3600
Epoch 4/5000
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.8104 - loss: 1.0974 - val_accuracy: 0.6468 - val_loss: 2.0770
Epoch 5/5000
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.7396 - loss: 1.2493 - val_accuracy: 0.6532 - val_loss: 2.0739
Epoch 6/5000
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.8123 - loss: 0.9811 - val_accuracy: 0.6509 - val_loss: 2.0031
Epoch 7/5000
[1m3/3[0m [32m━━━

<keras.src.callbacks.history.History at 0x7997f8795210>

#### Load Model Weights 

In [10]:
model.save('tam2eng.keras')

#### Inference Model

In [11]:
encoder_inputs = model.input[0]
encoder_outputs, state_h_enc, state_c_enc = model.layers[2].output # Lstm 1
encoder_states = [state_h_enc, state_c_enc]
encoder_model = Model(encoder_inputs, encoder_states)

decoder_inputs = model.input[1]
decoder_state_input_h = Input(shape=(latent_dim, ))
decoder_state_input_c = Input(shape=(latent_dim, ))
decoder_state_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_lstm = model.layers[3] # Lstm 2
decoder_outputs, state_h_dec, state_c_dec = decoder_lstm(decoder_inputs, initial_state=decoder_state_inputs)
decoder_states = [state_h_dec, state_c_dec]

decoder_dense = model.layers[4] # Dense Layer
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_state_inputs, [decoder_outputs] + decoder_states)

In [12]:
def decode_seq(input_seq):
    
    # Encoder
    state_value = encoder_model.predict(input_seq, verbose=0)
    
    target_seq = np.zeros((1, 1, len(y_voc)))
    target_seq[0, 0, target_tok_enc["\t"]] = 1.0
    
    stop = False
    decode_sent = ""
    
    while not stop:
        
        output_tok, h, c = decoder_model.predict([target_seq] + state_value, verbose=0)
        
        sample_token_idx = np.argmax(output_tok[0, -1, :])
        sample_char = target_tok_dec[sample_token_idx]
        
        decode_sent += sample_char
        
        if sample_char == "\n" or len(decode_sent) > max_decoder:
            stop = True
            
        target_seq = np.zeros((1, 1, len(y_voc)))
        target_seq[0, 0, sample_token_idx] = 1.0
        
        state_value = [h, c]
        
    return decode_sent

In [13]:
for i in range(10):
    
    input_seq = encoder_input_data[i: i+1]
    d_seq = decode_seq(input_seq)
    
    print("Input Seq : ", X_txt[i])
    print("Output Seq : ", d_seq)

Input Seq :  நான் தூங்கினேன்.
Output Seq :  I slept.

Input Seq :  அமைதியாக இருங்கள்
Output Seq :  Calm down.

Input Seq :  நான் நடப்பேன்.
Output Seq :  I'll walk.

Input Seq :  அவன் யார்?
Output Seq :  Who is he?

Input Seq :  யாருக்குத் தெரியும்?
Output Seq :  Who knows?

Input Seq :  அவள் சிரித்தாள்
Output Seq :  She smiled.

Input Seq :  என்னிடம் பேசு
Output Seq :  Talk to me!

Input Seq :  அவள் யார்?
Output Seq :  Who is she?

Input Seq :  போய் தூங்கு
Output Seq :  Go to sleep.

Input Seq :  மழை பெய்யலாம்
Output Seq :  It may rain.



### Testing on UnSeen Data

In [14]:
def inf_testing(text):
    # Tokenize
    tok = [input_tok_enc.get(char, input_tok_enc[" "]) for char in text]
    
    # Encoder Input Fmt
    encoder_inp_seq = np.zeros((1, len(tok), len(X_voc)), dtype="float")
    for i, j in enumerate(tok):
        encoder_inp_seq[0][i][j] = 1.0
        
    output = decode_seq(encoder_inp_seq)
    
    print("Input : ", text)
    print("Output : ", output)
    

lst = ["நான் மிகவும் சந்த ாஷமாக இருக்கிதேன", "அது அவசியமில்லை", "தயவுசெய்து அல மீண்டும் சசய்யவும", "அது ஒரு நல்ை தயாசலை", "அவர்கள் ஒன்ோக தவலை சசய்ய ஒப்புக்சகாண்டைர"]
for sent in lst:
    inf_testing(sent)

Input :  நான் மிகவும் சந்த ாஷமாக இருக்கிதேன
Output :  I had to dom up.

Input :  அது அவசியமில்லை
Output :  Whe ridst.

Input :  தயவுசெய்து அல மீண்டும் சசய்யவும
Output :  Te ma m CD  lock.

Input :  அது ஒரு நல்ை தயாசலை
Output :  Don'd beaghrs.

Input :  அவர்கள் ஒன்ோக தவலை சசய்ய ஒப்புக்சகாண்டைர
Output :  When can we ut?

