In [1]:
import re
import numpy as np
import pandas as pd
import tensorflow as tf
import sentencepiece as spm
from tensorflow.keras.layers import (
    Input, Embedding, Dense, Add,
    LayerNormalization, MultiHeadAttention
)
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
# ---------------- CONFIG ----------------
DATA_PATH = r"C:\Users\koush\Downloads\english_to_telugu_data.csv"        # English, Telugu columns
VOCAB_SIZE = 16000
EMB_DIM = 256
NUM_HEADS = 4
FF_DIM = 512
ENC_LAYERS = 6
DEC_LAYERS = 6
BATCH_SIZE = 50
EPOCHS = 2
MAX_LEN = 40
#use_rows = 50000

In [3]:
df = pd.read_csv(DATA_PATH)
# df = df.iloc[:use_rows, :]

In [4]:
df.shape

(1499448, 2)

In [5]:

df = df[["english", "telugu"]].dropna()

def clean(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\u0C00-\u0C7F\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["english"] = df["english"].apply(clean)
df["telugu"]  = df["telugu"].apply(clean)


In [6]:
df.shape

(1499448, 2)

In [7]:
with open("eng.txt", "w", encoding="utf-8") as f:
    for i in df["english"]:
        f.write(i + "\n")

spm.SentencePieceTrainer.train(
    input="eng.txt",
    model_prefix="spm_eng",
    vocab_size=VOCAB_SIZE,
    model_type="bpe",
    pad_id=0, bos_id=1, eos_id=2, unk_id=3
)


In [8]:
with open("tel.txt", "w", encoding="utf-8") as f:
    for s in df["telugu"]:
        f.write(s + "\n")

spm.SentencePieceTrainer.train(
    input="tel.txt",
    model_prefix="spm_tel",
    vocab_size=VOCAB_SIZE,
    model_type="bpe",
    pad_id=0, bos_id=1, eos_id=2, unk_id=3
)


In [None]:
sp_eng = spm.SentencePieceProcessor(model_file="spm_eng.model")
sp_tel = spm.SentencePieceProcessor(model_file="spm_tel.model")

In [None]:
def encode_eng(text):
    return sp_eng.encode(text, out_type=int)

def encode_tel(text):
    return [1] + sp_tel.encode(text, out_type=int) + [2]   # <bos> <eos>


In [None]:
eng_seq = [encode_eng(s) for s in df["english"]]
tel_seq = [encode_tel(s) for s in df["telugu"]]

In [None]:
eng_pad = pad_sequences(eng_seq, maxlen=MAX_LEN, padding="post")
tel_pad = pad_sequences(tel_seq, maxlen=MAX_LEN, padding="post")

In [None]:
decoder_input  = tel_pad[:, :-1]
decoder_target = tel_pad[:, 1:]

In [None]:
decoder_input = pad_sequences(
    decoder_input, maxlen=MAX_LEN, padding="post"
)

decoder_target = pad_sequences(
    decoder_target, maxlen=MAX_LEN, padding="post"
)


# This is positional_encoding

In [None]:
def positional_encoding(length, depth):
    depth = depth / 2
    positions = np.arange(length)[:, None]
    depths = np.arange(depth)[None, :] / depth
    angle_rates = 1 / (10000**depths)
    angle_rads = positions * angle_rates
    pos_encoding = np.concatenate(
        [np.sin(angle_rads), np.cos(angle_rads)],
        axis=-1
    )
    return tf.cast(pos_encoding[None, ...], tf.float32)

pos_enc = positional_encoding(MAX_LEN, EMB_DIM)

# This ia a Transformer model with 6 Encoders and 6 Decoders

In [None]:
enc_in = Input(shape=(MAX_LEN,))
dec_in = Input(shape=(MAX_LEN,))
enc_emb = Embedding(VOCAB_SIZE, EMB_DIM, mask_zero=True)(enc_in)
enc_x = Add()([enc_emb, pos_enc])

for _ in range(ENC_LAYERS):
    attn = MultiHeadAttention(NUM_HEADS, EMB_DIM // NUM_HEADS)(
        enc_x, enc_x
    )
    enc_x = LayerNormalization()(Add()([enc_x, attn]))

    ff = Dense(FF_DIM, activation="relu")(enc_x)
    ff = Dense(EMB_DIM)(ff)
    enc_x = LayerNormalization()(Add()([enc_x, ff]))

encoder_out = enc_x
look_ahead_mask = tf.linalg.band_part(
    tf.ones((MAX_LEN, MAX_LEN)), -1, 0
)

dec_emb = Embedding(VOCAB_SIZE, EMB_DIM)(dec_in)
dec_x = Add()([dec_emb, pos_enc])

for _ in range(DEC_LAYERS):
    self_attn = MultiHeadAttention(
        NUM_HEADS, EMB_DIM // NUM_HEADS
    )(dec_x, dec_x, attention_mask=look_ahead_mask)

    dec_x = LayerNormalization()(Add()([dec_x, self_attn]))

    cross_attn = MultiHeadAttention(
        NUM_HEADS, EMB_DIM // NUM_HEADS
    )(dec_x, encoder_out)

    dec_x = LayerNormalization()(Add()([dec_x, cross_attn]))

    ff = Dense(FF_DIM, activation="relu")(dec_x)
    ff = Dense(EMB_DIM)(ff)
    dec_x = LayerNormalization()(Add()([dec_x, ff]))


out = Dense(VOCAB_SIZE, activation="softmax")(dec_x)

model = Model([enc_in, dec_in], out)
model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-4),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)
model.summary()


In [None]:
model.load_weights("/content/translator_4lks_15.weights.h5")


  saveable.load_own_variables(weights_store.get(inner_path))


In [None]:
history = model.fit(
    [eng_pad, decoder_input],
    decoder_target,
    batch_size=BATCH_SIZE,
    epochs=2,
    validation_split=0.2
)

Epoch 1/2
[1m2304/9600[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m10:51[0m 89ms/step - accuracy: 0.9093 - loss: 0.4429

In [None]:
import keras
model = keras.models.load_model("/content/translator_4lks_15.keras")

In [None]:
model.summary()

In [None]:
import keras
keras.models.save_model(model, "/content/translator_6lks_29.keras")

In [None]:
model.save_weights("/content/translator_6lks_29.weights.h5")