In [3]:
import os
import sys
import transformers
import tensorflow as tf
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers import AdamWeightDecay
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM

In [None]:
model_checkpoint = "Helsinki-NLP/opus-mt-mul-en"

In [None]:
import pandas as pd
from datasets import Dataset

df = pd.read_csv(
    "cleaned_dataset.tsv",
    sep="\t",
    names=["as", "en"],    
    header=None
)

raw_dataset = Dataset.from_pandas(df, preserve_index=False)

splits = raw_dataset.train_test_split(test_size=0.1, seed=42)
train_ds = splits["train"]
test_ds  = splits["test"]

split2 = train_ds.train_test_split(test_size=0.1, seed=42)
train_ds      = split2["train"]
validation_ds = split2["test"]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [None]:
max_input_length  = 128
max_target_length = 128

source_col = "as"    
target_col = "en"    

def preprocess_function(examples):
    inputs  = examples[source_col]
    targets = examples[target_col]

    model_inputs = tokenizer(
        inputs,
        max_length=max_input_length,
        truncation=True,
        padding="max_length"
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=max_target_length,
            truncation=True,
            padding="max_length"
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train = train_ds.map(preprocess_function, batched=True)
tokenized_val   = validation_ds.map(preprocess_function, batched=True)
tokenized_test  = test_ds.map(preprocess_function, batched=True)

In [None]:
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, from_pt=True)

Training

In [None]:
batch_size = 16
learning_rate = 2e-5
weight_decay = 0.01
num_train_epochs = 50

In [None]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    return_tensors="tf",
    label_pad_token_id=tokenizer.pad_token_id
)

In [None]:
generation_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf", pad_to_multiple_of=128)

In [None]:
train_dataset = model.prepare_tf_dataset(
    tokenized_train,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=data_collator,
)

In [None]:
validation_dataset = model.prepare_tf_dataset(
    tokenized_val,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=data_collator,
)

In [None]:
generation_dataset = model.prepare_tf_dataset(
    tokenized_val,
    batch_size=8,
    shuffle=False,
    collate_fn=generation_data_collator,
)

In [None]:
optimizer = AdamWeightDecay(learning_rate=learning_rate, weight_decay_rate=weight_decay)
model.compile(optimizer=optimizer)

In [None]:
model.fit(
  train_dataset,
  validation_data=validation_dataset,
  epochs=num_train_epochs
)

In [None]:
model.save_pretrained("model/")

Evaluation

In [None]:
from sacrebleu import corpus_bleu, corpus_chrf, corpus_ter
from tqdm.auto import tqdm

batch_size        = 16
num_beams         = 4
max_input_length  = 128
max_target_length = 128

preds, refs = [], []

for i in tqdm(range(0, len(test_ds), batch_size), desc="Evaluating"):
    batch_src = test_ds["as"][i : i + batch_size]
    batch_tgt = test_ds["en"][i : i + batch_size]

    inputs = tokenizer(
        batch_src,
        return_tensors="tf",
        truncation=True,
        padding="longest",
        max_length=max_input_length
    )

    output_ids = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=max_target_length,
        num_beams=num_beams,
        early_stopping=True
    )
    batch_preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    preds.extend(batch_preds)
    refs.extend(batch_tgt)

bleu = corpus_bleu(preds, [refs]).score
chrf = corpus_chrf(preds, [refs]).score
ter  = corpus_ter(preds, [refs]).score

print(f"\nBLEU: {bleu:.2f}")
print(f"chrF: {chrf:.2f}")
print(f"TER : {ter:.2f}")

Inference

In [None]:
max_input_length  = 128
max_target_length = 128

def translate(text: str,
              max_length: int = max_target_length,
              num_beams: int = 4) -> str:
    inputs = tokenizer(
        text,
        return_tensors="tf",
        truncation=True,
        padding="max_length",
        max_length=max_input_length
    )
    output_ids = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=max_length,
        num_beams=num_beams,
        early_stopping=True
    )
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

print("Assamese→English Translator (type 'exit' to quit)\n")
while True:
    user_input = input("Enter Assamese sentence: ").strip()
    if not user_input:
        continue
    if user_input.lower() in {"exit", "quit"}:
        print("Goodbye!")
        break
    translation = translate(user_input)
    print("→ English:", translation, "\n")