In [20]:
# If these are already installed in ancient-ai-env, you can skip this cell.
# Otherwise run it once.

try:
    import datasets  # type: ignore
except ImportError:
    %pip install datasets --quiet

try:
    import transformers  # type: ignore
except ImportError:
    %pip install transformers accelerate sentencepiece --quiet


In [21]:
from pathlib import Path
import unicodedata
import re

import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Trainer,
    TrainingArguments,
)

BASE_DIR = Path("..").resolve()
DATA_DIR = BASE_DIR / "data" / "corpora" / "sa_en_itihasa"
MODEL_DIR = BASE_DIR / "models" / "indictrans2-en"
LOG_DIR = BASE_DIR / "logs" / "translation_finetune"
LOG_DIR.mkdir(parents=True, exist_ok=True)

print("Project root :", BASE_DIR)
print("Corpus dir   :", DATA_DIR, "exists" if DATA_DIR.exists() else "MISSING")
print("Model dir    :", MODEL_DIR, "exists" if MODEL_DIR.exists() else "MISSING")
print("Log dir      :", LOG_DIR)


Project root : /Users/jyotirmoy/Desktop/Image/ancient-script-ai
Corpus dir   : /Users/jyotirmoy/Desktop/Image/ancient-script-ai/data/corpora/sa_en_itihasa exists
Model dir    : /Users/jyotirmoy/Desktop/Image/ancient-script-ai/models/indictrans2-en exists
Log dir      : /Users/jyotirmoy/Desktop/Image/ancient-script-ai/logs/translation_finetune


In [22]:
def normalize_text(text: str) -> str:
    """
    Basic Unicode normalization and cleanup for Sanskrit / English text.
    - NFC normalization
    - remove zero-width characters
    - strip whitespace
    """
    if not isinstance(text, str):
        return ""
    text = unicodedata.normalize("NFC", text)
    text = re.sub(r"[\u200b\u200c\u200d]", "", text)  # zero-width chars
    text = text.strip()
    return text


In [23]:
train_en_path = DATA_DIR / "train.en"
train_sa_path = DATA_DIR / "train.sn"

if not train_en_path.exists() or not train_sa_path.exists():
    raise FileNotFoundError(
        f"Expected files train.en and train.sn inside {DATA_DIR}, "
        f"found: train.en={train_en_path.exists()}, train.sn={train_sa_path.exists()}"
    )

sanskrit_lines = [normalize_text(l) for l in train_sa_path.read_text(encoding="utf-8").splitlines()]
english_lines  = [normalize_text(l) for l in train_en_path.read_text(encoding="utf-8").splitlines()]

if len(sanskrit_lines) != len(english_lines):
    raise ValueError(f"Line mismatch: Sanskrit={len(sanskrit_lines)}, English={len(english_lines)}")

df = pd.DataFrame({"src_sa": sanskrit_lines, "tgt_en": english_lines})
print("Total sentence pairs:", len(df))
df.head()


Total sentence pairs: 75161


Unnamed: 0,src_sa,tgt_en
0,ॐ तपः स्वाध्यायनिरतं तपस्वी वाग्विदां वरम्। ना...,"The ascetic Vālmīki asked Nārada, the best of ..."
1,कोन्वस्मिन् साम्प्रतं लोके गुणवान् कश्च वीर्यव...,Who at present in this world is like crowned w...
2,चारित्रेण च को युक्तः सर्वभूतेषु को हितः। विद्...,"Who is qualified by virtue of his character, a..."
3,आत्मवान् को जितक्रोधो द्युतिमान् कोऽनसूयकः। कस...,"Who has subdued his heart, and controlled his ..."
4,एतदिच्छाम्यहं श्रोतुं परं कौतूहलं हि मे। महर्ष...,I have great curiosity to hear of such a perso...


In [24]:
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

print("Train size:", len(train_df))
print("Val size  :", len(val_df))

train_df.head()


Train size: 67644
Val size  : 7517


Unnamed: 0,src_sa,tgt_en
41469,नागं जिघांसुः सहसा चिक्षेप च महाबलः। स विस्फुल...,Then that highly powerful hero desirous of sla...
36243,प्रोक्षिता यत्र बहवो वराहाद्या मृगा वने। शक्रे...,Here were massacred many boars and Other anima...
39166,इहैव तैर्जितः सर्गो येषां साम्ये स्थितं मनः। न...,Even here the material world is conquered by t...
48921,धनंजयस्ततः कृष्णमब्रवीत् पश्य केशव। आचार्यरथमु...,"Thereafter Dhananjaya addressing Kesava said-""..."
40873,क्रुद्धं तमुवीक्ष्य भयेन राजन् सम्मूर्च्छितो न...,"Beholding Bhima, I have been, O sire, unmanned..."


In [25]:
train_ds = Dataset.from_pandas(train_df.reset_index(drop=True))
val_ds   = Dataset.from_pandas(val_df.reset_index(drop=True))

raw_datasets = DatasetDict(
    {
        "train": train_ds,
        "validation": val_ds,
    }
)

raw_datasets


DatasetDict({
    train: Dataset({
        features: ['src_sa', 'tgt_en'],
        num_rows: 67644
    })
    validation: Dataset({
        features: ['src_sa', 'tgt_en'],
        num_rows: 7517
    })
})

In [26]:
if not MODEL_DIR.exists():
    raise FileNotFoundError(f"Model directory not found: {MODEL_DIR}")

tokenizer = AutoTokenizer.from_pretrained(str(MODEL_DIR), use_fast=True)
model = AutoModelForSeq2SeqLM.from_pretrained(str(MODEL_DIR))

print("Tokenizer vocab size:", tokenizer.vocab_size)
print("Model loaded:", type(model))


The repository /Users/jyotirmoy/Desktop/Image/ancient-script-ai/models/indictrans2-en contains custom code which must be executed to correctly load the model. You can inspect the repository content at /Users/jyotirmoy/Desktop/Image/ancient-script-ai/models/indictrans2-en .
 You can inspect the repository content at https://hf.co//Users/jyotirmoy/Desktop/Image/ancient-script-ai/models/indictrans2-en.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y
The repository /Users/jyotirmoy/Desktop/Image/ancient-script-ai/models/indictrans2-en contains custom code which must be executed to correctly load the model. You can inspect the repository content at /Users/jyotirmoy/Desktop/Image/ancient-script-ai/models/indictrans2-en .
 You can inspect the repository content at https://hf.co//Users/jyotirmoy/Desktop/Image/ancient-script-ai/models/indictrans2-en.
You can avoid this prompt in future by passing the argument `t

Tokenizer vocab size: 122706
Model loaded: <class 'transformers_modules.indictrans2_hyphen_en.modeling_indictrans.IndicTransForConditionalGeneration'>


In [29]:
max_source_length = 256
max_target_length = 256

SRC_LANG_TAG = "<2sa>"   # Sanskrit source
TGT_LANG_TAG = "<2en>"   # English target

def preprocess_function(examples):
    """
    Tokenize Sanskrit source and English target text.
    Adds required IndicTrans2 language tags to each example.
    """
    # add language tags
    inputs = [f"{SRC_LANG_TAG} {TGT_LANG_TAG} {text}" for text in examples["src_sa"]]
    targets = [f"{TGT_LANG_TAG} {text}" for text in examples["tgt_en"]]

    model_inputs = tokenizer(
        inputs,
        max_length=max_source_length,
        truncation=True,
    )

    # Tokenize targets separately
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=max_target_length,
            truncation=True,
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs



In [30]:
# Re-map with corrected preprocessing
tokenized_datasets = raw_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

# sanity check again
example = tokenized_datasets["train"][0]
print(tokenizer.decode(example["input_ids"], skip_special_tokens=False))


Map:   0%|          | 0/67644 [00:00<?, ? examples/s]



Map:   0%|          | 0/7517 [00:00<?, ? examples/s]

<unk><unk></s>


In [31]:
tokenized_datasets = raw_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

tokenized_datasets


Map:   0%|          | 0/67644 [00:00<?, ? examples/s]

Map:   0%|          | 0/7517 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 67644
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 7517
    })
})

In [32]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)


In [40]:
example = tokenized_datasets["train"][0]
input_ids = example["input_ids"]
label_ids = example["labels"]

print("Keys:", example.keys())
print("Lengths:", len(input_ids), len(label_ids))
print("First 20 input_ids:", input_ids[:20])
print("First 20 label_ids:", label_ids[:20])

print("\nDecoded Sanskrit (keep special tokens):")
print(tokenizer.decode(input_ids, skip_special_tokens=False))

print("\nDecoded English (keep special tokens):")
print(tokenizer.decode(label_ids, skip_special_tokens=False))


Keys: dict_keys(['input_ids', 'attention_mask', 'labels'])
Lengths: 3 1
First 20 input_ids: [3, 3, 2]
First 20 label_ids: [2]

Decoded Sanskrit (keep special tokens):
<unk><unk></s>

Decoded English (keep special tokens):
</s>


In [41]:
SRC_LANG_TAG = "<2sa>"
TGT_LANG_TAG = "<2en>"

max_source_length = 256
max_target_length = 256

def preprocess_function(examples):
    inputs = [f"{SRC_LANG_TAG} {TGT_LANG_TAG} {text}" for text in examples["src_sa"]]
    targets = [f"{TGT_LANG_TAG} {text}" for text in examples["tgt_en"]]

    model_inputs = tokenizer(
        inputs,
        max_length=max_source_length,
        truncation=True,
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=max_target_length,
            truncation=True,
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = raw_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)


Map:   0%|          | 0/67644 [00:00<?, ? examples/s]

Map:   0%|          | 0/7517 [00:00<?, ? examples/s]

In [42]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir=str(LOG_DIR / "sa_en_indictrans2_ft"),
    overwrite_output_dir=True,

    # new name in recent transformers versions
    eval_strategy="epoch",          # instead of evaluation_strategy="epoch"

    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=50,

    num_train_epochs=1,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-5,
    weight_decay=0.01,
    fp16=False,
    report_to="none",
)


In [43]:
training_args = TrainingArguments(
    output_dir=str(LOG_DIR / "sa_en_indictrans2_ft"),
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-5,
    weight_decay=0.01,
    fp16=False,
    report_to="none",
)

training_args = training_args.set_evaluate(strategy="epoch")
training_args = training_args.set_save(strategy="epoch")
training_args = training_args.set_logging(strategy="steps", steps=50, report_to="none")


In [44]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)


  trainer = Trainer(
