In [1]:
# Environment Setup & Installations
!pip install -q transformers "datasets<4.0.0" peft bitsandbytes accelerate sacrebleu evaluate indic-nlp-library sentencepiece scipy huggingface_hub

import os
import torch
from google.colab import drive

# Mount Google Drive for local datasets and checkpoints
drive.mount('/content/drive')

CHECKPOINT_DIR = '/content/drive/MyDrive/IndicBART_Summarization_Checkpoints'
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

print(f"Environment Setup Complete. GPU: {torch.cuda.get_device_name(0)}")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.7/60.7 MB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.8/100.8 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.3/40.3 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m129.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.1/121.1 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25hMounted at /content/drive
Environment Setup Complete. GPU: Tesla T4


In [2]:
# Authenticate & Load Data
from huggingface_hub import login
from datasets import load_dataset, concatenate_datasets
from google.colab import userdata

# --- Hugging Face Authentication ---
try:
    hf_token = userdata.get('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')
    login(token=hf_token)
    print("Logged into Hugging Face via Colab Secrets.")
except userdata.SecretNotFoundError:
    print("Colab Secret 'HF_TOKEN' not found. Please paste your token below:")
    # Replace with your token if not using Secrets
    MANUAL_TOKEN = "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"
    login(token=MANUAL_TOKEN)

# --- Load XL-Sum (Tamil & Telugu) ---
print("Loading Tamil and Telugu from XL-Sum...")
ds_ta = load_dataset("csebuetnlp/xlsum", name="tamil", split="train[:1000]", token=True, trust_remote_code=True)
ds_te = load_dataset("csebuetnlp/xlsum", name="telugu", split="train[:1000]", token=True, trust_remote_code=True)

# --- Load Custom Sanskrit Data ---
print("Loading Sanskrit data from Google Drive...")
# Update this path and extension (.csv or .jsonl) to match your Samanantar/Sāmayik file
SANSKRIT_DATA_PATH = '/content/drive/MyDrive/sanskrit_data.csv'
try:
    ds_sa = load_dataset('csv', data_files=SANSKRIT_DATA_PATH, split='train')
    # Rename columns to match XL-Sum if necessary
    # ds_sa = ds_sa.rename_column("sanskrit_sentence", "text")
    # ds_sa = ds_sa.rename_column("english_translation", "summary")
except Exception as e:
    print(f"Could not load Sanskrit data: {e}. Proceeding with dummy data for structure.")
    ds_sa = ds_ta.select(range(2)) # Fallback if file doesn't exist yet

# --- Tagging and Concatenation ---
def add_lang_tag(example, lang_code):
    example['lang'] = lang_code
    return example

ds_ta = ds_ta.map(lambda x: add_lang_tag(x, 'ta'))
ds_te = ds_te.map(lambda x: add_lang_tag(x, 'te'))
ds_sa = ds_sa.map(lambda x: add_lang_tag(x, 'sa'))

print("Combining datasets...")
full_dataset = concatenate_datasets([ds_ta, ds_te, ds_sa])

# --- Data Cleaning ---
def clean_data(batch):
    if len(str(batch['text'])) < 10 or len(str(batch['summary'])) < 5:
        return False
    if len(str(batch['summary'])) > len(str(batch['text'])):
        return False
    return True

full_dataset = full_dataset.filter(clean_data)
print(f"Total clean training samples: {len(full_dataset)}")

Colab Secret 'HF_TOKEN' not found. Please paste your token below:
Loading Tamil and Telugu from XL-Sum...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

xlsum.py: 0.00B [00:00, ?B/s]

tamil/train/0000.parquet:   0%|          | 0.00/61.3M [00:00<?, ?B/s]

tamil/test/0000.parquet:   0%|          | 0.00/6.97M [00:00<?, ?B/s]

tamil/validation/0000.parquet:   0%|          | 0.00/7.09M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16222 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2027 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2027 [00:00<?, ? examples/s]

telugu/train/0000.parquet:   0%|          | 0.00/46.8M [00:00<?, ?B/s]

telugu/test/0000.parquet:   0%|          | 0.00/5.01M [00:00<?, ?B/s]

telugu/validation/0000.parquet:   0%|          | 0.00/4.96M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10421 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1302 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1302 [00:00<?, ? examples/s]

Loading Sanskrit data from Google Drive...
Could not load Sanskrit data: Unable to find '/content/drive/MyDrive/sanskrit_data.csv'. Proceeding with dummy data for structure.


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Combining datasets...


Filter:   0%|          | 0/2002 [00:00<?, ? examples/s]

Total clean training samples: 2002


In [3]:
# @title 3. Script Unification & Tokenization
from indicnlp.transliterate.unicode_transliterate import UnicodeIndicTransliterator
from transformers import AutoTokenizer

# --- 1. Script Unification (Indic NLP) ---
# IndicBART requires all Indic scripts (Tamil, Telugu, etc.) to be in Devanagari.
lang_map = {'ta': 'ta', 'te': 'te', 'sa': 'sa'}

def unify_script(batch):
    src_lang = batch['lang']
    if src_lang in lang_map:
        # Transliterate source text to Devanagari (Hindi script)
        batch['text'] = UnicodeIndicTransliterator.transliterate(
            batch['text'],
            lang_map[src_lang],
            'hi'
        )
    return batch

print("Applying Script Unification to Devanagari script...")
full_dataset = full_dataset.map(unify_script)

# --- 2. Initialize Tokenizer ---
print("Initializing IndicBART Tokenizer...")
model_checkpoint = "ai4bharat/IndicBART"
# IndicBART uses AlbertTokenizer internally
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, do_lower_case=False, use_fast=False)

MAX_INPUT_LENGTH = 1024
MAX_TARGET_LENGTH = 128

# --- 3. Preprocessing Function ---
def preprocess_function(examples):
    # Input Format: "Sentence </s> <2xx>" where xx is source lang
    inputs = [f"{text} </s> <2{lang}>" for text, lang in zip(examples["text"], examples["lang"])]

    # Target Format: "<2en> Summary </s>"
    # We explicitly prepend <2en> so the model learns to start in English
    targets = [f"<2en> {summary} </s>" for summary in examples["summary"]]

    # Tokenize inputs
    model_inputs = tokenizer(
        inputs,
        max_length=MAX_INPUT_LENGTH,
        truncation=True,
        padding="max_length"
    )

    # Tokenize targets (FIXED: Uses text_target instead of as_target_tokenizer)
    labels = tokenizer(
        text_target=targets,
        max_length=MAX_TARGET_LENGTH,
        truncation=True,
        padding="max_length"
    )

    # Replace padding token id with -100 so it's ignored in loss calculation
    model_inputs["labels"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label]
        for label in labels["input_ids"]
    ]

    return model_inputs

print("Tokenizing combined dataset...")
# Splitting for evaluation before tokenizing
tokenized_datasets = full_dataset.train_test_split(test_size=0.1)
tokenized_datasets = tokenized_datasets.map(preprocess_function, batched=True)

print(f"Tokenization complete! Samples: {len(tokenized_datasets['train'])}")

Applying Script Unification to Devanagari script...


Map:   0%|          | 0/2002 [00:00<?, ? examples/s]

Initializing IndicBART Tokenizer...


config.json:   0%|          | 0.00/832 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.90M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/221 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/398 [00:00<?, ?B/s]

Tokenizing combined dataset...


Map:   0%|          | 0/1801 [00:00<?, ? examples/s]

Map:   0%|          | 0/201 [00:00<?, ? examples/s]

Tokenization complete! Samples: 1801


In [7]:
# @title 4. Load Model with QLoRA & Target Module Detection
import bitsandbytes as bnb
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, TaskType

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names: # lm_head should usually be excluded
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

print("Loading IndicBART in 4-bit...")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float32 # More stable than float16 for T4
)

model = MBartForConditionalGeneration.from_pretrained(
    model_checkpoint,
    quantization_config=bnb_config,
    device_map="auto"
)

model = prepare_model_for_kbit_training(model)

# Automatically detect all linear layers (q, v, k, out, fc1, fc2)
target_modules = find_all_linear_names(model)
print(f"Targeting modules: {target_modules}")

peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=target_modules,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

Loading IndicBART in 4-bit...


Loading weights:   0%|          | 0/267 [00:00<?, ?it/s]



Targeting modules: ['v_proj', 'k_proj', 'out_proj', 'fc1', 'q_proj', 'fc2']
trainable params: 4,325,376 || all params: 444,993,536 || trainable%: 0.9720


In [8]:
# @title 5. Train and Save (Stable Config)
training_args = Seq2SeqTrainingArguments(
    output_dir=CHECKPOINT_DIR,
    per_device_train_batch_size=1,       # Safest batch size
    gradient_accumulation_steps=16,      # Effective batch size of 16
    learning_rate=2e-5,                  # Very conservative learning rate
    max_grad_norm=0.3,                   # Tight gradient clipping
    warmup_ratio=0.1,
    weight_decay=0.01,
    num_train_epochs=3,
    eval_strategy="epoch",
    save_strategy="epoch",
    predict_with_generate=True,
    fp16=False,                          # DISABLE THIS: Prevents the 0.000/nan issue
    label_smoothing_factor=0.1,
    report_to="none",
    push_to_hub=False
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)

# CLEAR CACHE before starting
torch.cuda.empty_cache()

print("Starting training. Loss should now appear as a number > 0.")
trainer.train()

# Final Save
MODEL_SAVE_PATH = f"{CHECKPOINT_DIR}/final_adapter_stable"
trainer.save_model(MODEL_SAVE_PATH)
tokenizer.save_pretrained(MODEL_SAVE_PATH)

warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 3, 'bos_token_id': 2}.


Starting training. Loss should now appear as a number > 0.


  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Bleu,Chrf
1,No log,7.218479,1.1222,0.3317,1.1056,1.0945,0.0825,0.2678
2,No log,6.880081,0.7739,0.0,0.7629,0.7573,0.0352,0.1786
3,No log,6.774679,0.5196,0.0,0.5307,0.4422,0.0409,0.1702


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


('/content/drive/MyDrive/IndicBART_Summarization_Checkpoints/final_adapter_stable/tokenizer_config.json',
 '/content/drive/MyDrive/IndicBART_Summarization_Checkpoints/final_adapter_stable/tokenizer.json')

In [16]:
# @title 6. Test the Trained Model (Forced English Output)
def generate_summary(text, source_lang_code):
    model.eval()

    # 1. Script Unification
    if source_lang_code in lang_map:
        text = UnicodeIndicTransliterator.transliterate(text, lang_map[source_lang_code], 'hi')

    # 2. Format Input
    formatted_text = f"{text} </s> <2{source_lang_code}>"
    tokenized_inputs = tokenizer(
        formatted_text,
        return_tensors="pt",
        truncation=True,
        max_length=1024
    ).to("cuda")

    # 3. Target Language ID
    # Get the specific integer ID for the English token
    en_id = tokenizer.convert_tokens_to_ids("<2en>")

    with torch.no_grad():
        outputs = model.generate(
            input_ids=tokenized_inputs.input_ids,
            attention_mask=tokenized_inputs.attention_mask,
            max_new_tokens=128,
            min_new_tokens=20,
            num_beams=5,
            length_penalty=2.0,
            repetition_penalty=3.0,

            # --- THE KEY FIXES ---
            decoder_start_token_id=en_id, # Forces the first token to be English
            forced_bos_token_id=en_id,    # Stronger override for MBart models
            # ---------------------

            early_stopping=True
        )

    # 4. Decode
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Clean up any leftover language tags if they appear in text
    return summary.replace("<2en>", "").strip()

print("--- Live Inference Test ---")
sample = "சென்னை: தமிழகத்தில் அடுத்த 3 நாட்களுக்கு பரவலாக மழை பெய்ய வாய்ப்பு உள்ளதாக வானிலை ஆய்வு மையம் தெரிவித்துள்ளது."
print(f"Input Tamil: {sample}")
print(f"Generated Summary: {generate_summary(sample, 'ta')}")

--- Live Inference Test ---
Input Tamil: சென்னை: தமிழகத்தில் அடுத்த 3 நாட்களுக்கு பரவலாக மழை பெய்ய வாய்ப்பு உள்ளதாக வானிலை ஆய்வு மையம் தெரிவித்துள்ளது.
Generated Summary: 達 चनन: तमळकततल अटतत 3 नटकळकक परवलक मळ पयय वयपप उळळतक वनल आयव मयम तरवततळळत.</s>


In [19]:
# @title 2B. [NEW - ACTIVE] Cross-Lingual Data (CrossSum)
from huggingface_hub import login
from datasets import load_dataset, concatenate_datasets
from google.colab import userdata

# --- 1. Hugging Face Authentication ---
try:
    hf_token = userdata.get('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')
    login(token=hf_token)
    print("Logged into Hugging Face via Colab Secrets.")
except userdata.SecretNotFoundError:
    print("Colab Secret 'HF_TOKEN' not found. Please paste your token below:")
    MANUAL_TOKEN = "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"
    login(token=MANUAL_TOKEN)

# --- 2. Load CrossSum (Tamil & Telugu -> English) ---
print("Loading cross-lingual Tamil and Telugu datasets from CrossSum...")
# CrossSum natively maps the Indic source article to an English summary
ds_ta = load_dataset("csebuetnlp/CrossSum", "tamil-english", split="train[:1000]", trust_remote_code=True)
ds_te = load_dataset("csebuetnlp/CrossSum", "telugu-english", split="train[:1000]", trust_remote_code=True)

# Smart Renamer: Ensures columns always match what Steps 3, 4, and 5 expect
def unify_columns(ds):
    if "article" in ds.column_names:
        ds = ds.rename_column("article", "text")
    if "source" in ds.column_names:
        ds = ds.rename_column("source", "text")
    if "target" in ds.column_names:
        ds = ds.rename_column("target", "summary")
    return ds

ds_ta = unify_columns(ds_ta)
ds_te = unify_columns(ds_te)

# --- 3. Load Custom Sanskrit Data ---
print("Loading Sanskrit data...")
SANSKRIT_DATA_PATH = '/content/drive/MyDrive/sanskrit_data.csv'
try:
    ds_sa = load_dataset('csv', data_files=SANSKRIT_DATA_PATH, split='train')
    ds_sa = unify_columns(ds_sa) # Apply the same smart renamer
except Exception as e:
    print(f"Could not load Sanskrit data: {e}. Proceeding with dummy data.")
    ds_sa = ds_ta.select(range(2))

# --- 4. Tagging and Concatenation ---
def add_lang_tag(example, lang_code):
    example['lang'] = lang_code
    return example

ds_ta = ds_ta.map(lambda x: add_lang_tag(x, 'ta'))
ds_te = ds_te.map(lambda x: add_lang_tag(x, 'te'))
ds_sa = ds_sa.map(lambda x: add_lang_tag(x, 'sa'))

columns_to_keep = ['text', 'summary', 'lang']
ds_ta = ds_ta.select_columns(columns_to_keep)
ds_te = ds_te.select_columns(columns_to_keep)
ds_sa = ds_sa.select_columns(columns_to_keep)

print("Combining datasets...")
full_dataset = concatenate_datasets([ds_ta, ds_te, ds_sa])

# --- 5. Data Cleaning ---
def clean_data(batch):
    if not batch['text'] or not batch['summary']:
        return False
    if len(str(batch['text'])) < 10 or len(str(batch['summary'])) < 5:
        return False
    return True

full_dataset = full_dataset.filter(clean_data)
print(f"Total clean cross-lingual training samples: {len(full_dataset)}")

# --- VISUAL VERIFICATION ---
print("\n--- SANITY CHECK: First Dataset Entry ---")
print(f"Language Tag: {full_dataset[0]['lang']}")
print(f"Source Text (Truncated): {str(full_dataset[0]['text'])[:150]}...")
print(f"Target Summary: {full_dataset[0]['summary']}")
print("-----------------------------------------")

Colab Secret 'HF_TOKEN' not found. Please paste your token below:
Loading cross-lingual Tamil and Telugu datasets from CrossSum...
Loading Sanskrit data...
Could not load Sanskrit data: Unable to find '/content/drive/MyDrive/sanskrit_data.csv'. Proceeding with dummy data.


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Combining datasets...


Filter:   0%|          | 0/2002 [00:00<?, ? examples/s]

Total clean cross-lingual training samples: 2002

--- SANITY CHECK: First Dataset Entry ---
Language Tag: ta
Source Text (Truncated): ஆதித்யாவின் உடலில் துணி என்ன... ஒரு நூல் கூட இல்லை. அவர் என்னிடம் பேசும்போது, நண்டு, முட்டை, சீன முட்டைக்கோஸ் ஆகியற்றை வாணலில் வதக்கினார். அந்தப் பெரி...
Target Summary: What is it like to be a nudist in a country that prizes modesty and where public nudity is strictly forbidden? Clara Rondonuwu of the BBC's Indonesian service went to meet some members of the country's nudist community to find out.
-----------------------------------------


In [20]:
# @title 3. Script Unification & Tokenization
from indicnlp.transliterate.unicode_transliterate import UnicodeIndicTransliterator
from transformers import AutoTokenizer

# --- 1. Script Unification (Indic NLP) ---
# IndicBART requires all Indic scripts (Tamil, Telugu, etc.) to be in Devanagari.
lang_map = {'ta': 'ta', 'te': 'te', 'sa': 'sa'}

def unify_script(batch):
    src_lang = batch['lang']
    if src_lang in lang_map:
        # Transliterate source text to Devanagari (Hindi script)
        batch['text'] = UnicodeIndicTransliterator.transliterate(
            batch['text'],
            lang_map[src_lang],
            'hi'
        )
    return batch

print("Applying Script Unification to Devanagari script...")
full_dataset = full_dataset.map(unify_script)

# --- 2. Initialize Tokenizer ---
print("Initializing IndicBART Tokenizer...")
model_checkpoint = "ai4bharat/IndicBART"
# IndicBART uses AlbertTokenizer internally
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, do_lower_case=False, use_fast=False)

MAX_INPUT_LENGTH = 1024
MAX_TARGET_LENGTH = 128

# --- 3. Preprocessing Function ---
def preprocess_function(examples):
    # Input Format: "Sentence </s> <2xx>" where xx is source lang
    inputs = [f"{text} </s> <2{lang}>" for text, lang in zip(examples["text"], examples["lang"])]

    # Target Format: "<2en> Summary </s>"
    # We explicitly prepend <2en> so the model learns to start in English
    targets = [f"<2en> {summary} </s>" for summary in examples["summary"]]

    # Tokenize inputs
    model_inputs = tokenizer(
        inputs,
        max_length=MAX_INPUT_LENGTH,
        truncation=True,
        padding="max_length"
    )

    # Tokenize targets (FIXED: Uses text_target instead of as_target_tokenizer)
    labels = tokenizer(
        text_target=targets,
        max_length=MAX_TARGET_LENGTH,
        truncation=True,
        padding="max_length"
    )

    # Replace padding token id with -100 so it's ignored in loss calculation
    model_inputs["labels"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label]
        for label in labels["input_ids"]
    ]

    return model_inputs

print("Tokenizing combined dataset...")
# Splitting for evaluation before tokenizing
tokenized_datasets = full_dataset.train_test_split(test_size=0.1)
tokenized_datasets = tokenized_datasets.map(preprocess_function, batched=True)

print(f"Tokenization complete! Samples: {len(tokenized_datasets['train'])}")

Applying Script Unification to Devanagari script...


Map:   0%|          | 0/2002 [00:00<?, ? examples/s]

Initializing IndicBART Tokenizer...
Tokenizing combined dataset...


Map:   0%|          | 0/1801 [00:00<?, ? examples/s]

Map:   0%|          | 0/201 [00:00<?, ? examples/s]

Tokenization complete! Samples: 1801


In [21]:
# @title 4. Load Model with QLoRA & Target Module Detection
import bitsandbytes as bnb
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, TaskType

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names: # lm_head should usually be excluded
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

print("Loading IndicBART in 4-bit...")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float32 # More stable than float16 for T4
)

model = MBartForConditionalGeneration.from_pretrained(
    model_checkpoint,
    quantization_config=bnb_config,
    device_map="auto"
)

model = prepare_model_for_kbit_training(model)

# Automatically detect all linear layers (q, v, k, out, fc1, fc2)
target_modules = find_all_linear_names(model)
print(f"Targeting modules: {target_modules}")

peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=target_modules,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

Loading IndicBART in 4-bit...


Loading weights:   0%|          | 0/267 [00:00<?, ?it/s]



Targeting modules: ['v_proj', 'k_proj', 'out_proj', 'fc1', 'q_proj', 'fc2']
trainable params: 4,325,376 || all params: 444,993,536 || trainable%: 0.9720


In [22]:
# @title 5. Train and Save (Stable Config)
training_args = Seq2SeqTrainingArguments(
    output_dir=CHECKPOINT_DIR,
    per_device_train_batch_size=1,       # Safest batch size
    gradient_accumulation_steps=16,      # Effective batch size of 16
    learning_rate=2e-5,                  # Very conservative learning rate
    max_grad_norm=0.3,                   # Tight gradient clipping
    warmup_ratio=0.1,
    weight_decay=0.01,
    num_train_epochs=3,
    eval_strategy="epoch",
    save_strategy="epoch",
    predict_with_generate=True,
    fp16=False,                          # DISABLE THIS: Prevents the 0.000/nan issue
    label_smoothing_factor=0.1,
    report_to="none",
    push_to_hub=False
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)

# CLEAR CACHE before starting
torch.cuda.empty_cache()

print("Starting training. Loss should now appear as a number > 0.")
trainer.train()

# Final Save
MODEL_SAVE_PATH = f"{CHECKPOINT_DIR}/final_adapter_stable"
trainer.save_model(MODEL_SAVE_PATH)
tokenizer.save_pretrained(MODEL_SAVE_PATH)

warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 3, 'bos_token_id': 2}.


Starting training. Loss should now appear as a number > 0.


  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Bleu,Chrf
1,No log,6.654782,0.0374,0.0,0.0486,0.0486,0.0023,0.1191
2,No log,6.277786,0.105,0.0,0.1082,0.1082,0.0026,0.2921
3,No log,6.179753,0.2148,0.0,0.1931,0.1907,0.003,0.4816


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


('/content/drive/MyDrive/IndicBART_Summarization_Checkpoints/final_adapter_stable/tokenizer_config.json',
 '/content/drive/MyDrive/IndicBART_Summarization_Checkpoints/final_adapter_stable/tokenizer.json')

In [23]:
# @title 6. Test the Trained Model (Forced English Output)
def generate_summary(text, source_lang_code):
    model.eval()

    # 1. Script Unification
    if source_lang_code in lang_map:
        text = UnicodeIndicTransliterator.transliterate(text, lang_map[source_lang_code], 'hi')

    # 2. Format Input
    formatted_text = f"{text} </s> <2{source_lang_code}>"
    tokenized_inputs = tokenizer(
        formatted_text,
        return_tensors="pt",
        truncation=True,
        max_length=1024
    ).to("cuda")

    # 3. Target Language ID
    # Get the specific integer ID for the English token
    en_id = tokenizer.convert_tokens_to_ids("<2en>")

    with torch.no_grad():
        outputs = model.generate(
            input_ids=tokenized_inputs.input_ids,
            attention_mask=tokenized_inputs.attention_mask,
            max_new_tokens=128,
            min_new_tokens=20,
            num_beams=5,
            length_penalty=2.0,
            repetition_penalty=3.0,

            # --- THE KEY FIXES ---
            decoder_start_token_id=en_id, # Forces the first token to be English
            forced_bos_token_id=en_id,    # Stronger override for MBart models
            # ---------------------

            early_stopping=True
        )

    # 4. Decode
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Clean up any leftover language tags if they appear in text
    return summary.replace("<2en>", "").strip()

print("--- Live Inference Test ---")
sample = "சென்னை: தமிழகத்தில் அடுத்த 3 நாட்களுக்கு பரவலாக மழை பெய்ய வாய்ப்பு உள்ளதாக வானிலை ஆய்வு மையம் தெரிவித்துள்ளது."
print(f"Input Tamil: {sample}")
print(f"Generated Summary: {generate_summary(sample, 'ta')}")

--- Live Inference Test ---
Input Tamil: சென்னை: தமிழகத்தில் அடுத்த 3 நாட்களுக்கு பரவலாக மழை பெய்ய வாய்ப்பு உள்ளதாக வானிலை ஆய்வு மையம் தெரிவித்துள்ளது.
Generated Summary: 達 चनन: तमळकततल अटतत 3 नटकळकक परवलक मळ पयय वयपप उळळतक वनल आयव मयम तरवततळळत.</s>


In [24]:
# @title 6. Test the Trained Model (Hard-Forced English Output)
from peft import PeftModel
import torch

def generate_summary_forced(text, source_lang_code):
    model.eval()

    # 1. Script Unification
    if source_lang_code in lang_map:
        text = UnicodeIndicTransliterator.transliterate(text, lang_map[source_lang_code], 'hi')

    # 2. Format Input
    formatted_text = f"{text} </s> <2{source_lang_code}>"
    tokenized_inputs = tokenizer(
        formatted_text,
        return_tensors="pt",
        truncation=True,
        max_length=1024
    ).to("cuda")

    # 3. Create the physical Decoder Input Tensor
    en_id = tokenizer.convert_tokens_to_ids("<2en>")
    # We explicitly build a tensor with the English ID to feed directly to the decoder
    forced_decoder_input = torch.tensor([[en_id]], dtype=torch.long).to("cuda")

    # 4. Generate with injected decoder input
    with torch.no_grad():
        outputs = model.generate(
            input_ids=tokenized_inputs.input_ids,
            attention_mask=tokenized_inputs.attention_mask,
            decoder_input_ids=forced_decoder_input, # <--- THE SLEDGEHAMMER
            max_new_tokens=128,
            min_new_tokens=15,
            num_beams=5,
            length_penalty=1.5,
            repetition_penalty=2.5,
            early_stopping=True
        )

    # 5. Decode
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return summary.replace("<2en>", "").strip()

print("--- Live Inference Test ---")
sample = "சென்னை: தமிழகத்தில் அடுத்த 3 நாட்களுக்கு பரவலாக மழை பெய்ய வாய்ப்பு உள்ளதாக வானிலை ஆய்வு மையம் தெரிவித்துள்ளது."
print(f"Input Tamil: {sample}")
print(f"Generated Summary: {generate_summary_forced(sample, 'ta')}")

--- Live Inference Test ---
Input Tamil: சென்னை: தமிழகத்தில் அடுத்த 3 நாட்களுக்கு பரவலாக மழை பெய்ய வாய்ப்பு உள்ளதாக வானிலை ஆய்வு மையம் தெரிவித்துள்ளது.
Generated Summary: 達 चनन: तमळकततल अटतत 3 नटकळकक परवलक मळ पयय वयपप उळळतक वनल आयव मयम तरवततळळत.</s>


In [25]:
# @title Diagnostic: What is the Model Actually Seeing?

print("--- 1. Tokenizer Vocabulary Check ---")
# Check if the tokenizer actually knows the English tag, or if it's returning 'None'
en_token_id = tokenizer.vocab.get("<2en>", "ERROR: TOKEN NOT FOUND")
print(f"ID for <2en>: {en_token_id}")

print("\n--- 2. Dataset Label Check ---")
# Extract the first training example
sample_inputs = tokenized_datasets["train"][0]["input_ids"]
sample_labels = tokenized_datasets["train"][0]["labels"]

# Remove the -100 padding so we can decode the raw text
clean_labels = [l for l in sample_labels if l != -100]

print("Model Input (What it reads):")
print(tokenizer.decode(sample_inputs, skip_special_tokens=False))

print("\nModel Target (What it is supposed to learn):")
print(tokenizer.decode(clean_labels, skip_special_tokens=False))

--- 1. Tokenizer Vocabulary Check ---
ID for <2en>: 64004

--- 2. Dataset Label Check ---
Model Input (What it reads):
[CLS] पतवक नटटलळळ तळरचलकळ परवयटमपत अङकळळ अतकरकळ परटटवत कम जङ-उन वळककमक कणटरनतर. इननलयल, ऒर मननरपतत नलयततन कटटमन पण 70 चतवतम नरवटनतळळत कणट कम 'पचचरर' पनतकवम, मलम ओटटल ऒनरल 'मन तटटकळवट मचमन नलयलळळ' कळयल तटटकळ कणट अवर 'अतरचचयटनततकवम' अननटटन अरच ऊटकम चयत वळयटटळळत. अणआयतङकळ उरवककवतरक अटतत, तनत परळतर वळरचचकक वट करय मननरम अळतत वरकरत. चनवन ऎललपपकतय ऒटटयळळ वट हमयङ मकणततलळळ ननक इटङकळ इनत आयवपपणयनपत कम जङ-उन परवयटटर. करपपक ओरञचन मननरपतत नलयततन कटटमन पणकळ आरमपककपपटट 17 आणटकळ आकयळळ नलयल, इतवर 70 चतवत पणकळ मटटम नरवटनतळळत अवर कररञचटटयळळर. मलम, यमपञचन नकरततलळळ ऒर ओटटलन कटटमन पणकळ तटङकपपटट आर आणटकळकयळळ नलयल, अतन मरपचच वलकळकट इतवर मरकळळपपटवलल. अतपनर, आनप चररल वटतय परवयटट अवर, अङकळळ कळयल तटटकळ, "अळकककवम, इरटटकवम, चकतरमरर" नलयलम उळळतक करपपटटतक कचऎनए चयत मकम तरवततळळत. अतरकटतत, अङकळळ प तयरककम तळरचलय परवयटट कम जङ-उन, मकण अरच "ऒळङकरर मरयल" इयङक वरवतक करनर. पर चयतकळ: चमक ऊटकङकळल पपच तमळ:</s><2