In [2]:
from transformers.trainer_utils import get_last_checkpoint
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import GenerationConfig, DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

import torch
from peft import PeftModel
import evaluate 
import numpy as np

from datasets import load_dataset

In [3]:
ds = load_dataset("ai4bharat/samanantar", "kn")
ds

DatasetDict({
    train: Dataset({
        features: ['idx', 'src', 'tgt'],
        num_rows: 4093524
    })
})

In [4]:
# Sampling a smaller subset
ds = ds['train'].train_test_split(5000, shuffle=True, seed=42)
ds

DatasetDict({
    train: Dataset({
        features: ['idx', 'src', 'tgt'],
        num_rows: 4088524
    })
    test: Dataset({
        features: ['idx', 'src', 'tgt'],
        num_rows: 5000
    })
})

In [5]:
train_ds = ds['train'].shuffle(seed=42).select(range(25000))
test_ds = ds['test']
print(f"Test Dataset: {test_ds}")
print(f"Train Dataset: {train_ds}")

Test Dataset: Dataset({
    features: ['idx', 'src', 'tgt'],
    num_rows: 5000
})
Train Dataset: Dataset({
    features: ['idx', 'src', 'tgt'],
    num_rows: 25000
})


In [6]:
model_id = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_id, model_max_length=512)
tokenizer.pad_token = tokenizer.eos_token

In [7]:
def format_text(src, trgt=None):
    if trgt is None:
        return f""" Translate English to Kannada: English: {src}, Kannada:"""
    else:
        return f""" Translate English to Kannada: English: {src}, Kannada:{trgt}"""

In [8]:
def tokenize_text(example):
    src = example["src"]
    trgt = example["tgt"]

    full_text = format_text(src, trgt)

    prompt_text = format_text(src)

    # Tokenize full text
    tokenized_full = tokenizer(
        full_text,
        truncation=True,
        max_length=256,
        padding=False
    )

    # Tokenize prompt-only
    tokenized_prompt = tokenizer(
        prompt_text,
        truncation=True,
        max_length=256,
        padding=False
    )

    input_ids = tokenized_full["input_ids"]

    # Create labels
    labels = input_ids.copy()

    # Mask English + instruction tokens
    prompt_len = len(tokenized_prompt["input_ids"])
    labels[:prompt_len] = [-100] * prompt_len

    return {
        "input_ids": input_ids,
        "attention_mask": tokenized_full["attention_mask"],
        "labels": labels
    }

In [9]:
test_tokenized_ds = test_ds.map(tokenize_text, remove_columns=ds.column_names['train'])
test_tokenized_ds

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 5000
})

In [10]:
output_dir = "./llama-kannada-lora"
last_checkpoint = get_last_checkpoint(output_dir)

print(last_checkpoint)

./llama-kannada-lora\checkpoint-2500


In [11]:
base_model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.2-1B",
    torch_dtype=torch.float16,
    device_map="auto"
)

In [12]:
model = PeftModel.from_pretrained(
    base_model,
    last_checkpoint
)

model.eval()


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 2048)
        (layers): ModuleList(
          (0-15): 16 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear

In [13]:
model.generation_config = GenerationConfig(
    max_new_tokens=128,
    do_sample=False,   # important for eval stability
)

In [14]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True,          # REQUIRED
    label_pad_token_id=-100
)

In [15]:

sacrebleu = evaluate.load("sacrebleu")
bertscore = evaluate.load("bertscore")

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    if isinstance(preds, tuple):
        preds = preds[0]

    # Decode predictions and labels
    preds = np.where(preds < 0, tokenizer.pad_token_id, preds)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels (mask token) with pad_token_id for decoding
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Post-processing: Extract the generated Kannada text
    # Assuming your prompt ends with "Kannada:"
    decoded_preds = [pred.split("Kannada:")[-1].strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]

    # 1. Compute SacreBLEU (expects list of list for references)
    bleu_results = sacrebleu.compute(
        predictions=decoded_preds,
        references=[[label] for label in decoded_labels]
    )

    # 2. Compute BERTScore (expects list of strings for references)
    # lang="kn" ensures it uses the correct multilingual model for Kannada
    bert_results = bertscore.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        lang="kn"
    )

    return {
        "bleu": bleu_results["score"],
        "bertscore_precision": np.mean(bert_results["precision"]),
        "bertscore_recall": np.mean(bert_results["recall"]),
        "bertscore_f1": np.mean(bert_results["f1"]),
    }


training_args = Seq2SeqTrainingArguments(

    per_device_eval_batch_size=16,
    predict_with_generate=True, 

    fp16=True,                  
    report_to="none",

)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    eval_dataset=test_tokenized_ds.select(range(1000)), 
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Seq2SeqTrainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [16]:
metrics = trainer.evaluate()

In [21]:
for key in metrics:
    print(f"{key}: {round(metrics[key], 4)}")

eval_loss: 0.5815
eval_model_preparation_time: 0.0
eval_bleu: 31.9009
eval_bertscore_precision: 0.763
eval_bertscore_recall: 0.888
eval_bertscore_f1: 0.8194
eval_runtime: 438.2946
eval_samples_per_second: 2.282
eval_steps_per_second: 0.144
