In [5]:
%pip install peft accelerate bitsandbytes


Collecting peft
  Using cached peft-0.18.0-py3-none-any.whl.metadata (14 kB)
Collecting bitsandbytes
  Using cached bitsandbytes-0.42.0-py3-none-any.whl.metadata (9.9 kB)
Using cached peft-0.18.0-py3-none-any.whl (556 kB)
Using cached bitsandbytes-0.42.0-py3-none-any.whl (105.0 MB)
Installing collected packages: bitsandbytes, peft
[2K   [38;2;114;156;31m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m2/2[0m [peft]‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m1/2[0m [peft]
[1A[2KSuccessfully installed bitsandbytes-0.42.0 peft-0.18.0
Note: you may need to restart the kernel to use updated packages.


In [None]:
# Cell 1: Install (if needed) and imports

# If peft is not installed, uncomment this:
# %pip install peft accelerate

from pathlib import Path
import os

import torch
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
)

from peft import LoraConfig, get_peft_model


In [None]:
# Cell 2: Paths and configuration
import torch 
PROJECT_ROOT = Path.cwd().resolve().parents[0] if Path.cwd().name == "notebooks" else Path.cwd()
DATA_ROOT = PROJECT_ROOT / "data" / "corpora"

# We start with Sanskrit ‚Üí English fine-tuning (sa_en_itihasa)
CORPUS_DIR = DATA_ROOT / "sn_en_itihasa"

MODEL_NAME = "facebook/nllb-200-distilled-600M"
OUTPUT_DIR = PROJECT_ROOT / "models" / "nllb_sa_en_lora"

MAX_SOURCE_LEN = 128
MAX_TARGET_LEN = 128

BATCH_SIZE = 2   # keep small for MPS memory
EPOCHS = 2
LEARNING_RATE = 2e-5

device = torch.device("cuda" if torch.backends.cuda.is_available() else "cpu")
device


device(type='mps')

In [15]:
# Cell 3: Load parallel data from sa_en_itihasa (train + dev)

def load_parallel_split(corpus_dir: Path, split: str = "train"):
    src_file = corpus_dir / f"{split}.sn"
    tgt_file = corpus_dir / f"{split}.en"
    
    assert src_file.exists(), f"Missing: {src_file}"
    assert tgt_file.exists(), f"Missing: {tgt_file}"
    
    with src_file.open("r", encoding="utf-8") as f_src, tgt_file.open("r", encoding="utf-8") as f_tgt:
        src_lines = [line.strip() for line in f_src]
        tgt_lines = [line.strip() for line in f_tgt]
    
    # Align lengths
    n = min(len(src_lines), len(tgt_lines))
    src_lines = src_lines[:n]
    tgt_lines = tgt_lines[:n]
    
    return Dataset.from_dict({
        "source_text": src_lines,
        "target_text": tgt_lines,
    })

train_ds = load_parallel_split(CORPUS_DIR, "train")
val_ds   = load_parallel_split(CORPUS_DIR, "dev")

datasets = DatasetDict({
    "train": train_ds,
    "validation": val_ds
})

datasets


DatasetDict({
    train: Dataset({
        features: ['source_text', 'target_text'],
        num_rows: 75161
    })
    validation: Dataset({
        features: ['source_text', 'target_text'],
        num_rows: 6148
    })
})

In [16]:
# Cell 4: Load NLLB model and tokenizer, prepare LoRA (optimized for MPS GPU)

print("üîπ Loading NLLB base model...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

# -------------------------------
# Apple Silicon (MPS) configuration
# -------------------------------
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("‚öôÔ∏è  MPS device detected ‚Äî using Apple GPU for training.")
else:
    device = torch.device("cpu")
    print("‚ö†Ô∏è  MPS not available. Falling back to CPU.")

model = model.to(device)

# Configure LoRA (Low-Rank Adapters)
lora_config = LoraConfig(
    task_type="SEQ_2_SEQ_LM",
    r=8,               # LoRA rank
    lora_alpha=32,     # scaling factor
    lora_dropout=0.1,  # regularization
    target_modules=["q_proj", "v_proj"]  # attention projection layers
)

# Wrap model with PEFT (LoRA)
model = get_peft_model(model, lora_config)

# Confirm trainable parameters (should be only LoRA matrices)
print("\n‚úÖ Model is ready for fine-tuning.")
model.print_trainable_parameters()



üîπ Loading NLLB base model...


'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 0d8ae635-bdad-45f3-b901-063a4312cc00)')' thrown while requesting HEAD https://huggingface.co/api/resolve-cache/models/facebook/nllb-200-distilled-600M/f8d333a098d19b4fd9a8b18f94170487ad3f821d/tokenizer_config.json
Retrying in 1s [Retry 1/5].


‚öôÔ∏è  MPS device detected ‚Äî using Apple GPU for training.

‚úÖ Model is ready for fine-tuning.
trainable params: 1,179,648 || all params: 616,253,440 || trainable%: 0.1914


In [17]:
# Cell 5: Preprocessing function (modern Hugging Face API)
# Sanskrit ‚Üí English

SRC_LANG_CODE = "san_Deva"
TGT_LANG_CODE = "eng_Latn"

def preprocess_function(examples):
    tokenizer.src_lang = SRC_LANG_CODE
    tokenizer.tgt_lang = TGT_LANG_CODE

    # Tokenize both source and target in one go
    model_inputs = tokenizer(
        examples["source_text"],
        text_target=examples["target_text"],
        max_length=MAX_SOURCE_LEN,
        truncation=True,
        padding="max_length",
    )

    return model_inputs

# Map the function to your dataset
tokenized_datasets = datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=["source_text", "target_text"]
)

tokenized_datasets


Map:   0%|          | 0/75161 [00:00<?, ? examples/s]

Map:   0%|          | 0/6148 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 75161
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 6148
    })
})

In [18]:
# Cell 6: Data collator for seq2seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


In [19]:
from transformers import Seq2SeqTrainingArguments

# Cell 7: Training configuration (compatible across new transformers versions)

training_args = Seq2SeqTrainingArguments(
    output_dir=str(PROJECT_ROOT / "logs" / "nllb_sa_en_lora"),
    eval_strategy="epoch",               # <- fixed key for newer versions
    save_strategy="epoch",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    predict_with_generate=True,
    logging_dir=str(PROJECT_ROOT / "logs" / "nllb_sa_en_lora"),
    logging_steps=50,
    save_total_limit=2,
    report_to=[],                        # replaces "none"
)

training_args




In [None]:
# ============================================================
# Cell 8: Fast LoRA fine-tuning on MPS with safe Trainer
# ============================================================

import torch
from transformers import Seq2SeqTrainer

# Make sure model is on MPS (Apple GPU)
device = torch.device("cuda" if torch.backends.cuda.is_available() else "cpu")
model.to(device)
print(f"‚öôÔ∏è Using device: {device}")

# ---- Custom Trainer that ignores `num_items_in_batch` ----
class MPSSeq2SeqTrainer(Seq2SeqTrainer):
    def compute_loss(
        self,
        model,
        inputs,
        return_outputs: bool = False,
        num_items_in_batch: int | None = None,   # Trainer will pass this, we just ignore it
    ):
        # DO NOT forward num_items_in_batch to super() or model
        return super().compute_loss(model, inputs, return_outputs=return_outputs)

# ---- Use a smaller subset for faster experiments ----
# You can increase these numbers later once you're happy.
train_subset = tokenized_datasets["train"].select(range(4000))       # e.g. first 4k examples
val_subset   = tokenized_datasets["validation"].select(range(400))   # first 400 examples

print(f"Using {len(train_subset)} train examples and {len(val_subset)} validation examples.")

# ---- Define Trainer (no monkey patching) ----
trainer = MPSSeq2SeqTrainer(
    model=model,
    args=training_args,           # same TrainingArguments from earlier cell
    train_dataset=train_subset,
    eval_dataset=val_subset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

print("üöÄ Starting LoRA fine-tuning for Sanskrit ‚Üí English on Apple MPS GPU...")
train_result = trainer.train()
print("‚úÖ Fine-tuning complete.")

# Save just the LoRA fine-tuned weights
output_path = "models/nllb_sa_en_lora_fast.pth"
torch.save(model.state_dict(), output_path)
print(f"üíæ Saved LoRA weights to: {output_path}")
train_result


‚öôÔ∏è Using device: mps
Using 4000 train examples and 400 validation examples.
üöÄ Starting LoRA fine-tuning for Sanskrit ‚Üí English on Apple MPS GPU...


  trainer = MPSSeq2SeqTrainer(


RecursionError: maximum recursion depth exceeded