# LoRA Fine-Tuning LLaMA-3B with Hugging Face Transformers

>
**Description:** LoRA fine-tuning of LLaMA-3B on a Hugging Face dataset with W&B tracking, using the Transformers library.

### Requirements

* **Container:** `nvcr.io/nvidia/pytorch:25.09-py3`
* **GPUs:** 1–8 NVIDIA GPUs (≥24 GB per GPU recommended). H100/A100 or newer preferred.
* **Storage:** ~10–50 GB for datasets & checkpoints. See the [Storage Guide](https://docs.nvidia.com/dgx-cloud/lepton/features/storage/#use-storage-for-workloads).
* **Shared Memory (SHM):** ≥8 GB recommended for tokenization/dataloaders.
* **External Accounts:** Hugging Face token for gated models and a Weights & Biases API key.

### 1. Install Dependencies

In [None]:
# No DeepSpeed in this notebook to avoid libaio/AIO build/link issues
!pip -q install --upgrade pip
!pip -q install "transformers>=4.44" datasets accelerate peft bitsandbytes evaluate wandb ipywidgets tqdm

# Create Triton autotune dir to silence a warning
import os; os.makedirs('/root/.triton/autotune', exist_ok=True)

# Hide noisy FutureWarnings globally
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", message="`torch.cuda.amp.custom_", category=FutureWarning)

### 2. Environment Variables & Quick Mode

In [None]:
import os, warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", message="`torch.cuda.amp.custom_", category=FutureWarning)

# ---- User-configurable ----
MODEL_NAME      = os.environ.get('MODEL_NAME', 'meta-llama/Llama-3.2-3B')
DATASET_NAME    = os.environ.get('DATASET_NAME', 'imdb')
OUTPUT_DIR      = os.environ.get('OUTPUT_DIR', './outputs/llama3b_lora_imdb')
WANDB_API_KEY   = os.environ.get('WANDB_API_KEY', '')
WANDB_PROJECT   = os.environ.get('WANDB_PROJECT', 'llama3b-lora')
WANDB_ENTITY    = os.environ.get('WANDB_ENTITY', '')
HF_TOKEN        = os.environ.get('HF_TOKEN', '')  # needed for gated models

for k in ['MODEL_NAME','DATASET_NAME','OUTPUT_DIR','WANDB_API_KEY','WANDB_PROJECT','WANDB_ENTITY','HF_TOKEN']:
    os.environ[k] = eval(k)

# Optional logins
try:
    import wandb
    if WANDB_API_KEY:
        wandb.login(key=WANDB_API_KEY)
        print("W&B login successful.")
    else:
        print("WANDB_API_KEY not set; skipping W&B login.")
except Exception as e:
    print("W&B login skipped:", e)

try:
    from huggingface_hub import login
    if HF_TOKEN:
        login(token=HF_TOKEN); print("HF login successful.")
    else:
        print("HF_TOKEN not set; skipping HF login.")
except Exception as e:
    print("HF login skipped:", e)

# ---- Quick Mode (<= ~5 minutes on 1 GPU) ----
QUICK_MODE = True
QUICK_MAX_LEN       = 256
QUICK_TRAIN_SAMPLES = 400
QUICK_MAX_STEPS     = 120
QUICK_LR            = 3e-4
QUICK_BSZ           = 2
QUICK_GRAD_ACC      = 1
QUICK_LOG_STEPS     = 10

print(f"Quick Mode: {QUICK_MODE}")

### 3. Imports

In [None]:
# Fix tqdm IProgress warning by ensuring ipywidgets is available
try:
    from tqdm import tqdm  # noqa: F401
except Exception:
    import subprocess, sys
    subprocess.run([sys.executable, "-m", "pip", "install", "-q", "ipywidgets", "tqdm"], check=False)
    from tqdm import tqdm  # noqa: F401

import warnings; warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", message="`torch.cuda.amp.custom_", category=FutureWarning)

import os, json, torch, random
from datasets import load_dataset
from transformers import (
    AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer,
    DataCollatorForLanguageModeling, set_seed
)
from peft import LoraConfig, get_peft_model

set_seed(42)
print(f"GPUs visible: {torch.cuda.device_count()}")

### 4. Load Dataset

In [None]:
raw = load_dataset(DATASET_NAME)

SYSTEM_PROMPT = "You are a helpful assistant that classifies movie review sentiment."
def build_prompt(text, label):
    target = "positive" if int(label) == 1 else "negative"
    return (
        f"<s>[SYSTEM]\n{SYSTEM_PROMPT}\n[/SYSTEM]\n"
        f"[INSTRUCTION]\nClassify the sentiment of the following review as positive or negative.\n[/INSTRUCTION]\n"
        f"[INPUT]\n{text}\n[/INPUT]\n"
        f"[RESPONSE]\n{target}</s>"
    )

def format_row(row):
    return {"text": build_prompt(row["text"], row["label"])}

if QUICK_MODE:
    train_base = raw["train"].select(range(min(QUICK_TRAIN_SAMPLES, len(raw["train"]))))
    eval_base  = raw["test"].select(range(min(64, len(raw["test"]))))
else:
    train_base = raw["train"]
    eval_base  = raw["test"]

train_ds = train_base.map(format_row, remove_columns=train_base.column_names)
eval_ds  = eval_base.map(format_row,  remove_columns=eval_base.column_names)

print(f"Train samples: {len(train_ds)} | Eval samples: {len(eval_ds)}")

### 5. Model & LoRA Setup

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
    trust_remote_code=True
)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

### 6. Training Configuration & Trainer (No DeepSpeed; Quick-aware)

In [None]:
import warnings; warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", message="`torch.cuda.amp.custom_", category=FutureWarning)

# Tokenization (Quick-aware)
MAX_LEN = QUICK_MAX_LEN if QUICK_MODE else 1024
def tokenize_fn(batch):
    return tokenizer(batch["text"], truncation=True, max_length=MAX_LEN, padding=False)

train_tok = train_ds.map(tokenize_fn, batched=True, remove_columns=["text"])
eval_tok  = eval_ds.map(tokenize_fn,  batched=True, remove_columns=["text"])

# Collator
collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# TrainingArguments (no DeepSpeed anywhere)
if QUICK_MODE:
    args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        per_device_train_batch_size=QUICK_BSZ,
        per_device_eval_batch_size=QUICK_BSZ,
        gradient_accumulation_steps=QUICK_GRAD_ACC,
        learning_rate=QUICK_LR,
        num_train_epochs=1,
        max_steps=QUICK_MAX_STEPS,
        lr_scheduler_type="cosine",
        warmup_ratio=0.0,
        logging_steps=QUICK_LOG_STEPS,
        eval_strategy="no",
        save_strategy="no",
        bf16=True,
        report_to=[],  # skip W&B in quick mode for speed
        run_name="quick-5min-run",
    )
else:
    args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        gradient_accumulation_steps=8,
        num_train_epochs=1,
        learning_rate=2e-4,
        lr_scheduler_type="cosine",
        warmup_ratio=0.03,
        logging_steps=10,
        eval_strategy="steps",
        eval_steps=100,
        save_steps=500,
        save_total_limit=2,
        bf16=True,
        report_to=["wandb"] if os.environ.get("WANDB_API_KEY") else [],
        run_name="llama3b-lora-imdb",
    )

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=None if QUICK_MODE else eval_tok.select(range(min(1000, len(eval_tok)))),
    data_collator=collator,
)

print(f"Trainer ready | QUICK_MODE={QUICK_MODE} | MAX_LEN={MAX_LEN} | train_samples={len(train_tok)}")

### 7. Train & Evaluate (Quick/Full)

In [None]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", message="`torch.cuda.amp.custom_", category=FutureWarning)

train_result = trainer.train()
print("Training done.")

# Save artifacts
import os
adapter_dir = os.path.join(OUTPUT_DIR, "adapter_quick" if QUICK_MODE else "adapter")
tok_dir     = os.path.join(OUTPUT_DIR, "tokenizer")
os.makedirs(adapter_dir, exist_ok=True); os.makedirs(tok_dir, exist_ok=True)
try:
    trainer.save_model(adapter_dir)
    tokenizer.save_pretrained(tok_dir)
    print(f"Saved adapter to: {adapter_dir}")
except Exception as e:
    print("Save skipped:", e)

# Tiny eval in Quick Mode
if QUICK_MODE:
    try:
        def format_row(row): return {"text": build_prompt(row["text"], row["label"])}
        small_eval = load_dataset(DATASET_NAME)["test"].select(range(min(64, len(load_dataset(DATASET_NAME)["test"]))))
        small_eval = small_eval.map(format_row, remove_columns=small_eval.column_names)
        def tokenize_fn(batch): return tokenizer(batch["text"], truncation=True, max_length=QUICK_MAX_LEN, padding=False)
        small_eval_tok = small_eval.map(tokenize_fn, batched=True, remove_columns=["text"])
        metrics = trainer.evaluate(eval_dataset=small_eval_tok)
        print("Quick eval metrics:", metrics)
    except Exception as e:
        print("Quick eval skipped:", e)
else:
    metrics = trainer.evaluate()
    print("Eval metrics:", metrics)

### 8. Inference Test

In [None]:
# Simple generation to sanity-check the fine-tuned adapter
def generate_sentiment(review: str, max_new_tokens: int = 64):
    prompt = build_prompt(review, label=1)  # label ignored; we use the template
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.inference_mode():
        out = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
    return tokenizer.decode(out[0], skip_special_tokens=True)

print(generate_sentiment("A surprisingly heartfelt and funny film with great performances."))

### 9. Experiment Tracking (Weights & Biases)

In [None]:
import os
try:
    import wandb
    if os.environ.get("WANDB_API_KEY"):
        wandb.init(
            project=os.environ.get("WANDB_PROJECT", "llama3b-lora"),
            entity=os.environ.get("WANDB_ENTITY") or None,
            config={"model": MODEL_NAME, "dataset": DATASET_NAME},
            name="llama3b-lora-imdb"
        )
        print("W&B run initialized.")
    else:
        print("WANDB_API_KEY not set; skipping W&B init.")
except Exception as e:
    print("W&B init skipped:", e)

### 10. Export / Save Artifacts (Final Step)

In [None]:
import os
export_dir = os.path.join(OUTPUT_DIR, "export")
os.makedirs(export_dir, exist_ok=True)

try:
    model.save_pretrained(os.path.join(export_dir, "adapter"))
    tokenizer.save_pretrained(os.path.join(export_dir, "tokenizer"))
    print("Saved adapter & tokenizer.")
except Exception as e:
    print("Adapter/tokenizer save skipped:", e)

# Optional: merge LoRA (requires base + adapter; may be memory heavy; keep off in quick mode)
# from peft import PeftModel
# base = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.bfloat16, trust_remote_code=True)
# merged = PeftModel.from_pretrained(base, os.path.join(OUTPUT_DIR, "adapter_quick" if QUICK_MODE else "adapter"))
# merged = merged.merge_and_unload()
# merged.save_pretrained(os.path.join(export_dir, "merged_model"))
# print("Merged full model saved.")