# Dolly v2-3B LoRA Fine-Tuning
This notebook fine-tunes **Databricks' Dolly v2-3B** using the **LaMini-instruction** dataset.
LoRA (Low-Rank Adaptation) enables efficient fine-tuning even on smaller GPUs.

In [None]:

!pip install -q transformers accelerate peft datasets bitsandbytes sentencepiece

import os
import numpy as np
import pandas as pd
import torch
from typing import Dict, List
from functools import partial
import copy

from datasets import load_dataset, disable_caching
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    pipeline,
    logging as hf_logging
)
from transformers import DataCollatorForSeq2Seq
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training

disable_caching()
hf_logging.set_verbosity_info()


## 2. Load Dataset

In [None]:

dataset = load_dataset("MBZUAI/LaMini-instruction", split='train')
small_dataset = dataset.select(list(range(200)))
print("Dataset slice:", small_dataset)
print("One sample:", small_dataset[0])


## 3. Prompt Templates

In [None]:

prompt_template = (
    "Below is an instruction that describes a task. "
    "Write a response that appropriately completes the request.\n"
    "Instruction: {instruction}\nResponse:"
)
answer_template = "{response}"

print("Example prompt:\n", prompt_template.format(instruction=small_dataset[0]['instruction']))


## 4. Add Prompt/Answer/Text Fields

In [None]:

def _add_text(rec: Dict[str, str]) -> Dict[str, str]:
    inst = rec["instruction"]
    resp = rec["response"]
    if not inst:
        raise ValueError(f"Missing instruction in record: {rec}")
    if not resp:
        raise ValueError(f"Missing response in record: {rec}")
    rec["prompt"] = prompt_template.format(instruction=inst)
    rec["answer"] = answer_template.format(response=resp)
    rec["text"] = rec["prompt"] + rec["answer"]
    return rec

small_dataset = small_dataset.map(_add_text)
print("After adding text field:", small_dataset[0])


## 5. Load Model and Tokenizer

In [None]:

model_id = "databricks/dolly-v2-3b"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.float16,
    load_in_8bit=False
)
model.resize_token_embeddings(len(tokenizer))


## 6. Tokenization & Preprocessing

In [None]:

MAX_LENGTH = 256

def _preprocess_batch(batch: Dict[str, List[str]]) -> Dict[str, List]:
    tokens = tokenizer(
        batch["text"],
        padding='max_length',
        truncation=True,
        max_length=MAX_LENGTH
    )
    tokens["labels"] = copy.deepcopy(tokens["input_ids"])
    return tokens

encoded_small_dataset = small_dataset.map(
    _preprocess_batch,
    batched=True,
    remove_columns=["instruction", "response", "prompt", "answer"]
)

processed_dataset = encoded_small_dataset.filter(lambda rec: len(rec["input_ids"]) <= MAX_LENGTH)
split_dataset = processed_dataset.train_test_split(test_size=14, seed=0)
print("Dataset split:", split_dataset)

data_collator = DataCollatorForSeq2Seq(
    model=model,
    tokenizer=tokenizer,
    pad_to_multiple_of=8,
    padding='max_length',
    max_length=MAX_LENGTH
)


## 7. Configure LoRA

In [None]:

LORA_R = 256
LORA_ALPHA = 512
LORA_DROPOUT = 0.05

lora_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["query_key_value"]
)

model = prepare_model_for_int8_training(model)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


## 8. Training Configuration

In [None]:

EPOCHS = 3
LEARNING_RATE = 1e-4
MODEL_SAVE_FOLDER_NAME = "dolly-3b-lora"

training_args = TrainingArguments(
    output_dir=MODEL_SAVE_FOLDER_NAME,
    overwrite_output_dir=True,
    fp16=True,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    learning_rate=LEARNING_RATE,
    num_train_epochs=EPOCHS,
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    warmup_steps=50,
    weight_decay=0.01,
    logging_dir=f"{MODEL_SAVE_FOLDER_NAME}/logs"
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=split_dataset["train"],
    eval_dataset=split_dataset["test"],
    data_collator=data_collator
)

model.config.use_cache = False


## 9. Train Model

In [None]:
trainer.train()

## 10. Save Model

In [None]:

trainer.model.save_pretrained(MODEL_SAVE_FOLDER_NAME)
trainer.save_model(MODEL_SAVE_FOLDER_NAME)
trainer.model.config.save_pretrained(MODEL_SAVE_FOLDER_NAME)
print(f"âœ… Model saved to {MODEL_SAVE_FOLDER_NAME}")


## 11. Inference Test

In [None]:

def postprocess(response: str) -> str:
    parts = response.split("Response:")
    if len(parts) < 2:
        raise ValueError("Unexpected response format, expected 'Response:'")
    return "".join(parts[1:]).strip()

inference_prompt = "List 5 reasons why someone should learn to cook"

inf_pipeline = pipeline(
    "text-generation",
    model=trainer.model,
    tokenizer=tokenizer,
    max_length=256,
    trust_remote_code=True
)

raw = inf_pipeline(prompt_template.format(instruction=inference_prompt))[0]["generated_text"]
final_response = postprocess(raw)
print("Generated response:\n", final_response)
