## Fine-Tuning Using LoRA and QLoRA

This script demonstrates fine-tuning the LLaMA-2 model using LoRA (Low-Rank Adaptation) and QLoRA (Quantized Low-Rank Adaptation). 
It performs:
1. Dataset loading and preparation.
2. Model configuration with quantization.
3. Fine-tuning using SFTTrainer.
4. Saving and reloading the fine-tuned model.
5. Example usage with text generation.

Dependencies:
- accelerate
- peft
- bitsandbytes
- transformers
- trl

**Note:**
 - For QLoRA training, we will utilize a rank of 64 and a scaling parameter of 16. The Llama 2 model will be loaded in 4-bit precision using the NF4 data type. Training will be conducted for a single epoch.

In [None]:
# Install Required Packages
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7

In [None]:
# Import Libraries
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [None]:
# Configuration
model_name = "NousResearch/Llama-2-7b-chat-hf"  # Base model name
new_model = "Llama-2-7b-chat-finetune"         # Output fine-tuned model name
dataset_name = "mlabonne/guanaco-llama2-1k"    # Dataset name

# LoRA Parameters
lora_r = 64
lora_alpha = 16
lora_dropout = 0.1

# Quantization Parameters
use_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False

# Training Arguments
training_args = {
    "output_dir": "./results",
    "num_train_epochs": 1,
    "per_device_train_batch_size": 4,
    "gradient_accumulation_steps": 1,
    "optim": "paged_adamw_32bit",
    "save_steps": 0,
    "logging_steps": 25,
    "learning_rate": 2e-4,
    "weight_decay": 0.001,
    "fp16": False,
    "bf16": False,
    "max_grad_norm": 0.3,
    "max_steps": -1,
    "warmup_ratio": 0.03,
    "group_by_length": True,
    "lr_scheduler_type": "cosine",
    "report_to": "tensorboard",
}

In [None]:
# Load Dataset
dataset = load_dataset(dataset_name, split="train")

# Quantization Configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# GPU Compatibility Check
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# Load Model and Tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map={"": 0},
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Tokenizer Configuration
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# LoRA Configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

# Training Arguments
training_arguments = TrainingArguments(**training_args)

# Fine-Tuning Setup
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
)

# Train Model
trainer.train()

# Save Trained Model
trainer.model.save_pretrained(new_model)

In [None]:
# Text Generation Example
logging.set_verbosity(logging.CRITICAL)
prompt = "What is a large language model?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

In [None]:
# Clean Up VRAM
del model, pipe, trainer
import gc
gc.collect()

In [None]:
# Merge and Save LoRA Model
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map={"": 0},
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()
tokenizer.save_pretrained(new_model)
