# Finetuning Llama 2 via LoRA

This setup took ~17GB VRAM. With bitsandbytes 8bit quantization it can be brought down but will only work on WSL2 or Linux Native.

In [None]:
# If running in Colab or for first-time local use (Versions are problematic this should work for nvidia series 3000+ GPUs...)
#%pip install "transformers==4.38.2" "peft==0.8.2" torch datasets "accelerate==0.27.2" sentencepiece
#%pip install torch --index-url https://download.pytorch.org/whl/cu124

In [None]:
import torch, transformers, accelerate
print("transformers", transformers.__version__)
print("accelerate", accelerate.__version__)
print("PyTorch:", torch.__version__)
print("CUDA verfügbar:", torch.cuda.is_available())
print("CUDA Version:", torch.version.cuda)
print("Geräte:", torch.cuda.device_count())
print("Gerätename:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "Keine CUDA-GPU")

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model, TaskType
import os
import math

## Get model from HF

In [None]:
MODEL_NAME = "NousResearch/Llama-2-7b-hf" # This is a pre-trained model not a chat-variant (to show the difference)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    torch_dtype=torch.bfloat16
)

## Configure LoRA

In [None]:
lora_config = LoraConfig(
    r=8, # Rank
    lora_alpha=32, # Scaling factor
    target_modules=["q_proj", "v_proj"],  # Common Llama2 setup
    lora_dropout=0.05,
    task_type=TaskType.CAUSAL_LM
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


## Get Dataset

In [None]:
# Download TinyStories (use a small subset for demo)
ds = load_dataset("roneneldan/TinyStories", split="train[:2000]")  # 2,000 stories

# Prepare for causal LM: simple next-token prediction
def tokenize_function(example):
    # Use text as a single training sequence
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=256)

# Tokenize the dataset
tokenized_ds = ds.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_ds = tokenized_ds.train_test_split(test_size=0.05, seed=42)

# Data collator for dynamic padding
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Causal LM not masked like BERT
)

## Set training args

In [None]:
training_args = TrainingArguments(
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=2,                # Adjust for demonstration (increase for better fit)
    evaluation_strategy="steps",
    eval_steps=250,
    save_strategy="no",
    logging_steps=50,
    learning_rate=2e-4,
    bf16=True,                        # Set to True if your GPU supports bfloat16
    fp16=False,                         # Mixed precision for speed/VRAM
    output_dir="./outputs",
    report_to="none"
)


## Train LoRA

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    data_collator=data_collator
)
trainer.train()

## (Optional) Load LoRA

In [None]:
## Remove comments to load LoRA from disk

#LORA_PATH = "./lora-tinystories"

# Load tokenizer
#tokenizer = AutoTokenizer.from_pretrained(LORA_PATH)

# Load base model
#base_model = AutoModelForCausalLM.from_pretrained(
#     MODEL_NAME,
#     load_in_8bit=True,
#     device_map="auto"
# )

# Load LoRA adapter on top of base model
# model = PeftModel.from_pretrained(base_model, LORA_PATH)
# model.eval()

# print("Loaded LoRA adapter and tokenizer from", LORA_PATH)

## Evaluate

In [16]:
# - model: your LoRA-finetuned model (after training - as model is modified in-place)
# - base_model: a fresh, original model loaded from Hugging Face (no LoRA, no finetuning)
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto"
)
base_model.eval()

# Your prompt list, easily extendable
PROMPTS = [
    "Write me a story about a dragon and a lama. Once upon a time, there was a tiny dragon who",
    "Tell me a story about a small cat who learns to share her toys.",
    "Write a bedtime story about two best friends who go on an adventure to find a lost balloon.",
    "Explain the difference between a llama and an alpaca in simple terms.",
    "What is the capital of France? Give a fun fact about the city.",
    # Add more prompts below if you like!
]

lora_ppls = []
base_ppls = []

for prompt in PROMPTS:
    print("="*60)
    print(f"Prompt: {prompt}\n")

    # LoRA finetuned model output
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
    lora_gen = model.generate(input_ids, max_new_tokens=150, do_sample=True, temperature=0.7)
    lora_output = tokenizer.decode(lora_gen[0], skip_special_tokens=True)
    print("LoRA Finetuned Output:\n", lora_output)

    # Base model output
    base_input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(base_model.device)
    base_gen = base_model.generate(base_input_ids, max_new_tokens=150, do_sample=True, temperature=0.7)
    base_output = tokenizer.decode(base_gen[0], skip_special_tokens=True)
    print("Base Model Output:\n", base_output)

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  4.28it/s]


Prompt: Write me a story about a dragon and a lama. Once upon a time, there was a tiny dragon who

LoRA Finetuned Output:
 Write me a story about a dragon and a lama. Once upon a time, there was a tiny dragon who lived in a cave. The dragon was very lonely, so he decided to go on an adventure. He flew out of the cave and saw a lama. The lama was very friendly and the dragon was happy to meet him.

The dragon and the lama became best friends. They played together and had lots of fun. The dragon taught the lama how to fly and the lama taught the dragon how to dance. They were the best of friends and had lots of fun together.

One day, the dragon and the lama were playing when they heard a loud noise. It was a big, bad monster! The dragon and the lama were scared and
Base Model Output:
 Write me a story about a dragon and a lama. Once upon a time, there was a tiny dragon who lived in a cave in the mountains. He was very lonely and wanted to make friends, but he was too small to fly and to

In [None]:
model.save_pretrained("./lora-tinystories")
tokenizer.save_pretrained("./lora-tinystories")
print("Adapter and tokenizer saved to ./lora-tinystories")