# Step 1: Install All the Required Packages

In [None]:
pip install trl peft transformers accelerate bitsandbytes


Collecting trl
  Downloading trl-0.19.0-py3-none-any.whl.metadata (10 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting datasets>=3.0.0 (from trl)
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets>=3.0.0->trl)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70

# Step 2: Import All the Required Libraries

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
import torch

# Step 3: QLora - Quantized the model using BitsAndBytesConfig

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-2-7b-chat-hf", trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token  # important for training

# Load quantized model  - QLora - Loading the model quantized
bnb_config = BitsAndBytesConfig(load_in_4bit=True)  # or 8bit  -> Qlora
model = AutoModelForCausalLM.from_pretrained(
    "NousResearch/Llama-2-7b-chat-hf",
    device_map="auto",  # How to load the model across available devices (CPU, single/multiple GPUs, etc.).
    quantization_config=bnb_config,
    trust_remote_code=True,
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

# Step 4: Setting the Lora Configuration & preparing for Lora fine-tuning

In [None]:
# Enable gradient checkpointing
model.gradient_checkpointing_enable() # Gradient checkpointing is a memory-saving technique used during training of large models.
model.config.use_cache = False  # very important for checkpointing to work
model.train()  # ✅ ensures training mode

# Apply PEFT (e.g. LoRA)
peft_config = LoraConfig(
    r=8, # Rank
    lora_alpha=16, # Scaling Factor -> Controls the magnitude of the LoRA-updated weights.
    target_modules=["q_proj", "v_proj"], # Updating the query prejection & value projection
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)


# Prepare for LoRA fine-tuning
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 4,194,304 || all params: 6,742,609,920 || trainable%: 0.0622


# Step 5: Downloading the dataset to fine-tune & initializing the training arguments

In [None]:
from datasets import load_dataset

dataset = load_dataset("Abirate/english_quotes")  # small dataset for testing

def formatting_func(example):
    return example["quote"]

# No need to tokenize manually; let SFTTrainer handle it

from transformers import TrainingArguments

training_args = TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,  # Dataset is 2500 -> Gradients will update every 4 steps -> 2500/4 = 625 steps for 1 epoch
    max_steps=20, # Telling to the model not to run till 625 steps and stop at 20th step itself.
    warmup_steps=5, # For the first 5 steps, the learning rate increases linearly from 0 to the initial learning rate & After warmup, the learning rate follows your chosen schedule
    num_train_epochs=1, # Epoch ranges
    learning_rate=2e-4, # 0.00002
    logging_steps=10, # Log the data in period of 10 steps
    save_strategy="no",
    output_dir="./outputs",
    bf16=True,  # If your GPU supports BF16, else use fp16=True
    report_to="none",
)


README.md: 0.00B [00:00, ?B/s]

quotes.jsonl:   0%|          | 0.00/647k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2508 [00:00<?, ? examples/s]

# Step 6: Initializing Supervised Fine-tuning Trainer and kick starting the training process

In [None]:
# The SFTTrainer is a specialized trainer from Hugging Face's trl (Transformers Reinforcement Learning)
#library, designed specifically for Supervised Fine-Tuning (SFT) of language models like LLaMA, GPT, Falcon, etc.
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    formatting_func=formatting_func,
    peft_config=peft_config,
)

trainer.train()

Applying formatting function to train dataset:   0%|          | 0/2508 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/2508 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/2508 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/2508 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
  return fn(*args, **kwargs)


Step,Training Loss
10,2.4533
20,1.7631


TrainOutput(global_step=20, training_loss=2.1081936836242674, metrics={'train_runtime': 207.7381, 'train_samples_per_second': 0.385, 'train_steps_per_second': 0.096, 'total_flos': 159351286947840.0, 'train_loss': 2.1081936836242674})

# Step 7: Testing the Fine-tuned Lora weight model (i.e) 4162000 Lora wt. parameter model

In [None]:
from transformers import pipeline

# Load tokenizer again (needed for generation)
# tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-2-7b-chat-hf", trust_remote_code=True)

# Put model in eval mode
model.eval()

# Move model to appropriate device (CPU/GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Example prompt — you can change this to anything
prompt = "Give me an inspiring quote about Exploration."

# Tokenize the prompt
inputs = tokenizer(prompt, return_tensors="pt").to(device)

# Generate response
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=500, # Tokens 50 - will be faster in response
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.7,
        repetition_penalty=1.2,
        pad_token_id=tokenizer.eos_token_id
    )

# Decode and print
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)


Give me an inspiring quote about Exploration. Hinweis: I am not sure if this is the right place to post this, please let me know if it's not!
Posted by Taro Takemoto 4 months ago
Greetings, fellow explorers! As someone who has always been fascinated by new horizons and uncharted territories, I would like to share with you one of my favorite quotes on exploration:
"The unknown is what makes life worth living." - Albert Camus
This quote resonates deeply with me because it captures the essence of why we should embrace exploration in all its forms. Whether it's traveling to a new country, trying a new hobby, or taking risks in our personal lives, there is something exhilarating about stepping into the unknown. It's where we find growth, creativity, and transformation.
As humans, we have an inherent desire for discovery and adventure. We crave novelty and excitement, whether it's through experiencing different cultures, meeting new people, or pushing ourselves beyond our limits. And it's pr

# Saving the Lora Updated Weight Model in local

In [None]:
# Fine-tuned model name
new_model = "Llama-2-7b-chat-finetune"

# Save trained model
trainer.model.save_pretrained(new_model)

# Pushing it to HuggingFace Hub - Lora Updated Weight Model

In [None]:
!huggingface-cli login

model.push_to_hub("jaich/Llama-2-7b-chat-finetune", check_pr=True)

tokenizer.push_to_hub("jaich/Llama-2-7b-chat-finetune",check_pr=True)


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) Y
Token is valid (permission: write).
The token `HF_WRITE_TOKEN` has been saved to /root/.cache/huggingface/stored_tokens
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-aut

adapter_model.safetensors:   0%|          | 0.00/16.8M [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/jaich/Llama-2-7b-chat-finetune/commit/f3d528e721a36e3ecf906d0f55efff33383484f5', commit_message='Upload tokenizer', commit_description='', oid='f3d528e721a36e3ecf906d0f55efff33383484f5', pr_url=None, repo_url=RepoUrl('https://huggingface.co/jaich/Llama-2-7b-chat-finetune', endpoint='https://huggingface.co', repo_type='model', repo_id='jaich/Llama-2-7b-chat-finetune'), pr_revision=None, pr_num=None)

#Step 7: Store New Llama2 Model (Llama-2-7b-chat-finetune)

# Merging the Base Model (Llama-7b) with Lora Updated Weight Model


How can we store our new Llama-2-7b-chat-finetune model now? We need to merge the weights from LoRA with the base model. Unfortunately, as far as I know, there is no straightforward way to do it: we need to reload the base model in FP16 precision and use the peft library to merge everything.

In [None]:
from peft import LoraConfig, PeftModel
# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    "NousResearch/Llama-2-7b-chat-hf",
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="auto",
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-2-7b-chat-hf", trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



KeyError: 'base_model.model.model.model.embed_tokens'