<a href="https://colab.research.google.com/github/MoSahil147/AI-A-Z-Projects/blob/main/Fine_Tuning_of_LLMs_using_Hugging_Face.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine-Tuning LLMs with Hugging Face

## Step 1: Installing and importing the libraries

In [1]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7

In [2]:
!pip install huggingface_hub



In [3]:
import torch
from trl import SFTTrainer
from peft import LoraConfig
from datasets import load_dataset
from transformers import (AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, pipeline)

  warn("The installed version of bitsandbytes was compiled without GPU support. "


/usr/local/lib/python3.11/dist-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32


## Step 2: Loading the model

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

if device == "cuda":
    llama_model = AutoModelForCausalLM.from_pretrained(
        "aboonaji/llama2finetune-v2",
        device_map="auto",
        max_memory={0: "8GB"},
        offload_folder="./offload"
    )
else:
    llama_model = AutoModelForCausalLM.from_pretrained("aboonaji/llama2finetune-v2")

llama_model.config.use_cache = False
llama_model.config.pretraining_tp = 1

if device == "cuda":
    # Enable gradient checkpointing only on GPU
    llama_model.gradient_checkpointing_enable()

Using device: cpu


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


## Step 3: Loading the tokenizer

In [None]:
# Load the tokenizer
llama_tokenizer = AutoTokenizer.from_pretrained("aboonaji/llama2finetune-v2", trust_remote_code=True)
llama_tokenizer.pad_token = llama_tokenizer.eos_token
llama_tokenizer.padding_side = "right"

## Step 4: Setting the training arguments

In [None]:
from transformers import TrainingArguments

if device == "cuda":
    training_arguments = TrainingArguments(
        output_dir="./results",
        per_device_train_batch_size=1,
        gradient_accumulation_steps=1,
        max_steps=100,
        fp16=True  # Use fp16 for GPU (CUDA)
    )
else:
    training_arguments = TrainingArguments(
        output_dir="./results",
        per_device_train_batch_size=1,
        gradient_accumulation_steps=1,
        max_steps=100,
        bf16=True  # Use bf16 if on TPU; or remove if unsupported
    )

## Step 5: Creating the Supervised Fine-Tuning trainer

In [None]:
from trl import SFTTrainer
from peft import LoraConfig
from datasets import load_dataset

# Initialize the trainer with a further reduced max sequence length (e.g., 256 tokens)
llama_sft_trainer = SFTTrainer(
    model=llama_model,
    args=training_arguments,
    train_dataset=load_dataset(path="aboonaji/wiki_medical_terms_llam2_format", split="train"),
    tokenizer=llama_tokenizer,
    peft_config=LoraConfig(task_type="CAUSAL_LM", r=64, lora_alpha=16, lora_dropout=0.1),
    dataset_text_field="text",
    max_seq_length=256  # Further reduce the sequence length to lower memory usage
)

## Step 6: Training the model

In [None]:
llama_sft_trainer.train()

## Step 7: Chatting with the model

In [None]:
user_prompt = "Please tell me about Ascariasis"
text_generation_pipeline = pipeline(task = "text-generation", model = llama_model, tokenizer = llama_tokenizer, max_length = 300)
model_answer = text_generation_pipeline(f"<s>[INST] {user_prompt} [/INST]")
print(model_answer[0]['generated_text'])