# **🦙 Fine-Tuning Llama-2-7b**



### **Install and import statement**

In [1]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7

import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m50.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.4/77.4 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m61.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m49.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━

### **Define some variables**

In [2]:
model_name = "meta-llama/Llama-2-7b-chat-hf"

dataset_name = "GJN08/India_election_2024_and_TRP_game_zone_fire_event"

new_model = "Llama-2-7b-chat-finetune"

device_map = {"": 0}

HF_TOKEN = "your_hf_token"

### **Load data and bnb configs**

In [3]:
# Load dataset (you can process it here)
dataset = load_dataset(dataset_name, split="train")

# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_use_double_quant=False,
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/9.42k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/130 [00:00<?, ? examples/s]

### **Load model**

In [4]:

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map,
    use_auth_token=HF_TOKEN
)
model.config.use_cache = False
model.config.pretraining_tp = 1




config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]



model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

### **Load the tokenizer and lora configs**

In [5]:
# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_auth_token=HF_TOKEN)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)


tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]



tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

### **Set args and treain the model**

In [6]:

# Set training parameters
training_arguments = TrainingArguments(
    output_dir="./results",
    num_train_epochs=20,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=0,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
    report_to="tensorboard"
)

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="Answer",
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
)

# Train model
trainer.train()



Map:   0%|          | 0/130 [00:00<?, ? examples/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
25,3.9134
50,1.5834
75,0.8641
100,0.6652
125,0.4977
150,0.3936
175,0.348
200,0.2903
225,0.2421
250,0.2385


Step,Training Loss
25,3.9134
50,1.5834
75,0.8641
100,0.6652
125,0.4977
150,0.3936
175,0.348
200,0.2903
225,0.2421
250,0.2385


TrainOutput(global_step=660, training_loss=0.4428786165786512, metrics={'train_runtime': 733.0221, 'train_samples_per_second': 3.547, 'train_steps_per_second': 0.9, 'total_flos': 2754099330662400.0, 'train_loss': 0.4428786165786512, 'epoch': 20.0})

### **Save the new model**

In [7]:
# Save trained model
trainer.model.save_pretrained(new_model)

### **Function for Genrate answer from new llm**

In [20]:
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
def Generate_Answer(prompt):
  pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
  result = pipe(f"<s>[INST] {prompt} [/INST]")
  result = result[0]['generated_text']
  split_len = len(prompt) + 18
  result = result[split_len:]
  return result

 The Congress party gained 47 seats compared to 2019. "  " 


In [None]:
prompt = " How many seats did the BJP win in the 2024 Indian general elections ?"
answer = Generate_Answer(prompt)
print(answer)

### **For del model from ram and from gpu**

In [21]:
# Empty VRAM
del model
del pipe
del trainer
import gc
gc.collect()
gc.collect()

20933

In [22]:
import tensorflow as tf
from numba import cuda

torch.cuda.empty_cache()

### **Merge the new model**

In [23]:
# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map =device_map,
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

### **Save the new fine-tuned model**

In [24]:
save_directory = "./model_directory"
os.makedirs(save_directory, exist_ok=True)

model_path = os.path.join(save_directory, "model")
tokenizer_path = os.path.join(save_directory, "tokenizer")

# Save model and tokenizer
model.save_pretrained(model_path)
tokenizer.save_pretrained(tokenizer_path)

('./model_directory/tokenizer/tokenizer_config.json',
 './model_directory/tokenizer/special_tokens_map.json',
 './model_directory/tokenizer/tokenizer.model',
 './model_directory/tokenizer/added_tokens.json',
 './model_directory/tokenizer/tokenizer.json')

### **Push new model to huggingface**

In [25]:
from transformers import AutoModel, AutoTokenizer
from huggingface_hub import HfApi, HfFolder, Repository, create_repo, upload_folder

# Replace with your Hugging Face Hub token
hf_token = "your_hf_token"

# Authenticate with Hugging Face
HfFolder.save_token(hf_token)

# Model and tokenizer paths (where they are saved locally)
model_path = "/content/model_directory/model"
tokenizer_path = "/content/model_directory/tokenizer"

# Repository name (change to your desired repo name)
repo_name = "Your repo name"

# Create a new repository
api = HfApi()
api.create_repo(repo_name, private=False)


# Upload the files to the repository
upload_folder(
    folder_path=model_path,
    repo_id=repo_name,
    commit_message="Initial commit",
    token=hf_token
)

print(f"Model pushed to Hugging Face Hub: https://huggingface.co/{repo_name}")


Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Model pushed to Hugging Face Hub: https://huggingface.co/GJN08/llama_2_finetuned_on_election2024_trp_game_zone
