<a href="https://colab.research.google.com/github/Lior-Baruch/LLM-Advanced-FineTuning/blob/main/SFT_DPO_llama_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# install all dependencies
!pip install -q -U peft transformers datasets bitsandbytes trl accelerate wandb

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.4/183.4 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m32.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m35.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.0/105.0 MB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.9/150.9 kB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m71.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━

In [2]:
import torch

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from transformers import TrainingArguments
from peft import LoraConfig, AutoPeftModelForCausalLM
from datasets import load_dataset, Dataset
from trl import SFTTrainer, DPOTrainer

from huggingface_hub import notebook_login

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
from huggingface_hub import notebook_login

# log in to the Hugging Face hub (required for private datasets/models)
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
def print_trainable_params(model):
    total_params = 0
    trainable_params  = 0
    for name, param in model.named_parameters():
        total_params += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || total params: {total_params} || trainable%: {100 * trainable_params / total_params}"
    )

## Load base model (quantized) llama-2-7b and tokenizer

In [6]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig

# Load the 7b llama-2 model
model_id = "meta-llama/Llama-2-7b-hf"

# Set quantization config (to save memory)
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

# Load model, quantized
base_model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config, device_map={"": 0})
base_model.config.use_cache = False

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, device_map={"": 0})

# Set it to a new token to correctly attend to EOS tokens.
tokenizer.pad_token = tokenizer.eos_token
#tokenizer.padding_side = "right"  # Fix weird overflow issue with fp16 training

# Print the number of learned parameters
print_trainable_params(base_model)


config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

trainable params: 262410240 || total params: 3500412928 || trainable%: 7.496550989769399


### define LoRA adapter confing

In [7]:
# Define LoRA ("low-rank attention") config
lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj"],
    bias="none",
    task_type="CAUSAL_LM",
)


# Load ultrachat dataset

In [8]:
from datasets import load_dataset

# Load dataset ultrachat (User Assisted Chat Dataset)
train_dataset = load_dataset("stingning/ultrachat", split="train[:1%]")
df_train_sft = train_dataset.to_pandas()
print(df_train_sft.shape)
df_train_sft.head()

Downloading readme:   0%|          | 0.00/3.15k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/964M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/927M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/952M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/966M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.07G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/958M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/675M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/533M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

(14684, 2)


Unnamed: 0,id,data
0,0,[How can cross training benefit groups like ru...
1,1,[Are there any particular physical benefits to...
2,2,[What percentage of the Earth's surface is cov...
3,3,[How does language translation technology impa...
4,4,[What is the most popular smartphone brand the...


In [9]:
print(train_dataset[1])

{'id': '1', 'data': ['Are there any particular physical benefits to mindful walking, such as improved posture or increased physical fitness?', 'Yes, there are physical benefits to mindful walking, such as improved posture, increased physical fitness, and better balance. Mindful walking can also help relieve tension in the body, reduce stress, and improve flexibility. It can also improve circulation and help with weight management. By tuning into the body, mindful walking can also help individuals identify and address any imbalances or discomfort, leading to a healthier and more aligned body.', 'That sounds great! Can you give me some tips on how to incorporate mindful walking into my daily routine?', 'Sure, here are some tips on how to incorporate mindful walking into your daily routine:\n\n1. Choose a quiet and peaceful environment for your walk. This could be a park, a quiet neighborhood, or even a garden.\n\n2. Start by tuning into your breath and becoming aware of your surroundings

## define formatting_func (for our data)

In [10]:
# formatting function for the SFTTrainer (to format the dataset correctly)
def formatting_func(example):
    text = f"### USER: {example['data'][0]}\n### ASSISTANT: {example['data'][1]}"
    return text

## define SFT training args

In [11]:
from transformers import TrainingArguments

# Define training arguments

output_dir = "LBK95/llama-7b-qlora-ultrachat_2" # Set output directory for fine-tuned model
per_device_train_batch_size = 4  # Set batch size per device for training
gradient_accumulation_steps = 4  # Set the number of gradient accumulation steps
optim = "paged_adamw_32bit" # Set optimizer
save_steps = 10 # Set the number of steps before saving
logging_steps = 5 # Set the number of steps before logging
learning_rate = 2e-4 # Set learning rate
max_grad_norm = 0.3 # Set max gradient norm
max_steps = 200 # Set max steps
warmup_ratio = 0.03 # Set warmup ratio for learning rate scheduler
lr_scheduler_type = "cosine" # Set learning rate scheduler type


# Create TrainingArguments object with all of our defined arguments
training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    lr_scheduler_type=lr_scheduler_type,
    gradient_checkpointing=True,
    push_to_hub=True,
)


## create SFTTrainer

In [12]:
# Create SFTTrainer object (it is a wrapper around the Trainer object for Supervised-Fine-Tuning)
trainer = SFTTrainer(
    model=base_model,
    args=training_arguments,
    train_dataset=train_dataset,
    peft_config=lora_config,
    packing=True,
    tokenizer=tokenizer,
    max_seq_length=1024,
    formatting_func=formatting_func,
)


Generating train split: 0 examples [00:00, ? examples/s]

In [13]:
print_trainable_params(base_model)

trainable params: 16777216 || total params: 3517190144 || trainable%: 0.477006226934315


# Train using SFT

In [15]:
# Train the model using the SFTTrainer (this will take a while, and save the model to the output directory)

#trainer.train()
#trainer.push_to_hub()

# End of SFT, Start of DPO

## Load DPO dataset and prepare for DPO trainer

In [16]:
dataset_dpo = load_dataset("jondurbin/truthy-dpo-v0.1", split="train")

df_dpo = dataset_dpo.to_pandas()
df_dpo.head()

# keep rows with 'system' column = 'You are an unbiased, uncensored, helpful assistant.'
df_dpo = df_dpo[df_dpo["system"] == "You are an unbiased, uncensored, helpful assistant."]
df_dpo.head()

# keep only columns 'prompt', 'chosen', 'rejected'
df_dpo = df_dpo[["prompt", "chosen", "rejected"]]

# change every text in promt from str to user: str. asistent:
df_dpo["prompt"] = df_dpo["prompt"].apply(lambda x: "### USER: " + x + "\n### ASSISTANT: ")
filtered_dataset = Dataset.from_pandas(df_dpo)
print(df_dpo.shape)
df_dpo.head()


Downloading readme:   0%|          | 0.00/904 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/653k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

(687, 3)


Unnamed: 0,prompt,chosen,rejected
0,### USER: What's the nearest national park to ...,"As an AI, I don't have a physical location, so...","I don't have access to the user's location, so..."
1,### USER: Do you possess the ability to naviga...,"No, I do not possess the ability to navigate o...","Yes, I can navigate and move within a physical..."
2,### USER: Do wooden pencils contain lead as th...,"No, wooden pencils do not contain lead in thei...","Yes, wooden pencils typically contain a core m..."
3,### USER: Is there a dark side of the moon?\n#...,There is no permanently dark side of the moon....,"Yes, there is a dark side of the moon. The far..."
4,### USER: What's the weather like where you ar...,"As an artificial intelligence, I don't have a ...",I'm not sure. I don't have access to the curre...


In [17]:
# def chatml_format(example):
#     # Format system
#     if len(example['system']) > 0:
#         message = {"role": "system", "content": example['system']}
#         system = tokenizer.apply_chat_template([message], tokenize=False)
#     else:
#         system = ""

#     # Format instruction
#     message = {"role": "user", "content": example['question']}
#     prompt = tokenizer.apply_chat_template([message], tokenize=False, add_generation_prompt=True)
#     # Format chosen answer
#     chosen = example['chosen'] + "<|im_end|>\n"

#     # Format rejected answer
#     rejected = example['rejected'] + "<|im_end|>\n"

#     return {
#         "prompt": system + prompt,
#         "chosen": chosen,
#         "rejected": rejected,
#     }

# # Load dataset
# dataset = load_dataset("Intel/orca_dpo_pairs")['train']

# # Save columns
# original_columns = dataset.column_names

# # Format dataset
# dataset = dataset.map(
#     chatml_format,
#     remove_columns=original_columns
# )
# filtered_dataset = dataset

# Load our saved SFT model from the hub

In [None]:
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer
import torch

SFT_model_id = "LBK95/llama-7b-qlora-ultrachat_2"
#base_model_id = "meta-llama/Llama-2-7b-hf"

# Set quantization config (to save memory)
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4"
)

model = AutoPeftModelForCausalLM.from_pretrained(
    SFT_model_id, # location of saved SFT model
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    load_in_4bit=True,
    is_trainable=True,
)
model.config.use_cache = False

model_ref = AutoPeftModelForCausalLM.from_pretrained(
    SFT_model_id,  # same model as the main one
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    load_in_4bit=True,
)

tokenizer = AutoTokenizer.from_pretrained(SFT_model_id)
# Set it to a new token to correctly attend to EOS tokens.
tokenizer.pad_token = tokenizer.eos_token

In [65]:
print_trainable_params(model)
print_trainable_params(model_ref)

trainable params: 16777216 || total params: 3517190144 || trainable%: 0.477006226934315
trainable params: 0 || total params: 3517190144 || trainable%: 0.0


## define DPO training args

In [67]:
DPO_model_id = "LBK95/llama-7b-qlora-ultrachat_2-DPO"

# Training arguments
training_arguments = TrainingArguments(
    output_dir = DPO_model_id,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    learning_rate=5e-5,
    lr_scheduler_type="cosine",
    max_steps=50,
    save_strategy="no",
    logging_steps=1,
    optim="paged_adamw_32bit",
    warmup_steps=10,
    bf16=True,
    report_to="wandb",
    push_to_hub=True,
)

## define DPO trainer

In [68]:
from trl import DPOTrainer

dpo_trainer = DPOTrainer(
    model,          # base model from SFT pipeline
    model_ref,             # typically a copy of the SFT trained base model
    beta=0.1,              # temperature hyperparameter of DPO
    train_dataset=filtered_dataset, # dataset prepared above
    tokenizer=tokenizer,   # tokenizer
    args=training_arguments,    # training arguments e.g. batch size, lr, etc.
)

Map:   0%|          | 0/687 [00:00<?, ? examples/s]

In [69]:
print_trainable_params(dpo_trainer.model)
print_trainable_params(dpo_trainer.ref_model)

trainable params: 16777216 || total params: 3517190144 || trainable%: 0.477006226934315
trainable params: 0 || total params: 3517190144 || trainable%: 0.0
512


## Train using DPO

In [70]:
dpo_trainer.train()

Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss
1,0.6931
2,0.6931
3,0.6855
4,0.6627
5,0.6404
6,0.5973
7,0.5629
8,0.5049
9,0.4121
10,0.4269


TrainOutput(global_step=50, training_loss=0.2225299643352628, metrics={'train_runtime': 316.2209, 'train_samples_per_second': 2.53, 'train_steps_per_second': 0.158, 'total_flos': 0.0, 'train_loss': 0.2225299643352628, 'epoch': 1.16})

In [71]:
dpo_trainer.push_to_hub()


Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.73k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/LBK95/llama-7b-qlora-ultrachat_2-DPO/commit/098c28d7c5dbf5e8417bf388edaf097372e71c31', commit_message='End of training', commit_description='', oid='098c28d7c5dbf5e8417bf388edaf097372e71c31', pr_url=None, pr_revision=None, pr_num=None)

# compare base (llama2), SFT and SDT+DPO model

In [22]:
# clear cuda memory
torch.cuda.empty_cache()

In [7]:
base_model_id = "meta-llama/Llama-2-7b-hf"
SFT_model_id = "LBK95/llama-7b-qlora-ultrachat_2"
DPO_model_id = "LBK95/llama-7b-qlora-ultrachat_2-DPO"

# Set quantization config (to save memory)
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4"
)

tokenizer = AutoTokenizer.from_pretrained(base_model_id)
tokenizer.pad_token = tokenizer.eos_token

base_model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=quantization_config)
base_model.eval()

SFT_model = AutoModelForCausalLM.from_pretrained(SFT_model_id, quantization_config=quantization_config)
SFT_model.eval()

DPO_model = AutoModelForCausalLM.from_pretrained(DPO_model_id, quantization_config=quantization_config)
DPO_model.eval()

print_trainable_params(base_model)
print_trainable_params(SFT_model)
print_trainable_params(DPO_model)




tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/628 [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

trainable params: 262410240 || total params: 3500412928 || trainable%: 7.496550989769399
trainable params: 0 || total params: 3517190144 || trainable%: 0.0
trainable params: 0 || total params: 3517190144 || trainable%: 0.0


In [8]:
# freeze base model params
for param in base_model.parameters():
    param.requires_grad = False

# freeze SFT model params
for param in SFT_model.parameters():
    param.requires_grad = False

# freeze DPO model params
for param in DPO_model.parameters():
    param.requires_grad = False

print_trainable_params(base_model)
print_trainable_params(SFT_model)
print_trainable_params(DPO_model)

trainable params: 0 || total params: 3500412928 || trainable%: 0.0
trainable params: 0 || total params: 3517190144 || trainable%: 0.0
trainable params: 0 || total params: 3517190144 || trainable%: 0.0


In [9]:
text = "### USER: What's the weather like where you are?\n### ASSISSTANT: "

inputs = tokenizer(text, return_tensors="pt")

# print the inputs
print("Input promt:")
print(tokenizer.decode(inputs.input_ids[0], skip_special_tokens=False))

outputs = base_model.generate(inputs.input_ids, max_new_tokens=250, do_sample=False)
print()
print("Base model:")
print(tokenizer.decode(outputs[0], skip_special_tokens=False))

outputs = SFT_model.generate(inputs.input_ids, max_new_tokens=250, do_sample=False)
print()
print("SFT model:")
print(tokenizer.decode(outputs[0], skip_special_tokens=False))

outputs = DPO_model.generate(inputs.input_ids, max_new_tokens=250, do_sample=False)
print()
print("DPO model:")
print(tokenizer.decode(outputs[0], skip_special_tokens=False))

Input promt:
<s> ### USER: What's the weather like where you are?
### ASSISSTANT: 

Base model:
<s> ### USER: What's the weather like where you are?
### ASSISSTANT: 
### USER: What's the weather like where you are?
### ASSISTANT: 
### USER: What's the weather like where you are?
### ASSISTANT: 
### USER: What's the weather like where you are?
### ASSISTANT: 
### USER: What's the weather like where you are?
### ASSISTANT: 
### USER: What's the weather like where you are?
### ASSISTANT: 
### USER: What's the weather like where you are?
### ASSISTANT: 
### USER: What's the weather like where you are?
### ASSISTANT: 
### USER: What's the weather like where you are?
### ASSISTANT: 
### USER: What's the weather like where you are?
### ASSISTANT: 
### USER: What's the weather like where you are?
### ASSISTANT: 

SFT model:
<s> ### USER: What's the weather like where you are?
### ASSISSTANT: 10:00 AM: The weather is currently sunny and warm with a temperature of 25 degrees Celsius. The sky is 