In [1]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [1]:
import torch
import time
from transformers import AutoModelForCausalLM, AutoTokenizer,BitsAndBytesConfig
from trl import setup_chat_format,ORPOConfig, ORPOTrainer
from datasets import load_dataset
import multiprocessing
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model  


model_id="microsoft/phi-2"
model = AutoModelForCausalLM.from_pretrained(
    model_id,    
    device_map="auto",
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_type="nf4",
    ),
    torch_dtype=torch.bfloat16,
    # FA2 does not work yet
    # attn_implementation="flash_attention_2",          
)
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)


# # set chat template to OAI chatML, remove if you start from a fine-tuned model
model, tokenizer = setup_chat_format(model, tokenizer)
tokenizer.padding_side = 'right' # to prevent warnings

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
tokenizer.get_special_tokens_mask

<bound method PreTrainedTokenizerBase.get_special_tokens_mask of CodeGenTokenizerFast(name_or_path='microsoft/phi-2', vocab_size=50257, model_max_length=2048, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|im_start|>', 'eos_token': '<|im_end|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|im_end|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>']}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	50257: AddedToken("                               ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50258: AddedToken("                              ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50259: AddedToken("                             ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50260: AddedT

In [3]:
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True) 

# Adapter settings
lora_config = LoraConfig(
    r=32, 
    lora_alpha=32, 
    target_modules = [ "q_proj", "k_proj", "v_proj", "dense" ],
    modules_to_save = ["lm_head", "embed_tokens"],
    lora_dropout=0.1, 
    bias="none", 
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_config)

model.config.use_cache = False

In [4]:
# in the following config, we combine the usual HF Trainer args with the ORPOConfig args (beta)

cfg = ORPOConfig(
    output_dir='model/phi2-2b-orpo',     # usual HF Trainer args: https://huggingface.co/docs/transformers/main_classes/trainer#transformers.Trainer.args
    num_train_epochs=3,                     # number of training epochs
    per_device_train_batch_size=2,          # batch size per device during training
    gradient_accumulation_steps=2,          # number of steps before performing a backward/update pass
    gradient_checkpointing=True,            # use gradient checkpointing to save memory
    optim="adamw_torch_fused",              # use fused adamw optimizer
    logging_steps=20,                       # log every 20 steps
    bf16=True,                              # use bfloat16 precision
   # tf32=True,                              # use tf32          
    learning_rate=5e-5,                     # learning rate
    warmup_ratio=0.1,
    warmup_steps=100,
    lr_scheduler_type="cosine",
    max_prompt_length=512,
    remove_unused_columns=False,
    max_length=1024,
    beta=0.1,                               # ORPO beta
    save_total_limit=3,                     # args related to saving the model...
    save_strategy="epoch",
    push_to_hub=True,                   
    hub_model_id='Nandhu/phi-2b-orpo'
)

In [5]:
# https://github.com/huggingface/trl/blob/main/examples/scripts/orpo.py

ds = load_dataset("alvarobartt/dpo-mix-7k-simplified")

def process(row):
    row["prompt"] = tokenizer.apply_chat_template(row["prompt"], tokenize=False)
    row["chosen"] = tokenizer.apply_chat_template(row["chosen"], tokenize=False)
    row["rejected"] = tokenizer.apply_chat_template(row["rejected"], tokenize=False)
    return row

ds = ds.map(
    process,
    num_proc=multiprocessing.cpu_count(),
    load_from_cache_file=False,
)
train_dataset = ds["train"]
eval_dataset = ds["test"]

Map (num_proc=24):   0%|          | 0/6750 [00:00<?, ? examples/s]

Map (num_proc=24):   0%|          | 0/750 [00:00<?, ? examples/s]

In [6]:
train_dataset[0], len(train_dataset)

({'dataset': 'argilla/distilabel-intel-orca-dpo-pairs',
  'prompt': '<|im_start|>user\nQ:Question: how old julio cesar chavez when he fought de la hoya I found the following answer on Google: He holds records for most successful consecutive defenses of world titles (27), most title fights (37), most title-fight victories (31) and he is after Joe Louis with (23) for most title defenses won by knockout (21). Is that a correct answer? Yes or no.\nA:<|im_end|>\n',
  'chosen': "<|im_start|>assistant\n Yes, the information you found on Google is correct. Julio César Chávez holds several records related to world title defenses and victories, and he is considered one of the greatest boxers in history. Here is a detailed answer to your question:\n\nJulio César Chávez was born on July 12, 1962, in Ciudad Obregón, Sonora, Mexico. He began boxing at a young age and quickly made a name for himself in the sport, winning his first world title in 1984 when he defeated Mario Miranda for the WBC super f

In [7]:
orpo_trainer = ORPOTrainer(
    model=model,
    args=cfg,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

In [8]:
orpo_trainer.train()

  0%|          | 0/5061 [00:00<?, ?it/s]

Could not estimate the number of tokens of the input, floating-point operations will not be computed


{'loss': 7.0773, 'grad_norm': 448.0132141113281, 'learning_rate': 1e-05, 'rewards/chosen': -0.19050651788711548, 'rewards/rejected': -0.2069549262523651, 'rewards/accuracies': 0.48750001192092896, 'rewards/margins': 0.01644841395318508, 'logps/rejected': -2.069549083709717, 'logps/chosen': -1.9050649404525757, 'logits/rejected': 0.9096983671188354, 'logits/chosen': 0.8772487640380859, 'nll_loss': 6.980725288391113, 'log_odds_ratio': -0.9662173986434937, 'log_odds_chosen': 0.16117547452449799, 'epoch': 0.01}
{'loss': 6.7482, 'grad_norm': 725.6022338867188, 'learning_rate': 2e-05, 'rewards/chosen': -0.21175460517406464, 'rewards/rejected': -0.19826290011405945, 'rewards/accuracies': 0.4375, 'rewards/margins': -0.013491692952811718, 'logps/rejected': -1.9826290607452393, 'logps/chosen': -2.1175456047058105, 'logits/rejected': 0.9796890020370483, 'logits/chosen': 1.024097204208374, 'nll_loss': 6.6566033363342285, 'log_odds_ratio': -0.9162625074386597, 'log_odds_chosen': -0.1422257423400879

In [None]:
orpo_trainer.push_to_hub()

In [None]:

#### COMMENT IN TO MERGE PEFT AND BASE MODEL ####
# from peft import AutoPeftModelForCausalLM

# # Load PEFT model on CPU
# model = AutoPeftModelForCausalLM.from_pretrained(
#     "model/phi2-2b-orpo",
#     torch_dtype=torch.float16,
#     low_cpu_mem_usage=True,
# )
# # Merge LoRA and base model and save
# merged_model = model.merge_and_unload()
# merged_model.save_pretrained("model/phi2-2b-orpo",safe_serialization=True, max_shard_size="2GB")


### Inference

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

modelpath="model/phi2-2b-orpo"

model = AutoModelForCausalLM.from_pretrained(
    modelpath,    
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
    # attn_implementation="flash_attention_2",
)
tokenizer = AutoTokenizer.from_pretrained(modelpath) 

In [None]:
question = "What is the meaning of life?"
messages = [
    {"role": "user", "content": question},
]
        
input_tokens = tokenizer.apply_chat_template(
    messages, 
    add_generation_prompt=True,
    return_tensors="pt"
).to("cuda")
output_tokens = model.generate(input_tokens, max_new_tokens=200)
output = tokenizer.decode(output_tokens[0])

print(output)