In [1]:
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoftQConfig, LoraConfig, get_peft_model, TaskType
import torch

from huggingface_hub import notebook_login

# notebook_login()

In [2]:
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
# model_id = "distilbert/distilgpt2"

tokenizer = AutoTokenizer.from_pretrained(model_id)
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [3]:
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=16,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules = ["q_proj", "v_proj"]
)
# peft_model = get_peft_model(base_model, lora_config)

# peft_model.print_trainable_parameters()

In [9]:
messages = [
    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "Who are you?"},
]

input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    return_tensors="pt"
).to(peft_model.device)

terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]


In [13]:
print(tokenizer.decode(input_ids[0]))

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a pirate chatbot who always responds in pirate speak!<|eot_id|><|start_header_id|>user<|end_header_id|>

Who are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>




In [14]:
outputs = peft_model.generate(
    input_ids,
    max_new_tokens=256,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
)
response = outputs[0][input_ids.shape[-1]:]
print(tokenizer.decode(response, skip_special_tokens=True))


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Arrrr, me hearty! Me name be Captain Chatbot, the scurviest pirate to ever sail the Seven Seas... er, chat with ye on the Interwebs! Me and me trusty parrot, Polly, be here to swab the decks of yer queries and answer yer questions with a pirate's flair! So hoist the colors, me matey, and let's set sail fer a swashbucklin' good time!


In [2]:
import json

with open("datasets/M1_preference_data_15052024.json", "r") as f:
    data = json.load(f)

print(len(data))

1522


In [3]:
# TODO: the prompt here can be constructed for llama specifically during data processing

import copy

from tqdm.notebook import tqdm

dpo_dataset_dict = {
    "prompt": [],
    "chosen": [],
    "rejected": [],
}

msg = {"role": "system", "content": "You are an expert professor, teaching a student how to solve a problem. The student expects an accurate and correct answer to the question that has a thorough explanation of how the correct answer is reached."}

for dp in tqdm(data):
    
    qn = dp["question_complete"]
    
    for pref in dp["preference"]:
        assert pref["overall"] in ["A", "B"]
        
        dpo_dataset_dict["prompt"].append(qn)
        
        msg_qn = {"role": "user", "content": qn}
        
        msg_chosen = {"role": "system", "content": pref[pref["overall"]]}
        msg_rejected = {"role": "system", "content": pref["A" if pref["overall"] == "B" else "B"]}
        
        # chosen = tokenizer.apply_chat_template(msg + msg_qn + msg_chosen, tokenize=False)
        dpo_dataset_dict["chosen"].append([msg, msg_qn, msg_chosen])
        
        # rejected = tokenizer.apply_chat_template(msg + msg_qn + msg_rejected, tokenize=False)
        dpo_dataset_dict["rejected"].append([msg, msg_qn, msg_rejected])
        
        

  0%|          | 0/1522 [00:00<?, ?it/s]

In [4]:
with open("datasets/input_dpo_dataset.json", "w") as f:
    json.dump(dpo_dataset_dict, f)

In [5]:
dpo_dataset_dict["chosen"][17234]

[{'role': 'system',
  'content': 'You are an expert professor, teaching a student how to solve a problem. The student expects an accurate and correct answer to the question that has a thorough explanation of how the correct answer is reached.'},
 {'role': 'user',
  'content': 'Question: Select \\emph{incorrect} statement. The exhaustive search?\n\nOptions:\nA. can be used to find a secret key of AES.\nB. is a brute force attack.\nC. is not applicable against perfectly secure cipher.\nD. runs in time polynomial in the length of the key.'},
 {'role': 'system',
  'content': 'The correct statement is: "runs in time polynomial in the length of the key."\n\nJustification:\n1. The exhaustive search, also known as a brute force attack, involves trying every possible key until the correct one is found. This method is not guaranteed to run in polynomial time in the length of the key, as the number of possible keys can grow exponentially with the key length. Therefore, this statement is incorrect

In [9]:
import multiprocessing

from datasets import Dataset

dpo_ds = Dataset.from_dict(dpo_dataset_dict)


def process(row):
    row["chosen"] = tokenizer.apply_chat_template(row["chosen"], tokenize=False)
    row["rejected"] = tokenizer.apply_chat_template(row["rejected"], tokenize=False)
    return row

ds = dpo_ds.map(
    process,
    num_proc=multiprocessing.cpu_count(),
    load_from_cache_file=False,
)

ds.to_json("datasets/dpo_hf_dataset.json")

ds = ds.train_test_split(test_size=0.2)

Map (num_proc=64):   0%|          | 0/26738 [00:00<?, ? examples/s]

In [20]:
ds["train"]

Dataset({
    features: ['prompt', 'chosen', 'rejected'],
    num_rows: 21390
})

In [5]:
ref_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

ref_lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules = ["q_proj", "v_proj"]
)
ref_peft_model = get_peft_model(ref_model, ref_lora_config)



ValueError: Target modules {'q_proj', 'v_proj'} not found in the base model. Please check the target modules and try again.

In [4]:
from transformers import TrainingArguments
from datasets import Dataset
from trl import DPOTrainer

training_args = TrainingArguments(
    per_device_train_batch_size=2,
    learning_rate=1e-3,
    gradient_accumulation_steps=1,
    logging_steps=10,
    eval_steps=500,
    optim="rmsprop",
    warmup_steps=150,
    output_dir="logs",
    
)

tokenizer.pad_token = tokenizer.eos_token

ds_loaded = Dataset.from_json("datasets/dpo_hf_dataset.json")
ds_loaded = ds_loaded.train_test_split(test_size=0.2)

trainer = DPOTrainer(
    base_model,# peft_model,
    # ref_model,# ref_peft_model,
    args=training_args,
    train_dataset=ds_loaded["train"],
    eval_dataset=ds_loaded["test"],
    tokenizer=tokenizer,
    peft_config=lora_config,
    # callbacks=[RichProgressCallback] if TRL_USE_RICH else None,
)



Map:   0%|          | 0/21390 [00:00<?, ? examples/s]

Map:   0%|          | 0/5348 [00:00<?, ? examples/s]

In [5]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmanetheren_personal[0m ([33mcs433-jal[0m). Use [1m`wandb login --relogin`[0m to force relogin


Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss
10,0.6939
20,0.8324
30,0.7253
40,1.1121
50,0.9178
60,0.6897
70,0.7827
80,0.5755
90,1.1639
100,0.895


