In [1]:
import os
cache_dir = '/scratch3/workspace/wenlongzhao_umass_edu-reason/dev_kedar/transformers_cache'
os.environ['TRANSFORMERS_CACHE'] = cache_dir
os.environ['HF_HOME']=cache_dir
os.environ['HF_HUB_CACHE']=cache_dir+'/hub'
hf_token=os.getenv('hf_token')

In [2]:
import re
import numpy as np
import random
from datasets import load_from_disk, Dataset, load_dataset, concatenate_datasets
from transformers import AutoTokenizer, AutoModelForCausalLM

  from .autonotebook import tqdm as notebook_tqdm


### Preparing data in preference format


In [3]:
data=load_from_disk('../datasets/gsm8k/feedback/')
data

Dataset({
    features: ['question', 'answer'],
    num_rows: 1000
})

In [4]:
teacher_data_path = '../outputs/exp-2.0.3/eval_1/generated_outputs.json'
teacher_data = load_dataset('json',data_files=teacher_data_path)['train']
teacher_data

Dataset({
    features: ['input', 'output', 'token_ids', 'log_probs', 'all_returned_log_probs', 'model_answer', 'GT_Answer', 'score'],
    num_rows: 1000
})

In [5]:
student_data_path='../outputs/exp-2.1.1/eval_1/logprobs1.json'
student_data=load_dataset('json', data_files=student_data_path)['train']
student_data

Dataset({
    features: ['prompt', 'gt_reasoning', 'gt_answer', 'student_token_ids', 'student_reasoning', 'student_answer', 'student_correctness', 'student_log_probs', 'teacher_log_probs', 'teacher_correctness'],
    num_rows: 1000
})

In [6]:
# student_data_path='../outputs/exp-2.1.1/eval_1/logprobs.json'
# student_data1=load_dataset('json', data_files=student_data_path)['train']
# student_data1

In [7]:
def formatting_prompt_func(questions):
    final_prompts=[]
    for question in questions:
        prompt = f'<|start_header_id|>user<|end_header_id|>\n\nGiven the following problem, reason and give a final answer to the problem.\nProblem: {question}\nYour response should end with "The final answer is [answer]" where [answer] is the response to the problem.\n<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'
        final_prompts.append(prompt)
    return final_prompts


In [8]:
prompt= formatting_prompt_func(data['question'])
prompt[0]

'<|start_header_id|>user<|end_header_id|>\n\nGiven the following problem, reason and give a final answer to the problem.\nProblem: A box is 8 inches in height, 10 inches in width, and 12 inches in length. A wooden building block is 3 inches in height, 2 inches in width, and 4 inches in length. How many building blocks can fit into the box?\nYour response should end with "The final answer is [answer]" where [answer] is the response to the problem.\n<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'

In [9]:
chosen= [tr_output[0] for tr_output in teacher_data['output']] 
rejected= student_data['student_reasoning']

In [10]:
def get_prob(teacher_log_prob):
    teacher_logprob=[]
    for i in range(len(teacher_log_prob)):
        teacher_log_probs=np.array(teacher_log_prob[i])
        teacher_logprob.append(np.mean(teacher_log_probs))
    teacher_prob=np.exp(teacher_logprob)
    return teacher_prob

In [11]:
teacher_prob= get_prob(student_data['teacher_log_probs'])
print(np.max(teacher_prob))
print(np.min(teacher_prob))

0.9704129501564671
0.193618609447841


In [12]:
new_data = {
    'prompt': prompt,
    'chosen': chosen,
    'rejected': rejected,
    'tr_answer': teacher_data['model_answer'],
    'stu_answer': student_data['student_answer'],
    'tr_prob':teacher_prob,
    'tr_score': teacher_data['score']
    }
    
preference_data= Dataset.from_dict(new_data)
preference_data

Dataset({
    features: ['prompt', 'chosen', 'rejected', 'tr_answer', 'stu_answer', 'tr_prob', 'tr_score'],
    num_rows: 1000
})

In [13]:
preference_data= preference_data.filter(lambda x: x['tr_score']==1)
preference_data

Filter: 100%|██████████| 1000/1000 [00:00<00:00, 103198.68 examples/s]


Dataset({
    features: ['prompt', 'chosen', 'rejected', 'tr_answer', 'stu_answer', 'tr_prob', 'tr_score'],
    num_rows: 951
})

In [14]:
preference_data_wenlong= preference_data.filter(lambda x: x['tr_prob']<=0.6)
preference_data_wenlong

Filter: 100%|██████████| 951/951 [00:00<00:00, 51549.32 examples/s]


Dataset({
    features: ['prompt', 'chosen', 'rejected', 'tr_answer', 'stu_answer', 'tr_prob', 'tr_score'],
    num_rows: 89
})

In [15]:
preference_data_arafat= preference_data.filter(lambda x: x['tr_answer']!=x['stu_answer'])
preference_data_arafat

Filter: 100%|██████████| 951/951 [00:00<00:00, 61048.44 examples/s]


Dataset({
    features: ['prompt', 'chosen', 'rejected', 'tr_answer', 'stu_answer', 'tr_prob', 'tr_score'],
    num_rows: 71
})

In [16]:
preference_data_wenlong['chosen'][8]

"To determine how much Leila and her friends will save by choosing the first car rental option over the second, we need to calculate the total cost of each option for their trip.\n\nThe trip is 150 kilometers long each way, so the total distance covered in a day (to and from the destination) is 150 km * 2 = 300 km.\n\nThe first option costs $50 a day, excluding gasoline. To calculate the cost of gasoline for this option:\n- The total distance of the trip is 300 km.\n- A liter of gasoline covers 15 km, so the amount of gasoline needed for 300 km is 300 km / 15 km per liter = 20 liters.\n- The cost of gasoline is $0.90 per liter, so the total cost for gasoline is 20 liters * $0.90 per liter = $18.\nTherefore, the total cost for the first option is $50 (rental) + $18 (gasoline) = $68.\n\nThe second option costs $90 a day, including gasoline. There's no need to calculate the gasoline cost separately since it's already included in the daily rate.\n\nTo find out how much they will save by ch

## DPO Code

In [17]:
# train_dpo.py
from trl import DPOConfig, DPOTrainer
from transformers import AutoModelForCausalLM, AutoTokenizer



In [18]:
model_name='meta-llama/Llama-3.2-3B-Instruct'
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    token=hf_token, 
    cache_dir=cache_dir
)
tokenizer.pad_token_id = tokenizer.eos_token_id

model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    device_map="auto", 
    torch_dtype='bfloat16',
    token=hf_token, 
    cache_dir=cache_dir
)

training_args = DPOConfig(
    output_dir="Wenlong",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=5,
    learning_rate=1e-6, # Default 1e-6
    weight_decay=0.01,
    lr_scheduler_type='cosine',
    save_strategy="epoch",
    warmup_ratio=0.1,
    logging_steps=100
)
trainer = DPOTrainer(model=model, args=training_args, processing_class=tokenizer, train_dataset=preference_data_wenlong)


Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/scratch3/workspace/wenlongzhao_umass_edu-reason/dev_kedar/transformers_cache/models--meta-llama--Llama-3.2-3B-Instruct/.no_exist/0cb88a4f764b7a12671c53f0838cd831a0843b95/adapter_config.json'
Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/scratch3/workspace/wenlongzhao_umass_edu-reason/dev_kedar/transformers_cache/models--meta-llama--Llama-3.2-3B-Instruct/.no_exist/0cb88a4f764b7a12671c53f0838cd831a0843b95/adapter_config.json'
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.34it/s]
Extracting prompt in train dataset: 100%|██████████| 89/89 [00:00<00:00, 7162.18 examples/s]
Applying chat template to train dataset: 100%|██████████| 89/89 [00:00<00:00, 9886.20 examples/s]
Tokenizing train dataset: 100%|██████████| 89/89 [00:00<00:00, 865.71 examples/s]


In [None]:
trainer.train()

Step,Training Loss
