In [1]:
import yaml
import logging

from utils import read_jsonl, write_json
from evaluator import DPOModelEvaluator, repository_check

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger("mnlp-2024-auto-evaluator")

# Basic repository check to ensure the submission is correct
repository_check()

# Load the main configuration file
main_config = {}
with open("main_config.yaml") as f:
    try:
        main_config = yaml.safe_load(f)
    except Exception as e:
        logger.error(f"Error loading main_config.yaml: {e}! Please check the file format.")

# Load the task type to identify the model class
task_type = main_config.get("task_type", "causal_lm")

# Load the evaluation methods and the required paths
eval_method = main_config.get("eval_method", ["mcqa"])
policy_model_path = main_config["policy_model_path"]
reference_model_path = main_config["reference_model_path"]
test_data_path = main_config["test_data_path"]

# Load the test data
test_data = read_jsonl(test_data_path)

# Load the model arguments
dpo_model_args = main_config.get("dpo_model_args", {})
rag_model_args = main_config.get("rag_model_args", {})
quantized_model_args = main_config.get("quantized_model_args", {})

# Initialize the metrics dictionary
metrics = {
    "team_name": main_config.get("team_name", "Team Name"),
    "task_type": task_type,
}

# Ensure that the evaluation methods are not conflicting
assert not ("reward" in eval_method and "mcqa" in eval_method), "You cannot evaluate both reward and mcqa at the same time!"


In [2]:
len(test_data)

428

In [3]:
from torch.utils.data import DataLoader

test_dataloader = DataLoader(test_data[:4], batch_size=4)
evaluator = DPOModelEvaluator(
    task_type=task_type,
    policy_model_path=policy_model_path,
    reference_model_path=reference_model_path,
    dpo_model_args=dpo_model_args
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
policy_acc= evaluator.scoring_mcqa(test_dataloader)
eval_method.remove("mcqa")
metrics["policy_acc"] = policy_acc

2024-05-28 09:46:00,142 - INFO - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-28 09:46:25,863 - INFO - Trained peft adapter loaded
  0%|          | 0/4 [00:00<?, ?it/s]

True answer: D


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
 25%|██▌       | 1/4 [00:19<00:59, 19.96s/it]

['C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C']
True answer: C


 50%|█████     | 2/4 [00:50<00:51, 25.92s/it]

['D', 'D', 'D', 'D', 'B', 'D', 'D']
True answer: B


 75%|███████▌  | 3/4 [01:31<00:32, 32.80s/it]

['B', 'B', 'B', 'C', 'B', 'B', 'C', 'C']
True answer: D


100%|██████████| 4/4 [01:55<00:00, 28.94s/it]

['D', 'D', 'D', 'D', 'D', 'D', 'C', 'C', 'D']





In [5]:
policy_acc

0.5

In [4]:
# policy_model = evaluator.model_class.from_pretrained(policy_model_path, **evaluator.dpo_model_args)



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-28 12:04:57,973 - INFO - Trained peft adapter loaded
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
evaluator.compute_reference_logprobs(test_data)



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

2024-05-28 12:56:24,077 - INFO - Trained peft adapter loaded
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


1
[{'role': 'system', 'content': 'You are an expert professor, teaching a student how to solve a problem by providing a full explanation of the solution.'}, {'role': 'user', 'content': "Explain what's base rate fallacy and list five specific examples of how politicians use it for campaigns."}]


: 

In [6]:
%debug

> [0;32m/home/toskov/project-code-2024/evaluator.py[0m(211)[0;36mcompute_reference_logprobs[0;34m()[0m
[0;32m    209 [0;31m[0;34m[0m[0m
[0m[0;32m    210 [0;31m        [0;32mfor[0m [0mdata[0m [0;32min[0m [0mtest_data[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 211 [0;31m            [0mdata[0m[0;34m[[0m[0;34m"chosen_logps"[0m[0;34m][0m [0;34m=[0m [0mtest_data_map[0m[0;34m[[0m[0mdata[0m[0;34m[[0m[0;34m'prompt'[0m[0;34m][0m[0;34m][0m[0;34m[[0m[0;34m"chosen_logps"[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    212 [0;31m            [0mdata[0m[0;34m[[0m[0;34m"rejected_logps"[0m[0;34m][0m [0;34m=[0m [0mtest_data_map[0m[0;34m[[0m[0mdata[0m[0;34m[[0m[0;34m'prompt'[0m[0;34m][0m[0;34m][0m[0;34m[[0m[0;34m"rejected_logps"[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    213 [0;31m[0;34m[0m[0m
[0m


In [8]:
from transformers import TrainingArguments
 
args = TrainingArguments(
    output_dir="llama3_new",#"doplhin-dpo",               # directory to save and repository id
    num_train_epochs=1,                     # number of training epochs
    per_device_train_batch_size=1,         # batch size per device during training
    per_device_eval_batch_size=1,           # batch size for evaluation
    gradient_accumulation_steps=1,          # number of steps before performing a backward/update pass
    gradient_checkpointing=True,            # use gradient checkpointing to save memory
    optim="adamw_torch_fused",              # use fused adamw optimizer
    learning_rate=5e-5,                     # 10x higher LR than QLoRA paper
    max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
    warmup_ratio=0.1,                       # warmup ratio based on QLoRA paper
    lr_scheduler_type="cosine",             # use cosine learning rate scheduler
    logging_steps=25,                       # log every 25 steps
    save_steps=500,                         # when to save checkpoint
    save_total_limit=2,                     # limit the total amount of checkpoints
    evaluation_strategy="steps",            # evaluate every 1000 steps
    eval_steps=700,                         # when to evaluate
    bf16=True,                              # use bfloat16 precision
    tf32=True,                              # use tf32 precision
    push_to_hub=False,                      # push model to hub
    report_to="tensorboard",                # report metrics to tensorboard
)
 
dpo_args = {
    "beta": 0.1,                            # The beta factor in DPO loss. Higher beta means less divergence
    "loss_type": "sigmoid"                  # The loss type for DPO.
}

prompt_length = 402#1024
max_seq_length = 912#1512

In [9]:
from trl import DPOTrainer
from datasets import Dataset

policy_model.dpo_trainer = trainer_for_eval = DPOTrainer(
    policy_model.pretrained_model,
    ref_model=None, # set to none since we use peft
    # peft_config=peft_config,
    args=args,
    train_dataset=Dataset.from_dict({}),
    eval_dataset=Dataset.from_dict({}),
    tokenizer=evaluator.policy_tokenizer,
    max_length=max_seq_length,
    max_prompt_length=prompt_length,
    beta=dpo_args["beta"],
    loss_type=dpo_args["loss_type"],
)

policy_model.dpo_trainer



<trl.trainer.dpo_trainer.DPOTrainer at 0x7f942a6d3130>