In [2]:
messages = [
    {"role": "system", "content": "you are an AI agent."},
    {"role": "user", "content": "Hello."},
]

input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    return_tensors="pt"
).to(model.device)

terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

outputs = model.generate(
    input_ids,
    max_new_tokens=128,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.2,
    top_p=0.9,
)

response = outputs[0][input_ids.shape[-1]:]
output = tokenizer.decode(response, skip_special_tokens=True)
output


No chat template is defined for this tokenizer - using a default chat template that implements the ChatML format (without BOS/EOS tokens!). If the default is not appropriate for your model, please set `tokenizer.chat_template` to an appropriate template. See https://huggingface.co/docs/transformers/main/chat_templating for more information.



'Hello<|im>assistant>Hello<|assistant>assistantHello<assistantassistantHello<assistantassistantassistantHello<assistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistan

In [1]:
import re
import json
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM

import warnings
warnings.filterwarnings('ignore')

torch.cuda.empty_cache()
# tokenizer = AutoTokenizer.from_pretrained("/root/autodl-tmp/models/mistral_7b_instruct_50_dpo", trust_remote_code=True)
# model = AutoModelForCausalLM.from_pretrained("/root/autodl-tmp/models/mistral_7b_instruct_50_dpo", trust_remote_code=True).to('cuda:0')

tokenizer = AutoTokenizer.from_pretrained("/root/autodl-tmp/models/gemma_2b_it_50_dpo", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("/root/autodl-tmp/models/gemma_2b_it_50_dpo", trust_remote_code=True).to('cuda:0')

Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [2]:
def calculate_f1(str1, str2):
    set1, set2 = set(str1), set(str2)
    intersection = len(set1 & set2)
    if not intersection:
        return 0
    precision = intersection / len(set1)
    recall = intersection / len(set2)
    return 2 * precision * recall / (precision + recall)

In [4]:
import numpy as np

test_dataset_names = ["test_website"]

for test_dataset_name in test_dataset_names:
    print(f"testing {test_dataset_name} dpo dataset")
    results = []

    with open(f"/root/data/mind2web_dpo_{test_dataset_name}_ranked_50.json", "r") as file:
        test_dataset = json.load(file)

    for dat in tqdm(test_dataset):
        messages = [
            {"role": "system", "content": dat["instruction"]},
            {"role": "user", "content": dat["input"]},
        ]
        
        input_ids = tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            return_tensors="pt"
        ).to(model.device)
    
        terminators = [
            tokenizer.eos_token_id,
            tokenizer.convert_tokens_to_ids("<|eot_id|>")
        ]
        
        outputs = model.generate(
            input_ids,
            max_new_tokens=128,
            eos_token_id=terminators,
            do_sample=True,
            temperature=0.2,
            top_p=0.9,
        )
        
        response = outputs[0][input_ids.shape[-1]:]
        output = tokenizer.decode(response, skip_special_tokens=True)
        gt = dat["output"]

        try:
            id_pattern = r"ID: (\d+)"
            output_id, gt_id = int(re.search(id_pattern, output).group(1)), int(re.search(id_pattern, gt).group(1))
            
            op_pattern = r"Operation: (\w+)"
            output_op, gt_op = re.search(op_pattern, output).group(1), re.search(op_pattern, gt).group(1)
            
            value_pattern = r"Value: (\w+)"
            gt_value = re.search(value_pattern, gt).group(1) if re.search(value_pattern, gt) else None
            output_value = re.search(value_pattern, output).group(1) if re.search(value_pattern, output) else None
        
            value_f1 = None
            if gt_op != "CLICK" and output_op != "CLICK" and (output_value and gt_value):
                value_f1 = calculate_f1(output_value.lower(), gt_value.lower())
            elif gt_op != "CLICK" and output_op != "CLICK" and not (output_value and gt_value):
                value_f1 = 0.0
            elif gt_op != "CLICK" and output_op == "CLICK":
                value_f1 = 0.0
            
            result = {
                "step_sr": gt.lower() == output.lower(), 
                "id_distance": abs(output_id - gt_id) / max(gt_id, output_id), 
                "op_sr": gt_op == output_op, 
                "value_f1": value_f1
            }
        
            results.append(result)
            
        except Exception as e:
            print(output)
            print(gt)
            continue

    with open(f"/root/results/gemma_2b_it_50_dpo/mind2web_dpo_{test_dataset_name}_ranked_50.json", "w") as file:
        json.dump(results, file, indent=4)

    step_sr = np.mean([result["step_sr"] for result in results])
    mean_id_distance = np.mean([result["id_distance"] for result in results])
    op_sr = np.mean([result["op_sr"] for result in results])
    mean_value_f1 = np.mean([result["value_f1"] for result in [x for x in results if x["value_f1"] != None]])
    
    mean_result = {
        "step_sr": step_sr,
        "mean_id_distance": mean_id_distance,
        "op_sr": op_sr,
        "mean_value_f1": mean_value_f1
    }
    
    print(mean_result)
    
    with open(f"/root/results/gemma_2b_it_50_dpo/mind2web_dpo_{test_dataset_name}_ranked_50_mean.json", "w") as file:
        json.dump(mean_result, file, indent=4)  

testing test_website dpo dataset


 59%|█████▊    | 604/1030 [10:06<12:16,  1.73s/it]

ACTION_HISTORY: CLICK
ID: 50887
ID: 508877
ACTION_HISTORY: CLICK
ID: 48887
ACTION_HISTORY: CLICK
ID: 51329
ACTION_HISTORY: CLICK
ID: 5132
ID: 5132
ACTION_HISTORY: CLICK
ID: 51327
ID: 51327
ACTION_HISTORY: CLICK
ID: 48887
ACTION_HISTORY: CLICK
ID: 513
Operation: CLICK
Value: 
ID: 48887


 71%|███████   | 729/1030 [12:18<06:08,  1.23s/it]

Operation: CLICK
Element: #38090
Value: 
Action: CLICK
Element: #38127
Operation: TYPE
Value: 1200
ID: 38090


 76%|███████▌  | 783/1030 [13:13<07:17,  1.77s/it]

Operation: CLICK
Action_HISTORY: 
TYPE: 
TYPE: 
TYPE: 
TYPE: 
TYPE: 
TYPE: 
TYPE: 
TYPE: 
TYPE: 
TYPE: 
TYPE: 
TYPE: 
TYPE: 
TYPE: 
TYPE: 0
TYPE: 0
TYPE: 0
TYPE: 0
TYPE: 0
TYPE: 0
TYPE: 0
TYPE: 0
TYPE: 0
TYPE: 0
TYPE: 0
TYPE: 0
TYPE:
Operation: CLICK
Value: 
ID: 38710


100%|██████████| 1030/1030 [17:32<00:00,  1.02s/it]

{'step_sr': 0.4186952288218111, 'mean_id_distance': 0.04673762383723364, 'op_sr': 0.8919182083739046, 'mean_value_f1': 0.4998027255512285}





In [8]:
len(results)

1030