In [2]:
import torch
from transformers import AutoProcessor, AutoModelForVision2Seq
import os
import json
import random
import re

from eval_agentnet import parse_action, print_result


def process_dataset(path, image_path, model_path, checkpoint_file="train.checkpoint"):
    processor = AutoProcessor.from_pretrained(model_path)
    device = torch.device("cuda:0")
    model = AutoModelForVision2Seq.from_pretrained(
        model_path,
        torch_dtype=torch.float16
        ).to(device)

    start_line = 0
    if os.path.exists(checkpoint_file):
        with open(checkpoint_file, "r") as f:
            start_line = int(f.read().strip())
        print(f"从第 {start_line} 行开始续传")
    rule_count = 0
    claude_count = 0
    with (
        open(path, "r", encoding="utf-8") as f,
    ):
        # 跳过已经处理的行
        for _ in range(start_line):
            next(f)

        for line_num, line in enumerate(f, start_line + 1):
            line = line.strip()
            if not line:  # 跳过空行
                continue
            data = json.loads(line)
            previous_actions_list = []  # 把assistant的历史回答action累计在这里,用agentnet里的历史action
            if data["task_difficulty"] < 7:
                actions = data["traj"]
            else:
                actions = data["traj"][:10]
            final_goal = data["natural_language_task"]

            for i, action in enumerate(actions):
                confidence = action["confidence"]
                messages = []
                images = [image_path + action["image"]]
                messages.append(
                    {
                        "role": "system",
                        "content": "You are a helpful assistant.",
                    }
                )
                if previous_actions_list:
                    messages.append(
                        {
                            "role": "user",
                            "content": f"You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action and confidence to complete the task. \n\n## Output Format\n```\nThought: ...\nAction: ...\nConfidence: ...\n```\n\n## Action Space\n\nclick(start_box='<|box_start|>(x1, y1)<|box_end|>')\nleft_double(start_box='<|box_start|>(x1, y1)<|box_end|>')\nright_single(start_box='<|box_start|>(x1, y1)<|box_end|>')\ndrag(start_box='<|box_start|>(x1, y1)<|box_end|>', end_box='<|box_start|>(x3, y3)<|box_end|>')\nhotkey(key='')\ntype(content='') #If you want to submit your input, use \"\\n\" at the end of `content`.\nscroll(start_box='<|box_start|>(x1, y1)<|box_end|>', direction='down or up or right or left')\nwait() #Sleep for 5s and take a screenshot to check for any changes.\nfinished(content='xxx') # Use escape characters \\', \\\", and \\n in content part to ensure we can parse the content in normal python string format.\n\n\n## Note\n- Use Chinese in `Thought` part.\n- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part. Evaluate your action to be scored, giving it a score from 1 to 5 in 'Confidence' part. A higher score indicates that you believe this action is more likely to accomplish the current goal for the given screenshot.\n\n## User Instruction\n{final_goal}",
                        }
                    )
                    n = min(5, len(previous_actions_list))
                    previous_actions = "\n-------------".join(
                        previous_actions_list[-n:]
                    )
                    messages.append(
                        {
                            "role": "assistant",
                            "content": f"{previous_actions}",
                        }
                    )
                    messages.append(
                        {
                            "role": "user",
                            "content": "<image>",
                        }
                    )
                else:
                    messages.append(
                        {
                            "role": "user",
                            "content": f"You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action and confidence to complete the task. \n\n## Output Format\n```\nThought: ...\nAction: ...\nConfidence: ...\n```\n\n## Action Space\n\nclick(start_box='<|box_start|>(x1, y1)<|box_end|>')\nleft_double(start_box='<|box_start|>(x1, y1)<|box_end|>')\nright_single(start_box='<|box_start|>(x1, y1)<|box_end|>')\ndrag(start_box='<|box_start|>(x1, y1)<|box_end|>', end_box='<|box_start|>(x3, y3)<|box_end|>')\nhotkey(key='')\ntype(content='') #If you want to submit your input, use \"\\n\" at the end of `content`.\nscroll(start_box='<|box_start|>(x1, y1)<|box_end|>', direction='down or up or right or left')\nwait() #Sleep for 5s and take a screenshot to check for any changes.\nfinished(content='xxx') # Use escape characters \\', \\\", and \\n in content part to ensure we can parse the content in normal python string format.\n\n\n## Note\n- Use Chinese in `Thought` part.\n- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part. Evaluate your action to be scored, giving it a score from 1 to 5 in 'Confidence' part. A higher score indicates that you believe this action is more likely to accomplish the current goal for the given screenshot.\n\n## User Instruction\n{final_goal}<image>",
                        }
                    )
                teacher_action = (
                    action["value"]["code_uitars"]
                    if action["value"]["last_step_correct"]
                    else action["claude"]
                )

                for msg in messages:
                    if "<image>" in msg["content"]:
                        msg["content"] = [
                            {"type": "image", "url": image_path + action["image"]},
                            {"type": "text", "text": msg["content"]},
                        ]
                    else:
                        msg["content"] = [{"type": "text", "text": msg["content"]}]
                inputs = processor.apply_chat_template(
                    messages,
                    add_generation_prompt=True,
                    tokenize=True,
                    return_dict=True,
                    return_tensors="pt",
                ).to(model.device)

                outputs = model.generate(**inputs, max_new_tokens=1024)
                output_test = processor.decode(
                    outputs[0][inputs["input_ids"].shape[-1] :]
                )

                print(output_test)
                thought, action_pred, conf_pred = parse_action(output_test)
                pattern = r"<\|box_start\|>\((-?\d+),\s*(-?\d+)\)<\|box_end\|>"
                m_t = re.search(pattern, action_pred)
                m_s = re.search(pattern, teacher_action)
                if m_t and m_s:
                    x_t, y_t = map(int, m_t.groups())
                    x_s, y_s = map(int, m_s.groups())
                print_result(
                    image_path + action["image"], x_t, y_t, x_s, y_s
                )
            print("-------------------------------")
            json_new = json.dumps(data, ensure_ascii=False)
            # 更新检查点（每处理一行就更新）
            with open(checkpoint_file, "w") as f_check:
                f_check.write(str(line_num))

    # 处理完成后删除检查点文件
    if os.path.exists(checkpoint_file):
        os.remove(checkpoint_file)
    print("处理完成！")
    print("rule_count:", rule_count)
    print("claude_count:", claude_count)


if __name__ == "__main__":
    model_path = (
        "/data/zhouxy35/model/merged0911"
    )
    image_path = "/data/zhouxy35/dataset/AgentNet/social_media/images/"
    path = "/data/zhouxy35/dataset/AgentNet/social_media/social_media_confidence_rule.jsonl"

    print("开始处理数据集...")
    process_dataset(path, image_path, model_path)


开始处理数据集...


Loading checkpoint shards: 100%|██████████| 4/4 [00:01<00:00,  3.40it/s]
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


Thought: 我看到屏幕左上角有个浏览器图标，要完成登录Twitter的任务，第一步当然是要打开浏览器啦。让我点击一下这个图标，开始我们的旅程。
Action: click(start_box='<|box_start|>(23,13)<|box_end|>')
Confidence:5<|im_end|>


Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


Thought: 我看到页面正在加载中，屏幕上显示着优美的山景背景和"Good evening, Jason"的问候。虽然我很想马上开始搜索Elon Musk的账号，但现在最明智的做法是保持耐心，等待页面完全加载完毕。毕竟操之过急只会浪费时间，让我们给系统一点时间来完成加载过程。
Action: wait()
Confidence:2<|im_end|>


Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


Thought: 我看到页面上显示了Twitter的登录界面，但目前还没有登录。要完成后续的搜索和关注操作，我得先登录账号。让我点击一下页面上的登录按钮，开始登录流程。
Action: click(start_box='<|box_start|>(190,328)<|box_end|>')
Confidence:3
-------------Thought: 我已经成功导航到了Twitter的主页面。看到页面顶部有个搜索框，这正是我需要的。为了找到Elon Musk的账号，我得先点击这个搜索框，输入他的用户名。让我开始搜索吧。
Action: click(start_box='<|box_start|>(287,145)<|box_end|>')
Confidence:5
-------------Thought: 我看到搜索框已经准备就绪了，现在是时候开始寻找Elon Musk的账号了。让我在这里输入他的用户名，这样就能找到他的个人主页，然后完成关注/关注的取消操作。搜索框就在眼前，我马上开始输入。
Action: type(content='elon musk')
Confidence:5
-------------Thought: 嗯，我看到页面上显示了Elon Musk的个人主页，但目前是登录状态。要继续操作的话，我得先点击右上角的登录按钮，这样才能进行后续的搜索和关注取消等操作。让我先完成登录这一步吧。
Action: click(start_box='<|box_start|>(1092,97)<|box_end|>')
Confidence:3
-------------Thought: 我注意到页面上显示了Elon Musk的个人主页，但目前是登录状态。要继续操作的话，我得先点击右上角的登录按钮，这样才能进行后续的搜索和关注取消等操作。让我先完成登录这一步吧。
Action: click(start_box='<|box_start|>(1092,97)<|box_end|>')
Confidence:3<|im_end|>


Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


Thought: 我看到页面右上角有个搜索框，要找到Elon Musk的账号，第一步就是点击那里开始搜索。这个搜索框就在页面顶部，很容易找到。
Action: click(start_box='<|box_start|>(908,131)<|box_end|>')
Confidence:5<|im_end|>


Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


Thought: 我看到页面右上角有个搜索框，要找到Elon Musk的账号，第一步就是点击那里开始搜索。这个搜索框就在页面顶部，显示着"Search"的提示文字。
Action: click(start_box='<|box_start|>(431,134)<|box_end|>')
Confidence:5
-------------Thought: 搜索框已经准备就绪了，让我输入"Elon Musk"来找到他的账号。我注意到搜索框旁边有个放大镜图标，这正是我需要的输入入口。
Action: type(content='Elon Musk')
Confidence:5
-------------Thought: 嗯，我看到搜索框里已经输入了"Elon Musk"，但搜索结果似乎不太理想。看来需要重新搜索一下，我得先点击搜索框，把现有的内容清空，然后重新输入关键词。这次我要确保输入准确，这样才能找到正确的账号。
Action: click(start_box='<|box_start|>(431,134)<|box_end|>')
Confidence:3
-------------Thought: 我注意到搜索框里已经输入了"Elon Musk"，但搜索结果不太理想。看来需要重新搜索一下，我得先点击搜索框，把现有的内容清空，然后重新输入关键词。这次我要确保输入准确，这样才能找到正确的账号。
Action: click(start_box='<|box_start|>(431,134)<|box_end|>')
Confidence:3
-------------Thought: 我刚才一直在重复点击搜索框，这显然不是个明智的做法。让我重新思考一下：既然已经输入了"Elon Musk"，现在应该专注于搜索结果。我需要点击搜索框右侧的放大镜图标，这样就能触发搜索功能，找到Elon Musk的账号了。
Action: click(start_box='<|box_start|>(458,134)<|box_end|>')
Confidence:3<|im_end|>


Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.


Thought: 我看到页面右上角有个搜索框，要找到Elon Musk的账号，第一步就是点击那里开始搜索。这个搜索框就在页面顶部，显示着"Search"的提示文字。
Action: click(start_box='<|box_start|>(509,131)<|box_end|>')
Confidence:5
-------------Thought: 搜索框已经准备就绪了，让我在这里输入"Elon Musk"。这样系统就能帮我找到他的官方账号，我就可以继续完成后续的关注操作了。
Action: type(content='Elon Musk')
Confidence:5
-------------Thought: 嗯，我看到搜索框里已经输入了"Elon Musk"，但搜索结果似乎不太理想。让我重新点击一下搜索框，看看能不能找到更准确的搜索结果。毕竟要找到他的账号，得先确保搜索关键词准确无误。
Action: click(start_box='<|box_start|>(446,131)<|box_end|>')
Confidence:3
-------------Thought: 我注意到搜索框里已经输入了"Elon Musk"，但搜索结果似乎不太理想。让我重新点击一下搜索框，看看能不能找到更准确的搜索结果。毕竟要找到他的账号，得先确保搜索关键词准确无误。
Action: click(start_box='<|box_start|>(446,131)<|box_end|>')
Confidence:3<|im_end|>


KeyboardInterrupt: 