In [17]:
from datasets import load_dataset
from transformers import AutoTokenizer


ds1 = load_dataset("LangAGI-Lab/world_model_for_wa_desc_with_tao_dataset", split="train")


In [18]:
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-3B-Instruct")

token_counts = []


In [19]:
instruction="""You are an intelligent agent that predicts next state from given current action, with your own logical reasoning. You will be given web-based tasks.

Here's the information you'll have:
The user's objective: This is the task you're trying to complete.
The current web page's accessibility tree: This is a simplified representation of the webpage, providing key information.
The current web page's URL: This is the page you're currently navigating.
The previous action: This is the action you just performed in the previous step. It may be helpful to track your progress.
The current action: This is the current action that you performed to achieve the user's objective in the current web page's accessibility tree.
The format of previous actions can fall into several categories:
Page Operation Actions:

```click [id]```: This action clicks on an element with a specific id on the webpage.
```type [id] [content]```: Use this to type the content into the field with id. By default, the 'Enter' key is pressed after typing unless press_enter_after is set to 0, i.e., ```type [id] [content] [0]```.
```hover [id]```: Hover over an element with id.
```press [key_comb]```: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).
```scroll [down]``` or ```scroll [up]```: Scroll the page up or down.

Tab Management Actions:
```new_tab```: Open a new, empty browser tab.
```tab_focus [tab_index]```: Switch the browser's focus to a specific tab using its index.
```close_tab```: Close the currently active tab.

URL Navigation Actions:
```goto [url]```: Navigate to a specific URL.
```go_back```: Navigate to the previously viewed page.
```go_forward```: Navigate to the next page (if a previous 'go_back' action was performed)

Completion Action:
```stop [answer]```: Done when you believe the task is complete.


Follow the following rules for reasoning on next state prediction:
1. Please generate your answer starting with "Let's think step by step" regarding the changes.
2. In the **[Rationale]** part, identify and list the specific changes in the [accessibility tree] caused by the current action. Use a numbered list to detail these changes (e.g., items added, updated, or deleted).
3. Then, generate the **[Next State]** prediction. **Important:** You must predict strictly **ONE deterministic future state**. Do not distinguish between success or failure branches; only predict the immediate, objective result of the action.
4. **Format** your description of this state as a **detailed NUMBERED LIST**. Each item in the list should describe a specific aspect of the state (such as visual layout, content updates, or functional status), exactly as shown in the training data style.
5. Generate the state prediction in the correct format. Start with a "[Next State] The expected effect is that:" phrase.
6. Follow the Output Format structure below strictly.

------------------------------------------------------------
OUTPUT FORMAT
------------------------------------------------------------
[Rationale]
Let's think step by step about [description of the action].

Key changes in the accessibility tree based on this action would include:
1. [Logic for change 1]
2. [Logic for change 2]
...

[Next State] The expected effect is that:
1. [Detail of the updated state aspect 1]
2. [Detail of the updated state aspect 2]
3. [Detail of the updated state aspect 3]
...
"""

In [25]:
token_counts = []
for i in range(len(ds1)):
    text = instruction + ds1[i]['observation'] + ds1[i]['raw_prediction'] + ds1[i]["url"]+ ds1[i]["objective"]+ str(ds1[i]["previous_actions"])+ds1[i]['current_action']
    
    tokens = tokenizer.encode(text)
    token_counts.append(len(tokens))
    
    if (i + 1) % 1000 == 0:
        print(f"Processed: {i+1} items")

import numpy as np

print(f"\nToken counts distribution:")
print(f"  < 8192: {sum(1 for x in token_counts if x < 8192)}")
print(f"  > 8192: {sum(1 for x in token_counts if x >= 8192)}")

Processed: 1000 items
Processed: 2000 items
Processed: 3000 items
Processed: 4000 items
Processed: 5000 items
Processed: 6000 items
Processed: 7000 items
Processed: 8000 items
Processed: 9000 items
Processed: 10000 items
Processed: 11000 items
Processed: 12000 items
Processed: 13000 items
Processed: 14000 items

Token counts distribution:
  < 8192: 14633
  > 8192: 111


In [29]:
filtered_indices = [i for i, count in enumerate(token_counts) if count <= 8100]

print(f"original: {len(ds1)}")
print(f"filtered: {len(filtered_indices)}")
print(f"removed: {len(ds1) - len(filtered_indices)}")

ds1_filtered = ds1.select(filtered_indices)

print(f"\ds1_filtered_len: {len(ds1_filtered)}")

original: 14744
filtered: 14633
removed: 111
\ds1_filtered_len: 14633


  print(f"\ds1_filtered_len: {len(ds1_filtered)}")


In [None]:
sft_data = []

for i in filtered_indices:

    prev_actions = "\n".join(ds1[i]["previous_actions"]) if isinstance(ds1[i]["previous_actions"], list) else str(ds1[i]["previous_actions"])
    
    input_text = f"""URL: {ds1[i]['url']}
OBJECTIVE: {ds1[i]['objective']}
PREVIOUS ACTION: {prev_actions}
CURRENT OBSERVATION: {ds1[i]['observation']}
CURRENT ACTION: {ds1[i]['current_action']}"""
    

    prompt = instruction + "\n\n Now, let's start the task. Here's the task for you: \n" + input_text
    

    sft_data.append({
        "prompt": prompt,
        "response": ds1[i]['raw_prediction']
    })


import pandas as pd
df = pd.DataFrame(sft_data)


df[:14500].to_parquet("./sft_train.parquet", index=False)
print("saved to: ./sft_train.parquet")
df[14500:].to_parquet("./sft_test.parquet", index=False)
print("saved to: ./sft_test.parquet")

saved to: ./sft_train.parquet
saved to: ./sft_test.parquet
