In [18]:
from datasets import load_dataset
from transformers import AutoTokenizer
import pandas as pd
df = pd.read_json("./sft_annotations_gemini.jsonl", lines=True)

In [19]:
len(df)

14744

In [20]:
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-3B-Instruct")

token_counts = []

In [21]:
instruction="""You are an intelligent agent that predicts next state from given current action on a webpage, with your own logical reasoning.

Here's the information you'll have:
CURRENT OBSERVATION: The current web page's accessibility tree, a simplified representation of the webpage, providing key information.
CURRENT ACTION: This is the current action that you performed to achieve the user's objective in the current web page's accessibility tree.
The format of actions can fall into several categories:

Page Operation Actions:
```click [id]```: This action clicks on an element with a specific id on the webpage.
```type [id] [content]```: Use this to type the content into the field with id. By default, the 'Enter' key is pressed after typing unless press_enter_after is set to 0, i.e., ```type [id] [content] [0]```.
```hover [id]```: Hover over an element with id.
```press [key_comb]```: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v).
```scroll [down]``` or ```scroll [up]```: Scroll the page up or down.

Tab Management Actions:
```new_tab```: Open a new, empty browser tab.
```tab_focus [tab_index]```: Switch the browser's focus to a specific tab using its index.
```close_tab```: Close the currently active tab.

URL Navigation Actions:
```goto [url]```: Navigate to a specific URL.
```go_back```: Navigate to the previously viewed page.
```go_forward```: Navigate to the next page (if a previous 'go_back' action was performed)

CompletionAction:
 `stop[answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide
 the answer in the bracket.

You are required to predict the new changes that will occur on the webpage after the operation is performed. To be successful, it is very important to understand the effect of the current action on the next state of the webpage.

Follow the following rules for next state prediction:
1. In the **[Rationale]** part, identify and list the specific changes in the [accessibility tree] caused by the current action in natural language. Use a numbered list to detail these changes (e.g., items added, updated, or deleted).
2. Then, generate the [Next State] description of the next web state based on the changed parts. 
3. Follow the Output Format structure below strictly.

Response Format:
[Rationale]
Key changes in the accessibility tree based on [description of the action] would include:
1. [Logic for change 1]
2. [Logic for change 2]
...

[Next State] The expected effect is that:
1. [Detail of the next state 1]
2. [Detail of the next state 2]
3. [Detail of the next state 3]
...
"""

In [22]:
token_counts = []
for i in range(len(df)):
    text = instruction + df.iloc[i]['current_observation'] + df.iloc[i]['current_action'] + df.iloc[i]['generated_annotation']
    tokens = tokenizer.encode(text)
    token_counts.append(len(tokens))
    
    if (i + 1) % 1000 == 0:
        print(f"Processed: {i+1} items")

import numpy as np

print(f"\nToken counts distribution:")
print(f"  < 5000: {sum(1 for x in token_counts if x < 5000)}")
print(f"  > 5000: {sum(1 for x in token_counts if x >= 5000)}")

Processed: 1000 items
Processed: 2000 items
Processed: 3000 items
Processed: 4000 items
Processed: 5000 items
Processed: 6000 items
Processed: 7000 items
Processed: 8000 items
Processed: 9000 items
Processed: 10000 items
Processed: 11000 items
Processed: 12000 items
Processed: 13000 items
Processed: 14000 items

Token counts distribution:
  < 5000: 11587
  > 5000: 3157


In [23]:
filtered_indices = [i for i, count in enumerate(token_counts) if count <= 5000]

print(f"original: {len(df)}")
print(f"filtered: {len(filtered_indices)}")
print(f"removed: {len(df) - len(filtered_indices)}")

ds1_filtered = df.iloc[filtered_indices]

print(f"ds1_filtered_len: {len(ds1_filtered)}")

original: 14744
filtered: 11589
removed: 3155
ds1_filtered_len: 11589


In [None]:
sft_data = []

for i in filtered_indices:
    
    input_text = f"""CURRENT OBSERVATION: {df.iloc[i]['current_observation']}
CURRENT ACTION: {df.iloc[i]['current_action']}"""

    prompt = instruction + "\n\n Now, let's start the task. Here's the task for you: \n" + input_text

    sft_data.append({
        "prompt": prompt,
        "response": df.iloc[i]['generated_annotation']
    })

df = df.sample(frac=1, random_state=42).reset_index(drop=True)

df = pd.DataFrame(sft_data)


df[:11000].to_parquet("./sft_train.parquet", index=False)
print("saved to: ./sft_train.parquet")
df[11000:].to_parquet("./sft_test.parquet", index=False)
print("saved to: ./sft_test.parquet")

saved to: ./sft_train.parquet
saved to: ./sft_test.parquet


In [15]:
import pandas as pd
token_counts = []
df = pd.read_parquet('sft_train.parquet')

for i in range(len(df)):
    text = df.iloc[i]['prompt'] + df.iloc[i]['response']
    tokens = tokenizer.encode(text)
    token_counts.append(len(tokens))

print(pd.Series(token_counts).describe())

count    14500.000000
mean      3445.090759
std       1690.634378
min        724.000000
25%       1709.000000
50%       3656.000000
75%       4783.000000
max       8110.000000
dtype: float64
