In [1]:
import pandas as pd
import gc
import json
import math
import torch
from transformers import DebertaTokenizer, DebertaForSequenceClassification
from tqdm.auto import tqdm
import os

In [4]:
!tar -xvf  'checkpoint-4000.tar'

deberta_results/checkpoint-4000/
deberta_results/checkpoint-4000/model.safetensors
deberta_results/checkpoint-4000/config.json
deberta_results/checkpoint-4000/trainer_state.json


In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-base')
model = DebertaForSequenceClassification.from_pretrained('deberta_results/deberta_ckpt/checkpoint-58000').to(device)

def tokenize_function(text):
    return tokenizer(text, padding='max_length', return_tensors='pt', truncation=True)

In [17]:
def filter_page(html_page: str, objective: str, top_k=(1, 5, 10, 50)):
    with torch.no_grad():
        elements = [el.strip() for el in html_page.split('\n')]
        elements = [el for el in elements if el]
        prompts = [f'Objective: {objective}.\nElement: {element}' for element in elements]
        positive_logits = None
    
        for j in range(0, len(prompts), batch_size):
            ex = tokenize_function(prompts[j:j+batch_size]).to(device)
            out = model(**ex)
            cur = out.logits[:, 1]
            positive_logits = torch.cat((positive_logits, cur)) if positive_logits is not None else cur
            del cur, out, ex
            gc.collect() 
            torch.cuda.empty_cache()
    
        results = []
        for k in top_k:
            top_k_indices = sorted(range(len(positive_logits)), key=lambda i: positive_logits[i], reverse=True)[:k]
            top_k_labels = [elements[i] for i in top_k_indices]
            results.append((k, top_k_labels))
    
        return results

In [19]:
output_file = 'mind2web.txt'
file_path = 'data/mind2web.csv'
df = pd.read_csv(file_path)
val_start_idx =  math.floor(len(df) * 0.8)
batch_size = 32  # Uses ~5GB VRAM

with open(output_file, 'w') as wf:
    for i in tqdm(range(val_start_idx, len(df))):
        ex = df.iloc[len(df)-i-1]
        action_string = ex['ACTION']
        objective = ex['OBJECTIVE']
        wf.write(f'{len(df)-i-1}, Task: {objective}; Action: {action_string}\n')
        results = filter_page(ex['OBSERVATION'], objective)
        for k, top_k_labels in results:
            wf.write(f'k={k}: {top_k_labels}\n')
        wf.write('=' * 50 + '\n')

  0%|          | 0/1383 [00:00<?, ?it/s]

## Run on WebArena

In [20]:
with open('data/webarena_test.json', 'r') as f:
    webarena_data = json.load(f)
    id2objective = {}
    for d in webarena_data:
        id2objective[d['task_id']] = d['intent']

In [23]:
import os
import glob

base_dir = 'data/webarena_acc_tree'
pattern = os.path.join(base_dir, 'render_*_tree_0.txt')
output_file = 'webarena_results.txt'

with open(output_file, 'w') as output:
    for file_path in tqdm(glob.glob(pattern)):
        with open(file_path, 'r') as f:
            html_content = '\n'.join([s for s in f.readlines()])
            # Extracting the task_id from the file name
            task_id = int(file_path.split('/')[-1].split('_')[1])
            objective = id2objective[task_id]

        output.write(f"{task_id} Task: {objective}\n")
        results = filter_page(html_content, objective)
        for k, top_k_labels in results:
            output.write(f"k={k}: {top_k_labels}\n")
        output.write('=' * 50 + '\n')

  0%|          | 0/96 [00:00<?, ?it/s]