In [1]:
import pandas as pd
import re
from tqdm.auto import tqdm

In [2]:
file_path = 'data/mind2web.csv'
df = pd.read_csv(file_path)

In [3]:
negative_examples = []
positive_examples = []
negative_examples_id = []
positive_examples_id = []
val_negative_examples = []
val_positive_examples = []
val_negative_examples_id = []
val_positive_examples_id = []

for i in range(len(df)):
    ex = df.iloc[i]
    elements = [el.strip() for el in ex['OBSERVATION'].split('\n')]
    action_string = ex['ACTION']
    objective = ex['OBJECTIVE']
    
    match = re.search(r'\[(\d+)\]', action_string)
    groundtruth_id = match.group(1) if match else None
    if groundtruth_id:
        gt_id_formatted = f'[{groundtruth_id}]'
        for x in elements:
            if gt_id_formatted in x:
                # positive_examples.append((objective, x))
                if i < len(df) * 0.8:
                    positive_examples.append(f'Objective: {objective}.\nElement: {x}')
                    positive_examples_id.append(i)
                else:
                    val_positive_examples.append(f'Objective: {objective}.\nElement: {x}')
                    val_positive_examples_id.append(i)
            else:
                if i < len(df) * 0.8:
                    negative_examples.append(f'Objective: {objective}.\nElement: {x}')
                    negative_examples_id.append(i)
                else:
                    val_negative_examples.append(f'Objective: {objective}.\nElement: {x}')
                    val_negative_examples_id.append(i)

print(len(positive_examples), len(negative_examples), len(val_positive_examples), len(val_negative_examples))
print(positive_examples[0])
print(negative_examples[0])

5532 529483 1382 133051
Objective: Check for pickup restaurant available in Boston, NY on March 18, 5pm with just one guest.
Element: [168] combobox '' hasPopup: menu expanded: False
Objective: Check for pickup restaurant available in Boston, NY on March 18, 5pm with just one guest.
Element: [1] RootWebArea '' focused: True


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Create DataFrame
train_df = pd.DataFrame(data = {
    'id': positive_examples_id + negative_examples_id,
    'text': positive_examples + negative_examples,
    'label': [1] * len(positive_examples) + [-1] * len(negative_examples)
})
val_df = pd.DataFrame(data = {
    'id': val_positive_examples_id + val_negative_examples_id,
    'text': val_positive_examples + val_negative_examples,
    'label': [1] * len(val_positive_examples) + [-1] * len(val_negative_examples)
})

# TODO(jykoh): Change to 1.0 later
train_df = train_df.sample(frac=1.0).reset_index(drop=True)  # Shuffle
val_df = val_df#.sample(frac=0.005).reset_index(drop=True)
print(len(train_df), len(val_df))

train_df.to_csv('train_data.csv', index=False)
val_df.to_csv('val_data.csv', index=False)

535015 134433


# Model Training

In [5]:
from transformers import DebertaTokenizer

tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-base')

In [6]:
from transformers import DebertaTokenizer
from datasets import Dataset
import random
from datasets import concatenate_datasets

# Load the tokenizer
tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-base')

# Function to tokenize a batch of texts
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

# Convert your pandas dataframes to Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Tokenize the data
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/535015 [00:00<?, ? examples/s]

Map:   0%|          | 0/134433 [00:00<?, ? examples/s]

Balance dataset.

In [7]:
positive_dataset = tokenized_train_dataset.filter(lambda example: example['label'] == 1)
negative_dataset = tokenized_train_dataset.filter(lambda example: example['label'] == -1)

Filter:   0%|          | 0/535015 [00:00<?, ? examples/s]

Filter:   0%|          | 0/535015 [00:00<?, ? examples/s]

In [8]:
# Function to create subsets of the negative dataset
def create_negative_subsets(dataset, subset_size, num_subsets):
    subsets = []
    for _ in range(num_subsets):
        # Randomly sample without replacement
        sampled_indices = random.sample(range(len(dataset)), subset_size)
        subsets.append(dataset.select(sampled_indices))
    return subsets

positive_count, negative_count = len(positive_dataset), len(negative_dataset)
sampling_ratio = negative_count // positive_count

negative_subsets = create_negative_subsets(negative_dataset, positive_count, sampling_ratio)
balanced_datasets = [concatenate_datasets([positive_dataset, neg_subset]).shuffle() for neg_subset in negative_subsets]
combined_balanced_dataset = concatenate_datasets(balanced_datasets).shuffle()

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


## Train the model

In [9]:
from transformers import DebertaTokenizer, DebertaForSequenceClassification

tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-base')
model = DebertaForSequenceClassification.from_pretrained('microsoft/deberta-base', num_labels=2)

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['pooler.dense.weight', 'classifier.weight', 'classifier.bias', 'pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
from transformers import Trainer
from sklearn.metrics import recall_score
import numpy as np
from torch.utils.tensorboard import SummaryWriter

class CustomTrainer(Trainer):
    def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_prefix="eval"):
        eval_dataset = eval_dataset or self.eval_dataset
        # Call the original evaluate function
        predictions = self.predict(eval_dataset, ignore_keys, metric_key_prefix)
        # Convert logits to predicted labels
        predicted_labels = np.argmax(predictions.predictions, axis=1)

        # Aggregate predictions and true labels by ID
        id_recall = {}
        for idx, id_ in enumerate(eval_dataset['id']):
            if id_ not in id_recall:
                id_recall[id_] = {'logits': [], 'preds': [], 'labels': []}

            id_recall[id_]['logits'].append(predictions.predictions[idx, 1])
            id_recall[id_]['preds'].append(predicted_labels[idx])
            id_recall[id_]['labels'].append(predictions.label_ids[idx])

        output = {}
        # Compute recall for each ID
        for k in [5, 10, 50]:
            total_recalled = 0
            total = 0
            for id_, data in id_recall.items():
                # Sort the logits and get top k indices
                top_k_indices = sorted(range(len(data['logits'])), key=lambda i: data['logits'][i], reverse=True)[:k]
                # Get the labels corresponding to the top k logits
                top_k_labels = [data['labels'][i] for i in top_k_indices]
                if 1 in top_k_labels:
                    total_recalled += 1
                if 1 in data['labels']:
                    total += 1
            output[f'r@{k}'] = total_recalled / total

        if self.args.logging_dir is not None:
            tb_writer = SummaryWriter(log_dir=self.args.logging_dir)
            # Log each ID's recall to TensorBoard
            for k, score in output.items():
                tb_writer.add_scalar(f"eval/{k}", score, self.state.global_step)
            tb_writer.flush()
            tb_writer.close()

        # Add aggregated recall scores to output
        return output


In [11]:
from transformers import TrainingArguments
from transformers import Trainer
from sklearn.metrics import accuracy_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    print(accuracy_score(labels, predictions))
    return {'accuracy': accuracy_score(labels, predictions)}


training_args = TrainingArguments(
    output_dir='./deberta_results',          # directory for storing logs and model checkpoints
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=32,  # batch size for training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    learning_rate=1e-4,
    lr_scheduler_type= "cosine",
    weight_decay=0.001,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    report_to="tensorboard",
    bf16=True,
    logging_steps=100,                # log model metrics every 'logging_steps' steps
    evaluation_strategy="steps",     # evaluation strategy to adopt during training
    eval_steps=500,                  # number of steps to run evaluation
    load_best_model_at_end=True      # load the best model when finished training
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=combined_balanced_dataset,  # Use your combined balanced dataset
    eval_dataset=tokenized_val_dataset,        # Validation dataset
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

Step,Training Loss,Validation Loss
