In [1]:
import pandas as pd
import re
from tqdm.auto import tqdm
import random
# !pip install scikit-learn

In [2]:
from transformers import DebertaTokenizer, T5Tokenizer
# tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-base')
tokenizer = T5Tokenizer.from_pretrained('t5-base')

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
file_path = 'data/mind2web.csv'
df = pd.read_csv(file_path)

negative_examples = []
positive_examples = []
negative_examples_id = []
positive_examples_id = []
val_negative_examples = []
val_positive_examples = []
val_negative_examples_id = []
val_positive_examples_id = []

for i in range(len(df)):
    ex = df.iloc[i]
    elements = [el.strip() for el in ex['OBSERVATION'].split('\n')]
    action_string = ex['ACTION']
    objective = ex['OBJECTIVE']
    
    match = re.search(r'\[(\d+)\]', action_string)
    groundtruth_id = match.group(1) if match else None
    if groundtruth_id:
        gt_id_formatted = f'[{groundtruth_id}]'
        for x in elements:
            if gt_id_formatted in x:
                # positive_examples.append((objective, x))
                if i < len(df) * 0.8:
                    positive_examples.append(f'Objective: {objective}.\nElement: {x}')
                    positive_examples_id.append(i)
                else:
                    val_positive_examples.append(f'Objective: {objective}.\nElement: {x}')
                    val_positive_examples_id.append(i)
            else:
                if i < len(df) * 0.8:
                    negative_examples.append(f'Objective: {objective}.\nElement: {x}')
                    negative_examples_id.append(i)
                else:
                    val_negative_examples.append(f'Objective: {objective}.\nElement: {x}')
                    val_negative_examples_id.append(i)

print(len(positive_examples), len(negative_examples), len(val_positive_examples), len(val_negative_examples))
print(positive_examples[0])
print(negative_examples[0])

5532 529483 1382 133051
Objective: Check for pickup restaurant available in Boston, NY on March 18, 5pm with just one guest.
Element: [168] combobox '' hasPopup: menu expanded: False
Objective: Check for pickup restaurant available in Boston, NY on March 18, 5pm with just one guest.
Element: [1] RootWebArea '' focused: True


In [4]:
import pandas as pd

# Create DataFrame
train_df = pd.DataFrame(data = {
    'id': positive_examples_id + negative_examples_id,
    'text': positive_examples + negative_examples,
    'label': [1] * len(positive_examples) + [-1] * len(negative_examples)
})
val_df = pd.DataFrame(data = {
    'id': val_positive_examples_id + val_negative_examples_id,
    'text': val_positive_examples + val_negative_examples,
    'label': [1] * len(val_positive_examples) + [-1] * len(val_negative_examples)
})

# TODO(jykoh): Change to 1.0 later
train_df = train_df.sample(frac=1.0).reset_index(drop=True)  # Shuffle
val_df = val_df#.sample(frac=0.005).reset_index(drop=True)
print(len(train_df), len(val_df))

train_df.to_csv('train_data.csv', index=False)
val_df.to_csv('val_data.csv', index=False)

535015 134433


In [20]:
filtered_ids = (val_df['id'].unique())[:100]
val_df = val_df[val_df['id'].isin(filtered_ids)]

In [21]:
from transformers import DebertaTokenizer
from datasets import Dataset
import random
from datasets import concatenate_datasets

# Function to tokenize a batch of texts
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

# Convert your pandas dataframes to Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Tokenize the data
# tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)

# positive_dataset = tokenized_train_dataset.filter(lambda example: example['label'] == 1)
# negative_dataset = tokenized_train_dataset.filter(lambda example: example['label'] == -1)

# # Function to create subsets of the negative dataset
# def create_negative_subsets(dataset, subset_size, num_subsets):
#     subsets = []
#     for _ in range(num_subsets):
#         # Randomly sample without replacement
#         sampled_indices = random.sample(range(len(dataset)), subset_size)
#         subsets.append(dataset.select(sampled_indices))
#     return subsets

# positive_count, negative_count = len(positive_dataset), len(negative_dataset)
# sampling_ratio = negative_count // positive_count

# negative_subsets = create_negative_subsets(negative_dataset, positive_count, sampling_ratio)
# balanced_datasets = [concatenate_datasets([positive_dataset, neg_subset]).shuffle() for neg_subset in negative_subsets]
# combined_balanced_dataset = concatenate_datasets(balanced_datasets).shuffle()

# combined_balanced_dataset.save_to_disk('train_t5.hf')
tokenized_val_dataset.save_to_disk('val_t5_sampled.hf')

Map:   0%|          | 0/9672 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/9672 [00:00<?, ? examples/s]

In [22]:
from datasets import load_from_disk
combined_balanced_dataset = load_from_disk('train_t5.hf')
tokenized_val_dataset = load_from_disk('val_t5_sampled.hf')

  table = cls._concat_blocks(blocks, axis=0)


# Model Training

## Train the model

In [23]:
from transformers import DebertaTokenizer, DebertaForSequenceClassification
from transformers import T5Tokenizer, T5ForSequenceClassification

# tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-base')
# model = DebertaForSequenceClassification.from_pretrained('microsoft/deberta-base', num_labels=2)
tokenizer = T5Tokenizer.from_pretrained('t5-base')
model = T5ForSequenceClassification.from_pretrained('t5-base', num_labels=2)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at t5-base and are newly initialized: ['classification_head.dense.bias', 'classification_head.out_proj.bias', 'classification_head.dense.weight', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
from transformers import Trainer
import numpy as np
from torch.utils.tensorboard import SummaryWriter
import torch.nn.functional as F
from torch import nn
import torch

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = nn.CrossEntropyLoss()
        # compute custom loss (suppose one has 3 labels with different weights)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), ((labels + 1) // 2).view(-1))
        return (loss, outputs) if return_outputs else loss

    def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_prefix="eval"):
        eval_dataset = eval_dataset or self.eval_dataset
        # Call the original evaluate function
        predictions = self.predict(eval_dataset, ignore_keys, metric_key_prefix)
        # Convert logits to predicted labels
        if type(self.model) == T5ForSequenceClassification:
            predicted_labels = np.argmax(predictions.predictions[0], axis=1)
        else:
            predicted_labels = np.argmax(predictions.predictions, axis=1)
        loss_fct = nn.CrossEntropyLoss()

        # Aggregate predictions and true labels by ID
        id_recall = {}
        bce_loss = []
        for idx, id_ in enumerate(eval_dataset['id']):
            if id_ not in id_recall:
                id_recall[id_] = {'logits': [], 'preds': [], 'labels': []}

            if type(self.model) == T5ForSequenceClassification:
                id_recall[id_]['logits'].append(predictions.predictions[0][idx, 1])
                logits = predictions.predictions[0][idx]
            else:
                id_recall[id_]['logits'].append(predictions.predictions[idx, 1])
                logits = predictions.predictions[idx]
            id_recall[id_]['preds'].append(predicted_labels[idx])
            id_recall[id_]['labels'].append(predictions.label_ids[idx])

            label = predictions.label_ids[idx]
            bce_loss.append(loss_fct(torch.tensor([logits]), torch.tensor([int(label == 1)])))

        mean_bce_loss = torch.mean(torch.stack(bce_loss))

        output = {
            'eval_loss': mean_bce_loss.item()
        }
        # Compute recall for each ID
        for k in [1, 5, 10, 50]:
            total_recalled = 0
            total = 0
            total_recalled_random = 0
            for id_, data in id_recall.items():
                # Sort the logits and get top k indices
                top_k_indices = sorted(range(len(data['logits'])), key=lambda i: data['logits'][i], reverse=True)[:k]
                # Get the labels corresponding to the top k logits
                top_k_labels = [data['labels'][i] for i in top_k_indices]
                if 1 in top_k_labels:
                    total_recalled += 1
                if 1 in [data['labels'][i] for i in random.sample(range(0, len(data['labels'])), min(k, len(data['labels'])))]:
                    total_recalled_random += 1
                if 1 in data['labels']:
                    total += 1
            output[f'r@{k}'] = total_recalled / total
            output[f'r_rand@{k}'] = total_recalled_random / total

        if self.args.logging_dir is not None:
            tb_writer = SummaryWriter(log_dir=self.args.logging_dir)
            # Log each ID's recall to TensorBoard
            print('Results for step', self.state.global_step)
            for k, score in output.items():
                print(f"eval/{k}:", score)
                tb_writer.add_scalar(f"eval/{k}", score, self.state.global_step)
            tb_writer.flush()
            tb_writer.close()

        # Add aggregated recall scores to output
        return output


In [26]:
from transformers import TrainingArguments
from transformers import Trainer
import datasets

# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     predictions = logits.argmax(axis=-1)
#     print(accuracy_score(labels, predictions))
#     return {'accuracy': accuracy_score(labels, predictions)}


training_args = TrainingArguments(
    output_dir='./deberta_results',          # directory for storing logs and model checkpoints
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=16,  # batch size for training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    learning_rate=2e-5,
    lr_scheduler_type= "cosine",
    weight_decay=0.001,               # strength of weight decay
    report_to="tensorboard",
    bf16=True,
    logging_steps=100,                # log model metrics every 'logging_steps' steps
    evaluation_strategy="steps",     # evaluation strategy to adopt during training
    eval_steps=2000,                  # number of steps to run evaluation
    save_steps=2000,
    eval_accumulation_steps=16,
    load_best_model_at_end=True      # load the best model when finished training
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=combined_balanced_dataset,  # Use your combined balanced dataset
    eval_dataset=tokenized_val_dataset,        # Validation dataset
    # compute_metrics=compute_metrics
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [None]:
trainer.train()

Step,Training Loss,Validation Loss


  bce_loss.append(loss_fct(torch.tensor([logits]), torch.tensor([int(label == 1)])))


Results for step 2000
eval/eval_loss: 0.418666273355484
eval/r@1: 0.27
eval/r_rand@1: 0.01
eval/r@5: 0.67
eval/r_rand@5: 0.07
eval/r@10: 0.8
eval/r_rand@10: 0.06
eval/r@50: 0.99
eval/r_rand@50: 0.51


IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Results for step 22000
eval/eval_loss: 0.2841876149177551
eval/r@1: 0.28
eval/r_rand@1: 0.01
eval/r@5: 0.68
eval/r_rand@5: 0.03
eval/r@10: 0.83
eval/r_rand@10: 0.1
eval/r@50: 0.99
eval/r_rand@50: 0.48
