In [None]:
path_token_dict = {
    'GleghornLab/abstract_domain_copd': '[COPD]',
    'GleghornLab/abstract_domain_cvd': '[CVD]',
    'GleghornLab/abstract_domain_skincancer': '[CANCER]',
    'GleghornLab/abstract_domain_parasitic': '[PARASITIC]',
    'GleghornLab/abstract_domain_autoimmune': '[AUTOIMMUNE]'
}

from datasets import load_dataset


# for each dataset, switch the valid and test splits

for path, _ in path_token_dict.items():
    data = load_dataset(path)
    valid, test = data['valid'], data['test']
    data['test'] = valid
    data['valid'] = test
    data.push_to_hub(path)

In [1]:
from transformers import AutoModel, AutoTokenizer

model_path = "answerdotai/ModernBERT-base"

tokenizer = AutoTokenizer.from_pretrained(model_path)


In [2]:
tokenizer.special_tokens_map

{'unk_token': '[UNK]',
 'sep_token': '[SEP]',
 'pad_token': '[PAD]',
 'cls_token': '[CLS]',
 'mask_token': '[MASK]'}

In [4]:
tokenizer.decode(tokenizer.encode("Hello, world!"))


'[CLS]Hello, world![SEP]'

In [None]:
benchmark = mteb.Benchmark(
    name='MTEB(Medical)',
    tasks=mteb.get_tasks(
        tasks=['Banking77Classification']
        )
)
benchmark



In [None]:
benchmark.tasks

In [8]:
from data.get_data import get_single_train_data, SimDataset
from models.utils import add_new_tokens

In [9]:



path_token_dict = {
    'GleghornLab/abstract_domain_copd': '[COPD]',
    'GleghornLab/abstract_domain_cvd': '[CVD]',
    'GleghornLab/abstract_domain_skincancer': '[CANCER]',
    'GleghornLab/abstract_domain_parasitic': '[PARASITIC]',
    'GleghornLab/abstract_domain_autoimmune': '[AUTOIMMUNE]'
}

token_expert_dict = {
    '[COPD]': 0,
    '[CVD]': 1,
    '[CANCER]': 2,
    '[PARASITIC]': 3,
    '[AUTOIMMUNE]': 4
}


model, tokenizer = add_new_tokens(model, tokenizer, list(path_token_dict.values()))

In [5]:
dataset = get_single_train_data(
    data_path='GleghornLab/abstract_domain_cvd',
    tokenizer=tokenizer,
    path_token_dict=path_token_dict,
    token_expert_dict=token_expert_dict,
    max_length=512,
    add_tokens=True
)


In [None]:
list(path_token_dict.keys())

In [11]:
import random
import torch
from typing import Any, List, Dict
from torch.utils.data import Dataset as TorchDataset
from datasets import load_dataset


def get_all_train_data(
    data_paths: List[str],
    tokenizer: Any,
    path_token_dict: Dict[str, str],
    token_expert_dict: Dict[str, int],
    max_length: int = 512,
    add_tokens: bool = True,
):
    all_a_documents, all_b_documents, all_expert_assignments = [], [], []
    for path in data_paths:
        domain_token = path_token_dict[path]
        expert_assignment = token_expert_dict[domain_token]
        data = load_dataset(path, split='train').select(range(100))
        all_a_documents.extend(data['a'])
        all_b_documents.extend(data['b'])
        all_expert_assignments.extend([expert_assignment] * len(data['a']))

    random.seed(42)
    entries = list(zip(all_a_documents, all_b_documents, all_expert_assignments))
    random.shuffle(entries)
    all_a_documents, all_b_documents, all_expert_assignments = zip(*entries)
    domain_tokens = list(path_token_dict.values())
    dataset = SimDataset(
        a_documents=all_a_documents,
        b_documents=all_b_documents,
        expert_assignments=all_expert_assignments,
        domain_tokens=domain_tokens,
        tokenizer=tokenizer,
        max_length=max_length,
        add_tokens=add_tokens

    )
    return dataset




In [12]:
dataset = get_all_train_data(
    data_paths=list(path_token_dict.keys()),
    tokenizer=tokenizer,
    path_token_dict=path_token_dict,
    token_expert_dict=token_expert_dict,
    max_length=512,
    add_tokens=True
)

In [None]:
for i in range(100):
    for k, v in dataset[i].items():
        if 'doc' in k:
            print(k)
            print(tokenizer.decode(v['input_ids'][0]))
        else:
            print(v)