In [81]:
import sys
import logging

import datasets
import peft
from datasets import load_dataset
from peft import LoraConfig
import torch
import transformers
from trl import SFTTrainer
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig

Set up which datasets to use

In [147]:
training_dataset_path = "data/CDR_TrainingSet.json" # "./data/NCBItrainset_corpus.json"
test_dataset_path = "data/CDR_TestSet.json" # "./data/NCBItestset_corpus.json"
dev_dataset_path = "data/CDR_DevelopmentSet.json" # "./data/NCBIdevelopset_corpus.json"

In [154]:
system_prompt_path = "data/CDR_system_prompt.txt"

In [111]:
training_config = {
    "fp16": True,
    "do_eval": True,
    "evaluation_strategy": "steps",
    "eval_steps": 20,
    "learning_rate": 2e-4,
    "log_level": "info",
    "logging_steps": 5,
    "logging_strategy": "steps",
    "lr_scheduler_type": "cosine",
    "num_train_epochs": 1,
    "max_steps": 20,
    "output_dir": "./checkpoint_dir",
    "overwrite_output_dir": True,
    "per_device_eval_batch_size": 1,
    "per_device_train_batch_size": 1,
    "remove_unused_columns": True,
    "save_steps": 20,
    "save_total_limit": 1,
    "seed": 0,
    "gradient_checkpointing": True,
    "gradient_checkpointing_kwargs":{"use_reentrant": False},
    "gradient_accumulation_steps": 4,
    "warmup_ratio": 0.05,
    }

In [112]:
peft_config = {
    "r": 16,
    "lora_alpha": 32,
    "lora_dropout": 0.05,
    "bias": "none",
    "task_type": "CAUSAL_LM",
    "target_modules": "all-linear",
    "modules_to_save": None,
}
train_conf = TrainingArguments(**training_config)
peft_conf = LoraConfig(**peft_config)

[INFO|training_args.py:2169] 2024-11-27 21:48:14,555 >> PyTorch: setting up devices
[INFO|training_args.py:1844] 2024-11-27 21:48:14,673 >> The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


logger = logging.getLogger(__name__)

logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
)
log_level = train_conf.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()

# Log on each process a small summary
logger.warning(
    f"Process rank: {train_conf.local_rank}, device: {train_conf.device}, n_gpu: {train_conf.n_gpu}"
    + f" distributed training: {bool(train_conf.local_rank != -1)}, 16-bits training: {train_conf.fp16}"
)
logger.info(f"Training/evaluation parameters {train_conf}")
logger.info(f"PEFT parameters {peft_conf}")


In [113]:
checkpoint_path = "microsoft/Phi-3-mini-4k-instruct"
# checkpoint_path = "microsoft/Phi-3-mini-128k-instruct"
model_kwargs = dict(
    use_cache=False,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    device_map=None
)
model = AutoModelForCausalLM.from_pretrained(checkpoint_path, **model_kwargs)
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
tokenizer.model_max_length = 2048
tokenizer.pad_token = tokenizer.unk_token  # use unk rather than eos token to prevent endless generation
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
tokenizer.padding_side = 'right'

[INFO|configuration_utils.py:679] 2024-11-27 21:48:17,943 >> loading configuration file config.json from cache at /home/martinh2k3/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/0a67737cc96d2554230f90338b163bc6380a2a85/config.json
[INFO|configuration_utils.py:679] 2024-11-27 21:48:18,094 >> loading configuration file config.json from cache at /home/martinh2k3/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/0a67737cc96d2554230f90338b163bc6380a2a85/config.json
[INFO|configuration_utils.py:746] 2024-11-27 21:48:18,095 >> Model config Phi3Config {
  "_name_or_path": "microsoft/Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO|modeling_utils.py:4800] 2024-11-27 21:48:19,849 >> All model checkpoint weights were used when initializing Phi3ForCausalLM.

[INFO|modeling_utils.py:4808] 2024-11-27 21:48:19,850 >> All the weights of Phi3ForCausalLM were initialized from the model checkpoint at microsoft/Phi-3-mini-4k-instruct.
If your task is similar to the task the model of the checkpoint was trained on, you can already use Phi3ForCausalLM for predictions without further training.
[INFO|configuration_utils.py:1051] 2024-11-27 21:48:19,997 >> loading configuration file generation_config.json from cache at /home/martinh2k3/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/0a67737cc96d2554230f90338b163bc6380a2a85/generation_config.json
[INFO|configuration_utils.py:1096] 2024-11-27 21:48:19,998 >> Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": [
    32000,
    32001,
    32007
  ],
  "pad_token_id": 32000
}

[INFO|tokenization_utils_base.py:2211] 2024-11-27 21:4

Load datasets:

In [150]:
raw_train = datasets.load_dataset("json", data_files=training_dataset_path, download_mode="force_redownload")["train"]
raw_test = datasets.load_dataset("json", data_files=test_dataset_path, download_mode="force_redownload")["train"]
raw_dev = datasets.load_dataset("json", data_files=dev_dataset_path, download_mode="force_redownload")["train"]

Using custom data configuration default-39b1d5b4b4c2982b


2024-11-27 23:21:44 - INFO - datasets.builder - Using custom data configuration default-39b1d5b4b4c2982b


Loading Dataset Infos from /home/martinh2k3/anaconda3/envs/bp/lib/python3.12/site-packages/datasets/packaged_modules/json


2024-11-27 23:21:44 - INFO - datasets.info - Loading Dataset Infos from /home/martinh2k3/anaconda3/envs/bp/lib/python3.12/site-packages/datasets/packaged_modules/json


Generating dataset json (/home/martinh2k3/.cache/huggingface/datasets/json/default-39b1d5b4b4c2982b/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092)


2024-11-27 23:21:44 - INFO - datasets.builder - Generating dataset json (/home/martinh2k3/.cache/huggingface/datasets/json/default-39b1d5b4b4c2982b/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092)


Downloading and preparing dataset json/default to /home/martinh2k3/.cache/huggingface/datasets/json/default-39b1d5b4b4c2982b/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092...


2024-11-27 23:21:44 - INFO - datasets.builder - Downloading and preparing dataset json/default to /home/martinh2k3/.cache/huggingface/datasets/json/default-39b1d5b4b4c2982b/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092...


Downloading took 0.0 min


2024-11-27 23:21:44 - INFO - datasets.download.download_manager - Downloading took 0.0 min


Checksum Computation took 0.0 min


2024-11-27 23:21:44 - INFO - datasets.download.download_manager - Checksum Computation took 0.0 min


Generating train split


2024-11-27 23:21:44 - INFO - datasets.builder - Generating train split


Generating train split: 0 examples [00:00, ? examples/s]

Unable to verify splits sizes.


2024-11-27 23:21:44 - INFO - datasets.utils.info_utils - Unable to verify splits sizes.


Dataset json downloaded and prepared to /home/martinh2k3/.cache/huggingface/datasets/json/default-39b1d5b4b4c2982b/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092. Subsequent calls will reuse this data.


2024-11-27 23:21:44 - INFO - datasets.builder - Dataset json downloaded and prepared to /home/martinh2k3/.cache/huggingface/datasets/json/default-39b1d5b4b4c2982b/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092. Subsequent calls will reuse this data.


Using custom data configuration default-92ae54dd2cb89b78


2024-11-27 23:21:44 - INFO - datasets.builder - Using custom data configuration default-92ae54dd2cb89b78


Loading Dataset Infos from /home/martinh2k3/anaconda3/envs/bp/lib/python3.12/site-packages/datasets/packaged_modules/json


2024-11-27 23:21:44 - INFO - datasets.info - Loading Dataset Infos from /home/martinh2k3/anaconda3/envs/bp/lib/python3.12/site-packages/datasets/packaged_modules/json


Generating dataset json (/home/martinh2k3/.cache/huggingface/datasets/json/default-92ae54dd2cb89b78/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092)


2024-11-27 23:21:44 - INFO - datasets.builder - Generating dataset json (/home/martinh2k3/.cache/huggingface/datasets/json/default-92ae54dd2cb89b78/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092)


Downloading and preparing dataset json/default to /home/martinh2k3/.cache/huggingface/datasets/json/default-92ae54dd2cb89b78/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092...


2024-11-27 23:21:44 - INFO - datasets.builder - Downloading and preparing dataset json/default to /home/martinh2k3/.cache/huggingface/datasets/json/default-92ae54dd2cb89b78/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092...


Downloading took 0.0 min


2024-11-27 23:21:44 - INFO - datasets.download.download_manager - Downloading took 0.0 min


Checksum Computation took 0.0 min


2024-11-27 23:21:44 - INFO - datasets.download.download_manager - Checksum Computation took 0.0 min


Generating train split


2024-11-27 23:21:44 - INFO - datasets.builder - Generating train split


Generating train split: 0 examples [00:00, ? examples/s]

Unable to verify splits sizes.


2024-11-27 23:21:44 - INFO - datasets.utils.info_utils - Unable to verify splits sizes.


Dataset json downloaded and prepared to /home/martinh2k3/.cache/huggingface/datasets/json/default-92ae54dd2cb89b78/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092. Subsequent calls will reuse this data.


2024-11-27 23:21:44 - INFO - datasets.builder - Dataset json downloaded and prepared to /home/martinh2k3/.cache/huggingface/datasets/json/default-92ae54dd2cb89b78/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092. Subsequent calls will reuse this data.


Using custom data configuration default-5867b7293cfc4e97


2024-11-27 23:21:44 - INFO - datasets.builder - Using custom data configuration default-5867b7293cfc4e97


Loading Dataset Infos from /home/martinh2k3/anaconda3/envs/bp/lib/python3.12/site-packages/datasets/packaged_modules/json


2024-11-27 23:21:44 - INFO - datasets.info - Loading Dataset Infos from /home/martinh2k3/anaconda3/envs/bp/lib/python3.12/site-packages/datasets/packaged_modules/json


Overwrite dataset info from restored data version if exists.


2024-11-27 23:21:44 - INFO - datasets.builder - Overwrite dataset info from restored data version if exists.


Loading Dataset info from /home/martinh2k3/.cache/huggingface/datasets/json/default-5867b7293cfc4e97/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092


2024-11-27 23:21:44 - INFO - datasets.info - Loading Dataset info from /home/martinh2k3/.cache/huggingface/datasets/json/default-5867b7293cfc4e97/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092


Generating dataset json (/home/martinh2k3/.cache/huggingface/datasets/json/default-5867b7293cfc4e97/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092)


2024-11-27 23:21:44 - INFO - datasets.builder - Generating dataset json (/home/martinh2k3/.cache/huggingface/datasets/json/default-5867b7293cfc4e97/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092)


Downloading and preparing dataset json/default (download: 1.11 MiB, generated: 1.03 MiB, post-processed: Unknown size, total: 2.15 MiB) to /home/martinh2k3/.cache/huggingface/datasets/json/default-5867b7293cfc4e97/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092...


2024-11-27 23:21:44 - INFO - datasets.builder - Downloading and preparing dataset json/default (download: 1.11 MiB, generated: 1.03 MiB, post-processed: Unknown size, total: 2.15 MiB) to /home/martinh2k3/.cache/huggingface/datasets/json/default-5867b7293cfc4e97/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092...


Downloading took 0.0 min


2024-11-27 23:21:44 - INFO - datasets.download.download_manager - Downloading took 0.0 min


Checksum Computation took 0.0 min


2024-11-27 23:21:44 - INFO - datasets.download.download_manager - Checksum Computation took 0.0 min


Generating train split


2024-11-27 23:21:44 - INFO - datasets.builder - Generating train split


Generating train split:   0%|          | 0/2339 [00:00<?, ? examples/s]

All the splits matched successfully.


2024-11-27 23:21:44 - INFO - datasets.utils.info_utils - All the splits matched successfully.


Dataset json downloaded and prepared to /home/martinh2k3/.cache/huggingface/datasets/json/default-5867b7293cfc4e97/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092. Subsequent calls will reuse this data.


2024-11-27 23:21:44 - INFO - datasets.builder - Dataset json downloaded and prepared to /home/martinh2k3/.cache/huggingface/datasets/json/default-5867b7293cfc4e97/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092. Subsequent calls will reuse this data.


In [151]:
raw_train[18]

{'user': "In this model of chronic renal failure\nTreatment of Crohn's disease with fusidic acid: an antibiotic with immunosuppressive properties similar to cyclosporin.Fusidic acid is an antibiotic with T-cell specific immunosuppressive effects similar to those of cyclosporin.",
 'assistant': '[{"category": Disease, "entity": chronic renal failure}, {"category": Disease, "entity": Crohn\'s disease}, {"category": Chemical, "entity": fusidic acid}, {"category": Chemical, "entity": cyclosporin}, {"category": Chemical, "entity": cyclosporin}]'}

Tokenize input into correct format

Load the system prompt

In [155]:
with open(system_prompt_path, "r") as f:
    system_prompt = f.read()

In [156]:
system_prompt

'Please identify all the named entities mentioned in the input sentence provided below. Use only the categories: SpecificDisease, DiseaseClass, CompositeMention, and Modifier. Remember, some terms might refer to broader disease classes, while others are specific diseases or composite mentions involving multiple diseases. You should only output the results strictly in JSON format, without any delimiters, following a similar structure to the example result provided.\nExample user input and assistant response:\nUser:\nNaloxone reverses the antihypertensive effect of clonidine.\nAssistant:\n[{"category": "Chemical", "entity": "Naloxone"}, {"category": "Chemical", "entity": "clonidine"}]'

In [157]:
def apply_chat_template(example, tokenizer):
    """
    Convert the system, input, and output fields into a formatted chat-like text.
    """
    # Combine the fields into a structured chat format
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": example["user"]},
        {"role": "assistant", "content": example["assistant"]}
    ]
    # Use the tokenizer's chat template to create formatted text
    example["text"] = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=False
    )
    return example

column_names = list(raw_train.features)

# Apply processing to each dataset
processed_train = raw_train.map(
    apply_chat_template,
    fn_kwargs={"tokenizer": tokenizer},
    num_proc=10,
    remove_columns=column_names,
    desc="Applying chat template to train_sft",
)
processed_test = raw_test.map(
    apply_chat_template,
    fn_kwargs={"tokenizer": tokenizer},
    num_proc=10,
    remove_columns=column_names,
    desc="Applying chat template to train_sft",
)
processed_dev = raw_dev.map(
    apply_chat_template,
    fn_kwargs={"tokenizer": tokenizer},
    num_proc=10,
    remove_columns=column_names,
    desc="Applying chat template to train_sft",
)

Process #0 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-39b1d5b4b4c2982b/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-3dbb71085a3838a9_00000_of_00010.arrow


2024-11-27 23:25:34 - INFO - datasets.arrow_dataset - Process #0 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-39b1d5b4b4c2982b/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-3dbb71085a3838a9_00000_of_00010.arrow


Process #1 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-39b1d5b4b4c2982b/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-3dbb71085a3838a9_00001_of_00010.arrow


2024-11-27 23:25:34 - INFO - datasets.arrow_dataset - Process #1 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-39b1d5b4b4c2982b/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-3dbb71085a3838a9_00001_of_00010.arrow


Process #2 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-39b1d5b4b4c2982b/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-3dbb71085a3838a9_00002_of_00010.arrow


2024-11-27 23:25:34 - INFO - datasets.arrow_dataset - Process #2 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-39b1d5b4b4c2982b/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-3dbb71085a3838a9_00002_of_00010.arrow


Process #3 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-39b1d5b4b4c2982b/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-3dbb71085a3838a9_00003_of_00010.arrow


2024-11-27 23:25:34 - INFO - datasets.arrow_dataset - Process #3 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-39b1d5b4b4c2982b/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-3dbb71085a3838a9_00003_of_00010.arrow


Process #4 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-39b1d5b4b4c2982b/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-3dbb71085a3838a9_00004_of_00010.arrow


2024-11-27 23:25:34 - INFO - datasets.arrow_dataset - Process #4 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-39b1d5b4b4c2982b/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-3dbb71085a3838a9_00004_of_00010.arrow


Process #5 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-39b1d5b4b4c2982b/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-3dbb71085a3838a9_00005_of_00010.arrow


2024-11-27 23:25:34 - INFO - datasets.arrow_dataset - Process #5 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-39b1d5b4b4c2982b/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-3dbb71085a3838a9_00005_of_00010.arrow


Process #6 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-39b1d5b4b4c2982b/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-3dbb71085a3838a9_00006_of_00010.arrow


2024-11-27 23:25:34 - INFO - datasets.arrow_dataset - Process #6 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-39b1d5b4b4c2982b/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-3dbb71085a3838a9_00006_of_00010.arrow


Process #7 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-39b1d5b4b4c2982b/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-3dbb71085a3838a9_00007_of_00010.arrow


2024-11-27 23:25:34 - INFO - datasets.arrow_dataset - Process #7 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-39b1d5b4b4c2982b/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-3dbb71085a3838a9_00007_of_00010.arrow


Process #8 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-39b1d5b4b4c2982b/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-3dbb71085a3838a9_00008_of_00010.arrow


2024-11-27 23:25:34 - INFO - datasets.arrow_dataset - Process #8 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-39b1d5b4b4c2982b/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-3dbb71085a3838a9_00008_of_00010.arrow


Process #9 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-39b1d5b4b4c2982b/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-3dbb71085a3838a9_00009_of_00010.arrow


2024-11-27 23:25:34 - INFO - datasets.arrow_dataset - Process #9 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-39b1d5b4b4c2982b/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-3dbb71085a3838a9_00009_of_00010.arrow


Spawning 10 processes


2024-11-27 23:25:34 - INFO - datasets.arrow_dataset - Spawning 10 processes


Applying chat template to train_sft (num_proc=10):   0%|          | 0/2331 [00:00<?, ? examples/s]

Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-39b1d5b4b4c2982b/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-3dbb71085a3838a9_00000_of_00010.arrow


2024-11-27 23:25:34 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-39b1d5b4b4c2982b/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-3dbb71085a3838a9_00000_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-39b1d5b4b4c2982b/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-3dbb71085a3838a9_00001_of_00010.arrow


2024-11-27 23:25:34 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-39b1d5b4b4c2982b/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-3dbb71085a3838a9_00001_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-39b1d5b4b4c2982b/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-3dbb71085a3838a9_00002_of_00010.arrow


2024-11-27 23:25:34 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-39b1d5b4b4c2982b/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-3dbb71085a3838a9_00002_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-39b1d5b4b4c2982b/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-3dbb71085a3838a9_00003_of_00010.arrow


2024-11-27 23:25:34 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-39b1d5b4b4c2982b/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-3dbb71085a3838a9_00003_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-39b1d5b4b4c2982b/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-3dbb71085a3838a9_00004_of_00010.arrow


2024-11-27 23:25:34 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-39b1d5b4b4c2982b/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-3dbb71085a3838a9_00004_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-39b1d5b4b4c2982b/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-3dbb71085a3838a9_00005_of_00010.arrow


2024-11-27 23:25:34 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-39b1d5b4b4c2982b/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-3dbb71085a3838a9_00005_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-39b1d5b4b4c2982b/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-3dbb71085a3838a9_00006_of_00010.arrow


2024-11-27 23:25:34 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-39b1d5b4b4c2982b/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-3dbb71085a3838a9_00006_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-39b1d5b4b4c2982b/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-3dbb71085a3838a9_00007_of_00010.arrow


2024-11-27 23:25:35 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-39b1d5b4b4c2982b/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-3dbb71085a3838a9_00007_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-39b1d5b4b4c2982b/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-3dbb71085a3838a9_00008_of_00010.arrow


2024-11-27 23:25:35 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-39b1d5b4b4c2982b/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-3dbb71085a3838a9_00008_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-39b1d5b4b4c2982b/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-3dbb71085a3838a9_00009_of_00010.arrow


2024-11-27 23:25:35 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-39b1d5b4b4c2982b/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-3dbb71085a3838a9_00009_of_00010.arrow


Concatenating 10 shards


2024-11-27 23:25:35 - INFO - datasets.arrow_dataset - Concatenating 10 shards


Process #0 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-92ae54dd2cb89b78/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-d5b8c5c5ce03349c_00000_of_00010.arrow


2024-11-27 23:25:35 - INFO - datasets.arrow_dataset - Process #0 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-92ae54dd2cb89b78/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-d5b8c5c5ce03349c_00000_of_00010.arrow


Process #1 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-92ae54dd2cb89b78/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-d5b8c5c5ce03349c_00001_of_00010.arrow


2024-11-27 23:25:35 - INFO - datasets.arrow_dataset - Process #1 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-92ae54dd2cb89b78/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-d5b8c5c5ce03349c_00001_of_00010.arrow


Process #2 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-92ae54dd2cb89b78/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-d5b8c5c5ce03349c_00002_of_00010.arrow


2024-11-27 23:25:35 - INFO - datasets.arrow_dataset - Process #2 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-92ae54dd2cb89b78/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-d5b8c5c5ce03349c_00002_of_00010.arrow


Process #3 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-92ae54dd2cb89b78/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-d5b8c5c5ce03349c_00003_of_00010.arrow


2024-11-27 23:25:35 - INFO - datasets.arrow_dataset - Process #3 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-92ae54dd2cb89b78/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-d5b8c5c5ce03349c_00003_of_00010.arrow


Process #4 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-92ae54dd2cb89b78/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-d5b8c5c5ce03349c_00004_of_00010.arrow


2024-11-27 23:25:35 - INFO - datasets.arrow_dataset - Process #4 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-92ae54dd2cb89b78/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-d5b8c5c5ce03349c_00004_of_00010.arrow


Process #5 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-92ae54dd2cb89b78/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-d5b8c5c5ce03349c_00005_of_00010.arrow


2024-11-27 23:25:35 - INFO - datasets.arrow_dataset - Process #5 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-92ae54dd2cb89b78/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-d5b8c5c5ce03349c_00005_of_00010.arrow


Process #6 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-92ae54dd2cb89b78/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-d5b8c5c5ce03349c_00006_of_00010.arrow


2024-11-27 23:25:35 - INFO - datasets.arrow_dataset - Process #6 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-92ae54dd2cb89b78/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-d5b8c5c5ce03349c_00006_of_00010.arrow


Process #7 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-92ae54dd2cb89b78/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-d5b8c5c5ce03349c_00007_of_00010.arrow


2024-11-27 23:25:35 - INFO - datasets.arrow_dataset - Process #7 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-92ae54dd2cb89b78/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-d5b8c5c5ce03349c_00007_of_00010.arrow


Process #8 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-92ae54dd2cb89b78/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-d5b8c5c5ce03349c_00008_of_00010.arrow


2024-11-27 23:25:35 - INFO - datasets.arrow_dataset - Process #8 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-92ae54dd2cb89b78/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-d5b8c5c5ce03349c_00008_of_00010.arrow


Process #9 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-92ae54dd2cb89b78/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-d5b8c5c5ce03349c_00009_of_00010.arrow


2024-11-27 23:25:35 - INFO - datasets.arrow_dataset - Process #9 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-92ae54dd2cb89b78/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-d5b8c5c5ce03349c_00009_of_00010.arrow


Spawning 10 processes


2024-11-27 23:25:35 - INFO - datasets.arrow_dataset - Spawning 10 processes


Applying chat template to train_sft (num_proc=10):   0%|          | 0/2420 [00:00<?, ? examples/s]

Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-92ae54dd2cb89b78/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-d5b8c5c5ce03349c_00000_of_00010.arrow


2024-11-27 23:25:35 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-92ae54dd2cb89b78/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-d5b8c5c5ce03349c_00000_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-92ae54dd2cb89b78/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-d5b8c5c5ce03349c_00001_of_00010.arrow


2024-11-27 23:25:35 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-92ae54dd2cb89b78/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-d5b8c5c5ce03349c_00001_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-92ae54dd2cb89b78/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-d5b8c5c5ce03349c_00002_of_00010.arrow


2024-11-27 23:25:35 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-92ae54dd2cb89b78/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-d5b8c5c5ce03349c_00002_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-92ae54dd2cb89b78/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-d5b8c5c5ce03349c_00003_of_00010.arrow


2024-11-27 23:25:35 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-92ae54dd2cb89b78/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-d5b8c5c5ce03349c_00003_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-92ae54dd2cb89b78/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-d5b8c5c5ce03349c_00004_of_00010.arrow


2024-11-27 23:25:35 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-92ae54dd2cb89b78/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-d5b8c5c5ce03349c_00004_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-92ae54dd2cb89b78/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-d5b8c5c5ce03349c_00005_of_00010.arrow


2024-11-27 23:25:35 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-92ae54dd2cb89b78/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-d5b8c5c5ce03349c_00005_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-92ae54dd2cb89b78/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-d5b8c5c5ce03349c_00006_of_00010.arrow


2024-11-27 23:25:35 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-92ae54dd2cb89b78/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-d5b8c5c5ce03349c_00006_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-92ae54dd2cb89b78/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-d5b8c5c5ce03349c_00007_of_00010.arrow


2024-11-27 23:25:35 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-92ae54dd2cb89b78/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-d5b8c5c5ce03349c_00007_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-92ae54dd2cb89b78/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-d5b8c5c5ce03349c_00008_of_00010.arrow


2024-11-27 23:25:35 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-92ae54dd2cb89b78/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-d5b8c5c5ce03349c_00008_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-92ae54dd2cb89b78/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-d5b8c5c5ce03349c_00009_of_00010.arrow


2024-11-27 23:25:35 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-92ae54dd2cb89b78/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-d5b8c5c5ce03349c_00009_of_00010.arrow


Concatenating 10 shards


2024-11-27 23:25:36 - INFO - datasets.arrow_dataset - Concatenating 10 shards


Process #0 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-5867b7293cfc4e97/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-94fd2afff26524cc_00000_of_00010.arrow


2024-11-27 23:25:36 - INFO - datasets.arrow_dataset - Process #0 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-5867b7293cfc4e97/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-94fd2afff26524cc_00000_of_00010.arrow


Process #1 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-5867b7293cfc4e97/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-94fd2afff26524cc_00001_of_00010.arrow


2024-11-27 23:25:36 - INFO - datasets.arrow_dataset - Process #1 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-5867b7293cfc4e97/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-94fd2afff26524cc_00001_of_00010.arrow


Process #2 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-5867b7293cfc4e97/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-94fd2afff26524cc_00002_of_00010.arrow


2024-11-27 23:25:36 - INFO - datasets.arrow_dataset - Process #2 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-5867b7293cfc4e97/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-94fd2afff26524cc_00002_of_00010.arrow


Process #3 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-5867b7293cfc4e97/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-94fd2afff26524cc_00003_of_00010.arrow


2024-11-27 23:25:36 - INFO - datasets.arrow_dataset - Process #3 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-5867b7293cfc4e97/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-94fd2afff26524cc_00003_of_00010.arrow


Process #4 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-5867b7293cfc4e97/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-94fd2afff26524cc_00004_of_00010.arrow


2024-11-27 23:25:36 - INFO - datasets.arrow_dataset - Process #4 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-5867b7293cfc4e97/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-94fd2afff26524cc_00004_of_00010.arrow


Process #5 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-5867b7293cfc4e97/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-94fd2afff26524cc_00005_of_00010.arrow


2024-11-27 23:25:36 - INFO - datasets.arrow_dataset - Process #5 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-5867b7293cfc4e97/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-94fd2afff26524cc_00005_of_00010.arrow


Process #6 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-5867b7293cfc4e97/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-94fd2afff26524cc_00006_of_00010.arrow


2024-11-27 23:25:36 - INFO - datasets.arrow_dataset - Process #6 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-5867b7293cfc4e97/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-94fd2afff26524cc_00006_of_00010.arrow


Process #7 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-5867b7293cfc4e97/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-94fd2afff26524cc_00007_of_00010.arrow


2024-11-27 23:25:36 - INFO - datasets.arrow_dataset - Process #7 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-5867b7293cfc4e97/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-94fd2afff26524cc_00007_of_00010.arrow


Process #8 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-5867b7293cfc4e97/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-94fd2afff26524cc_00008_of_00010.arrow


2024-11-27 23:25:36 - INFO - datasets.arrow_dataset - Process #8 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-5867b7293cfc4e97/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-94fd2afff26524cc_00008_of_00010.arrow


Process #9 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-5867b7293cfc4e97/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-94fd2afff26524cc_00009_of_00010.arrow


2024-11-27 23:25:36 - INFO - datasets.arrow_dataset - Process #9 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-5867b7293cfc4e97/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-94fd2afff26524cc_00009_of_00010.arrow


Spawning 10 processes


2024-11-27 23:25:36 - INFO - datasets.arrow_dataset - Spawning 10 processes


Applying chat template to train_sft (num_proc=10):   0%|          | 0/2339 [00:00<?, ? examples/s]

Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-5867b7293cfc4e97/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-94fd2afff26524cc_00000_of_00010.arrow


2024-11-27 23:25:36 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-5867b7293cfc4e97/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-94fd2afff26524cc_00000_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-5867b7293cfc4e97/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-94fd2afff26524cc_00001_of_00010.arrow


2024-11-27 23:25:36 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-5867b7293cfc4e97/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-94fd2afff26524cc_00001_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-5867b7293cfc4e97/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-94fd2afff26524cc_00002_of_00010.arrow


2024-11-27 23:25:36 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-5867b7293cfc4e97/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-94fd2afff26524cc_00002_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-5867b7293cfc4e97/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-94fd2afff26524cc_00003_of_00010.arrow


2024-11-27 23:25:36 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-5867b7293cfc4e97/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-94fd2afff26524cc_00003_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-5867b7293cfc4e97/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-94fd2afff26524cc_00004_of_00010.arrow


2024-11-27 23:25:36 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-5867b7293cfc4e97/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-94fd2afff26524cc_00004_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-5867b7293cfc4e97/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-94fd2afff26524cc_00005_of_00010.arrow


2024-11-27 23:25:36 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-5867b7293cfc4e97/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-94fd2afff26524cc_00005_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-5867b7293cfc4e97/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-94fd2afff26524cc_00006_of_00010.arrow


2024-11-27 23:25:36 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-5867b7293cfc4e97/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-94fd2afff26524cc_00006_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-5867b7293cfc4e97/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-94fd2afff26524cc_00007_of_00010.arrow


2024-11-27 23:25:36 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-5867b7293cfc4e97/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-94fd2afff26524cc_00007_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-5867b7293cfc4e97/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-94fd2afff26524cc_00008_of_00010.arrow


2024-11-27 23:25:36 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-5867b7293cfc4e97/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-94fd2afff26524cc_00008_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-5867b7293cfc4e97/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-94fd2afff26524cc_00009_of_00010.arrow


2024-11-27 23:25:36 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-5867b7293cfc4e97/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-94fd2afff26524cc_00009_of_00010.arrow


Concatenating 10 shards


2024-11-27 23:25:37 - INFO - datasets.arrow_dataset - Concatenating 10 shards


In [158]:
processed_train[89]["text"]

'<|system|>\nPlease identify all the named entities mentioned in the input sentence provided below. Use only the categories: SpecificDisease, DiseaseClass, CompositeMention, and Modifier. Remember, some terms might refer to broader disease classes, while others are specific diseases or composite mentions involving multiple diseases. You should only output the results strictly in JSON format, without any delimiters, following a similar structure to the example result provided.\nExample user input and assistant response:\nUser:\nNaloxone reverses the antihypertensive effect of clonidine.\nAssistant:\n[{"category": "Chemical", "entity": "Naloxone"}, {"category": "Chemical", "entity": "clonidine"}]<|end|>\n<|user|>\nThe results suggest that rigidity, which is assumed to be due to an action of morphine in the striatum, can be antagonized by another process leading to dopaminergic activation in the striatum.Nevertheless, there occurs some real tolerance to this effect.<|end|>\n<|assistant|>\

In [118]:
trainer = SFTTrainer(
    model=model,
    args=train_conf,
    peft_config=peft_conf,
    train_dataset=processed_train,
    eval_dataset=processed_dev,
    max_seq_length=4,
    dataset_text_field="text",
    tokenizer=tokenizer
)
train_result = trainer.train()
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.log_metrics("eval", metrics)
trainer.save_metrics("train", metrics)
trainer.save_metrics("eval", metrics)
trainer.save_state()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
[INFO|training_args.py:2169] 2024-11-27 21:51:04,334 >> PyTorch: setting up devices


Map:   0%|          | 0/2061 [00:00<?, ? examples/s]

Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-8ed097bbc980fbd8/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-47008cd06c9cf839.arrow


2024-11-27 21:51:05 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-8ed097bbc980fbd8/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-47008cd06c9cf839.arrow


Map:   0%|          | 0/344 [00:00<?, ? examples/s]

Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-6ba14887e953e60f/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-924df7d47e0cc06c.arrow


2024-11-27 21:51:05 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-6ba14887e953e60f/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-924df7d47e0cc06c.arrow


[INFO|trainer.py:699] 2024-11-27 21:51:05,920 >> Using auto half precision backend
[INFO|trainer.py:2314] 2024-11-27 21:51:06,334 >> ***** Running training *****
[INFO|trainer.py:2315] 2024-11-27 21:51:06,335 >>   Num examples = 2,061
[INFO|trainer.py:2316] 2024-11-27 21:51:06,336 >>   Num Epochs = 1
[INFO|trainer.py:2317] 2024-11-27 21:51:06,336 >>   Instantaneous batch size per device = 1
[INFO|trainer.py:2320] 2024-11-27 21:51:06,337 >>   Total train batch size (w. parallel, distributed & accumulation) = 4
[INFO|trainer.py:2321] 2024-11-27 21:51:06,338 >>   Gradient Accumulation steps = 4
[INFO|trainer.py:2322] 2024-11-27 21:51:06,338 >>   Total optimization steps = 20
[INFO|trainer.py:2323] 2024-11-27 21:51:06,341 >>   Number of trainable parameters = 25,165,824


Step,Training Loss,Validation Loss
20,0.0,0.0


[INFO|trainer.py:4128] 2024-11-27 21:51:37,888 >> 
***** Running Evaluation *****
[INFO|trainer.py:4130] 2024-11-27 21:51:37,889 >>   Num examples = 344
[INFO|trainer.py:4133] 2024-11-27 21:51:37,890 >>   Batch size = 1
[INFO|trainer.py:3812] 2024-11-27 21:52:05,043 >> Saving model checkpoint to ./checkpoint_dir/checkpoint-20
[INFO|configuration_utils.py:679] 2024-11-27 21:52:05,357 >> loading configuration file config.json from cache at /home/martinh2k3/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/0a67737cc96d2554230f90338b163bc6380a2a85/config.json
[INFO|configuration_utils.py:746] 2024-11-27 21:52:05,360 >> Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.

***** train metrics *****
  epoch                    =     0.0388
  total_flos               =     6701GF
  train_loss               =     1.7224
  train_runtime            = 0:00:59.53
  train_samples_per_second =      1.344
  train_steps_per_second   =      0.336
***** eval metrics *****
  epoch                    =     0.0388
  total_flos               =     6701GF
  train_loss               =     1.7224
  train_runtime            = 0:00:59.53
  train_samples_per_second =      1.344
  train_steps_per_second   =      0.336


In [119]:
model.save_pretrained("./trained_model")
tokenizer.save_pretrained("./trained_model")

[INFO|configuration_utils.py:414] 2024-11-27 22:22:09,232 >> Configuration saved in ./trained_model/config.json
[INFO|configuration_utils.py:865] 2024-11-27 22:22:09,234 >> Configuration saved in ./trained_model/generation_config.json
[INFO|modeling_utils.py:3043] 2024-11-27 22:22:19,441 >> The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 2 checkpoint shards. You can find where each parameters has been saved in the index located at ./trained_model/model.safetensors.index.json.
[INFO|tokenization_utils_base.py:2646] 2024-11-27 22:22:19,444 >> tokenizer config file saved in ./trained_model/tokenizer_config.json
[INFO|tokenization_utils_base.py:2655] 2024-11-27 22:22:19,446 >> Special tokens file saved in ./trained_model/special_tokens_map.json


('./trained_model/tokenizer_config.json',
 './trained_model/special_tokens_map.json',
 './trained_model/tokenizer.model',
 './trained_model/added_tokens.json',
 './trained_model/tokenizer.json')

In [121]:
def prepare_for_inference(user_input: str, system_prompt=None):
    if not system_prompt:
        system_prompt = """Please identify all the named entities mentioned in the input sentence provided below. Use only the categories: SpecificDisease, DiseaseClass, CompositeMention, and Modifier. Remember, some terms might refer to broader disease classes, while others are specific diseases or composite mentions involving multiple diseases. You should only output the results strictly in JSON format, without any delimiters, following a similar structure to the example result provided.
Example user input and assistant response:
User:
A common human skin tumour is caused by activating mutations in beta-catenin.
Assistant:
[{ \"category\": \"DiseaseClass\", \"entity\": \"skin tumour\" }]"""
    prompt_data = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_input}
    ]
    return  tokenizer.apply_chat_template(
        prompt_data, tokenize=False, add_generation_prompt="<|assistant|>" 
    )


In [130]:
processed_input = prepare_for_inference("We examined whether the WD gene ATP7B was also causative for CT by investigating the chromosomal co-localization of ATP7B and C04107, using fluorescence in situ hybridization (FISH).C04107 is an anonymous microsatellite marker closely linked to CT.")
processed_input

'<|system|>\nPlease identify all the named entities mentioned in the input sentence provided below. Use only the categories: SpecificDisease, DiseaseClass, CompositeMention, and Modifier. Remember, some terms might refer to broader disease classes, while others are specific diseases or composite mentions involving multiple diseases. You should only output the results strictly in JSON format, without any delimiters, following a similar structure to the example result provided.\nExample user input and assistant response:\nUser:\nA common human skin tumour is caused by activating mutations in beta-catenin.\nAssistant:\n[{ "category": "DiseaseClass", "entity": "skin tumour" }]<|end|>\n<|user|>\nWe examined whether the WD gene ATP7B was also causative for CT by investigating the chromosomal co-localization of ATP7B and C04107, using fluorescence in situ hybridization (FISH).C04107 is an anonymous microsatellite marker closely linked to CT.<|end|>\n<|assistant|>\n'

In [123]:
inputs = tokenizer(processed_input, return_tensors="pt").to("cuda")
outputs = model.generate(inputs["input_ids"], max_new_tokens=1000, num_return_sequences=1, temperature=0.7)

# Decode and print the generated text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)



Please identify all the named entities mentioned in the input sentence provided below. Use only the categories: SpecificDisease, DiseaseClass, CompositeMention, and Modifier. Remember, some terms might refer to broader disease classes, while others are specific diseases or composite mentions involving multiple diseases. You should only output the results strictly in JSON format, without any delimiters, following a similar structure to the example result provided.
Example user input and assistant response:
User:
A common human skin tumour is caused by activating mutations in beta-catenin.
Assistant:
[{ "category": "DiseaseClass", "entity": "skin tumour" }] In most people with red-blood-cell glucose-6-phosphate dehydrogenase (G6PD) deficiency, the enzyme-deficient phenotype is only moderately expressed in nucleated cells. [{"category": "DiseaseClass", "entity": "red-blood-cell glucose-6-phosphate dehydrogenase (G6PD) deficiency"}, {"category": "Modifier", "entity": "nucleated cells"}]


In [131]:
from transformers import pipeline

nlp = pipeline("text-generation", model=model, tokenizer=tokenizer, device='cuda')
generation_args = {
    "max_new_tokens": 500,
    "return_full_text": False,
    "temperature": 0.2,
}

output = nlp(processed_input, **generation_args)

In [133]:
output[0]["generated_text"]

' [\n  {\n    "category": "DiseaseClass",\n    "entity": "Wilson disease"\n  },\n  {\n    "category": "DiseaseClass",\n    "entity": "Congenital toxoplasmosis"\n  },\n  {\n    "category": "Gene",\n    "entity": "ATP7B"\n  },\n  {\n    "category": "Marker",\n    "entity": "C04107"\n  }\n]'

In [125]:
peft_model = peft.PeftModel.from_pretrained(model, "checkpoint_dir/checkpoint-20")
peft_pipeline = pipeline("text-generation", model=peft_model, tokenizer=tokenizer, device='cuda')

[ERROR|base.py:1149] 2024-11-27 22:23:52,393 >> The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'GraniteForCausalLM', 'GraniteMoeForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'Mamba2F

In [127]:
output = peft_pipeline(processed_input, **generation_args)
output[0]["generated_text"]

' [{"category": "DiseaseClass", "entity": "red-blood-cell glucose-6-phosphate dehydrogenase (G6PD) deficiency"}, {"category": "Modifier", "entity": "nucleated cells"}]'