In [7]:
import sys
import logging

import datasets
from datasets import load_dataset
from peft import LoraConfig
import torch
import transformers
from trl import SFTTrainer
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig

In [4]:
training_config = {
    "bf16": True,
    "do_eval": False,
    "learning_rate": 5.0e-06,
    "log_level": "info",
    "logging_steps": 20,
    "logging_strategy": "steps",
    "lr_scheduler_type": "cosine",
    "num_train_epochs": 1,
    "max_steps": 120,
    "output_dir": "./checkpoint_dir",
    "overwrite_output_dir": True,
    "per_device_eval_batch_size": 4,
    "per_device_train_batch_size": 4,
    "remove_unused_columns": True,
    "save_steps": 100,
    "save_total_limit": 1,
    "seed": 0,
    "gradient_checkpointing": True,
    "gradient_checkpointing_kwargs":{"use_reentrant": False},
    "gradient_accumulation_steps": 1,
    "warmup_ratio": 0.2,
    }

In [5]:
peft_config = {
    "r": 16,
    "lora_alpha": 32,
    "lora_dropout": 0.05,
    "bias": "none",
    "task_type": "CAUSAL_LM",
    "target_modules": "all-linear",
    "modules_to_save": None,
}
train_conf = TrainingArguments(**training_config)
peft_conf = LoraConfig(**peft_config)

In [8]:
logger = logging.getLogger(__name__)

logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
)
log_level = train_conf.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()

# Log on each process a small summary
logger.warning(
    f"Process rank: {train_conf.local_rank}, device: {train_conf.device}, n_gpu: {train_conf.n_gpu}"
    + f" distributed training: {bool(train_conf.local_rank != -1)}, 16-bits training: {train_conf.fp16}"
)
logger.info(f"Training/evaluation parameters {train_conf}")
logger.info(f"PEFT parameters {peft_conf}")


2024-11-21 18:11:56 - INFO - __main__ - Training/evaluation parameters TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=True,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_st

In [66]:
checkpoint_path = "microsoft/Phi-3-mini-4k-instruct"
# checkpoint_path = "microsoft/Phi-3-mini-128k-instruct"
model_kwargs = dict(
    use_cache=False,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    device_map=None
)
model = AutoModelForCausalLM.from_pretrained(checkpoint_path, **model_kwargs)
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
tokenizer.model_max_length = 2048
tokenizer.pad_token = tokenizer.unk_token  # use unk rather than eos token to prevent endless generation
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
tokenizer.padding_side = 'right'

[INFO|configuration_utils.py:679] 2024-11-21 20:57:10,500 >> loading configuration file config.json from cache at /home/martinh2k3/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/0a67737cc96d2554230f90338b163bc6380a2a85/config.json
[INFO|configuration_utils.py:679] 2024-11-21 20:57:10,691 >> loading configuration file config.json from cache at /home/martinh2k3/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/0a67737cc96d2554230f90338b163bc6380a2a85/config.json
[INFO|configuration_utils.py:746] 2024-11-21 20:57:10,693 >> Model config Phi3Config {
  "_name_or_path": "microsoft/Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO|modeling_utils.py:4800] 2024-11-21 20:57:11,479 >> All model checkpoint weights were used when initializing Phi3ForCausalLM.

[INFO|modeling_utils.py:4808] 2024-11-21 20:57:11,480 >> All the weights of Phi3ForCausalLM were initialized from the model checkpoint at microsoft/Phi-3-mini-4k-instruct.
If your task is similar to the task the model of the checkpoint was trained on, you can already use Phi3ForCausalLM for predictions without further training.
[INFO|configuration_utils.py:1051] 2024-11-21 20:57:11,654 >> loading configuration file generation_config.json from cache at /home/martinh2k3/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/0a67737cc96d2554230f90338b163bc6380a2a85/generation_config.json
[INFO|configuration_utils.py:1096] 2024-11-21 20:57:11,655 >> Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": [
    32000,
    32001,
    32007
  ],
  "pad_token_id": 32000
}

[INFO|tokenization_utils_base.py:2211] 2024-11-21 20:5

Load datasets:

In [127]:
raw_train = datasets.load_dataset("json", data_files="./data/NCBItrainset_corpus.json", download_mode="force_redownload")["train"]
raw_test = datasets.load_dataset("json", data_files="./data/NCBItestset_corpus.json", download_mode="force_redownload")["train"]
raw_dev = datasets.load_dataset("json", data_files="./data/NCBIdevelopset_corpus.json", download_mode="force_redownload")["train"]

Using custom data configuration default-a428603bbc82149e


2024-11-21 22:58:05 - INFO - datasets.builder - Using custom data configuration default-a428603bbc82149e


Loading Dataset Infos from /home/martinh2k3/anaconda3/envs/bp/lib/python3.12/site-packages/datasets/packaged_modules/json


2024-11-21 22:58:05 - INFO - datasets.info - Loading Dataset Infos from /home/martinh2k3/anaconda3/envs/bp/lib/python3.12/site-packages/datasets/packaged_modules/json


Generating dataset json (/home/martinh2k3/.cache/huggingface/datasets/json/default-a428603bbc82149e/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092)


2024-11-21 22:58:05 - INFO - datasets.builder - Generating dataset json (/home/martinh2k3/.cache/huggingface/datasets/json/default-a428603bbc82149e/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092)


Downloading and preparing dataset json/default to /home/martinh2k3/.cache/huggingface/datasets/json/default-a428603bbc82149e/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092...


2024-11-21 22:58:05 - INFO - datasets.builder - Downloading and preparing dataset json/default to /home/martinh2k3/.cache/huggingface/datasets/json/default-a428603bbc82149e/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092...


Downloading took 0.0 min


2024-11-21 22:58:05 - INFO - datasets.download.download_manager - Downloading took 0.0 min


Checksum Computation took 0.0 min


2024-11-21 22:58:05 - INFO - datasets.download.download_manager - Checksum Computation took 0.0 min


Generating train split


2024-11-21 22:58:05 - INFO - datasets.builder - Generating train split


Generating train split: 0 examples [00:00, ? examples/s]

Unable to verify splits sizes.


2024-11-21 22:58:05 - INFO - datasets.utils.info_utils - Unable to verify splits sizes.


Dataset json downloaded and prepared to /home/martinh2k3/.cache/huggingface/datasets/json/default-a428603bbc82149e/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092. Subsequent calls will reuse this data.


2024-11-21 22:58:05 - INFO - datasets.builder - Dataset json downloaded and prepared to /home/martinh2k3/.cache/huggingface/datasets/json/default-a428603bbc82149e/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092. Subsequent calls will reuse this data.


Using custom data configuration default-716ccb4327edc3cc


2024-11-21 22:58:05 - INFO - datasets.builder - Using custom data configuration default-716ccb4327edc3cc


Loading Dataset Infos from /home/martinh2k3/anaconda3/envs/bp/lib/python3.12/site-packages/datasets/packaged_modules/json


2024-11-21 22:58:05 - INFO - datasets.info - Loading Dataset Infos from /home/martinh2k3/anaconda3/envs/bp/lib/python3.12/site-packages/datasets/packaged_modules/json


Generating dataset json (/home/martinh2k3/.cache/huggingface/datasets/json/default-716ccb4327edc3cc/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092)


2024-11-21 22:58:05 - INFO - datasets.builder - Generating dataset json (/home/martinh2k3/.cache/huggingface/datasets/json/default-716ccb4327edc3cc/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092)


Downloading and preparing dataset json/default to /home/martinh2k3/.cache/huggingface/datasets/json/default-716ccb4327edc3cc/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092...


2024-11-21 22:58:05 - INFO - datasets.builder - Downloading and preparing dataset json/default to /home/martinh2k3/.cache/huggingface/datasets/json/default-716ccb4327edc3cc/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092...


Downloading took 0.0 min


2024-11-21 22:58:05 - INFO - datasets.download.download_manager - Downloading took 0.0 min


Checksum Computation took 0.0 min


2024-11-21 22:58:05 - INFO - datasets.download.download_manager - Checksum Computation took 0.0 min


Generating train split


2024-11-21 22:58:05 - INFO - datasets.builder - Generating train split


Generating train split: 0 examples [00:00, ? examples/s]

Unable to verify splits sizes.


2024-11-21 22:58:05 - INFO - datasets.utils.info_utils - Unable to verify splits sizes.


Dataset json downloaded and prepared to /home/martinh2k3/.cache/huggingface/datasets/json/default-716ccb4327edc3cc/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092. Subsequent calls will reuse this data.


2024-11-21 22:58:05 - INFO - datasets.builder - Dataset json downloaded and prepared to /home/martinh2k3/.cache/huggingface/datasets/json/default-716ccb4327edc3cc/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092. Subsequent calls will reuse this data.


Using custom data configuration default-0d3ee0f7bc793fac


2024-11-21 22:58:05 - INFO - datasets.builder - Using custom data configuration default-0d3ee0f7bc793fac


Loading Dataset Infos from /home/martinh2k3/anaconda3/envs/bp/lib/python3.12/site-packages/datasets/packaged_modules/json


2024-11-21 22:58:05 - INFO - datasets.info - Loading Dataset Infos from /home/martinh2k3/anaconda3/envs/bp/lib/python3.12/site-packages/datasets/packaged_modules/json


Generating dataset json (/home/martinh2k3/.cache/huggingface/datasets/json/default-0d3ee0f7bc793fac/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092)


2024-11-21 22:58:05 - INFO - datasets.builder - Generating dataset json (/home/martinh2k3/.cache/huggingface/datasets/json/default-0d3ee0f7bc793fac/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092)


Downloading and preparing dataset json/default to /home/martinh2k3/.cache/huggingface/datasets/json/default-0d3ee0f7bc793fac/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092...


2024-11-21 22:58:05 - INFO - datasets.builder - Downloading and preparing dataset json/default to /home/martinh2k3/.cache/huggingface/datasets/json/default-0d3ee0f7bc793fac/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092...


Downloading took 0.0 min


2024-11-21 22:58:05 - INFO - datasets.download.download_manager - Downloading took 0.0 min


Checksum Computation took 0.0 min


2024-11-21 22:58:05 - INFO - datasets.download.download_manager - Checksum Computation took 0.0 min


Generating train split


2024-11-21 22:58:05 - INFO - datasets.builder - Generating train split


Generating train split: 0 examples [00:00, ? examples/s]

Unable to verify splits sizes.


2024-11-21 22:58:05 - INFO - datasets.utils.info_utils - Unable to verify splits sizes.


Dataset json downloaded and prepared to /home/martinh2k3/.cache/huggingface/datasets/json/default-0d3ee0f7bc793fac/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092. Subsequent calls will reuse this data.


2024-11-21 22:58:05 - INFO - datasets.builder - Dataset json downloaded and prepared to /home/martinh2k3/.cache/huggingface/datasets/json/default-0d3ee0f7bc793fac/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092. Subsequent calls will reuse this data.


In [128]:
raw_train[0]

{'system': 'Please identify all the named entities mentioned in the input sentence provided below. Use only the categories: SpecificDisease, DiseaseClass, CompositeMention, and Modifier. Remember, some terms might refer to broader disease classes, while others are specific diseases or composite mentions involving multiple diseases. You should only output the results in JSON format, following a similar structure to the example result provided.\n\n    Example sentence and results:\n    "A common human skin tumour is caused by activating mutations in beta-catenin."\n\n    "\\"Results\\": [\n        { \\"category\\": \\"DiseaseClass\\", \\"entity\\": \\"skin tumour\\" }\n    ]"\n    ',
 'user': 'A common human skin tumour is caused by activating mutations in beta-catenin.WNT signalling orchestrates a number of developmental programs.In response to this stimulus, cytoplasmic beta-catenin (encoded by CTNNB1) is stabilized, enabling downstream transcriptional activation by members of the LEF/

Tokenize input into correct format

In [129]:
def apply_chat_template(example, tokenizer):
    """
    Convert the system, input, and output fields into a formatted chat-like text.
    """
    # Combine the fields into a structured chat format
    messages = [
        {"role": "system", "content": example["system"] if example["system"] else ""},
        {"role": "user", "content": example["user"] if example["user"] else ""},
        {"role": "assistant", "content": example["assistant"] if example["assistant"] else ""}
    ]
    # Use the tokenizer's chat template to create formatted text
    example["text"] = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=False
    )
    return example

column_names = list(raw_train.features)

# Apply processing to each dataset
processed_train = raw_train.map(
    apply_chat_template,
    fn_kwargs={"tokenizer": tokenizer},
    num_proc=10,
    remove_columns=column_names,
    desc="Applying chat template to train_sft",
)
processed_test = raw_test.map(
    apply_chat_template,
    fn_kwargs={"tokenizer": tokenizer},
    num_proc=10,
    remove_columns=column_names,
    desc="Applying chat template to train_sft",
)
processed_dev = raw_dev.map(
    apply_chat_template,
    fn_kwargs={"tokenizer": tokenizer},
    num_proc=10,
    remove_columns=column_names,
    desc="Applying chat template to train_sft",
)

Process #0 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-a428603bbc82149e/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-2991b260959a2bd0_00000_of_00010.arrow


2024-11-21 22:58:18 - INFO - datasets.arrow_dataset - Process #0 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-a428603bbc82149e/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-2991b260959a2bd0_00000_of_00010.arrow


Process #1 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-a428603bbc82149e/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-2991b260959a2bd0_00001_of_00010.arrow


2024-11-21 22:58:18 - INFO - datasets.arrow_dataset - Process #1 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-a428603bbc82149e/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-2991b260959a2bd0_00001_of_00010.arrow


Process #2 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-a428603bbc82149e/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-2991b260959a2bd0_00002_of_00010.arrow


2024-11-21 22:58:18 - INFO - datasets.arrow_dataset - Process #2 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-a428603bbc82149e/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-2991b260959a2bd0_00002_of_00010.arrow


Process #3 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-a428603bbc82149e/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-2991b260959a2bd0_00003_of_00010.arrow


2024-11-21 22:58:18 - INFO - datasets.arrow_dataset - Process #3 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-a428603bbc82149e/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-2991b260959a2bd0_00003_of_00010.arrow


Process #4 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-a428603bbc82149e/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-2991b260959a2bd0_00004_of_00010.arrow


2024-11-21 22:58:18 - INFO - datasets.arrow_dataset - Process #4 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-a428603bbc82149e/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-2991b260959a2bd0_00004_of_00010.arrow


Process #5 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-a428603bbc82149e/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-2991b260959a2bd0_00005_of_00010.arrow


2024-11-21 22:58:18 - INFO - datasets.arrow_dataset - Process #5 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-a428603bbc82149e/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-2991b260959a2bd0_00005_of_00010.arrow


Process #6 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-a428603bbc82149e/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-2991b260959a2bd0_00006_of_00010.arrow


2024-11-21 22:58:18 - INFO - datasets.arrow_dataset - Process #6 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-a428603bbc82149e/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-2991b260959a2bd0_00006_of_00010.arrow


Process #7 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-a428603bbc82149e/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-2991b260959a2bd0_00007_of_00010.arrow


2024-11-21 22:58:18 - INFO - datasets.arrow_dataset - Process #7 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-a428603bbc82149e/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-2991b260959a2bd0_00007_of_00010.arrow


Process #8 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-a428603bbc82149e/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-2991b260959a2bd0_00008_of_00010.arrow


2024-11-21 22:58:18 - INFO - datasets.arrow_dataset - Process #8 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-a428603bbc82149e/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-2991b260959a2bd0_00008_of_00010.arrow


Process #9 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-a428603bbc82149e/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-2991b260959a2bd0_00009_of_00010.arrow


2024-11-21 22:58:18 - INFO - datasets.arrow_dataset - Process #9 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-a428603bbc82149e/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-2991b260959a2bd0_00009_of_00010.arrow


Spawning 10 processes


2024-11-21 22:58:18 - INFO - datasets.arrow_dataset - Spawning 10 processes


Applying chat template to train_sft (num_proc=10):   0%|          | 0/2061 [00:00<?, ? examples/s]

Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-a428603bbc82149e/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-2991b260959a2bd0_00000_of_00010.arrow


2024-11-21 22:58:18 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-a428603bbc82149e/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-2991b260959a2bd0_00000_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-a428603bbc82149e/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-2991b260959a2bd0_00001_of_00010.arrow


2024-11-21 22:58:18 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-a428603bbc82149e/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-2991b260959a2bd0_00001_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-a428603bbc82149e/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-2991b260959a2bd0_00002_of_00010.arrow


2024-11-21 22:58:18 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-a428603bbc82149e/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-2991b260959a2bd0_00002_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-a428603bbc82149e/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-2991b260959a2bd0_00003_of_00010.arrow


2024-11-21 22:58:18 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-a428603bbc82149e/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-2991b260959a2bd0_00003_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-a428603bbc82149e/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-2991b260959a2bd0_00004_of_00010.arrow


2024-11-21 22:58:18 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-a428603bbc82149e/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-2991b260959a2bd0_00004_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-a428603bbc82149e/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-2991b260959a2bd0_00005_of_00010.arrow


2024-11-21 22:58:18 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-a428603bbc82149e/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-2991b260959a2bd0_00005_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-a428603bbc82149e/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-2991b260959a2bd0_00006_of_00010.arrow


2024-11-21 22:58:18 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-a428603bbc82149e/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-2991b260959a2bd0_00006_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-a428603bbc82149e/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-2991b260959a2bd0_00007_of_00010.arrow


2024-11-21 22:58:18 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-a428603bbc82149e/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-2991b260959a2bd0_00007_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-a428603bbc82149e/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-2991b260959a2bd0_00008_of_00010.arrow


2024-11-21 22:58:18 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-a428603bbc82149e/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-2991b260959a2bd0_00008_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-a428603bbc82149e/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-2991b260959a2bd0_00009_of_00010.arrow


2024-11-21 22:58:18 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-a428603bbc82149e/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-2991b260959a2bd0_00009_of_00010.arrow


Concatenating 10 shards


2024-11-21 22:58:19 - INFO - datasets.arrow_dataset - Concatenating 10 shards


Process #0 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-716ccb4327edc3cc/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-9a7294531412b931_00000_of_00010.arrow


2024-11-21 22:58:19 - INFO - datasets.arrow_dataset - Process #0 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-716ccb4327edc3cc/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-9a7294531412b931_00000_of_00010.arrow


Process #1 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-716ccb4327edc3cc/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-9a7294531412b931_00001_of_00010.arrow


2024-11-21 22:58:19 - INFO - datasets.arrow_dataset - Process #1 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-716ccb4327edc3cc/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-9a7294531412b931_00001_of_00010.arrow


Process #2 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-716ccb4327edc3cc/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-9a7294531412b931_00002_of_00010.arrow


2024-11-21 22:58:19 - INFO - datasets.arrow_dataset - Process #2 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-716ccb4327edc3cc/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-9a7294531412b931_00002_of_00010.arrow


Process #3 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-716ccb4327edc3cc/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-9a7294531412b931_00003_of_00010.arrow


2024-11-21 22:58:19 - INFO - datasets.arrow_dataset - Process #3 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-716ccb4327edc3cc/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-9a7294531412b931_00003_of_00010.arrow


Process #4 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-716ccb4327edc3cc/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-9a7294531412b931_00004_of_00010.arrow


2024-11-21 22:58:19 - INFO - datasets.arrow_dataset - Process #4 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-716ccb4327edc3cc/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-9a7294531412b931_00004_of_00010.arrow


Process #5 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-716ccb4327edc3cc/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-9a7294531412b931_00005_of_00010.arrow


2024-11-21 22:58:19 - INFO - datasets.arrow_dataset - Process #5 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-716ccb4327edc3cc/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-9a7294531412b931_00005_of_00010.arrow


Process #6 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-716ccb4327edc3cc/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-9a7294531412b931_00006_of_00010.arrow


2024-11-21 22:58:19 - INFO - datasets.arrow_dataset - Process #6 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-716ccb4327edc3cc/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-9a7294531412b931_00006_of_00010.arrow


Process #7 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-716ccb4327edc3cc/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-9a7294531412b931_00007_of_00010.arrow


2024-11-21 22:58:19 - INFO - datasets.arrow_dataset - Process #7 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-716ccb4327edc3cc/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-9a7294531412b931_00007_of_00010.arrow


Process #8 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-716ccb4327edc3cc/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-9a7294531412b931_00008_of_00010.arrow


2024-11-21 22:58:19 - INFO - datasets.arrow_dataset - Process #8 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-716ccb4327edc3cc/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-9a7294531412b931_00008_of_00010.arrow


Process #9 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-716ccb4327edc3cc/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-9a7294531412b931_00009_of_00010.arrow


2024-11-21 22:58:19 - INFO - datasets.arrow_dataset - Process #9 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-716ccb4327edc3cc/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-9a7294531412b931_00009_of_00010.arrow


Spawning 10 processes


2024-11-21 22:58:19 - INFO - datasets.arrow_dataset - Spawning 10 processes


Applying chat template to train_sft (num_proc=10):   0%|          | 0/382 [00:00<?, ? examples/s]

Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-716ccb4327edc3cc/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-9a7294531412b931_00000_of_00010.arrow


2024-11-21 22:58:19 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-716ccb4327edc3cc/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-9a7294531412b931_00000_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-716ccb4327edc3cc/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-9a7294531412b931_00001_of_00010.arrow


2024-11-21 22:58:19 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-716ccb4327edc3cc/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-9a7294531412b931_00001_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-716ccb4327edc3cc/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-9a7294531412b931_00002_of_00010.arrow


2024-11-21 22:58:19 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-716ccb4327edc3cc/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-9a7294531412b931_00002_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-716ccb4327edc3cc/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-9a7294531412b931_00003_of_00010.arrow


2024-11-21 22:58:19 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-716ccb4327edc3cc/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-9a7294531412b931_00003_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-716ccb4327edc3cc/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-9a7294531412b931_00004_of_00010.arrow


2024-11-21 22:58:19 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-716ccb4327edc3cc/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-9a7294531412b931_00004_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-716ccb4327edc3cc/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-9a7294531412b931_00005_of_00010.arrow


2024-11-21 22:58:19 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-716ccb4327edc3cc/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-9a7294531412b931_00005_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-716ccb4327edc3cc/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-9a7294531412b931_00006_of_00010.arrow


2024-11-21 22:58:19 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-716ccb4327edc3cc/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-9a7294531412b931_00006_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-716ccb4327edc3cc/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-9a7294531412b931_00007_of_00010.arrow


2024-11-21 22:58:19 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-716ccb4327edc3cc/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-9a7294531412b931_00007_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-716ccb4327edc3cc/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-9a7294531412b931_00009_of_00010.arrow


2024-11-21 22:58:19 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-716ccb4327edc3cc/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-9a7294531412b931_00009_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-716ccb4327edc3cc/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-9a7294531412b931_00008_of_00010.arrow


2024-11-21 22:58:19 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-716ccb4327edc3cc/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-9a7294531412b931_00008_of_00010.arrow


Concatenating 10 shards


2024-11-21 22:58:19 - INFO - datasets.arrow_dataset - Concatenating 10 shards


Process #0 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-0d3ee0f7bc793fac/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-73136685d4784318_00000_of_00010.arrow


2024-11-21 22:58:19 - INFO - datasets.arrow_dataset - Process #0 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-0d3ee0f7bc793fac/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-73136685d4784318_00000_of_00010.arrow


Process #1 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-0d3ee0f7bc793fac/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-73136685d4784318_00001_of_00010.arrow


2024-11-21 22:58:19 - INFO - datasets.arrow_dataset - Process #1 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-0d3ee0f7bc793fac/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-73136685d4784318_00001_of_00010.arrow


Process #2 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-0d3ee0f7bc793fac/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-73136685d4784318_00002_of_00010.arrow


2024-11-21 22:58:19 - INFO - datasets.arrow_dataset - Process #2 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-0d3ee0f7bc793fac/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-73136685d4784318_00002_of_00010.arrow


Process #3 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-0d3ee0f7bc793fac/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-73136685d4784318_00003_of_00010.arrow


2024-11-21 22:58:19 - INFO - datasets.arrow_dataset - Process #3 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-0d3ee0f7bc793fac/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-73136685d4784318_00003_of_00010.arrow


Process #4 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-0d3ee0f7bc793fac/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-73136685d4784318_00004_of_00010.arrow


2024-11-21 22:58:19 - INFO - datasets.arrow_dataset - Process #4 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-0d3ee0f7bc793fac/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-73136685d4784318_00004_of_00010.arrow


Process #5 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-0d3ee0f7bc793fac/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-73136685d4784318_00005_of_00010.arrow


2024-11-21 22:58:19 - INFO - datasets.arrow_dataset - Process #5 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-0d3ee0f7bc793fac/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-73136685d4784318_00005_of_00010.arrow


Process #6 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-0d3ee0f7bc793fac/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-73136685d4784318_00006_of_00010.arrow


2024-11-21 22:58:19 - INFO - datasets.arrow_dataset - Process #6 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-0d3ee0f7bc793fac/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-73136685d4784318_00006_of_00010.arrow


Process #7 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-0d3ee0f7bc793fac/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-73136685d4784318_00007_of_00010.arrow


2024-11-21 22:58:19 - INFO - datasets.arrow_dataset - Process #7 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-0d3ee0f7bc793fac/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-73136685d4784318_00007_of_00010.arrow


Process #8 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-0d3ee0f7bc793fac/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-73136685d4784318_00008_of_00010.arrow


2024-11-21 22:58:19 - INFO - datasets.arrow_dataset - Process #8 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-0d3ee0f7bc793fac/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-73136685d4784318_00008_of_00010.arrow


Process #9 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-0d3ee0f7bc793fac/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-73136685d4784318_00009_of_00010.arrow


2024-11-21 22:58:19 - INFO - datasets.arrow_dataset - Process #9 will write at /home/martinh2k3/.cache/huggingface/datasets/json/default-0d3ee0f7bc793fac/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-73136685d4784318_00009_of_00010.arrow


Spawning 10 processes


2024-11-21 22:58:19 - INFO - datasets.arrow_dataset - Spawning 10 processes


Applying chat template to train_sft (num_proc=10):   0%|          | 0/344 [00:00<?, ? examples/s]

Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-0d3ee0f7bc793fac/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-73136685d4784318_00000_of_00010.arrow


2024-11-21 22:58:20 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-0d3ee0f7bc793fac/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-73136685d4784318_00000_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-0d3ee0f7bc793fac/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-73136685d4784318_00001_of_00010.arrow


2024-11-21 22:58:20 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-0d3ee0f7bc793fac/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-73136685d4784318_00001_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-0d3ee0f7bc793fac/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-73136685d4784318_00002_of_00010.arrow


2024-11-21 22:58:20 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-0d3ee0f7bc793fac/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-73136685d4784318_00002_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-0d3ee0f7bc793fac/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-73136685d4784318_00003_of_00010.arrow


2024-11-21 22:58:20 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-0d3ee0f7bc793fac/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-73136685d4784318_00003_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-0d3ee0f7bc793fac/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-73136685d4784318_00004_of_00010.arrow


2024-11-21 22:58:20 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-0d3ee0f7bc793fac/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-73136685d4784318_00004_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-0d3ee0f7bc793fac/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-73136685d4784318_00005_of_00010.arrow


2024-11-21 22:58:20 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-0d3ee0f7bc793fac/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-73136685d4784318_00005_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-0d3ee0f7bc793fac/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-73136685d4784318_00006_of_00010.arrow


2024-11-21 22:58:20 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-0d3ee0f7bc793fac/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-73136685d4784318_00006_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-0d3ee0f7bc793fac/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-73136685d4784318_00007_of_00010.arrow


2024-11-21 22:58:20 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-0d3ee0f7bc793fac/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-73136685d4784318_00007_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-0d3ee0f7bc793fac/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-73136685d4784318_00008_of_00010.arrow


2024-11-21 22:58:20 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-0d3ee0f7bc793fac/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-73136685d4784318_00008_of_00010.arrow


Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-0d3ee0f7bc793fac/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-73136685d4784318_00009_of_00010.arrow


2024-11-21 22:58:20 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-0d3ee0f7bc793fac/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-73136685d4784318_00009_of_00010.arrow


Concatenating 10 shards


2024-11-21 22:58:20 - INFO - datasets.arrow_dataset - Concatenating 10 shards


In [131]:
processed_train[89]["text"]

'<|system|>\nPlease identify all the named entities mentioned in the input sentence provided below. Use only the categories: SpecificDisease, DiseaseClass, CompositeMention, and Modifier. Remember, some terms might refer to broader disease classes, while others are specific diseases or composite mentions involving multiple diseases. You should only output the results in JSON format, following a similar structure to the example result provided.\n\n    Example sentence and results:\n    "A common human skin tumour is caused by activating mutations in beta-catenin."\n\n    "\\"Results\\": [\n        { \\"category\\": \\"DiseaseClass\\", \\"entity\\": \\"skin tumour\\" }\n    ]"\n    <|end|>\n<|user|>\nFabry disease (FD) (angiokeratoma corporis diffusum) is an X-linked inborn error of glycosphingolipid metabolism caused by defects in the lysosomal alpha-galactosidase A gene (GLA).The enzymatic defect leads to the systemic accumulation of neutral glycosphingolipids with terminal alpha-galac

In [132]:
trainer = SFTTrainer(
    model=model,
    args=train_conf,
    peft_config=peft_conf,
    train_dataset=processed_train,
    eval_dataset=processed_test,
    max_seq_length=4,
    dataset_text_field="text",
    tokenizer=tokenizer
)
train_result = trainer.train()
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
[INFO|training_args.py:2169] 2024-11-21 22:58:48,645 >> PyTorch: setting up devices


Map:   0%|          | 0/2061 [00:00<?, ? examples/s]

Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-a428603bbc82149e/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-4cb83d9e7887f779.arrow


2024-11-21 22:58:50 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-a428603bbc82149e/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-4cb83d9e7887f779.arrow


Map:   0%|          | 0/382 [00:00<?, ? examples/s]

Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-716ccb4327edc3cc/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-890c869d430509b9.arrow


2024-11-21 22:58:51 - INFO - datasets.arrow_dataset - Caching processed dataset at /home/martinh2k3/.cache/huggingface/datasets/json/default-716ccb4327edc3cc/0.0.0/f4e89e8750d5d5ffbef2c078bf0ddfedef29dc2faff52a6255cf513c05eb1092/cache-890c869d430509b9.arrow


[INFO|trainer.py:699] 2024-11-21 22:58:51,185 >> Using auto half precision backend
[INFO|trainer.py:2314] 2024-11-21 22:58:52,152 >> ***** Running training *****
[INFO|trainer.py:2315] 2024-11-21 22:58:52,153 >>   Num examples = 2,061
[INFO|trainer.py:2316] 2024-11-21 22:58:52,154 >>   Num Epochs = 1
[INFO|trainer.py:2317] 2024-11-21 22:58:52,155 >>   Instantaneous batch size per device = 4
[INFO|trainer.py:2320] 2024-11-21 22:58:52,155 >>   Total train batch size (w. parallel, distributed & accumulation) = 4
[INFO|trainer.py:2321] 2024-11-21 22:58:52,156 >>   Gradient Accumulation steps = 1
[INFO|trainer.py:2322] 2024-11-21 22:58:52,157 >>   Total optimization steps = 516
[INFO|trainer.py:2323] 2024-11-21 22:58:52,160 >>   Number of trainable parameters = 25,165,824


Step,Training Loss
20,5.5203
40,5.2206
60,4.1472
80,2.0951
100,0.5132
120,0.0018
140,0.0001
160,0.0
180,0.0
200,0.0


[INFO|trainer.py:3812] 2024-11-21 22:59:29,811 >> Saving model checkpoint to ./checkpoint_dir/checkpoint-100
[INFO|configuration_utils.py:679] 2024-11-21 22:59:30,240 >> loading configuration file config.json from cache at /home/martinh2k3/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/0a67737cc96d2554230f90338b163bc6380a2a85/config.json
[INFO|configuration_utils.py:746] 2024-11-21 22:59:30,244 >> Model config Phi3Config {
  "_name_or_path": "Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embedding

***** train metrics *****
  epoch                    =        1.0
  total_flos               =   172647GF
  train_loss               =     0.6782
  train_runtime            = 0:03:15.17
  train_samples_per_second =      10.56
  train_steps_per_second   =      2.644


In [64]:
print(model.config.attention_type)

AttributeError: 'Phi3Config' object has no attribute 'attention_type'

In [65]:
print(model.config)

Phi3Config {
  "_attn_implementation_autoset": true,
  "_name_or_path": "microsoft/Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 4096,
  "model_type": "phi3",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "pad_token_id": 32000,
  "resid_pdrop": 0.0,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 2047,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_versi

In [72]:
model.save_pretrained("./trained_model")
tokenizer.save_pretrained("./trained_model")

[INFO|configuration_utils.py:414] 2024-11-21 21:16:13,310 >> Configuration saved in ./trained_model/config.json
[INFO|configuration_utils.py:865] 2024-11-21 21:16:13,314 >> Configuration saved in ./trained_model/generation_config.json
[INFO|modeling_utils.py:3043] 2024-11-21 21:16:21,510 >> The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 2 checkpoint shards. You can find where each parameters has been saved in the index located at ./trained_model/model.safetensors.index.json.
[INFO|tokenization_utils_base.py:2646] 2024-11-21 21:16:21,512 >> tokenizer config file saved in ./trained_model/tokenizer_config.json
[INFO|tokenization_utils_base.py:2655] 2024-11-21 21:16:21,513 >> Special tokens file saved in ./trained_model/special_tokens_map.json


('./trained_model/tokenizer_config.json',
 './trained_model/special_tokens_map.json',
 './trained_model/tokenizer.json')

In [133]:
input_text = raw_train[0]['system'][:-1]+"\n"+"Somatic-cell selection is a major determinant of the blood-cell phenotype in heterozygotes for glucose-6-phosphate dehydrogenase mutations causing severe enzyme deficiency.X-chromosome inactivation in mammals is regarded as an essentially random process, but the resulting somatic-cell mosaicism creates the opportunity for cell selection."

In [136]:
def prepare_for_inference(user_input: str, system_prompt=None):
    if not system_prompt:
        system_prompt = "Please identify all the named entities mentioned in the input sentence provided below. Use only the categories: SpecificDisease, DiseaseClass, CompositeMention, and Modifier. Remember, some terms might refer to broader disease classes, while others are specific diseases or composite mentions involving multiple diseases. You should only output the results in JSON format, following a similar structure to the example result provided.\n\nExample sentence and results:\n\"A common human skin tumour is caused by activating mutations in beta-catenin.\"\n\n\"Results\": [\n{ \"category\": \"DiseaseClass\", \"entity\": \"skin tumour\" }\n]\n"
    prompt_data = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_input}
    ]
    return  tokenizer.apply_chat_template(
        prompt_data, tokenize=False, add_generation_prompt="<|assistant|>" 
    )


In [137]:
processed_input = prepare_for_inference("Somatic-cell selection is a major determinant of the blood-cell phenotype in heterozygotes for glucose-6-phosphate dehydrogenase mutations causing severe enzyme deficiency.X-chromosome inactivation in mammals is regarded as an essentially random process, but the resulting somatic-cell mosaicism creates the opportunity for cell selection.")
processed_input

'<|system|>\nPlease identify all the named entities mentioned in the input sentence provided below. Use only the categories: SpecificDisease, DiseaseClass, CompositeMention, and Modifier. Remember, some terms might refer to broader disease classes, while others are specific diseases or composite mentions involving multiple diseases. You should only output the results in JSON format, following a similar structure to the example result provided.\n\nExample sentence and results:\n"A common human skin tumour is caused by activating mutations in beta-catenin."\n\n"Results": [\n{ "category": "DiseaseClass", "entity": "skin tumour" }\n]\n<|end|>\n<|user|>\nSomatic-cell selection is a major determinant of the blood-cell phenotype in heterozygotes for glucose-6-phosphate dehydrogenase mutations causing severe enzyme deficiency.X-chromosome inactivation in mammals is regarded as an essentially random process, but the resulting somatic-cell mosaicism creates the opportunity for cell selection.<|e

In [139]:
inputs = tokenizer(processed_input, return_tensors="pt").to("cuda")
outputs = model.generate(inputs["input_ids"], max_new_tokens=1000, num_return_sequences=1, temperature=0.7)

# Decode and print the generated text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)

Please identify all the named entities mentioned in the input sentence provided below. Use only the categories: SpecificDisease, DiseaseClass, CompositeMention, and Modifier. Remember, some terms might refer to broader disease classes, while others are specific diseases or composite mentions involving multiple diseases. You should only output the results in JSON format, following a similar structure to the example result provided.

Example sentence and results:
"A common human skin tumour is caused by activating mutations in beta-catenin."

"Results": [
{ "category": "DiseaseClass", "entity": "skin tumour" }
]
 Somatic-cell selection is a major determinant of the blood-cell phenotype in heterozygotes for glucose-6-phosphate dehydrogenase mutations causing severe enzyme deficiency.X-chromosome inactivation in mammals is regarded as an essentially random process, but the resulting somatic-cell mosaicism creates the opportunity for cell selection. "Results": [
{ "category": "DiseaseClass"