In [1]:
import os
import torch
from datasets import load_dataset
from peft import (
    LoraConfig,
    PeftConfig,
    PeftModel,
    prepare_model_for_kbit_training
)
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline
)

from trl import SFTTrainer, SFTConfig

from src.utils.config_loader import load_config

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

### Set up file paths

In [2]:
config = load_config("nuextract_config.json")

In [3]:
train_dataset_path = config.get("train_dataset_path")
test_dataset_path = config.get("test_dataset_path")
dev_dataset_path = config.get("dev_dataset_path")

In [4]:
system_prompt_path = config.get("system_prompt_path")

In [5]:
checkpoint_path = config.get("checkpoint_path")

### Load datasets and system prompt:

In [6]:
raw_train = load_dataset("json", data_files=train_dataset_path, download_mode="force_redownload")["train"]
raw_test = load_dataset("json", data_files=test_dataset_path, download_mode="force_redownload")["train"]
raw_dev = load_dataset("json", data_files=dev_dataset_path, download_mode="force_redownload")["train"]

Generating train split:   0%|          | 0/2332 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/2422 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/2341 [00:00<?, ? examples/s]

### Load model and tokenizer
We need to load it before processing the data, as we are going to use the tokenizer to format the data.

In [7]:
model_path = config.get("model")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="auto",
    trust_remote_code=True,
    quantization_config=bnb_config
)

tokenizer = AutoTokenizer.from_pretrained(model_path)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
model = prepare_model_for_kbit_training(model)

### Change the dataset format to chat-like text

In [9]:
system_prompt = config.get("system_prompt")

In [10]:
def apply_chat_template(sample, tokenizer, include_response=True):
    message = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": sample["user"]}
    ]
    if include_response:
        message.append({"role": "assistant", "content": sample["assistant"]})

    # Use the tokenizer's chat template to create formatted text
    message = tokenizer.apply_chat_template(
        message, tokenize=False, add_generation_prompt=False
    )
    return tokenizer(message, padding=True, truncation=True)

In [12]:
# Apply processing to each dataset
processed_train = raw_train.map(
    apply_chat_template,
    fn_kwargs={"tokenizer": tokenizer},
)
processed_test = raw_test.map(
    apply_chat_template,
    fn_kwargs={"tokenizer": tokenizer},
)
processed_dev = raw_dev.map(
    apply_chat_template,
    fn_kwargs={"tokenizer": tokenizer},
)

Map:   0%|          | 0/2332 [00:00<?, ? examples/s]

Map:   0%|          | 0/2422 [00:00<?, ? examples/s]

Map:   0%|          | 0/2341 [00:00<?, ? examples/s]

### Train

In [17]:
for name, module in model.named_modules():
    if isinstance(module, torch.nn.Linear):
        print(name)

model.layers.0.self_attn.o_proj
model.layers.0.self_attn.qkv_proj
model.layers.0.mlp.gate_up_proj
model.layers.0.mlp.down_proj
model.layers.1.self_attn.o_proj
model.layers.1.self_attn.qkv_proj
model.layers.1.mlp.gate_up_proj
model.layers.1.mlp.down_proj
model.layers.2.self_attn.o_proj
model.layers.2.self_attn.qkv_proj
model.layers.2.mlp.gate_up_proj
model.layers.2.mlp.down_proj
model.layers.3.self_attn.o_proj
model.layers.3.self_attn.qkv_proj
model.layers.3.mlp.gate_up_proj
model.layers.3.mlp.down_proj
model.layers.4.self_attn.o_proj
model.layers.4.self_attn.qkv_proj
model.layers.4.mlp.gate_up_proj
model.layers.4.mlp.down_proj
model.layers.5.self_attn.o_proj
model.layers.5.self_attn.qkv_proj
model.layers.5.mlp.gate_up_proj
model.layers.5.mlp.down_proj
model.layers.6.self_attn.o_proj
model.layers.6.self_attn.qkv_proj
model.layers.6.mlp.gate_up_proj
model.layers.6.mlp.down_proj
model.layers.7.self_attn.o_proj
model.layers.7.self_attn.qkv_proj
model.layers.7.mlp.gate_up_proj
model.layers.

In [13]:
peft_config = LoraConfig(
    r=4,
    lora_alpha=32,
    target_modules=['gate_up_proj', 'down_proj', 'qkv_proj', 'o_proj'],
    lora_dropout=0.3935,
    bias="none",
    task_type="CAUSAL_LM"
)

In [14]:
trainer = SFTTrainer(
    model=model,
    args=SFTConfig(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        num_train_epochs=2,
        learning_rate=0.0009371,
        max_seq_length=4,
        bf16=True,
        optim="adamw_8bit",
        lr_scheduler_type="cosine",
        warmup_ratio=0.5784,
        logging_steps=20,
        save_strategy="epoch",
        output_dir=checkpoint_path,
        eval_accumulation_steps=30,
    ),
    train_dataset=processed_train,#processed_long_train,
    eval_dataset=processed_dev,
    # eval_steps=200,
    peft_config=peft_config,
    tokenizer=tokenizer
)
trainer.train()



[2025-04-26 16:57:09,024] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/martinh2k3/anaconda3/envs/bp/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/martinh2k3/anaconda3/envs/bp/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::runtime_error::~runtime_error()@GLIBCXX_3.4'
/home/martinh2k3/anaconda3/envs/bp/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__gxx_personality_v0@CXXABI_1.3'
/home/martinh2k3/anaconda3/envs/bp/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::tellp()@GLIBCXX_3.4'
/home/martinh2k3/anaconda3/envs/bp/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::chrono::_V2::steady_clock::now()@GLIBCXX_3.4.19'
/home/martinh2k3/anaconda3/envs/bp/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_replace_aux(unsigned long, unsigned long, unsigned long, char)@GLIBCXX_3.4'
/home/martinh2k3/a

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  return fn(*args, **kwargs)
You are not running the flash-attention implementation, expect numerical differences.


Step,Training Loss
20,2.2367
40,1.03
60,0.6399
80,0.6205
100,0.6217
120,0.6002
140,0.6086
160,0.6011
180,0.5965
200,0.6066


  return fn(*args, **kwargs)


TrainOutput(global_step=582, training_loss=0.6431682560452071, metrics={'train_runtime': 1475.8346, 'train_samples_per_second': 3.16, 'train_steps_per_second': 0.394, 'total_flos': 2.72605633403904e+16, 'train_loss': 0.6431682560452071, 'epoch': 1.9965694682675816})

In [20]:
torch.cuda.empty_cache()

In [21]:
trainer.evaluate()

{'eval_loss': 0.5881975293159485,
 'eval_runtime': 211.2715,
 'eval_samples_per_second': 11.081,
 'eval_steps_per_second': 1.387,
 'epoch': 2.490566037735849}


{'eval_loss': 0.4607459306716919,
 'eval_runtime': 308.9566,
 'eval_samples_per_second': 7.577,
 'eval_steps_per_second': 0.948,
 'epoch': 3.979416809605489}

In [22]:
trainer.evaluate()

{'eval_loss': 0.4607459306716919,
 'eval_runtime': 295.9207,
 'eval_samples_per_second': 7.911,
 'eval_steps_per_second': 0.99,
 'epoch': 3.979416809605489}

In [16]:
config

{'model': 'numind/NuExtract-1.5',
 'train_dataset_path': '/home/martinh2k3/PycharmProjects/bp_wsl/data/CDR_TrainingSetNE.json',
 'test_dataset_path': '/home/martinh2k3/PycharmProjects/bp_wsl/data/CDR_TestSetNE.json',
 'dev_dataset_path': '/home/martinh2k3/PycharmProjects/bp_wsl/data/CDR_DevelopmentSetNE.json',
 'model_dir_path': '/home/martinh2k3/PycharmProjects/bp_wsl/models/',
 'system_prompt': '# Role:\nExtract entities based on template. Don\'t include reasoning.\n# Template:\n{"Chemicals": [], "Diseases": []}\n# Example:\n## Input:\nHowever, studies into the dose necessary to combating scopolamine. Effects of uninephrectomy and high protein feeding on lithium-induced chronic renal failure in rats.\n## Output:\n{"Chemicals": ["scopolamine", "lithium"], "Diseases": ["chronic renal failure"]}',
 'checkpoint_path': '/home/martinh2k3/PycharmProjects/bp_wsl/checkpoint_dir/',
 'result_separator': '\n$SEP$\n',
 'litellm_url': 'http://147.175.151.44/',
 'deepseek_url': 'https://api.deepsee

In [17]:
trainer.save_model(config.get("model_dir_path") + "/nuextract")