In [1]:
import os
import torch
from datasets import load_dataset
from peft import (
    LoraConfig,
    PeftConfig,
    PeftModel,
    prepare_model_for_kbit_training
)
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline
)

from trl import SFTTrainer, SFTConfig

from src.utils.config_loader import load_config

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

### Set up file paths

In [2]:
config = load_config()

In [3]:
train_dataset_path = config.get("train_dataset_path")
test_dataset_path = config.get("test_dataset_path")
dev_dataset_path = config.get("dev_dataset_path")

In [4]:
system_prompt_path = config.get("system_prompt_path")

In [5]:
checkpoint_path = "../temp_checkpoint_dir"  #config.get("checkpoint_path")

### Load datasets and system prompt:

In [6]:
raw_train = load_dataset("json", data_files=train_dataset_path, download_mode="force_redownload")["train"]
raw_test = load_dataset("json", data_files=test_dataset_path, download_mode="force_redownload")["train"]
raw_dev = load_dataset("json", data_files=dev_dataset_path, download_mode="force_redownload")["train"]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

### Load model and tokenizer
We need to load it before processing the data, as we are going to use the tokenizer to format the data.

In [7]:
model_path = config.get("main_model")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="auto",
    trust_remote_code=True,
    quantization_config=bnb_config
)

tokenizer = AutoTokenizer.from_pretrained(model_path)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
model = prepare_model_for_kbit_training(model)

### Change the dataset format to chat-like text

In [9]:
with open(system_prompt_path, "r") as f:
    system_prompt = f.read()

In [10]:
def apply_chat_template(sample, tokenizer, include_response=True):
    # Combine the fields into a structured chat format
    message = []
    if len(system_prompt):
        message.append({"role": "system", "content": system_prompt})
    message.append({"role": "user", "content": sample["user"]})
    if include_response:
        message.append({"role": "assistant", "content": sample["assistant"]})

    # Use the tokenizer's chat template to create formatted text
    message = tokenizer.apply_chat_template(
        message, tokenize=False, add_generation_prompt=False
    )
    return tokenizer(message, padding=True, truncation=True)

In [11]:
# Apply processing to each dataset
processed_train = raw_train.map(
    apply_chat_template,
    fn_kwargs={"tokenizer": tokenizer},
)
processed_test = raw_test.map(
    apply_chat_template,
    fn_kwargs={"tokenizer": tokenizer},
)
processed_dev = raw_dev.map(
    apply_chat_template,
    fn_kwargs={"tokenizer": tokenizer},
)

Map:   0%|          | 0/2332 [00:00<?, ? examples/s]

Map:   0%|          | 0/2422 [00:00<?, ? examples/s]

Map:   0%|          | 0/2341 [00:00<?, ? examples/s]

### Train

In [12]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=['gate_up_proj', 'base_layer', 'down_proj', 'qkv_proj', 'o_proj'],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

In [13]:
trainer = SFTTrainer(
    model=model,
    args=SFTConfig(
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        num_train_epochs=2.5,
        learning_rate=1e-4,
        max_seq_length=4,
        bf16=True,
        optim="adamw_8bit",
        lr_scheduler_type="cosine",
        warmup_ratio=0.05,
        logging_steps=10,
        save_strategy="epoch",
        output_dir=checkpoint_path,
        eval_accumulation_steps=50,
    ),
    train_dataset=processed_train,#processed_long_train,
    eval_dataset=processed_dev,
    # eval_steps=200,
    peft_config=peft_config,
    tokenizer=tokenizer
)
trainer.train()

[2025-04-10 21:08:49,298] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/martinh2k3/anaconda3/envs/bp/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/martinh2k3/anaconda3/envs/bp/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::runtime_error::~runtime_error()@GLIBCXX_3.4'
/home/martinh2k3/anaconda3/envs/bp/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__gxx_personality_v0@CXXABI_1.3'
/home/martinh2k3/anaconda3/envs/bp/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::tellp()@GLIBCXX_3.4'
/home/martinh2k3/anaconda3/envs/bp/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::chrono::_V2::steady_clock::now()@GLIBCXX_3.4.19'
/home/martinh2k3/anaconda3/envs/bp/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_replace_aux(unsigned long, unsigned long, unsigned long, char)@GLIBCXX_3.4'
/home/martinh2k3/a

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  return fn(*args, **kwargs)
You are not running the flash-attention implementation, expect numerical differences.


Step,Training Loss
10,2.1641
20,1.7526
30,1.085
40,0.683
50,0.5931
60,0.5739
70,0.5739
80,0.5651
90,0.5578
100,0.5669


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


TrainOutput(global_step=363, training_loss=0.6297916853723448, metrics={'train_runtime': 1363.6046, 'train_samples_per_second': 4.275, 'train_steps_per_second': 0.266, 'total_flos': 3.925345895261798e+16, 'train_loss': 0.6297916853723448, 'epoch': 2.490566037735849})

In [14]:
trainer.evaluate()

{'eval_loss': 0.5289265513420105,
 'eval_runtime': 183.3295,
 'eval_samples_per_second': 12.769,
 'eval_steps_per_second': 1.598,
 'epoch': 2.490566037735849}


{'eval_loss': 0.4607459306716919,
 'eval_runtime': 308.9566,
 'eval_samples_per_second': 7.577,
 'eval_steps_per_second': 0.948,
 'epoch': 3.979416809605489}

In [20]:
trainer.args.eval_accumulation_steps=1

In [21]:
torch.cuda.empty_cache()

In [22]:
trainer.evaluate()

{'eval_loss': 0.4607459306716919,
 'eval_runtime': 295.9207,
 'eval_samples_per_second': 7.911,
 'eval_steps_per_second': 0.99,
 'epoch': 3.979416809605489}

In [16]:
model.save_pretrained(checkpoint_path)

### Inference

In [17]:
config = PeftConfig.from_pretrained(checkpoint_path+"/checkpoint-145")
model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    return_dict=True,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

tokenizer=AutoTokenizer.from_pretrained(config.base_model_name_or_path)
tokenizer.pad_token = tokenizer.unk_token

model = PeftModel.from_pretrained(model, checkpoint_path+"/checkpoint-145")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [18]:
def prepare_for_inference(user_input: str, system_prompt: str = system_prompt):
    prompt_data = []
    if len(system_prompt):
        prompt_data.append({"role": "system", "content": system_prompt})
    prompt_data.append({"role": "user", "content": user_input})
    return tokenizer._apply_chat_template(
        prompt_data, tokenize=False, add_generation_prompt=True
    )

In [19]:
sentence: str = "A random paragraph can also be an excellent way for a writer to tackle writers' block. Writing block can often happen due to being stuck with a current project that the writer is trying to complete. By inserting a completely random paragraph from which to begin, it can take down some of the issues that may have been causing the writers' block in the first place. Another productive way, other than using xanax, to use this tool to begin a daily writing routine."

In [20]:
generation_args = {
    "max_new_tokens": 150,
    "return_full_text": False,
}

In [23]:
peft_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)
output = peft_pipeline(prepare_for_inference(sentence), **generation_args)
print(output[0]["generated_text"])

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'GraniteForCausalLM', 'GraniteMoeForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'Mamba2ForCausalLM', 'MarianForCausalLM', 'MBartForCausa

 [{"category": "Chemical", "entity": "xanax"}]
