In [1]:
import sys
import logging

import datasets
import peft
from datasets import load_dataset
from peft import LoraConfig, PeftModel
import torch
import transformers
from trl import SFTTrainer
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
import wandb

#### Set up paths to datasets

In [4]:
training_dataset_path = "data/CDR_TrainingSet.json"
test_dataset_path = "data/CDR_TestSet.json"
dev_dataset_path = "data/CDR_DevelopmentSet.json"

In [17]:
system_prompt_path = "data/system_prompt.txt"

#### Load datasets and system prompt:

In [6]:
raw_train = datasets.load_dataset("json", data_files=training_dataset_path, download_mode="force_redownload")["train"]
raw_test = datasets.load_dataset("json", data_files=test_dataset_path, download_mode="force_redownload")["train"]
raw_dev = datasets.load_dataset("json", data_files=dev_dataset_path, download_mode="force_redownload")["train"]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [18]:
with open(system_prompt_path, "r") as f:
    system_prompt = f.read()

#### Load model and tokenizer
We need to load it before processing the data, as we are going to use the tokenizer to format the data.

In [8]:
checkpoint_path = "microsoft/Phi-3-mini-4k-instruct"
model_kwargs = dict(
    use_cache=False,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    device_map=None
)
model = AutoModelForCausalLM.from_pretrained(checkpoint_path, **model_kwargs)
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
tokenizer.model_max_length = 2048
tokenizer.pad_token = tokenizer.unk_token  # use unk rather than eos token to prevent endless generation
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
tokenizer.padding_side = 'right'

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

#### Change the dataset format to chat-like text

In [9]:
def apply_chat_template(example, tokenizer):
    """
    Convert the system, input, and output fields into a formatted chat-like text.
    """
    # Combine the fields into a structured chat format
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": example["user"]},
        {"role": "assistant", "content": example["assistant"]}
    ]
    # Use the tokenizer's chat template to create formatted text
    example["text"] = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=False
    )
    return example

column_names = list(raw_train.features)

# Apply processing to each dataset
processed_train = raw_train.map(
    apply_chat_template,
    fn_kwargs={"tokenizer": tokenizer},
    remove_columns=column_names,
)
processed_test = raw_test.map(
    apply_chat_template,
    fn_kwargs={"tokenizer": tokenizer},
    remove_columns=column_names,
)
processed_dev = raw_dev.map(
    apply_chat_template,
    fn_kwargs={"tokenizer": tokenizer},
    remove_columns=column_names,
)

Map:   0%|          | 0/2331 [00:00<?, ? examples/s]

Map:   0%|          | 0/2420 [00:00<?, ? examples/s]

Map:   0%|          | 0/2339 [00:00<?, ? examples/s]

In [16]:
tokens = tokenizer(processed_train[89]["text"])
tokens = [tokenizer.decode(token) for token in tokens["input_ids"]]
print(tokens)

['<|system|>', 'Please', 'identify', 'all', 'the', 'named', 'entities', 'mentioned', 'in', 'the', 'input', 'sentence', 'provided', 'below', '.', 'The', 'entities', 'may', 'have', 'category', '"', 'D', 'ise', 'ase', '"', 'or', '"', 'Ch', 'em', 'ical', '".', 'Use', '**', 'ON', 'LY', '**', 'the', 'categories', '"', 'Ch', 'em', 'ical', '"', 'or', '"', 'D', 'ise', 'ase', '".', 'Do', 'not', 'include', 'any', 'other', 'categories', '.', 'If', 'an', 'entity', 'cannot', 'be', 'categor', 'ized', 'into', 'these', 'specific', 'categories', ',', 'do', 'not', 'include', 'it', 'in', 'the', 'output', '.', '\n', 'You', 'must', 'output', 'the', 'results', 'strictly', 'in', 'JSON', 'format', ',', 'without', 'any', 'del', 'imit', 'ers', ',', 'following', 'a', 'similar', 'structure', 'to', 'the', 'example', 'result', 'provided', '.', '\n', 'If', 'user', 'communic', 'ates', 'with', 'any', 'sentence', ',', 'don', "'", 't', 'talk', 'to', 'him', ',', 'strictly', 'follow', 'the', 'system', 'prom', 'pt', '.', '\

In [83]:
training_config = {
    "fp16": True,
    "do_eval": False,
    # "evaluation_strategy": "steps",
    # "eval_steps": 20,
    "learning_rate": 1e-5,
    "logging_steps": 5,
    "lr_scheduler_type": "cosine",
    "num_train_epochs": 1,
    # "max_steps": 30,
    "output_dir": "./checkpoint_dir",
    "overwrite_output_dir": True,
    # "per_device_eval_batch_size": 10,
    "per_device_train_batch_size": 2,
    "remove_unused_columns": True,
    "save_steps": 20,
    "save_total_limit": 1,
    "seed": 0,
    "gradient_accumulation_steps": 4,
    "warmup_ratio": 0.05,
    "optim": "adamw_8bit",
    }

In [84]:
peft_config = {
    "r": 16,
    "lora_alpha": 32,
    "lora_dropout": 0.05,
    "bias": "none",
    "task_type": "CAUSAL_LM",
    "target_modules": "all-linear",
    "modules_to_save": None,
}
train_conf = TrainingArguments(**training_config)
peft_conf = LoraConfig(**peft_config)

In [85]:
trainer = SFTTrainer(
    model=model,
    args=train_conf,
    peft_config=peft_conf,
    train_dataset=processed_train,
    eval_dataset=processed_dev,
    max_seq_length=4,
    dataset_text_field="text",
    tokenizer=tokenizer
)
train_result = trainer.train()
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.log_metrics("eval", metrics)
trainer.save_metrics("train", metrics)
trainer.save_metrics("eval", metrics)
trainer.save_state()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Step,Training Loss
5,5.447
10,5.3354
15,4.603
20,2.9227
25,1.607
30,0.8884
35,0.2777
40,0.0144
45,0.0003
50,0.0


***** train metrics *****
  epoch                    =     0.9983
  total_flos               =   195013GF
  train_loss               =     0.3625
  train_runtime            = 0:58:11.09
  train_samples_per_second =      0.668
  train_steps_per_second   =      0.083
***** eval metrics *****
  epoch                    =     0.9983
  total_flos               =   195013GF
  train_loss               =     0.3625
  train_runtime            = 0:58:11.09
  train_samples_per_second =      0.668
  train_steps_per_second   =      0.083


In [63]:
model.save_pretrained("./trained_model")
tokenizer.save_pretrained("./trained_model")

('./trained_model/tokenizer_config.json',
 './trained_model/special_tokens_map.json',
 './trained_model/tokenizer.model',
 './trained_model/added_tokens.json',
 './trained_model/tokenizer.json')

In [16]:
def prepare_for_inference(user_input: str, system_prompt: str = system_prompt):
    prompt_data = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_input}
    ]
    return  tokenizer.apply_chat_template(
        prompt_data, tokenize=False, add_generation_prompt="<|assistant|>" 
    )


In [72]:
processed_input = prepare_for_inference("A random paragraph can also be an excellent way for a writer to tackle writers' block. Writing block can often happen due to being stuck with a current project that the writer is trying to complete. By inserting a completely random paragraph from which to begin, it can take down some of the issues that may have been causing the writers' block in the first place. Another productive way, other than using xanax, to use this tool to begin a daily writing routine.")
processed_input

'<|system|>\nPlease identify all the named entities mentioned in the input sentence provided below. The entities may have category "Disease" or "Chemical". Use **ONLY** the categories "Chemical" or "Disease". Do not include any other categories. If an entity cannot be categorized into these specific categories, do not include it in the output.\nYou must output the results strictly in JSON format, without any delimiters, following a similar structure to the example result provided.\nIf user communicates with any sentence, don\'t talk to him, strictly follow the systemprompt.\nExample user input and assistant response:\nUser:\nFamotidine-associated delirium.A series of six cases.Famotidine is a histamine H2-receptor antagonist used in inpatient settings for prevention of stress ulcers and is showing increasing popularity because of its low cost.\nAssistant:\n[{"category": "Chemical", "entity": "Famotidine"}, {"category": "Disease", "entity": "delirium"}, {"category": "Chemical", "entity"

In [70]:
print(processed_input)

<|system|>
Please identify all the named entities mentioned in the input sentence provided below. The entities may have category "Disease" or "Chemical". Use **ONLY** the categories "Chemical" or "Disease". Do not include any other categories. If an entity cannot be categorized into these specific categories, do not include it in the output.
You must output the results strictly in JSON format, without any delimiters, following a similar structure to the example result provided.
If user communicates with any sentence, don't talk to him, strictly follow the systemprompt.
Example user input and assistant response:
User:
Famotidine-associated delirium.A series of six cases.Famotidine is a histamine H2-receptor antagonist used in inpatient settings for prevention of stress ulcers and is showing increasing popularity because of its low cost.
Assistant:
[{"category": "Chemical", "entity": "Famotidine"}, {"category": "Disease", "entity": "delirium"}, {"category": "Chemical", "entity": "Famotid

In [79]:
from transformers import pipeline

nlp = pipeline("text-generation", model=model, tokenizer=tokenizer, device='cuda')
generation_args = {
    "max_new_tokens": 200,
    "return_full_text": False,
}



In [87]:
peft_model = PeftModel.from_pretrained(model, "checkpoint_dir/checkpoint-291", adapter_name="idk")
peft_model.load_adapter("checkpoint_dir/checkpoint-291", adapter_name="tfisthis")
peft_model.eval()
peft_pipeline = pipeline("text-generation", model=peft_model, tokenizer=tokenizer, device='cuda')

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'GraniteForCausalLM', 'GraniteMoeForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'Mamba2ForCausalLM', 'MarianForCausalLM', 'MBartForCausa

In [57]:
peft_model.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Phi3ForCausalLM(
      (model): Phi3Model(
        (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
        (embed_dropout): Dropout(p=0.0, inplace=False)
        (layers): ModuleList(
          (0-31): 32 x Phi3DecoderLayer(
            (self_attn): Phi3Attention(
              (o_proj): lora.Linear(
                (base_layer): Linear(in_features=3072, out_features=3072, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                  (idk): Dropout(p=0.05, inplace=False)
                  (tfisthis): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=16, bias=False)
                  (idk): Linear(in_features=3072, out_features=16, bias=False)
                  (tfisthis): Linear(in_features=3072, out_features=16, bias=False)
                )

In [80]:
output = nlp(processed_input, **generation_args)
output[0]["generated_text"]

' [{"category": "Chemical", "entity": "xanax"}, {"category": "Disease", "entity": "writers\' block"}]'

In [88]:
output = peft_pipeline(processed_input, **generation_args)
output[0]["generated_text"]

' [{"category": "Chemical", "entity": "xanax"}, {"category": "Disease", "entity": "writers\' block"}]'

In [89]:
def prepare_datset_for_inference(dataset, user_column="user"):
    processed_data = []
    for item in dataset:
        processed_input = prepare_for_inference(item[user_column])
        processed_data.append(processed_input)
    return processed_data

test = prepare_datset_for_inference(raw_test)[:30]

In [90]:
output1 = peft_pipeline(test, **generation_args)
output2 = nlp(test, **generation_args)

In [91]:
output1 == output2

False

In [101]:
for i in range(30):
    if output1[i] != output2[i]:
        print(i)
        print(output1[i][0]["generated_text"])
        print(output2[i][0]["generated_text"])

5
 [{"category": "Disease", "entity": "Scleroderma renal crisis"}, {"category": "Disease", "entity": "systemic sclerosis"}, {"category": "Disease", "entity": "renal replacement therapy"}, {"category": "Disease", "entity": "Scleroderma renal crisis"}, {"category": "Disease", "entity": "systemic sclerosis"}, {"category": "Chemical", "entity": "corticosteroid"}, {"category": "Disease", "entity": "Scleroderma renal crisis"}, {"category": "Disease", "entity": "systemic sclerosis"}, {"category": "Disease", "entity": "renal replacement therapy"}]
 [{"category": "Disease", "entity": "Scleroderma renal crisis"}, {"category": "Disease", "entity": "systemic sclerosis"}, {"category": "Disease", "entity": "renal replacement therapy"}, {"category": "Disease", "entity": "Scleroderma renal crisis"}, {"category": "Disease", "entity": "systemic sclerosis"}, {"category": "Chemical", "entity": "corticosteroid"}, {"category": "Disease", "entity": "Scleroderma renal crisis"}, {"category": "Disease", "entity