In [38]:
!pip3 install -q -U trl==0.7.11
!pip3 install -q -U transformers==4.38.2
!pip3 install -q -U datasets==2.18.0
!pip3 install -q -U bitsandbytes==0.42.0
!pip3 install -q -U peft==0.9.0
!pip3 install -q -U accelerate==0.27.2

  pid, fd = os.forkpty()


In [39]:
import torch
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline, TrainingArguments
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
from sklearn.model_selection import train_test_split

In [40]:
lora_config = LoraConfig(
    r=6,
    lora_alpha = 8,
    lora_dropout = 0.05,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM",
)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

In [None]:
BASE_MODEL = "unsloth/gemma-2-2b-it"

model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, device_map="cuda:0", quantization_config=bnb_config)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
tokenizer.padding_side = 'right'

In [None]:
from datasets import Dataset
import pandas as pd

# CSV 파일 경로 지정
file_path = '/kaggle/input/drug-recommend-dataset/Drug_Data.csv'

# CSV 파일 읽기
df = pd.read_csv(file_path)

# Drug_Review 열이 문자열인지 확인
print(df['Drug_Review'].dtype == 'object')

# 데이터프레임 출력
print(df.head())


In [None]:
print(len(df))
print(type(df))

train_temp, test_temp = train_test_split(df, test_size=0.2, random_state=42)
print(len(train_temp["Drug_Review"]), len(test_temp["Drug_Review"]))
print(type(train_temp))
train_data = Dataset.from_pandas(train_temp)
test_data = Dataset.from_pandas(test_temp)
print(type(train_data))
print(type(test_data))

In [None]:
doc = train_data["Drug_Review"][0]
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=512)
messages = [
    {
        "role": "user",
        "content": "I am in this situation, please recommend the right medication for me:\n\n{}".format(doc)
    }
]
prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

In [None]:
prompt

In [None]:
outputs = pipe(
    prompt,
    do_sample=True,
    temperature=0.2,
    top_k=50,
    top_p=0.95,
    add_special_tokens=True
)

In [14]:
print(outputs[0]["generated_text"][len(prompt):])

I understand you're looking for medication advice, but I'm an AI and cannot provide medical recommendations. 

It's great that you found relief with Zoloft for your OCD and depression. However, weight gain and other side effects are common with some antidepressants, and it's important to discuss them with your doctor. 

**Here's what I can suggest:**

* **Talk to your doctor:** The most important step is to have an open and honest conversation with your doctor. They can:
    * **Assess your individual situation:** They'll consider your specific symptoms, medical history, and lifestyle to determine the best course of treatment.
    * **Discuss your concerns:**  Explain your experience with weight gain, vivid dreams, and the need to stop Zoloft. 
    * **Explore alternative options:** There are many different types of antidepressants, and your doctor can help you find one that's right for you. 
    * **Adjust your current medication:** They might be able to adjust your dosage or switch y

In [15]:
def generate_prompt(data):
    prompt_list = []
    drug_review = data["Drug_Review"]
    drug_name = data["drugName"]
    for i in range(len(data)):
        prompt_list.append(r"""<bos><start_of_turn>user
I am in this situation, please recommend the right medication for me:

{}<end_of_turn>
<start_of_turn>model

We've analyzed your symptoms and think "{}" may help. We recommend that you consult a healthcare professional for more accurate information, as we may be giving you incorrect information.
<end_of_turn><eos>""".format(drug_review[i], drug_name[i]))
    return prompt_list

In [21]:
print(generate_prompt(train_data)[0])

<bos><start_of_turn>user
I am in this situation, please recommend the right medication for me:

"I suffer from moderate to severe OCD (with mild depression due to it) and I was prescribed Zoloft. It helped me to control my obsessive thoughts, I was able to get fully-rested sleeps and I was more social and outgoing.  Unfortunately, I stayed on it for only 6 months before weaning myself off of it due to weight gain (20 lbs) in a short amount of time and my lifestyle was a lot more active when I started taking it too, I also had very vivid and detailed dreams, which wasn't horrible, but it was nice to be able to fall asleep easier."<end_of_turn>
<start_of_turn>model

We've analyzed your symptoms and think "Zoloft" may help. We recommend that you consult a healthcare professional for more accurate information, as we may be giving you incorrect information.
<end_of_turn><eos>


In [17]:
tokenizer.padding_side = 'right'
trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    max_seq_length=512,
    args=TrainingArguments(
        output_dir="outputs",
        num_train_epochs = 10,
        max_steps=300, ###
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        optim="paged_adamw_8bit",
        warmup_steps=1,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=100,
        push_to_hub=False,
        report_to='none',
    ),
    peft_config=lora_config,
    formatting_func=generate_prompt,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/43012 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


In [18]:
trainer.train()

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Step,Training Loss
100,1.7532
200,0.9726
300,0.5155


TrainOutput(global_step=300, training_loss=1.0804122924804687, metrics={'train_runtime': 1752.4645, 'train_samples_per_second': 1.369, 'train_steps_per_second': 0.171, 'total_flos': 5909838446582784.0, 'train_loss': 1.0804122924804687, 'epoch': 7.792207792207792})

In [39]:
ADAPTER_MODEL = "lora_adapter"

trainer.model.save_pretrained(ADAPTER_MODEL)

In [41]:
!ls -alh lora_adapter

model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, device_map='cuda:0', torch_dtype=torch.float16)
model = PeftModel.from_pretrained(model, ADAPTER_MODEL, device_map='cuda:0', torch_dtype=torch.float16)

model = model.merge_and_unload()
model.save_pretrained('gemma-2b-it-sum-drug_recommend')

total 15M
drwxr-xr-x 2 root root 4.0K Oct  3 08:59 .
drwxr-xr-x 6 root root 4.0K Oct  3 09:07 ..
-rw-r--r-- 1 root root 5.0K Oct  3 09:54 README.md
-rw-r--r-- 1 root root  722 Oct  3 09:54 adapter_config.json
-rw-r--r-- 1 root root  15M Oct  3 09:54 adapter_model.safetensors


In [45]:
!ls -alh ./gemma-2b-it-sum-drug_recommend

  pid, fd = os.forkpty()


total 4.9G
drwxr-xr-x 2 root root 4.0K Oct  3 08:59 .
drwxr-xr-x 6 root root 4.0K Oct  3 09:07 ..
-rw-r--r-- 1 root root  912 Oct  3 09:55 config.json
-rw-r--r-- 1 root root  209 Oct  3 09:55 generation_config.json
-rw-r--r-- 1 root root 4.7G Oct  3 09:55 model-00001-of-00002.safetensors
-rw-r--r-- 1 root root 230M Oct  3 09:55 model-00002-of-00002.safetensors
-rw-r--r-- 1 root root  24K Oct  3 09:55 model.safetensors.index.json


In [42]:
doc = test_data["Drug_Review"][10]
label = test_data["drugName"][10]
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=512)
messages = [
    {
        "role": "user",
        "content": "I am in this situation, please recommend the right medication for me:\n\n{}".format(doc)
    }
]
prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

In [43]:
outputs = pipe(
    prompt,
    do_sample=True,
    temperature=0.2,
    top_k=50,
    top_p=0.95,
    add_special_tokens=True
)
print(outputs[0]["generated_text"][len(prompt):])

We've analyzed your symptoms and think "Celecoxib" may help. We recommend that you consult a healthcare professional for more accurate information, as we may be giving you incorrect information.



In [44]:
print(doc)
print("")
print(label)
print("\n")
print(outputs[0]["generated_text"][len(prompt):])

"Hi everyone,
Dealing with knee osteoarthritis for years. Pain got so bad that I could only do minimal exercise and was waking up from pain. Ibuprofen does help, maybe a 50% pain inhibition taking 6 pills a day. With Celebrex, 200mg/ twice daily, I am having about 90% pain reduction. I am now biking daily, no problem going up and down stairs. This is a miracle drug. Not a cure, but close!"

Celecoxib


We've analyzed your symptoms and think "Celecoxib" may help. We recommend that you consult a healthcare professional for more accurate information, as we may be giving you incorrect information.

