In [1]:
%%capture
# !pip install unsloth
# # Also get the latest nightly Unsloth!
# !pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


2024-12-23 23:31:03.611245: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-23 23:31:03.626463: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1734964263.643818 2215677 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1734964263.649064 2215677 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-23 23:31:03.667962: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

Unsloth: OpenAI failed to import - ignoring for now.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.46.3.
   \\   /|    GPU: NVIDIA A100-PCIE-40GB. Max memory: 39.394 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post2. FA2 = True]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 8, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    # target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
    #                  "gate_proj", "up_proj", "down_proj",],
    target_modules = ["q_proj", "k_proj", "v_proj"],
    lora_alpha = 16,
    lora_dropout = 0.05, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2024.12.4 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


In [4]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("passionMan/diabetes_v3", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

README.md:   0%|          | 0.00/607 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/22.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/5.48M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/29785 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6245 [00:00<?, ? examples/s]

Map:   0%|          | 0/29785 [00:00<?, ? examples/s]

In [6]:
dataset[0]

{'dataset': 'bionli',
 'split_data': 'train',
 'task': 'nli',
 'instruction': 'Read the given premise and hypothesis. Decide if the hypothesis logically follows from the premise.',
 'input': '[PRE] Obesity and age are risk factors for feline diabetes. This study aimed to test the hypothesis that age, long-term obesity, and dietary composition would lead to peripheral and hepatorenal insulin resistance, indicated by higher endogenous glucose production (EGP) in the fasted and postprandial state, higher blood glucose and insulin, and higher leptin, free thyroxine, and lower adiponectin concentrations. Using triple tracer-(2)H(2)O, [U-(13)C(3)] propionate, and [3,4-(13)C(2)] glucose infusion, and indirect calorimetry-we investigated carbohydrate and fat metabolic pathways in overnight-fasted neutered cats (13 young lean, 12 old lean, and 12 old obese), each fed three different diets (high protein with and without polyunsaturated fatty acids, and high carbohydrate) in a crossover design. E

In [22]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from datasets import Dataset, concatenate_datasets
import pandas as pd
import numpy as np

# 리스트 형태의 dataset을 Pandas DataFrame으로 변환
dataset_df = pd.DataFrame(dataset)

# 긴 데이터와 짧은 데이터 분리
long_data = dataset_df[dataset_df['dataset'].isin(['icliniq', 'pubmed'])].reset_index(drop=True)  # 인덱스 제거
short_data = dataset_df[~dataset_df['dataset'].isin(['icliniq', 'pubmed'])].reset_index(drop=True)  # 인덱스 제거

# 데이터셋 생성
long_dataset = Dataset.from_pandas(long_data)
short_dataset = Dataset.from_pandas(short_data)

# 데이터 결합
combined_dataset = concatenate_datasets([short_dataset, long_dataset])

# 샘플링 가중치 설정
# Hugging Face Dataset에서 map 함수가 dict를 반환하도록 수정
def compute_weights(example):
    return {"weight": 0.1 if example['dataset'] in ['icliniq', 'pubmed'] else 0.9}

# map 함수로 가중치 리스트 생성
weighted_dataset = combined_dataset.map(compute_weights)

# 가중치 추출 및 정규화
weights = np.array(weighted_dataset["weight"])
weights = weights / weights.sum()  # 가중치 합이 1이 되도록 정규화

# 데이터 재샘플링
weighted_indices = np.random.choice(len(combined_dataset), size=len(combined_dataset), p=weights)
resampled_dataset = combined_dataset.select(weighted_indices)

# 학습 설정
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=resampled_dataset,
    dataset_text_field="text",
    max_seq_length=2048,  # 긴 데이터도 처리 가능하도록 설정
    dataset_num_proc = 16,  # 현재 2에서 증가

    args=TrainingArguments(
        per_device_train_batch_size=32,
        gradient_accumulation_steps=1,
        warmup_steps=50,
        num_train_epochs=2,  # 2 epochs 설정
        learning_rate=3e-5,  # 낮은 학습률로 균형 유지
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        optim="adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        logging_steps=50,
        seed=3407,
        save_steps=200,
        output_dir="outputs/weighted_sampling",
        report_to="none",
    ),
)



Map:   0%|          | 0/29785 [00:00<?, ? examples/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Map (num_proc=16):   0%|          | 0/29785 [00:00<?, ? examples/s]

In [23]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 29,785 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 32 | Gradient Accumulation steps = 1
\        /    Total batch size = 32 | Total steps = 1,862
 "-____-"     Number of trainable parameters = 4,718,592


Step,Training Loss
50,2.2862
100,1.5594
150,1.1465
200,1.0484
250,0.9801
300,0.8851
350,0.882
400,0.9025
450,0.8756
500,0.8744


In [15]:
## inference

In [None]:

from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "/data/jaesung/llm_for_diabetes/src/model/outputs/checkpoint-3214",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

In [33]:


FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# alpaca_prompt = You MUST copy from above!
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""


inputs = tokenizer(
[
    alpaca_prompt.format(
        "Recommend a daily diet that includes a specific ingredient.",

        "Create a diet that includes baby bok choy(roots trimmed and roughly chopped).",

        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 2048)

<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Recommend a daily diet that includes a specific ingredient.

### Input:
Create a diet that includes baby bok choy(roots trimmed and roughly chopped).

### Response:
{"Breakfast": "Strawberry Yogurt Parfait", "Lunch": "Turkey Meatball \u201cWonton\u201d Soup with Bok Choy & Carrots", "Dinner": "Roasted Cherry Tomatoes with Rosemary & Garlic"}<|end_of_text|>


In [36]:
list(set(dataset['dataset']))

['pubmed',
 'biorel',
 'medmcqa',
 'bionli',
 'diabetes_food_hub',
 'medqa',
 'icliniq']

In [None]:
from datasets import load_dataset

# 1. 데이터셋 로드
dataset_name = "passionMan/diabetes_v3"
train_dataset = load_dataset(dataset_name, split="train")
test_dataset = load_dataset(dataset_name, split="test")

# 2. 중복 확인을 위한 train 데이터의 (input, output) 세트 생성
train_pairs = set((row["input"], row["output"]) for row in train_dataset)

# 3. 각 task별 샘플 개수 설정
task_sample_limits = {
    'qa_subjective': 30,
    'alternative_diet': 50,
    'daily_diets': 50,
    'ie_extract_relation': 100,
    'nli': 100,
    "summarization": 30,
    "qa_objective_medqa": 100,
    "qa_objective_medmcqa": 50,
}
default_sample_limit = 100  # 나머지 task는 100개씩 샘플링

# 4. task별 샘플링 데이터 초기화
task_sampled_data = {}
# task_counts를 task_sample_limits로 초기화
task_counts = {task: 0 for task in task_sample_limits}

# 5. Task별 샘플링
for row in test_dataset:
    task = row["task"]
    dataset_value = row.get("dataset")  # dataset 컬럼 값 확인
    input_output_pair = (row["input"], row["output"])

    # qa_objective 세분화
    if task == "qa_objective":
        if dataset_value == "medqa":
            task = "qa_objective_medqa"  # medqa로 세분화
        elif dataset_value == "medmcqa":
            task = "qa_objective_medmcqa"  # medmcqa로 세분화
        else:
            continue  # medqa, medmcqa가 아니면 건너뛰기

    # 해당 task의 샘플링 제한 확인
    sample_limit = task_sample_limits.get(task, default_sample_limit)

    # 샘플링 조건 확인
    if task_counts[task] < sample_limit and input_output_pair not in train_pairs:
        if task not in task_sampled_data:
            task_sampled_data[task] = []
        task_sampled_data[task].append(row)
        task_counts[task] += 1

    # 모든 task의 샘플링이 완료되면 종료
    if all(task_counts[task] >= task_sample_limits.get(task, default_sample_limit) for task in task_sample_limits):
        break

# 6. 결과 출력 및 확인
for task, samples in task_sampled_data.items():
    print(f"Task: {task}, Sampled: {len(samples)}")
    for sample in samples[:5]:  # 첫 5개 샘플만 출력
        print(sample)

# 7. 필요 시 샘플링된 데이터 저장
from datasets import DatasetDict
sampled_dataset = DatasetDict({task: samples for task, samples in task_sampled_data.items()})

# 저장 (필요시 주석 제거)
# sampled_dataset.save_to_disk("sampled_test_dataset")


Task: nli, Sampled: 100
{'dataset': 'bionli', 'split_data': 'test', 'task': 'nli', 'instruction': 'Evaluate if the hypothesis can be inferred from the premise. Label it as entailment, contradiction, or neutral.', 'input': "[PRE] To investigate the mechanism of thyroid hormone action on pulmonary surfactant synthesis, we characterized the effect of triiodothyronine on phosphatidylcholine synthesis in cultured fetal rabbit lung. Since glucocorticoids stimulate surfactant synthesis and reduce the incidence of Respiratory Distress Syndrome in premature infants, we also examined the interaction of triiodothyronine and dexamethasone. The rate of choline incorporation into phosphatidylcholine was determined in organ cultures of rabbit lung maintained in serum-free Waymouth's medium. In 23-d lung cultured for 72 h, the increase in choline incorporation with triiodothyronine alone, dexamethasone alone, and triiodothyronine plus dexamethasone was 50, 62, and 161%, respectively. Both triiodothyro

In [37]:
import json
import re
from tqdm import tqdm
from transformers import TextStreamer

output_file = "inference_results_1224.jsonl"

# 1. `tqdm`을 사용하여 진행 상태 표시
total_samples = sum(len(samples) for samples in sampled_dataset.values())
with open(output_file, 'w') as f_out:
    with tqdm(total=total_samples, desc="Processing samples", unit="sample") as pbar:
        for task, sample in sampled_dataset.items():
            for samp in sample:
                # 데이터셋 이름에 따른 context length 설정
                dataset_name = samp.get("dataset", "")
                max_new_tokens = 8192 if dataset_name in ["pubmed", "icliniq"] else 2048

                # 입력 토큰 생성
                inputs = tokenizer(
                    [
                        alpaca_prompt.format(
                            samp['instruction'],  # instruction
                            samp['input'],  # input
                            "",  # output
                        )
                    ], return_tensors="pt"
                ).to("cuda")

                # TextStreamer 설정
                text_streamer = TextStreamer(tokenizer)
                
                # 모델 생성 및 출력
                output_tensor = model.generate(
                    **inputs, 
                    max_new_tokens=max_new_tokens
                )
                model_output = tokenizer.decode(output_tensor[0], skip_special_tokens=True)

                # `### Response:` 뒤의 텍스트 추출
                response_text = None
                response_match = re.search(r"### Response:\s*(.+)", model_output, re.DOTALL)
                if response_match:
                    response_text = response_match.group(1).strip()
                else:
                    response_text = "No valid response found"

                # 모델 출력 결과를 samp에 추가
                samp['model_output'] = response_text

                # JSONL 형식으로 저장
                f_out.write(json.dumps(samp, ensure_ascii=False) + "\n")
                
                # tqdm 진행 상태 업데이트
                pbar.update(1)


Processing samples: 100%|██████████| 530/530 [1:45:09<00:00, 11.90s/sample]   
