In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] ="0"

In [2]:
from huggingface_hub import logout, notebook_login
# logout()
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
from datasets import load_dataset

seed = load_dataset("passionMan/real_seed_IFD_rIFD14")
seed

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'ifd_loss'],
        num_rows: 15156
    })
})

In [3]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 4096 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = False # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 03-31 13:30:28 __init__.py:190] Automatically detected platform cuda.
==((====))==  Unsloth 2025.2.12: Fast Llama patching. Transformers: 4.48.3.
   \\   /|    GPU: NVIDIA A100-PCIE-40GB. Max memory: 39.394 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1. CUDA: 8.0. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [4]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_alpha = 32,
    lora_dropout = 0.05, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.2.12 patched 28 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


In [5]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]

    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("passionMan/real_seed_IFD_rIFD14", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

In [6]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 8,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 16,
        gradient_accumulation_steps = 4,
        warmup_steps = 150, # 50
        num_train_epochs = 2, # Set this for 1 full training run.
        # max_steps = 60,
        learning_rate = 2e-5,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 100,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "cosine",
        seed = 3407,
        output_dir = "outputs/real_seed_IFD_rIFD14",
        report_to = "none", # Use this for WandB etc
        save_steps = 100,
    ),
)

Applying chat template to train dataset (num_proc=8):   0%|          | 0/15156 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=8):   0%|          | 0/15156 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=8):   0%|          | 0/15156 [00:00<?, ? examples/s]

In [7]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 15,156 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 16 | Gradient Accumulation steps = 4
\        /    Total batch size = 64 | Total steps = 474
 "-____-"     Number of trainable parameters = 9,175,040


Step,Training Loss
100,2.04
200,1.5631
300,1.4654
400,1.4407


In [8]:
import json
import time
import random
from datasets import load_dataset
from collections import defaultdict

### ✅ Hugging Face에서 데이터 로드
dataset_name = "passionMan/test_dataset10"
dataset = load_dataset(dataset_name, split="test")  # 'test' split 로드

### ✅ JSONL 저장 함수 (평가 결과 저장용)
def save_to_jsonl(file_path, data):
    with open(file_path, "a", encoding="utf-8") as f: 
        f.write(json.dumps(data, ensure_ascii=False) + "\n")

### ✅ 모델 응답 생성 함수
def generate_response(instruction_text, input_text, max_new_tokens=128):
    try:
        FastLanguageModel.for_inference(model)  # Enable native 2x faster inference

        # ✅ 모델의 최대 입력 길이 가져오기 (보통 4096 또는 2048)
        max_input_length = getattr(model.config, "max_position_embeddings", 4096)

        # ✅ 입력 토큰 길이 확인
        input_tokens = tokenizer(
            alpaca_prompt.format(instruction_text, input_text, ""), 
            return_tensors="pt"
        ).to("cuda")

        input_length = input_tokens['input_ids'].shape[1]

        # 🔥 입력이 너무 길면 최대 입력 길이에 맞게 자름
        if input_length > max_input_length:
            print(f"[WARNING] Truncating input from {input_length} to {max_input_length} tokens.")
            input_text = tokenizer.decode(input_tokens['input_ids'][0, :max_input_length], skip_special_tokens=True)

        # ✅ 생성 수행 (max_new_tokens을 적용)
        outputs = model.generate(
            **tokenizer(alpaca_prompt.format(instruction_text, input_text, ""), return_tensors="pt").to("cuda"),
            max_new_tokens=max_new_tokens,  # ✅ 생성 길이 적용
            use_cache=True
        )

        decoded_outputs = tokenizer.batch_decode(outputs)
        response_texts = [output.split("### Response:\n")[-1].strip() for output in decoded_outputs]
        return response_texts[0].replace("<|eot_id|>", "")

    except Exception as e:
        print(f"[ERROR] Exception in response generation: {str(e)}")
        return None

# ✅ 데이터 경로 설정 (결과 저장용)
output_json_path = "response/real_seed_IFD_rIFD14.jsonl"

# ✅ Task별 데이터 그룹화 (각 태스크별 0~29번 샘플 선택)
grouped_data = defaultdict(list)
for item in dataset:
    grouped_data[item["task"]].append(item)

# ✅ 성능 평가할 데이터 생성 (각 태스크별 30개만 추출)
sampled_data = []
for task, samples in grouped_data.items():
    sampled_data.extend(samples[:100])  # 최대 30개 선택

# ✅ 성능 평가 시작
start_time = time.time()
total_samples = len(sampled_data)

for idx, item in enumerate(sampled_data):
    sample_start_time = time.time()

    input_text = item.get("input", "")
    instruction = item.get("instruction", "")
    task = item.get("task", "").lower()

    # ✅ 생성할 토큰 길이 설정 (생성 토큰 수 조절)
    short_context_tasks = {"qa1", "qa2", "qa3", "nli", "re", "re2"}  # 생성 32
    medium_context_tasks = {"ie"} # 생성 128
    long_context_tasks = {"summarization", "generation", "daily_diets", "alternative_diet"}  # 생성 1024

    if task in short_context_tasks:
        max_new_tokens = 32  # ✅ 생성 길이 128
    elif task in medium_context_tasks:
        max_new_tokens = 32
    elif task in long_context_tasks:
        max_new_tokens = 1024  # ✅ 생성 길이 1024
    else:
        max_new_tokens = 128  # 기본값

    try:
        model_output = generate_response(instruction, input_text, max_new_tokens)

        if model_output is not None:
            output_data = item.copy()
            output_data.update({f"model_output_{max_new_tokens}": model_output})
            save_to_jsonl(output_json_path, output_data)
        else:
            print(f"[WARNING] Skipping sample {idx+1}/{total_samples} due to length limit or generation failure.")

    except Exception as e:
        print(f"[ERROR] Skipping sample {idx+1}/{total_samples} due to unexpected error: {str(e)}")

    elapsed_time = time.time() - start_time
    avg_time_per_sample = elapsed_time / (idx + 1) 
    remaining_samples = total_samples - (idx + 1)
    estimated_remaining_time = remaining_samples * avg_time_per_sample

    print(f"[{idx+1}/{total_samples}] Sample processed in {time.time() - sample_start_time:.2f}s, ETA: {estimated_remaining_time/60:.2f} min")

print(f"\nAll samples processed. Total time: {(time.time() - start_time)/60:.2f} min")


[1/900] Sample processed in 5.47s, ETA: 81.94 min
[2/900] Sample processed in 4.84s, ETA: 77.13 min
[3/900] Sample processed in 4.81s, ETA: 75.31 min
[4/900] Sample processed in 3.43s, ETA: 69.24 min
[5/900] Sample processed in 5.36s, ETA: 71.33 min
[6/900] Sample processed in 5.72s, ETA: 73.59 min
[7/900] Sample processed in 6.72s, ETA: 77.29 min
[8/900] Sample processed in 3.51s, ETA: 74.07 min
[9/900] Sample processed in 1.98s, ETA: 69.03 min
[10/900] Sample processed in 3.46s, ETA: 67.18 min
[11/900] Sample processed in 3.75s, ETA: 66.05 min
[12/900] Sample processed in 4.09s, ETA: 65.52 min
[13/900] Sample processed in 8.52s, ETA: 70.10 min
[14/900] Sample processed in 2.46s, ETA: 67.62 min
[15/900] Sample processed in 5.49s, ETA: 68.44 min
[16/900] Sample processed in 8.78s, ETA: 72.18 min
[17/900] Sample processed in 2.69s, ETA: 70.18 min
[18/900] Sample processed in 6.65s, ETA: 71.64 min
[19/900] Sample processed in 7.06s, ETA: 73.25 min
[20/900] Sample processed in 2.20s, ETA:

In [1]:
import os

os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

from unsloth import FastLanguageModel

import torch
max_seq_length = 4096 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    # model_name = "unsloth/Meta-Llama-3.1-8B-bnb-4bit",
    model_name = "/data/jaesung/llm_for_diabetes/src/trial7/main_table/llama_8b/outputs/last_seed/checkpoint-458",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)


alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""



🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 03-26 17:53:33 __init__.py:190] Automatically detected platform cuda.
==((====))==  Unsloth 2025.2.12: Fast Llama patching. Transformers: 4.48.3.
   \\   /|    GPU: NVIDIA A100-PCIE-40GB. Max memory: 39.394 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1. CUDA: 8.0. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.2.12 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


In [2]:
import torch
torch.manual_seed(1000) 

FastLanguageModel.for_inference(model) 

inputs = tokenizer(
[
    alpaca_prompt.format(
        # "Can you recommend a daily diet recipe for a diabetic patient? Please provide detailed information on breakfast, lunch, and dinner.", 
        # "Can you recommend a daily diet recipe? Please provide detailed information on breakfast, lunch, and dinner.", 
        # "Prepare a healthy daily meal plan for a female aged 35 with 1400 kcal, including portion sizes in grams.",
        "Prepare a healthy daily meal plan for diebeic patien, including portion sizes in grams.",
        "",
        "", 
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 1024, do_sample=True)

<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Prepare a healthy daily meal plan for diebeic patien, including portion sizes in grams.

### Input:


### Response:
Breakfast: 1 cup of steel-cut oats with 1/2 cup of nonfat milk and 1 tablespoon of ground flaxseed. Lunch: 3 ounces of grilled chicken breast with 1 cup of steamed broccoli and 1/2 cup of brown rice. Snack: 1 small apple with 1 tablespoon of almond butter. Dinner: 4 ounces of grilled salmon with 1 cup of steamed asparagus and 1/2 cup of quinoa.<|end_of_text|>


In [None]:
import torch
torch.manual_seed(400) 

FastLanguageModel.for_inference(model) 

inputs = tokenizer(
[
    alpaca_prompt.format(
        "Can you recommend a daily diet recipe for a diabetic patient? Please provide detailed information on breakfast, lunch, and dinner.", 
        "",
        "", 
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 1024, do_sample=True)

<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Can you recommend a daily diet recipe for a diabetic patient? Please provide detailed information on breakfast, lunch, and dinner.

### Input:


### Response:


Diabetes is a metabolic disorder that affects how the body uses food for energy. It occurs when the body cannot produce enough insulin or when the body cannot properly use the insulin it produces. Insulin is a hormone that helps the body use glucose, a type of sugar, for energy. People with diabetes must carefully manage their blood sugar levels to avoid complications such as heart disease, stroke, and nerve damage.

A healthy diet for a diabetic patient should include plenty of vegetables, fruits, whole grains, lean proteins, and healthy fats. It should also limit processed foods, sugary drinks, and refined carbohydrates. A typical daily diet for a diabetic patient might look like this:

Breakfast: oatmeal with fresh berries, a glass of milk, and a cup of green tea
Lunch: grilled chicken breast with steamed vegetables, brown rice, and a glass of water
Dinner: baked fish with roasted vegetables, quinoa, and a small glass of red wine
Snacks: a handful of nuts, a piece of fruit, or a sma