In [1]:
import os

os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [6]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

==((====))==  Unsloth 2025.2.12: Fast Llama patching. Transformers: 4.48.3.
   \\   /|    GPU: NVIDIA A100-PCIE-40GB. Max memory: 39.394 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1. CUDA: 8.0. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [7]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}
"""

EOS_TOKEN = tokenizer.eos_token

In [3]:
import json
import time
import random
from datasets import load_dataset
from collections import defaultdict

### ✅ Hugging Face에서 데이터 로드
dataset_name = "passionMan/test_dataset4"
dataset = load_dataset(dataset_name, split="test")  # 'test' split 로드

### ✅ JSONL 저장 함수 (평가 결과 저장용)
def save_to_jsonl(file_path, data):
    with open(file_path, "a", encoding="utf-8") as f: 
        f.write(json.dumps(data, ensure_ascii=False) + "\n")

### ✅ 모델 응답 생성 함수
def generate_response(instruction_text, input_text, max_new_tokens=128):
    try:
        FastLanguageModel.for_inference(model)  # Enable native 2x faster inference

        # ✅ 모델의 최대 입력 길이 가져오기 (보통 4096 또는 2048)
        max_input_length = getattr(model.config, "max_position_embeddings", 4096)

        # ✅ 입력 토큰 길이 확인
        input_tokens = tokenizer(
            alpaca_prompt.format(instruction_text, input_text, ""), 
            return_tensors="pt"
        ).to("cuda")

        input_length = input_tokens['input_ids'].shape[1]

        # 🔥 입력이 너무 길면 최대 입력 길이에 맞게 자름
        if input_length > max_input_length:
            print(f"[WARNING] Truncating input from {input_length} to {max_input_length} tokens.")
            input_text = tokenizer.decode(input_tokens['input_ids'][0, :max_input_length], skip_special_tokens=True)

        # ✅ 생성 수행 (max_new_tokens을 적용)
        outputs = model.generate(
            **tokenizer(alpaca_prompt.format(instruction_text, input_text, ""), return_tensors="pt").to("cuda"),
            max_new_tokens=max_new_tokens,  # ✅ 생성 길이 적용
            use_cache=True
        )

        decoded_outputs = tokenizer.batch_decode(outputs)
        response_texts = [output.split("### Response:\n")[-1].strip() for output in decoded_outputs]
        return response_texts[0].replace("<|eot_id|>", "")

    except Exception as e:
        print(f"[ERROR] Exception in response generation: {str(e)}")
        return None

# ✅ 데이터 경로 설정 (결과 저장용)
output_json_path = "/data/jaesung/llm_for_diabetes/src/trial3/model_response/model_output_zero_shot.jsonl"

# ✅ Task별 데이터 그룹화 (각 태스크별 0~29번 샘플 선택)
grouped_data = defaultdict(list)
for item in dataset:
    grouped_data[item["task"]].append(item)

# ✅ 성능 평가할 데이터 생성 (각 태스크별 30개만 추출)
sampled_data = []
for task, samples in grouped_data.items():
    sampled_data.extend(samples[:30])  # 최대 30개 선택

# ✅ 성능 평가 시작
start_time = time.time()
total_samples = len(sampled_data)

for idx, item in enumerate(sampled_data):
    sample_start_time = time.time()

    input_text = item.get("input", "")
    instruction = item.get("instruction", "")
    task = item.get("task", "").lower()

    # ✅ 생성할 토큰 길이 설정 (생성 토큰 수 조절)
    short_context_tasks = {"qa1", "qa2", "qa3", "nli", "ie", "re"}  # 생성 128
    long_context_tasks = {"summarization", "generation", "daily_diets", "alternative_diet"}  # 생성 1024

    if task in short_context_tasks:
        max_new_tokens = 128  # ✅ 생성 길이 128
    elif task in long_context_tasks:
        max_new_tokens = 1024  # ✅ 생성 길이 1024
    else:
        max_new_tokens = 128  # 기본값

    try:
        model_output = generate_response(instruction, input_text, max_new_tokens)

        if model_output is not None:
            output_data = item.copy()
            output_data.update({f"model_output_{max_new_tokens}": model_output})
            save_to_jsonl(output_json_path, output_data)
        else:
            print(f"[WARNING] Skipping sample {idx+1}/{total_samples} due to length limit or generation failure.")

    except Exception as e:
        print(f"[ERROR] Skipping sample {idx+1}/{total_samples} due to unexpected error: {str(e)}")

    elapsed_time = time.time() - start_time
    avg_time_per_sample = elapsed_time / (idx + 1) 
    remaining_samples = total_samples - (idx + 1)
    estimated_remaining_time = remaining_samples * avg_time_per_sample

    print(f"[{idx+1}/{total_samples}] Sample processed in {time.time() - sample_start_time:.2f}s, ETA: {estimated_remaining_time/60:.2f} min")

print(f"\nAll samples processed. Total time: {(time.time() - start_time)/60:.2f} min")


[1/240] Sample processed in 5.81s, ETA: 23.16 min
[2/240] Sample processed in 2.89s, ETA: 17.27 min
[3/240] Sample processed in 3.62s, ETA: 16.23 min
[4/240] Sample processed in 3.63s, ETA: 15.69 min
[5/240] Sample processed in 3.62s, ETA: 15.33 min
[6/240] Sample processed in 0.09s, ETA: 12.78 min
[7/240] Sample processed in 3.63s, ETA: 12.92 min
[8/240] Sample processed in 3.23s, ETA: 12.82 min
[9/240] Sample processed in 1.94s, ETA: 12.18 min
[10/240] Sample processed in 3.62s, ETA: 12.30 min
[11/240] Sample processed in 0.98s, ETA: 11.47 min
[12/240] Sample processed in 0.26s, ETA: 10.55 min
[13/240] Sample processed in 0.13s, ETA: 9.73 min
[14/240] Sample processed in 3.63s, ETA: 9.98 min
[15/240] Sample processed in 3.64s, ETA: 10.18 min
[16/240] Sample processed in 0.19s, ETA: 9.55 min
[17/240] Sample processed in 3.66s, ETA: 9.74 min
[18/240] Sample processed in 0.35s, ETA: 9.23 min
[19/240] Sample processed in 0.13s, ETA: 8.73 min
[20/240] Sample processed in 0.16s, ETA: 8.29 

In [3]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]

    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("passionMan/dataset_cherry", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

In [4]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# alpaca_prompt = You MUST copy from above!

inputs = tokenizer(
[
    alpaca_prompt.format(
        "Please recommend a diet for diabetic patients.", # instruction
        "", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Please recommend a diet for diabetic patients.

### Input:


### Response:

1. A diabetic diet is a diet that can help you manage your blood glucose level. It should be low in fat, high in fiber, and rich in complex carbohydrates. It should also be low in calories and high in protein. You should avoid eating foods that are high in sugar and refined carbohydrates, such as white bread, pasta, and rice. Instead, opt for whole grains, fruits, and vegetables. You should also avoid eating foods that are high in saturated fat, such as red meat, butter, and cheese. Instead, opt for lean proteins, such as chicken and fish. You should also avoid eating foods that are high


In [9]:
import torch
import faiss
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer

FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# 2️⃣ 음식 데이터 로드 & FAISS 인덱스 로드
df = pd.read_csv("processed_food_data.csv")  # 음식 데이터 로드
index = faiss.read_index("food_faiss.index")  # FAISS 인덱스 로드
embedding_model = SentenceTransformer("jhgan/ko-sbert-nli")  # 한국어 임베딩 모델

# 3️⃣ 음식 검색 함수 (FAISS 사용)
def search_food(query, top_k=3):
    """질문을 벡터화하여 FAISS에서 유사한 음식 검색"""
    query_embedding = embedding_model.encode([query], convert_to_tensor=True).cpu().numpy()
    _, indices = index.search(query_embedding, top_k)
    
    # 검색된 음식 정보 추출
    results = df.iloc[indices[0]][["title", "description", "tags"]]
    
    # 검색된 음식 정보를 프롬프트용 문자열로 변환
    search_context = "\n".join([f"{row['title']}: {row['description']} (tags: {row['tags']})" for _, row in results.iterrows()])
    return search_context

# 4️⃣ 사용자 입력 및 검색 수행
query = "Please recommend a diet for diabetic patients."
search_results = search_food(query)

# 5️⃣ LLM 프롬프트 생성
alpaca_prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Context:
{}

### Response:
""".strip()

prompt_text = alpaca_prompt.format(
    "Please recommend a diet for diabetic patients.",
    query,
    search_results
)

# 6️⃣ LLM을 사용하여 검색된 정보 기반으로 답변 생성
inputs = tokenizer([prompt_text], return_tensors="pt").to("cuda")

text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=128)


<|begin_of_text|>Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Please recommend a diet for diabetic patients.

### Input:
Please recommend a diet for diabetic patients.

### Context:
Chicken Apple Crunch Salad: This savory and sweet chicken apple crunch salad will delight your taste buds by pairing fresh flavors with nutrition. Chicken apple crunch salad is delicious and light, good for lunch, dinner, or a protein-filled snack. It has been modified for the dialysis diet to encourage healthy eating and reduce food-related stress. (tags: ['CKD Non-Dialysis', 'CKD Dialysis', 'Kidney-Friendly', 'Kid Friendly', 'Main Dish', 'Budget Friendly', 'Dinner', 'Lunch', 'Quick & Easy'])
Broccoli and Apple Salad: This kidney-friendly recipe is a kid favorite. Dice, chop, and stir—that’s all you need to create broccoli and apple salad. This healthy recipe is low in sodium and high in flavor. Caution: this recipe contains wal