In [1]:
# huggingface model load

import transformers
import torch
tokenizer = transformers.LlamaTokenizer.from_pretrained('chaoyi-wu/PMC_LLAMA_7B')
model = transformers.LlamaForCausalLM.from_pretrained('chaoyi-wu/PMC_LLAMA_7B')
model.cuda()  # move the model to GPU

tokenizer.pad_token = tokenizer.eos_token  # 또는 tokenizer.add_special_tokens({'pad_token': '[PAD]'})


You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message
2025-02-06 22:41:38.424362: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-06 22:41:38.438839: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory 

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [2]:
# prompt_input = (
#     'Below is an instruction that describes a task, paired with an input that provides further context.'
#     'Write a response that appropriately completes the request.\n\n'
#     '### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:'
# )

# example = {
#     "instruction": "You're a doctor, kindly address the medical queries according to the patient's account. Answer with the best option directly.",
#     "input": (
#         "###Question: A 23-year-old pregnant woman at 22 weeks gestation presents with burning upon urination. "
#         "She states it started 1 day ago and has been worsening despite drinking more water and taking cranberry extract. "
#         "She otherwise feels well and is followed by a doctor for her pregnancy. "
#         "Her temperature is 97.7°F (36.5°C), blood pressure is 122/77 mmHg, pulse is 80/min, respirations are 19/min, and oxygen saturation is 98% on room air."
#         "Physical exam is notable for an absence of costovertebral angle tenderness and a gravid uterus. "
#         "Which of the following is the best treatment for this patient?"
#         "###Options: A. Ampicillin B. Ceftriaxone C. Doxycycline D. Nitrofurantoin"
#     )
# }
# input_str = [prompt_input.format_map(example)]

# model_inputs = tokenizer(
#     input_str,
#     return_tensors='pt',
#     padding=True,
# )

# # print( f"\033[32mmodel_inputs\033[0m: { model_inputs }" )


# topk_output = model.generate(
#     model_inputs.input_ids.cuda(),
#     max_new_tokens=1000,
#     top_k=5
# )
# output_str = tokenizer.batch_decode(topk_output)
# print('model predict: ', output_str[0])

In [3]:
import random
import numpy as np
import pandas as pd  # pandas 임포트 추가
from datasets import load_dataset, Dataset, DatasetDict

# 시드 값 설정
SEED = 42

# Python의 random 모듈 시드 고정
random.seed(SEED)

# NumPy 시드 고정
np.random.seed(SEED)

# 1. 데이터셋 로드
dataset_name = "passionMan/diabetes_v14"
train_dataset = load_dataset(dataset_name, split="train")
test_dataset = load_dataset(dataset_name, split="test")

# 2. 중복 확인을 위한 train 데이터의 (input, output) 세트 생성
train_pairs = set((row["input"], row["output"]) for row in train_dataset)

# 3. 각 task별 샘플 개수 설정
task_sample_limits = {
    'qa1': 25,
    'qa2': 25,
    'qa3': 25,
    'nli': 25,
    're': 25,
    'ie': 25,
    "summarization": 25,
    "generation": 25,
    'alternative_diet': 25,
    'daily_diets': 25,
}
default_sample_limit = 50  # 나머지 task는 100개씩 샘플링

# 4. task별 샘플링 데이터 초기화
task_sampled_data = {}
# task_counts를 task_sample_limits로 초기화
task_counts = {task: 0 for task in task_sample_limits}

# 5. Task별 샘플링
# 테스트 데이터셋을 셔플링하여 랜덤 샘플링 보장
test_dataset = test_dataset.shuffle(seed=SEED)

for row in test_dataset:
    task = row["task"]
    dataset_value = row.get("dataset")  # dataset 컬럼 값 확인
    input_output_pair = (row["input"], row["output"])

    # qa_objective 세분화
    if task == "qa_objective":
        if dataset_value == "medqa":
            task = "qa_objective_1"  # medqa로 세분화
        elif dataset_value == "medmcqa":
            task = "qa_objective_2"  # medmcqa로 세분화
        else:
            continue  # medqa, medmcqa가 아니면 건너뛰기

    # 해당 task의 샘플링 제한 확인
    sample_limit = task_sample_limits.get(task, default_sample_limit)

    # 샘플링 조건 확인
    if task_counts.get(task, 0) < sample_limit and input_output_pair not in train_pairs:
        if task not in task_sampled_data:
            task_sampled_data[task] = []
        task_sampled_data[task].append(row)
        task_counts[task] += 1

    # 모든 task의 샘플링이 완료되면 종료
    if all(task_counts.get(task, 0) >= task_sample_limits.get(task, default_sample_limit) for task in task_sample_limits):
        break

# 6. 결과 출력 및 확인
for task, samples in task_sampled_data.items():
    print(f"Task: {task}, Sampled: {len(samples)}")
    for sample in samples[:5]:  # 첫 5개 샘플만 출력
        print(sample)

# 7. 필요 시 샘플링된 데이터 저장
# pandas를 사용하여 리스트의 딕셔너리를 데이터프레임으로 변환 후 Dataset으로 변환
sampled_dataset = DatasetDict({
    task: Dataset.from_pandas(pd.DataFrame(samples)) for task, samples in task_sampled_data.items()
})

# 저장 (필요시 주석 제거)
# sampled_dataset.save_to_disk("sampled_test_dataset")


Task: daily_diets, Sampled: 25
{'dataset': 'diabetes_food_hub', 'split_data': 'test', 'task': 'daily_diets', 'instruction': 'Design a daily dietary plan featuring a particular ingredient.', 'input': 'Create a diet that includes Chinese five-spice powder.', 'output': '{"Breakfast": "Spinach, Tomato and Feta Cheese Baked Egg", "Lunch": "Turkey Meatball \\u201cWonton\\u201d Soup with Bok Choy & Carrots", "Dinner": "Roasted Sweet Potatoes with Lemon-Dill Yogurt Sauce"}', 'text_length': 305, '__index_level_0__': 28288}
{'dataset': 'diabetes_food_hub', 'split_data': 'test', 'task': 'daily_diets', 'instruction': 'Create a daily meal plan that incorporates a given ingredient.', 'input': 'Create a diet that includes lower sodium soy sauce.', 'output': '{"Breakfast": "Curried Chickpea Stew with Roasted Vegetables", "Lunch": "Cranberry Almond Spinach Salad", "Dinner": "Turkey Meatball \\u201cWonton\\u201d Soup with Bok Choy & Carrots"}', 'text_length': 285, '__index_level_0__': 27690}
{'dataset':

In [6]:
import json
import re
import torch
from tqdm import tqdm
from transformers import TextStreamer

# 모델을 inference 모드로 설정
model.eval()

# 출력 파일 설정
output_file = "inference_results_pmc.jsonl"

# Task별 `max_new_tokens` 설정
task_max_new_tokens = {
    "alternative_diet": 512,
    "daily_diets": 1024,
    "dfh_info": 256,
    "generation": 2048,
    "ie": 128,
    "nli": 32,
    "qa1": 64,
    "qa2": 64,
    "qa3": 64,
    "re": 32,
    "summarization": 1024,
}

# 새로운 프롬프트 적용
prompt_input = (
    'Below is an instruction that describes a task, paired with an input that provides further context.\n'
    'Write a response that appropriately completes the request.\n\n'
    '### Instruction:\n{instruction}\n\n'
    '{input}\n\n'  # 기존 형식을 유지하면서 `input` 그대로 삽입
    '### Response:'
)

# 전체 샘플 개수
total_samples = sum(len(samples) for samples in sampled_dataset.values())

# 진행 상태 표시 (`tqdm`)
with open(output_file, 'w', encoding="utf-8") as f_out:
    with tqdm(total=total_samples, desc="Processing samples", unit="sample") as pbar:
        for task, sample in sampled_dataset.items():
            for samp in sample:
                try:
                    # Task별 `max_new_tokens` 설정
                    task_name = samp.get("task", "")
                    max_new_tokens = task_max_new_tokens.get(task_name, 8)  # 기본값 8

                    # 새로운 프롬프트 적용
                    prompt_text = prompt_input.format(
                        instruction=samp['instruction'],
                        input=samp['input']
                    )

                    # 입력 토큰 생성
                    inputs = tokenizer(
                        [prompt_text],
                        return_tensors="pt",
                        padding=True,  # 패딩 적용
                    )

                    # `input_ids`와 `attention_mask`만 GPU로 이동
                    input_ids = inputs.input_ids.cuda()
                    attention_mask = inputs.attention_mask.cuda()

                    # 모델 생성
                    output_tensor = model.generate(
                        input_ids=input_ids,
                        attention_mask=attention_mask,
                        max_new_tokens=max_new_tokens,
                        temperature=0.7,  # 답변의 다양성과 논리성 조정
                        top_p=0.9,  # nucleus sampling
                        top_k=50,  # Top-k 필터링
                        repetition_penalty=1.2,  # 반복 방지
                        do_sample=True  # 확률적 샘플링 활성화
                    )

                    # 출력 디코딩
                    model_output = tokenizer.batch_decode(output_tensor, skip_special_tokens=True)[0]

                    # `### Response:` 뒤의 텍스트 추출
                    response_text = "No valid response found"
                    response_match = re.search(r"### Response:\s*(.+)", model_output, re.DOTALL)
                    if response_match:
                        response_text = response_match.group(1).strip()

                    # 모델 출력 결과를 샘플 데이터에 추가
                    samp['model_output'] = response_text

                    # JSONL 형식으로 저장
                    f_out.write(json.dumps(samp, ensure_ascii=False) + "\n")

                except Exception as e:
                    print(f"❌ Error processing sample {samp}: {str(e)}")

                # tqdm 진행 상태 업데이트
                pbar.update(1)


Processing samples:   8%|▊         | 21/250 [10:37<1:52:39, 29.52s/sample]

In [5]:
# data load

In [9]:
import json
import pandas as pd

file_path = "/data/jaesung/llm_for_diabetes/src/model/inference_results_pmc.jsonl"

data = []
with open(file_path, 'r', encoding='utf-8') as f:
    for line in f:
        data.append(json.loads(line))

df = pd.DataFrame(data)

In [10]:
# medqa

import pandas as pd
qa_objective_df = df[df['task']=='qa1']

# 'output'와 'model_output'에서 A), B), C), D)만 추출
qa_objective_df['output_label'] = qa_objective_df['output'].str.extract(r'(A\)|B\)|C\)|D\))')
qa_objective_df['model_output_label'] = qa_objective_df['model_output'].str.extract(r'(A\)|B\)|C\)|D\))')

# 두 컬럼 비교하여 맞은 경우를 계산
qa_objective_df['correct'] = qa_objective_df['output_label'] == qa_objective_df['model_output_label']

# Accuracy 계산
accuracy = qa_objective_df['correct'].mean()

print(f"Accuracy: {accuracy:.2%}")


Accuracy: 8.00%


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qa_objective_df['output_label'] = qa_objective_df['output'].str.extract(r'(A\)|B\)|C\)|D\))')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qa_objective_df['model_output_label'] = qa_objective_df['model_output'].str.extract(r'(A\)|B\)|C\)|D\))')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qa_ob

In [11]:
# medmcqa

import pandas as pd
qa_subjective_df = df[df['task']=='qa2']

# 'output'와 'model_output'에서 A), B), C), D)만 추출
qa_subjective_df['output_label'] = qa_subjective_df['output'].str.extract(r'(A\)|B\)|C\)|D\))')
qa_subjective_df['model_output_label'] = qa_subjective_df['model_output'].str.extract(r'(A\)|B\)|C\)|D\))')

# 두 컬럼 비교하여 맞은 경우를 계산
qa_subjective_df['correct'] = qa_subjective_df['output_label'] == qa_subjective_df['model_output_label']

# Accuracy 계산
accuracy = qa_subjective_df['correct'].mean()

print(f"Accuracy: {accuracy:.2%}")

Accuracy: 40.00%


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qa_subjective_df['output_label'] = qa_subjective_df['output'].str.extract(r'(A\)|B\)|C\)|D\))')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qa_subjective_df['model_output_label'] = qa_subjective_df['model_output'].str.extract(r'(A\)|B\)|C\)|D\))')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  q

In [12]:
# pubmedqa

def token_overlap(output, model_output):
    # 토큰화
    output_tokens = set(output.split())
    model_output_tokens = set(model_output.split())
    
    # 공통 토큰 개수 계산
    common_tokens = output_tokens.intersection(model_output_tokens)
    
    # 일치 비율 계산
    return len(common_tokens) / len(output_tokens) if len(output_tokens) > 0 else 0

qa_descriptive_df = df[df['task'] == 'qa3']

# 일치 비율 계산 및 저장
qa_descriptive_df['token_match_score'] = qa_descriptive_df.apply(lambda row: token_overlap(row['output'], row['model_output']), axis=1)

# 평균 점수를 정확도로 간주
accuracy = qa_descriptive_df['token_match_score'].mean()

print(f"Token Match Accuracy: {accuracy:.2%}")


Token Match Accuracy: 0.00%


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qa_descriptive_df['token_match_score'] = qa_descriptive_df.apply(lambda row: token_overlap(row['output'], row['model_output']), axis=1)


In [13]:
# nli

nli_df = df[df['task'] == 'nli']

correct_predictions = (nli_df['output'] == nli_df['model_output']).sum()
total_predictions = len(nli_df) 

nli_acc = correct_predictions / total_predictions

print(nli_acc)  # bioinstruct - 0.33 

0.0


In [14]:
# re

import pandas as pd

df_re = df[df['task'] == 're']

# Precision, Recall, F1 계산 함수 (전체)
def calculate_total_metrics(output_col, model_output_col):
    total_true_positive = 0
    total_false_positive = 0
    total_false_negative = 0

    for output, model_output in zip(output_col, model_output_col):
        # ','로 구분된 문자열을 집합으로 변환
        output_set = set(output.split(', '))
        model_output_set = set(model_output.split(', '))

        # 교집합, 정답의 크기, 모델 예측의 크기 계산
        true_positive = len(output_set & model_output_set)
        false_positive = len(model_output_set - output_set)
        false_negative = len(output_set - model_output_set)

        # 누적 합산
        total_true_positive += true_positive
        total_false_positive += false_positive
        total_false_negative += false_negative

    # 총 Precision, Recall, F1 계산
    precision = total_true_positive / (total_true_positive + total_false_positive) if (total_true_positive + total_false_positive) > 0 else 0
    recall = total_true_positive / (total_true_positive + total_false_negative) if (total_true_positive + total_false_negative) > 0 else 0
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return precision, recall, f1

# 메트릭 계산
total_precision, total_recall, total_f1 = calculate_total_metrics(df_re['output'], df_re['model_output'])

# 결과 출력
print(f"Total Precision: {total_precision:.4f}")
print(f"Total Recall: {total_recall:.4f}")
print(f"Total F1-Score: {total_f1:.4f}")


Total Precision: 0.0000
Total Recall: 0.0000
Total F1-Score: 0.0000


In [15]:
# ie

from sklearn.metrics import precision_score, recall_score, f1_score
import pandas as pd

# 데이터프레임 예시 (df_ie 가 주어진 데이터프레임)
df_ie = df[df['task'] == 'ie']

df_ie["output"] = df_ie["output"].str.lower().str.split(", ")
df_ie["model_output"] = df_ie["model_output"].str.lower().str.split(", ")

# Precision, Recall, F1-score 계산 함수
def calculate_scores(y_true, y_pred):
    all_precisions = []
    all_recalls = []
    all_f1s = []
    
    for true_vals, pred_vals in zip(y_true, y_pred):
        true_set = set(true_vals) if isinstance(true_vals, list) else set()
        pred_set = set(pred_vals) if isinstance(pred_vals, list) else set()

        TP = len(true_set & pred_set)  # True Positives (정답과 예측이 일치하는 것)
        FP = len(pred_set - true_set)  # False Positives (예측했지만 정답이 아닌 것)
        FN = len(true_set - pred_set)  # False Negatives (정답이지만 예측하지 못한 것)

        precision = TP / (TP + FP) if (TP + FP) > 0 else 0
        recall = TP / (TP + FN) if (TP + FN) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        all_precisions.append(precision)
        all_recalls.append(recall)
        all_f1s.append(f1)

    return sum(all_precisions) / len(all_precisions), sum(all_recalls) / len(all_recalls), sum(all_f1s) / len(all_f1s)

# Precision, Recall, F1-score 계산
precision, recall, f1 = calculate_scores(df_ie["output"], df_ie["model_output"])

# 결과 출력
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")


Precision: 0.0137
Recall: 0.0213
F1-score: 0.0160


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ie["output"] = df_ie["output"].str.lower().str.split(", ")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ie["model_output"] = df_ie["model_output"].str.lower().str.split(", ")


In [16]:
# summarization
import openai
import pandas as pd
import re
from sklearn.metrics import f1_score
from bert_score import score as bert_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import os
from dotenv import load_dotenv

load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

# BLEURT 모델 로드
bleurt_model_name = "Elron/bleurt-large-512"
tokenizer = AutoTokenizer.from_pretrained(bleurt_model_name)
bleurt_model = AutoModelForSequenceClassification.from_pretrained(bleurt_model_name)
bleurt_model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bleurt_model = bleurt_model.to(device)

MAX_CONTEXT_LENGTH = 8192

# GPT-4 평가 함수
def evaluate_with_gpt4(input_text, model_output, true_output):
    input_text = input_text[:MAX_CONTEXT_LENGTH]
    model_output = model_output[:MAX_CONTEXT_LENGTH]
    true_output = true_output[:MAX_CONTEXT_LENGTH]

    prompt = f"""
    You are tasked with evaluating the quality of a QA model's responses based on the following metrics:
    1. **Coherence**: Does the model's response logically align with the context provided in the input?
    2. **Completeness**: Does the model's response sufficiently answer the question in the input?
    3. **Naturalness**: Does the model's response sound fluent and human-like?

    **Input**:
    {input_text}

    **Model's Response**:
    {model_output}

    **True Answer**:
    {true_output}

    Please rate each metric on a scale from 1 to 5. 
    Example response format:
    - Coherence: X.X
    - Completeness: X.X
    - Naturalness: X.X
    """
    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo-0125",
            messages=[
                {"role": "system", "content": "You are an expert evaluator for Summarization models."},
                {"role": "user", "content": prompt}
            ]
        )
        return response["choices"][0]["message"]["content"]
    except Exception as e:
        print("Error with GPT-4 API:", e)
        return None

# GPT-4 점수 추출
def extract_scores(evaluation):
    if evaluation is None:
        return {"Coherence": 0.0, "Completeness": 0.0, "Naturalness": 0.0}
    coherence = re.search(r"Coherence: (\d\.\d)", evaluation)
    completeness = re.search(r"Completeness: (\d\.\d)", evaluation)
    naturalness = re.search(r"Naturalness: (\d\.\d)", evaluation)
    
    return {
        "Coherence": float(coherence.group(1)) if coherence else 0.0,
        "Completeness": float(completeness.group(1)) if completeness else 0.0,
        "Naturalness": float(naturalness.group(1)) if naturalness else 0.0
    }

# BLEURT 점수 계산
def calculate_bleurt(y_true, y_pred):
    inputs = tokenizer(y_pred, y_true, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    with torch.no_grad():
        scores = bleurt_model(**inputs).logits

    if scores.numel() == 1:
        return float(scores.squeeze().item())  
    return [float(score) for score in scores.squeeze().tolist()]  

# BLEURT 및 BERTScore 계산
def calculate_bleurt_and_bertscore(y_true, y_pred):
    bleurt_score_value = calculate_bleurt(y_true, y_pred)
    _, _, bert_f1 = bert_score(y_pred, y_true, lang="en", rescale_with_baseline=True)
    bert_f1_avg = sum(bert_f1) / len(bert_f1) if len(bert_f1) > 0 else 0

    return {
        "BLEURT": bleurt_score_value if isinstance(bleurt_score_value, float) else sum(bleurt_score_value) / len(bleurt_score_value),
        "BERTScore_F1": bert_f1_avg
    }

# 점수 정규화 함수
def normalize_scores(df, column):
    if column not in df.columns:
        print(f"Warning: Column {column} not found in DataFrame. Skipping normalization.")
        return df
    df[column] = df[column].apply(lambda x: float(x) if isinstance(x, torch.Tensor) else x)
    min_val, max_val = df[column].min(), df[column].max()
    df[column] = df[column].apply(lambda x: (x - min_val) / (max_val - min_val) if max_val > min_val else 0.5)
    return df

qa_df = df[df['task'] == 'summarization']
results = []

for _, row in qa_df.iterrows():
    input_text, model_output, true_output = row['input'], row['model_output'], row['output']
    
    evaluation = evaluate_with_gpt4(input_text, model_output, true_output)
    print(f"Evaluation result:\n{evaluation}")
    
    scores = extract_scores(evaluation)
    metric_scores = calculate_bleurt_and_bertscore([true_output], [model_output])

    results.append({
        "input": input_text,
        "model_output": model_output,
        "true_output": true_output,
        "evaluation": evaluation,
        "Coherence": scores["Coherence"],
        "Completeness": scores["Completeness"],
        "Naturalness": scores["Naturalness"],
        "BLEURT": metric_scores.get("BLEURT", 0.0),  # 기본값 설정
        "BERTScore_F1": metric_scores.get("BERTScore_F1", 0.0)
    })

evaluation_df = pd.DataFrame(results)

# 'BLEURT' 컬럼이 존재하는지 확인 후 정규화 수행
evaluation_df = normalize_scores(evaluation_df, "BLEURT")
evaluation_df = normalize_scores(evaluation_df, "BERTScore_F1")

average_scores = evaluation_df[["Coherence", "Completeness", "Naturalness", "BLEURT", "BERTScore_F1"]].mean()
print("평균 점수:")
print(average_scores)


Evaluation result:
Based on the provided context and the model's response being irrelevant and repetitive, I would evaluate the model as follows:

- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0

The model's response lacks coherence, completeness, and naturalness as it does not address the content of the input or provide a meaningful response to the task.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 5.0
- Completeness: 1.0
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
Based on the provided input, here is the evaluation of the model's response:

- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0

Explanation:
- The model's response does not demonstrate coherence as it does not align logically with the context of the input. It simply acknowledges a comment without addressing the main topic of impaired fasting glucose among perinatally HIV-infected adolescents and youths in Dar es Salaam, Tanzania.
- The response lacks completeness as it fails to sufficiently answer the question or provide any meaningful information related to the study discussed in the input.
- In terms of naturalness, the response is short and generic, lacking human-like fluency or relevance to the topic at hand.

Overall, the model's response falls short in terms of coherence, completeness, and naturalness.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 5.0
- Completeness: 4.0
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 5.0
- Completeness: 4.0
- Naturalness: 5.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
This response lacks any actual content to evaluate based on the provided input. Therefore, I cannot rate the model's performance on Coherence, Completeness, and Naturalness. The model did not generate a response that can be assessed based on the given metrics.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
Based on the given context and the model's response, here is the evaluation for each metric:

- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0

Overall, the model's response scored poorly on all metrics as it completely failed to provide a relevant and complete answer in a natural, human-like manner.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
Based on the evaluation of the model's response, here are the ratings for each metric:

- Coherence: 4.5
- Completeness: 4.0
- Naturalness: 4.0

Overall, the model's response aligns logically with the context provided, sufficiently answers the question, and sounds relatively fluent and human-like.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
Based on the provided information, here is the evaluation of the QA model's responses:

- Coherence: 2.5
- Completeness: 2.0
- Naturalness: 3.0

Overall, the model's responses partially address the comments provided but lack depth and specificity. The responses appear somewhat coherent with the input context but are incomplete in addressing the specific details requested. The naturalness of the responses is reasonable, sounding polite and professional. Further improvement is needed to enhance completeness and address the specific requests for additional details.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
Based on the given input and the model's response, here is the evaluation of the QA model's responses:

- Coherence: 2.0
- Completeness: 1.5
- Naturalness: 2.0

Explanation:

- **Coherence (2.0)**: The model's response partially aligns with the context provided in the input by acknowledging the study's aim to identify risk factors for diabetes in chronic pancreatitis. However, there is repetition in the response which affects the coherence.

- **Completeness (1.5)**: The model's response lacks completeness as it fails to provide a detailed or comprehensive answer to the input. It does not discuss specific risk factors or outcomes mentioned in the original text.

- **Naturalness (2.0)**: The response is not very natural as it appears to be repetitive and does not present information in a fluent or human-like manner. The repetition of the same content negatively impacts the naturalness of the response.

Overall, the model's response needs improvement in terms of cohere

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
Based on the provided information, here is the evaluation for the model's response:

- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0

The model's response lacks coherence, completeness, and naturalness as it repeatedly provides the same instruction without generating a meaningful summary of the article.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 5.0
- Completeness: 3.0
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
I would evaluate the model's response based on the given metrics as follows:

- Coherence: 2.0
The response does not align logically with the context provided in the input about the epidemiology and clinical features of infectious pathology in patients with diabetes mellitus.

- Completeness: 1.5
The response does not sufficiently answer the question in the input about the risk of community-acquired pneumonia in patients with diabetes mellitus.

- Naturalness: 3.0
The response sounds relatively fluent and could be considered somewhat human-like in tone.

Overall, the model's response lacks coherence and completeness, but it is moderately natural in terms of language flow.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
Based on the provided information, here is the evaluation of the model's responses:

- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0

The model's responses did not address the input question at all, leading to a low score across all metrics.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.0
- Completeness: 1.0
- Naturalness: 2.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
**Summary Evaluation**:
- Coherence: 5.0
- Completeness: 4.5
- Naturalness: 5.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
Based on the evaluation of the model's response, the ratings are as follows:

- Coherence: 4.0
- Completeness: 3.5
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
Based on the provided input and model responses, here is the evaluation of the QA model's responses:

- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0

The model response provided ("Thank you for your comment. We have revised the manuscript to address your concerns.") does not align with the input question about hyperglycemia, symptoms, and symptom clusters in colorectal cancer survivors with type 2 diabetes. It lacks both coherence and completeness as it does not sufficiently answer the input question. Additionally, the response does not exhibit naturalness as it does not sound fluent or human-like. 

Overall, the model's response is not relevant or appropriate in the context of the input question, and it fails to meet the criteria for coherence, completeness, and naturalness.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
Based on the provided input and the model's responses, here is the evaluation for the QA model:

- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0

Overall, the model's responses are not relevant or useful in answering the questions posed by the comments. The responses lack coherence, completeness, and naturalness as they do not provide any meaningful information related to the input context.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
Based on the provided information, here is the evaluation of the model's response for the QA task:

- Coherence: 2.0
  - The response touches on the burden of disease but fails to address the details mentioned in the input about the specific diseases, population demographics, and other key aspects related to fruit and vegetable intake.

- Completeness: 1.5
  - The response lacks completeness as it only briefly mentions the estimated deaths and DALYs without providing a comprehensive overview of the burden of disease attributable to low fruit and vegetable intake in South Africa for the specified years.

- Naturalness: 3.0
  - The response is relatively fluent and coherent in terms of language and structure, but it lacks depth and specificity in addressing the query.

Overall, the model's response falls short in coherence and completeness while achieving moderate naturalness. Additional improvements are needed to enhance the model's performance in providing accurate a

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
Based on the provided input, model's responses, and true answer, here is the evaluation of the QA model's responses:

- Coherence: 5.0
- Completeness: 1.0
- Naturalness: 2.0

Explanation:
1. **Coherence (5.0)**: The responses consistently address the comment made by providing the same generic statement, which aligns with the context. The responses are logically coherent with the input.
   
2. **Completeness (1.0)**: The responses lack completeness as they repeatedly provide a vague statement without actually addressing the specific request for more information about the compounds in SJHK. The responses are not sufficiently answering the question.

3. **Naturalness (2.0)**: The responses lack variety and creativity, with the same generic phrase being repeated multiple times. This lack of originality and variation affects the naturalness of the responses, making them sound robotic and non-human-like.

Overall, while the responses are coherent, they severely lack comple

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
**True Answer**:

- Objective: The objective of the study was to investigate the causes of lipohypertrophy in insulin-requiring subjects with type 2 diabetes and to assess the educational deficiencies in healthcare providers regarding correct injection techniques.
- Methods: The study surveyed 1160 insulin-requiring subjects with type 2 diabetes and conducted physical examinations to identify lipohypertrophy. Educational and injection behavior differences were analyzed between subjects with and without lipohypertrophy.
- Results: The study revealed significant educational gaps with many patients not receiving proper training on injection techniques from healthcare providers. Patients with lipohypertrophy often relied on peers for education, highlighting the need for repeated education for doctors and nurses.
- Conclusion: The findings emphasize the urgent need to educate healthcare professionals on correct insulin injection techniques to enhance patient knowledge and

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


평균 점수:
Coherence       2.600000
Completeness    1.900000
Naturalness     2.320000
BLEURT          0.529392
BERTScore_F1    0.888975
dtype: float64


In [13]:
# generation

import openai
import pandas as pd
import re
from sklearn.metrics import f1_score
from bert_score import score as bert_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import os
from dotenv import load_dotenv

load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

# BLEURT 모델 로드
bleurt_model_name = "Elron/bleurt-large-512"
tokenizer = AutoTokenizer.from_pretrained(bleurt_model_name)
bleurt_model = AutoModelForSequenceClassification.from_pretrained(bleurt_model_name)
bleurt_model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bleurt_model = bleurt_model.to(device)

MAX_CONTEXT_LENGTH = 8192

# GPT-4 평가 함수
def evaluate_with_gpt4(input_text, model_output, true_output):
    input_text = input_text[:MAX_CONTEXT_LENGTH]
    model_output = model_output[:MAX_CONTEXT_LENGTH]
    true_output = true_output[:MAX_CONTEXT_LENGTH]

    prompt = f"""
    You are tasked with evaluating the quality of a QA model's responses based on the following metrics:
    1. **Coherence**: Does the model's response logically align with the context provided in the input?
    2. **Completeness**: Does the model's response sufficiently answer the question in the input?
    3. **Naturalness**: Does the model's response sound fluent and human-like?

    **Input**:
    {input_text}

    **Model's Response**:
    {model_output}

    **True Answer**:
    {true_output}

    Please rate each metric on a scale from 1 to 5. 
    Example response format:
    - Coherence: X.X
    - Completeness: X.X
    - Naturalness: X.X
    """
    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo-0125",
            messages=[
                {"role": "system", "content": "You are an expert evaluator for Summarization models."},
                {"role": "user", "content": prompt}
            ]
        )
        return response["choices"][0]["message"]["content"]
    except Exception as e:
        print("Error with GPT-4 API:", e)
        return None

# GPT-4 점수 추출
def extract_scores(evaluation):
    if evaluation is None:
        return {"Coherence": 0.0, "Completeness": 0.0, "Naturalness": 0.0}
    coherence = re.search(r"Coherence: (\d\.\d)", evaluation)
    completeness = re.search(r"Completeness: (\d\.\d)", evaluation)
    naturalness = re.search(r"Naturalness: (\d\.\d)", evaluation)
    
    return {
        "Coherence": float(coherence.group(1)) if coherence else 0.0,
        "Completeness": float(completeness.group(1)) if completeness else 0.0,
        "Naturalness": float(naturalness.group(1)) if naturalness else 0.0
    }

# BLEURT 점수 계산
def calculate_bleurt(y_true, y_pred):
    inputs = tokenizer(y_pred, y_true, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    with torch.no_grad():
        scores = bleurt_model(**inputs).logits

    if scores.numel() == 1:
        return float(scores.squeeze().item())  
    return [float(score) for score in scores.squeeze().tolist()]  

# BLEURT 및 BERTScore 계산
def calculate_bleurt_and_bertscore(y_true, y_pred):
    bleurt_score_value = calculate_bleurt(y_true, y_pred)
    _, _, bert_f1 = bert_score(y_pred, y_true, lang="en", rescale_with_baseline=True)
    bert_f1_avg = sum(bert_f1) / len(bert_f1) if len(bert_f1) > 0 else 0

    return {
        "BLEURT": bleurt_score_value if isinstance(bleurt_score_value, float) else sum(bleurt_score_value) / len(bleurt_score_value),
        "BERTScore_F1": bert_f1_avg
    }

# 점수 정규화 함수
def normalize_scores(df, column):
    if column not in df.columns:
        print(f"Warning: Column {column} not found in DataFrame. Skipping normalization.")
        return df
    df[column] = df[column].apply(lambda x: float(x) if isinstance(x, torch.Tensor) else x)
    min_val, max_val = df[column].min(), df[column].max()
    df[column] = df[column].apply(lambda x: (x - min_val) / (max_val - min_val) if max_val > min_val else 0.5)
    return df

qa_df = df[df['task'] == 'generation']
results = []

for _, row in qa_df.iterrows():
    input_text, model_output, true_output = row['input'], row['model_output'], row['output']
    
    evaluation = evaluate_with_gpt4(input_text, model_output, true_output)
    print(f"Evaluation result:\n{evaluation}")
    
    scores = extract_scores(evaluation)
    metric_scores = calculate_bleurt_and_bertscore([true_output], [model_output])

    results.append({
        "input": input_text,
        "model_output": model_output,
        "true_output": true_output,
        "evaluation": evaluation,
        "Coherence": scores["Coherence"],
        "Completeness": scores["Completeness"],
        "Naturalness": scores["Naturalness"],
        "BLEURT": metric_scores.get("BLEURT", 0.0),  # 기본값 설정
        "BERTScore_F1": metric_scores.get("BERTScore_F1", 0.0)
    })

evaluation_df = pd.DataFrame(results)

# 'BLEURT' 컬럼이 존재하는지 확인 후 정규화 수행
evaluation_df = normalize_scores(evaluation_df, "BLEURT")
evaluation_df = normalize_scores(evaluation_df, "BERTScore_F1")

average_scores = evaluation_df[["Coherence", "Completeness", "Naturalness", "BLEURT", "BERTScore_F1"]].mean()
print("평균 점수:")
print(average_scores)


Evaluation result:
- Coherence: 3.5
- Completeness: 3.5
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 4.5
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 2.0
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.0
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.5
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.5
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.0
- Completeness: 2.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.5
- Completeness: 3.0
- Naturalness: 2.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
Certainly! Here is the evaluation of the QA model's response based on the given metrics:

- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0

Explanation:
- **Coherence (1.0)**: The model's response lacks coherence as it repeatedly states "I have seen your query" without providing any meaningful information or addressing the context provided in the input.
- **Completeness (1.0)**: The response is completely incomplete as it fails to address any of the questions or concerns raised in the input, providing no helpful information to the user.
- **Naturalness (1.0)**: The response lacks naturalness as it consists of repetitive and nonsensical phrases that do not resemble human-like communication.

Overall, the QA model's response performs poorly on all evaluated metrics, indicating a significant room for improvement in generating relevant and coherent responses.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
Based on the provided input and true answer, here is the evaluation of the QA model's response:

- Coherence: 1.0
- Completeness: 3.0
- Naturalness: 1.0

The model's response lacks coherence as it veers off into generic information about erectile dysfunction without directly addressing the specific questions asked in the input. It also includes repetitive and irrelevant information. In terms of completeness, the response addresses some aspects of the query but fails to directly answer the questions about diagnosis and treatment options for erectile dysfunction. Additionally, the response lacks naturalness as it feels robotic, overly verbose, and repetitive.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 4.5
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
Based on the provided metrics, here is the evaluation of the model's response:

- Coherence: 1.0
- Completeness: 2.0
- Naturalness: 1.0

Overall Assessment:
The model's response scored low in coherence and naturalness, as it repetitively lists the patient's history without providing a clear and concise response. In terms of completeness, while the model captures some of the patient's symptoms and history, it lacks a clear and actionable treatment plan as present in the true answer. Further improvement is needed to enhance the quality of the response.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.5
- Naturalness: 2.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.5
- Completeness: 3.0
- Naturalness: 2.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.0
- Completeness: 2.5
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.5
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.5
- Completeness: 2.5
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.0
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
Based on the given input and the model's response, here is the evaluation:

- Coherence: 3.0
- Completeness: 2.5
- Naturalness: 2.5

The model's response shows moderate coherence with some alignment to the symptoms described in the input. However, the response lacks completeness in addressing all the specific details mentioned by the patient. Additionally, the response appears somewhat robotic and lacks the natural flow of human-like language.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 2.0
- Completeness: 2.0
- Naturalness: 1.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.0
- Completeness: 2.0
- Naturalness: 2.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 2.0
- Completeness: 1.5
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 2.5
- Completeness: 1.5
- Naturalness: 2.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.0
- Completeness: 3.5
- Naturalness: 2.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


평균 점수:
Coherence       3.020000
Completeness    2.700000
Naturalness     2.580000
BLEURT          0.542391
BERTScore_F1    0.614151
dtype: float64


In [14]:
# daily diets

import openai
import pandas as pd
import re
from tqdm import tqdm

from dotenv import load_dotenv
import os

load_dotenv()

openai.api_key = os.getenv("OPENAI_API_KEY")

MAX_CONTEXT_LENGTH = 8192

def evaluate_with_gpt4(input_text, model_output, true_output, task_type):
    input_text = input_text[:MAX_CONTEXT_LENGTH]
    model_output = model_output[:MAX_CONTEXT_LENGTH]
    true_output = true_output[:MAX_CONTEXT_LENGTH]

    if task_type == "daily_diets":
        prompt = f"""
        You are tasked with evaluating the quality of a meal recommendation model's responses based on the following metrics:
        1. **Coherence**: Does the model's response logically align with the context provided in the input?
        2. **Completeness**: Does the model's response sufficiently answer the input request?
        3. **Naturalness**: Does the model's response sound fluent and human-like?
        4. **Nutritional Adequacy**: Does the meal align with the nutritional goals in the input, considering reasonable flexibility and practical applicability in real-life scenarios?
        5. **Caloric Balance**: Are the recommended meals well-balanced in terms of calories?

        **Input**:
        {input_text}

        **Model's Response**:
        {model_output}

        **True Answer**:
        {true_output}

        Please rate each metric on a scale from 1 to 5. 
        Example response format:
        - Coherence: X.X
        - Completeness: X.X
        - Naturalness: X.X
        - Nutritional Adequacy: X.X
        - Caloric Balance: X.X
        """
    elif task_type == "alternative_diets":
        prompt = f"""
        You are tasked with evaluating the quality of a meal recommendation model's responses based on the following metrics:
        1. **Coherence**: Does the model's response logically align with the context provided in the input?
        2. **Completeness**: Does the model's response sufficiently answer the input request?
        3. **Naturalness**: Does the model's response sound fluent and human-like?
        4. **Improvement**: Does the recommended meal address the shortcomings of the previous meal?
        5. **Suitability**: Is the recommended meal suitable for a diabetes patient?

        **Input**:
        {input_text}

        **Model's Response**:
        {model_output}

        **True Answer**:
        {true_output}

        Please rate each metric on a scale from 1 to 5. 
        Example response format:
        - Coherence: X.X
        - Completeness: X.X
        - Naturalness: X.X
        - Improvement: X.X
        - Suitability: X.X
        """
    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo-0125",
            messages=[
                {"role": "system", "content": "You are an expert evaluator for meal recommendation models."},
                {"role": "user", "content": prompt}
            ]
        )
        return response["choices"][0]["message"]["content"]
    except Exception as e:
        print("Error with GPT-4 API:", e)
        return None

# 점수 추출 함수
def extract_scores(evaluation, task_type):
    if evaluation is None:
        if task_type == "daily_diets":
            return {
                "Coherence": 0.0,
                "Completeness": 0.0,
                "Naturalness": 0.0,
                "Nutritional Adequacy": 0.0,
                "Caloric Balance": 0.0
            }
        elif task_type == "alternative_diets":
            return {
                "Coherence": 0.0,
                "Completeness": 0.0,
                "Naturalness": 0.0,
                "Improvement": 0.0,
                "Suitability": 0.0
            }

    scores = {}
    if task_type == "daily_diets":
        metrics = ["Coherence", "Completeness", "Naturalness", "Nutritional Adequacy", "Caloric Balance"]
    elif task_type == "alternative_diets":
        metrics = ["Coherence", "Completeness", "Naturalness", "Improvement", "Suitability"]

    for metric in metrics:
        match = re.search(fr"{metric}: (\d\.\d)", evaluation)
        scores[metric] = float(match.group(1)) if match else 0.0

    return scores

results = []

daily_df = df[(df['task'] == 'daily_diets') & (df['output'].str.contains('"Breakfast"'))]

for _, row in tqdm(daily_df.iterrows(), total=len(daily_df), desc="Evaluating daily diets"):
    evaluation = evaluate_with_gpt4(row['input'], row['model_output'], row['output'], "daily_diets")
    scores = extract_scores(evaluation, "daily_diets")
    results.append({**row.to_dict(), **scores})

alternative_df = df[df['task'] == 'alternative_diets']

for _, row in tqdm(alternative_df.iterrows(), total=len(alternative_df), desc="Evaluating alternative diets"):
    evaluation = evaluate_with_gpt4(row['input'], row['model_output'], row['output'], "alternative_diets")
    scores = extract_scores(evaluation, "alternative_diets")
    results.append({**row.to_dict(), **scores})

evaluation_df = pd.DataFrame(results)

if "Nutritional Adequacy" in evaluation_df.columns and "Caloric Balance" in evaluation_df.columns:
    daily_avg = evaluation_df[evaluation_df['task'] == 'daily_diets'][[
        "Coherence", "Completeness", "Naturalness", 
        "Nutritional Adequacy", "Caloric Balance"
    ]].mean()
    print("Daily Diets Average Scores:")
    print(daily_avg)

if "Improvement" in evaluation_df.columns and "Suitability" in evaluation_df.columns:
    alternative_avg = evaluation_df[evaluation_df['task'] == 'alternative_diets'][[
        "Coherence", "Completeness", "Naturalness", 
        "Improvement", "Suitability"
    ]].mean()
    print("\nAlternative Diets Average Scores:")
    print(alternative_avg)
else:
    missing_columns = [col for col in ["Improvement", "Suitability"] if col not in evaluation_df.columns]
    print("\nAlternative Diets scores not available:")
    print(f"Missing columns: {missing_columns}")


Evaluating daily diets: 100%|██████████| 21/21 [00:20<00:00,  1.03it/s]
Evaluating alternative diets: 0it [00:00, ?it/s]

Daily Diets Average Scores:
Coherence               3.761905
Completeness            3.000000
Naturalness             3.880952
Nutritional Adequacy    3.000000
Caloric Balance         3.309524
dtype: float64

Alternative Diets scores not available:
Missing columns: ['Improvement', 'Suitability']





In [15]:
# alternative diets

import openai
import pandas as pd
import re
from tqdm import tqdm

from dotenv import load_dotenv
import os

load_dotenv()


openai.api_key = os.getenv("OPENAI_API_KEY")

MAX_CONTEXT_LENGTH = 8192

def evaluate_with_gpt4(input_text, model_output, true_output, task_type):

    input_text = input_text[:MAX_CONTEXT_LENGTH]
    model_output = model_output[:MAX_CONTEXT_LENGTH]
    true_output = true_output[:MAX_CONTEXT_LENGTH]

    if task_type == "daily_diets":
        prompt = f"""
        You are tasked with evaluating the quality of a meal recommendation model's responses based on the following metrics:
        1. **Coherence**: Does the model's response logically align with the context provided in the input?
        2. **Completeness**: Does the model's response sufficiently answer the input request?
        3. **Naturalness**: Does the model's response sound fluent and human-like?
        4. **Nutritional Adequacy**: Does the meal response meet the nutritional goals mentioned in the input?
        5. **Caloric Balance**: Are the recommended meals well-balanced in terms of calories?

        **Input**:
        {input_text}

        **Model's Response**:
        {model_output}

        **True Answer**:
        {true_output}

        Please rate each metric on a scale from 1 to 5. 
        Example response format:
        - Coherence: X.X
        - Completeness: X.X
        - Naturalness: X.X
        - Nutritional Adequacy: X.X
        - Caloric Balance: X.X
        """
    elif task_type == "alternative_diet":
        prompt = f"""
        You are tasked with evaluating the quality of a meal recommendation model's responses based on the following metrics:

        1. **Coherence**: Does the model's response logically align with the context provided in the input?
        2. **Completeness**: Does the model's response sufficiently answer the input request?
        3. **Naturalness**: Does the model's response sound fluent and human-like?
        4. **Improvement**: Assume that the recommended alternative meal is an improvement over the previous meal. Evaluate how effectively it builds upon and enhances the previous meal, even if the changes are small or subtle.
        5. **Suitability**: Is the recommended meal suitable for a diabetes patient?

        **Input**:
        {input_text}

        **Model's Response**:
        {model_output}

        **True Answer**:
        {true_output}

        Please rate each metric on a scale from 1 to 5. 
        Example response format:
        - Coherence: X.X
        - Completeness: X.X
        - Naturalness: X.X
        - Improvement: X.X
        - Suitability: X.X
        """
    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo-0125",
            messages=[
                {"role": "system", "content": "You are an expert evaluator for meal recommendation models."},
                {"role": "user", "content": prompt}
            ]
        )
        return response["choices"][0]["message"]["content"]
    except Exception as e:
        print("Error with GPT-4 API:", e)
        return None

def extract_scores(evaluation, task_type):
    if evaluation is None:
        if task_type == "daily_diets":
            return {
                "Coherence": 0.0,
                "Completeness": 0.0,
                "Naturalness": 0.0,
                "Nutritional Adequacy": 0.0,
                "Caloric Balance": 0.0
            }
        elif task_type == "alternative_diet":
            return {
                "Coherence": 0.0,
                "Completeness": 0.0,
                "Naturalness": 0.0,
                "Improvement": 0.0,
                "Suitability": 0.0
            }

    scores = {}
    if task_type == "daily_diets":
        metrics = ["Coherence", "Completeness", "Naturalness", "Nutritional Adequacy", "Caloric Balance"]
    elif task_type == "alternative_diet":
        metrics = ["Coherence", "Completeness", "Naturalness", "Improvement", "Suitability"]

    for metric in metrics:
        match = re.search(fr"{metric}: (\d\.\d)", evaluation)
        scores[metric] = float(match.group(1)) if match else 0.0

    return scores

results = []

alternative_df = df[df['task'] == 'alternative_diet']

for _, row in tqdm(alternative_df.iterrows(), total=len(alternative_df), desc="Evaluating alternative diets"):
    evaluation = evaluate_with_gpt4(row['input'], row['model_output'], row['output'], "alternative_diet")
    scores = extract_scores(evaluation, "alternative_diet")
    results.append({**row.to_dict(), **scores})

evaluation_df = pd.DataFrame(results)

if all(col in evaluation_df.columns for col in ["Improvement", "Suitability"]):
    alternative_avg = evaluation_df[[
        "Coherence", "Completeness", "Naturalness", 
        "Improvement", "Suitability"
    ]].mean()
    print("\nAlternative Diets Average Scores:")
    print(alternative_avg)


Evaluating alternative diets:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluating alternative diets: 100%|██████████| 25/25 [00:23<00:00,  1.07it/s]


Alternative Diets Average Scores:
Coherence       4.32
Completeness    4.00
Naturalness     4.00
Improvement     3.32
Suitability     3.98
dtype: float64





In [16]:
import pandas as pd

dfh = pd.read_csv("/data/jaesung/llm_for_diabetes/src/data/data2_daily_diets/diabetes_food_hub_new_nutri_facts.csv")
dfh.head(2)

Unnamed: 0,title,description,prep_time,cook_time,servings,steps,tags,nutrition_facts,ingredients
0,Raspberry Swirl Frozen Yogurt Bark,Raspberry Swirl Frozen Yogurt Bark: Dive into ...,10 min,4 hr,6 Servings,['Cover a freezer-safe tray with parchment pap...,"['Kid Friendly', 'Vegetarian', 'Dessert', 'Sna...","{'Servings': '6 Servings', 'Serving Size': '1 ...","[{'label': 'Plain Nonfat Greek yogurt', 'us_me..."
1,Maple-Pumpkin Spice Oatmeal Cookies,Description not found,10 min,25 min,14 Servings,['Preheat the oven to 350 degrees F. Line two ...,"['Kid Friendly', 'Vegetarian', 'Snacks', 'Glut...","{'Servings': '14 Servings', 'Serving Size': '1...","[{'label': 'old-fashioned rolled oats', 'us_me..."


In [17]:
# daily diet - nutri score

import ast
import pandas as pd
import json
import re
import openai
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from dotenv import load_dotenv
import os
from tqdm import tqdm

# Load environment variables
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

def extract_numeric_value(value):
    try:
        if isinstance(value, str):
            match = re.search(r"(\d+(\.\d+)?)", value)
            if match:
                return float(match.group(1))
        elif isinstance(value, (int, float)):
            return float(value)
    except Exception as e:
        print(f"Error in extract_numeric_value: {e}, value: {value}")
    return 0.0

def is_valid_meal_structure(json_string):
    try:
        data = json.loads(json_string)
        return all(key in data for key in ['Breakfast', 'Lunch', 'Dinner'])
    except (json.JSONDecodeError, TypeError):
        return False

def find_most_similar_row(title, dfh):
    try:
        dfh['title'] = dfh['title'].fillna('')  # Handle NaN values
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform(dfh['title'])
        input_vector = vectorizer.transform([title])
        similarities = cosine_similarity(input_vector, tfidf_matrix)
        most_similar_idx = similarities.argmax()
        return dfh.iloc[most_similar_idx]
    except Exception as e:
        print(f"Error in find_most_similar_row: {e}, title: {title}")
        return None

def identify_fruit_veg(ingredients_list):
    try:
        prompt = f"Identify which items in the following ingredient list are fruits or vegetables:\n\n{ingredients_list}\n\nReturn only the names of items that are fruits or vegetables in a Python list format."
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are an assistant identifying fruits and vegetables."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=100,
            temperature=0
        )
        fruits_vegetables = response['choices'][0]['message']['content']
        return ast.literal_eval(fruits_vegetables)
    except Exception as e:
        print(f"Error identifying fruits and vegetables: {e}")
        return []

def calculate_fruit_veg_points(ingredients, total_weight):
    try:
        ingredients_list = ast.literal_eval(ingredients)
        fruit_veg_labels = identify_fruit_veg(ingredients_list)

        fruit_veg_weight = 0
        for ingredient in ingredients_list:
            label = ingredient.get('label', '')
            weight = extract_numeric_value(ingredient.get('metric_measure', 0))
            if label in fruit_veg_labels:
                fruit_veg_weight += weight

        # 과일/채소 비율을 100g 기준으로 변환
        fruit_veg_ratio = (fruit_veg_weight / total_weight) * 100 if total_weight > 0 else 0

        if fruit_veg_ratio > 80:
            return 5
        elif fruit_veg_ratio > 60:
            return 2
        elif fruit_veg_ratio > 40:
            return 1
        else:
            return 0
    except Exception as e:
        print(f"Error calculating fruit_veg_points: {e}")
        return 0

def extract_nested_value(data, keys, default=0):
    try:
        for key in keys:
            if isinstance(data, dict):
                data = data.get(key, {})
            else:
                return default
        return extract_numeric_value(data) if isinstance(data, (int, float, str)) else default
    except Exception as e:
        print(f"Error in extract_nested_value: {e}, keys: {keys}, data: {data}")
        return default

def calculate_nutri_score(nutrition_facts, ingredients):
    try:
        if isinstance(nutrition_facts, str):
            nutrition_facts = ast.literal_eval(nutrition_facts)

        # 전체 무게 계산
        total_weight = sum(
            extract_numeric_value(ingredient.get('metric_measure', 0)) 
            for ingredient in ast.literal_eval(ingredients)
        )
        if total_weight == 0:
            print("Warning: Total weight is zero. Skipping calculation.")
            return None

        # 100g 기준으로 성분 정규화
        energy = extract_nested_value(nutrition_facts, ['Amount per Serving', 'Calories']) / total_weight * 100
        saturated_fat = extract_nested_value(nutrition_facts, ['Amount per Serving', 'Total Fat', 'Amount']) / total_weight * 100
        sugar = extract_nested_value(nutrition_facts, ['Amount per Serving', 'Total Carbohydrates', 'Total Sugars']) / total_weight * 100
        sodium = extract_nested_value(nutrition_facts, ['Amount per Serving', 'Sodium']) / total_weight * 100
        fiber = extract_nested_value(nutrition_facts, ['Amount per Serving', 'Total Carbohydrates', 'Dietary Fiber']) / total_weight * 100
        protein = extract_nested_value(nutrition_facts, ['Amount per Serving', 'Protein']) / total_weight * 100

        # Unfavorable points calculation
        energy_points = min(energy / 80, 800)
        saturated_fat_points = min(saturated_fat / 1, 10)
        sugar_points = min(sugar / 4.5, 45)
        sodium_points = min(sodium / 90, 900)

        unfavorable_points = energy_points + saturated_fat_points + sugar_points + sodium_points

        # Favorable points calculation
        fiber_points = min(fiber / 0.7, 3.5)
        protein_points = min(protein / 1.6, 8.0)
        fruit_veg_points = calculate_fruit_veg_points(ingredients, total_weight)

        favorable_points = fiber_points + protein_points + fruit_veg_points

        # Final Nutri-Score calculation
        total_score = unfavorable_points - favorable_points
        return total_score
    except Exception as e:
        print(f"Error in calculate_nutri_score: {e}, nutrition_facts: {nutrition_facts}")
        return None

def get_nutri_score_grade(score):
    if score <= -1:
        return "A"
    elif score <= 2:
        return "B"
    elif score <= 10:
        return "C"
    elif score <= 18:
        return "D"
    else:
        return "E"

def calculate_meal_nutri_score(meal_data, dfh):
    meal_scores = {}

    for meal, title in meal_data.items():
        matched_row = find_most_similar_row(title, dfh)
        if matched_row is None:
            continue

        nutrition_facts = matched_row['nutrition_facts']
        ingredients = matched_row['ingredients']
        score = calculate_nutri_score(nutrition_facts, ingredients)

        if score is None:
            print(f"Warning: Nutri-Score calculation failed for meal '{meal}' with title '{title}'.")
            grade = "N/A"
        else:
            grade = get_nutri_score_grade(score)

        meal_scores[meal] = {'score': score, 'grade': grade}

    return meal_scores

def calculate_scores_with_comparison(df, dfh):
    results = []
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        output_scores = {}
        model_scores = {}
        if is_valid_meal_structure(row.get('output', '')):
            output_data = json.loads(row['output'])
            output_scores = calculate_meal_nutri_score(output_data, dfh)
        if is_valid_meal_structure(row.get('model_output', '')):
            model_data = json.loads(row['model_output'])
            model_scores = calculate_meal_nutri_score(model_data, dfh)
        results.append({'row_index': idx, 'output_scores': output_scores, 'model_scores': model_scores})
    return results

def calculate_average_scores(results):
    """
    Calculate the average Nutri-Scores for outputs and model outputs.
    """
    output_total_score = 0
    model_total_score = 0
    output_count = 0
    model_count = 0

    for result in results:
        # Extract output scores
        for meal, score_data in result['output_scores'].items():
            if score_data['score'] is not None:
                output_total_score += score_data['score']
                output_count += 1

        # Extract model scores
        for meal, score_data in result['model_scores'].items():
            if score_data['score'] is not None:
                model_total_score += score_data['score']
                model_count += 1

    # Calculate averages
    output_avg = output_total_score / output_count if output_count > 0 else None
    model_avg = model_total_score / model_count if model_count > 0 else None

    return output_avg, model_avg


# 'daily_diets' task Nutri-Score calculation
filtered_df = df[df['task'] == 'daily_diets']
results = calculate_scores_with_comparison(filtered_df, dfh)

# Calculate overall averages
output_avg, model_avg = calculate_average_scores(results)

# Print results
print("=== Results for Each Row ===")
for result in results:
    print(f"Row Index: {result['row_index']}")
    print(f"Output Scores: {result['output_scores']}")
    print(f"Model Output Scores: {result['model_scores']}")
    print()

print("=== Overall Averages ===")
print(f"Output Average Nutri-Score: {output_avg}")
print(f"Model Output Average Nutri-Score: {model_avg}")


  0%|          | 0/25 [00:00<?, ?it/s]

100%|██████████| 25/25 [01:27<00:00,  3.52s/it]

=== Results for Each Row ===
Row Index: 0
Output Scores: {'Breakfast': {'score': 1.2832821300563242, 'grade': 'B'}, 'Lunch': {'score': 0.12827070932539675, 'grade': 'B'}, 'Dinner': {'score': 0.1844070961718014, 'grade': 'B'}}
Model Output Scores: {'Breakfast': {'score': 0.8727017632030825, 'grade': 'B'}, 'Lunch': {'score': -0.13559683963271407, 'grade': 'B'}, 'Dinner': {'score': 0.12827070932539675, 'grade': 'B'}}

Row Index: 1
Output Scores: {'Breakfast': {'score': 0.27903677232536306, 'grade': 'B'}, 'Lunch': {'score': 1.2434792311769467, 'grade': 'B'}, 'Dinner': {'score': 0.12827070932539675, 'grade': 'B'}}
Model Output Scores: {'Breakfast': {'score': 1.210600194254846, 'grade': 'B'}, 'Lunch': {'score': 0.12827070932539675, 'grade': 'B'}, 'Dinner': {'score': 0.12827070932539675, 'grade': 'B'}}

Row Index: 2
Output Scores: {'Breakfast': {'score': -0.06054961667206582, 'grade': 'B'}, 'Lunch': {'score': 1.4849885674520331, 'grade': 'B'}, 'Dinner': {'score': 0.12827070932539675, 'grade':




In [18]:
# alternative diet - nutri score

import ast
import pandas as pd
import json
import re
import openai
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from dotenv import load_dotenv
import os
from tqdm import tqdm

# Load environment variables
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

def extract_numeric_value(value):
    try:
        if isinstance(value, str):
            match = re.search(r"(\d+(\.\d+)?)", value)
            if match:
                return float(match.group(1))
        elif isinstance(value, (int, float)):
            return float(value)
    except Exception as e:
        print(f"Error in extract_numeric_value: {e}, value: {value}")
    return 0.0

def is_valid_meal_structure(json_string):
    try:
        data = json.loads(json_string)
        return isinstance(data, dict)
    except (json.JSONDecodeError, TypeError):
        return False

def find_most_similar_row(title, dfh):
    try:
        dfh['title'] = dfh['title'].fillna('')  # Handle NaN values
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform(dfh['title'])
        input_vector = vectorizer.transform([title])
        similarities = cosine_similarity(input_vector, tfidf_matrix)
        most_similar_idx = similarities.argmax()
        return dfh.iloc[most_similar_idx]
    except Exception as e:
        print(f"Error in find_most_similar_row: {e}, title: {title}")
        return None

def identify_fruit_veg(ingredients_list):
    try:
        prompt = f"Identify which items in the following ingredient list are fruits or vegetables:\n\n{ingredients_list}\n\nReturn only the names of items that are fruits or vegetables in a Python list format."
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are an assistant identifying fruits and vegetables."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=100,
            temperature=0
        )
        fruits_vegetables = response['choices'][0]['message']['content']
        return ast.literal_eval(fruits_vegetables)
    except Exception as e:
        print(f"Error identifying fruits and vegetables: {e}")
        return []

def calculate_fruit_veg_points(ingredients, total_weight):
    try:
        ingredients_list = ast.literal_eval(ingredients)
        fruit_veg_labels = identify_fruit_veg(ingredients_list)

        fruit_veg_weight = 0
        for ingredient in ingredients_list:
            label = ingredient.get('label', '')
            weight = extract_numeric_value(ingredient.get('metric_measure', 0))
            if label in fruit_veg_labels:
                fruit_veg_weight += weight

        # 과일/채소 비율을 100g 기준으로 변환
        fruit_veg_ratio = (fruit_veg_weight / total_weight) * 100 if total_weight > 0 else 0

        if fruit_veg_ratio > 80:
            return 5
        elif fruit_veg_ratio > 60:
            return 2
        elif fruit_veg_ratio > 40:
            return 1
        else:
            return 0
    except Exception as e:
        print(f"Error calculating fruit_veg_points: {e}")
        return 0

def extract_nested_value(data, keys, default=0):
    try:
        for key in keys:
            if isinstance(data, dict):
                data = data.get(key, {})
            else:
                return default
        return extract_numeric_value(data)
    except Exception as e:
        print(f"Error in extract_nested_value: {e}, keys: {keys}, data: {data}")
        return default

def calculate_nutri_score(nutrition_facts, ingredients):
    try:
        if isinstance(nutrition_facts, str):
            nutrition_facts = ast.literal_eval(nutrition_facts)

        # 전체 무게 계산
        total_weight = sum(
            extract_numeric_value(ingredient.get('metric_measure', 0)) 
            for ingredient in ast.literal_eval(ingredients)
        )
        if total_weight == 0:
            print("Warning: Total weight is zero. Skipping calculation.")
            return None

        # 100g 기준으로 성분 정규화
        energy = extract_nested_value(nutrition_facts, ['Amount per Serving', 'Calories']) / total_weight * 100
        saturated_fat = extract_nested_value(nutrition_facts, ['Amount per Serving', 'Total Fat', 'Amount']) / total_weight * 100
        sugar = extract_nested_value(nutrition_facts, ['Amount per Serving', 'Total Carbohydrates', 'Total Sugars']) / total_weight * 100
        sodium = extract_nested_value(nutrition_facts, ['Amount per Serving', 'Sodium']) / total_weight * 100
        fiber = extract_nested_value(nutrition_facts, ['Amount per Serving', 'Total Carbohydrates', 'Dietary Fiber']) / total_weight * 100
        protein = extract_nested_value(nutrition_facts, ['Amount per Serving', 'Protein']) / total_weight * 100

        # Unfavorable points calculation
        energy_points = min(energy / 80, 800)
        saturated_fat_points = min(saturated_fat / 1, 10)
        sugar_points = min(sugar / 4.5, 45)
        sodium_points = min(sodium / 90, 900)

        unfavorable_points = energy_points + saturated_fat_points + sugar_points + sodium_points

        # Favorable points calculation
        fiber_points = min(fiber / 0.7, 3.5)
        protein_points = min(protein / 1.6, 8.0)
        fruit_veg_points = calculate_fruit_veg_points(ingredients, total_weight)

        favorable_points = fiber_points + protein_points + fruit_veg_points

        # Final Nutri-Score calculation
        total_score = unfavorable_points - favorable_points
        return total_score
    except Exception as e:
        print(f"Error in calculate_nutri_score: {e}, nutrition_facts: {nutrition_facts}")
        return None

def get_nutri_score_grade(score):
    if score <= -1:
        return "A"
    elif score <= 2:
        return "B"
    elif score <= 10:
        return "C"
    elif score <= 18:
        return "D"
    else:
        return "E"

def calculate_scores_with_comparison_no_meals(df, dfh):
    results = []
    output_scores_list = []
    model_output_scores_list = []

    for idx, row in tqdm(df.iterrows(), total=len(df)):
        try:
            output_text = row.get('output', '')
            if output_text:
                matched_row = find_most_similar_row(output_text, dfh)
                if matched_row is not None:
                    nutrition_facts = matched_row['nutrition_facts']
                    ingredients = matched_row['ingredients']
                    output_score = calculate_nutri_score(nutrition_facts, ingredients)
                    output_scores_list.append(output_score)
                else:
                    output_score = None

            model_output_text = row.get('model_output', '')
            if model_output_text:
                matched_row = find_most_similar_row(model_output_text, dfh)
                if matched_row is not None:
                    nutrition_facts = matched_row['nutrition_facts']
                    ingredients = matched_row['ingredients']
                    model_output_score = calculate_nutri_score(nutrition_facts, ingredients)
                    model_output_scores_list.append(model_output_score)
                else:
                    model_output_score = None

            results.append({
                'row_index': idx,
                'output_score': output_score,
                'model_output_score': model_output_score
            })

        except Exception as e:
            print(f"Error processing row {idx}: {e}")
            results.append({
                'row_index': idx,
                'output_score': None,
                'model_output_score': None
            })

    final_output_avg = sum(output_scores_list) / len(output_scores_list) if output_scores_list else None
    final_model_output_avg = sum(model_output_scores_list) / len(model_output_scores_list) if model_output_scores_list else None

    print(f"Output Average Nutri-Score: {final_output_avg}")
    print(f"Model Output Average Nutri-Score: {final_model_output_avg}")

    return results

# Execution
filtered_df = df[df['task'] == 'alternative_diet']
results = calculate_scores_with_comparison_no_meals(filtered_df, dfh)

# Print results
for result in results:
    print(result)


  0%|          | 0/25 [00:00<?, ?it/s]

100%|██████████| 25/25 [00:33<00:00,  1.35s/it]

Output Average Nutri-Score: 0.4576467924666062
Model Output Average Nutri-Score: 0.9385360360104872
{'row_index': 50, 'output_score': -0.5456349206349205, 'model_output_score': -0.2137724271482232}
{'row_index': 51, 'output_score': 0.32153783832762994, 'model_output_score': -1.8280321920168663}
{'row_index': 52, 'output_score': 1.2495974235104654, 'model_output_score': 0.08622999129328224}
{'row_index': 53, 'output_score': -0.2137724271482232, 'model_output_score': -1.8280321920168663}
{'row_index': 54, 'output_score': 0.09538432905267075, 'model_output_score': 0.08622999129328224}
{'row_index': 55, 'output_score': 0.09538432905267075, 'model_output_score': 0.04485645933014354}
{'row_index': 56, 'output_score': -4.595731219412345, 'model_output_score': 3.005339996080737}
{'row_index': 57, 'output_score': 0.32153783832762994, 'model_output_score': 0.9751322751322751}
{'row_index': 58, 'output_score': -0.04938140184041795, 'model_output_score': 0.08622999129328224}
{'row_index': 59, 'out


