In [30]:

from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
    )

==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.46.3.
   \\   /|    GPU: NVIDIA A100-PCIE-40GB. Max memory: 39.394 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post2. FA2 = True]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [31]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# alpaca_prompt = You MUST copy from above!
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""


inputs = tokenizer(
[
    alpaca_prompt.format(
        "Recommend a daily diet that includes a specific ingredient.",

        "Create a diet that includes baby bok choy(roots trimmed and roughly chopped).",

        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 2048)

<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Recommend a daily diet that includes a specific ingredient.

### Input:
Create a diet that includes baby bok choy(roots trimmed and roughly chopped).

### Response:
1. Wash the baby bok choy and trim the roots.
2. Chop the baby bok choy into bite-sized pieces.
3. In a large bowl, mix together the chopped baby bok choy, 1/2 cup of quinoa, and 1/4 cup of chickpeas.
4. Add 1/2 cup of water and stir well.
5. Cover the bowl with plastic wrap and refrigerate overnight.
6. The next day, heat the quinoa mixture in the microwave for 2-3 minutes, or until the quinoa is cooked through.
7. Serve the quinoa mixture with your favorite toppings, such as chopped tomatoes, chopped onions, and chopped cilantro.<|end_of_text|>


In [32]:
import json
import pandas as pd

file_path = "/data/jaesung/llm_for_diabetes/src/model/inference_results_0120.jsonl"

data = []
with open(file_path, 'r', encoding='utf-8') as f:
    for line in f:
        data.append(json.loads(line))

df = pd.DataFrame(data)

In [33]:
df.head(2)

Unnamed: 0,dataset,split_data,task,instruction,input,output,__index_level_0__,model_output
0,diabetes_food_hub,test,daily_diets,Recommend a daily diet based on the given nutr...,Ensure the daily carbohydrate intake does not ...,"{""Breakfast"": ""Sun-Kissed Health Salad"", ""Lunc...",28967,"{""Breakfast"": ""Peach & Cream Smoothie"", ""Lunch..."
1,diabetes_food_hub,test,daily_diets,Recommend a daily diet based on the given nutr...,Ensure the daily carbohydrate intake does not ...,"{""Breakfast"": ""Tofu and Vegetable Skewers"", ""L...",27867,"{""Breakfast"": ""Peach & Cream Smoothie"", ""Lunch..."


In [34]:
df['task'].unique()

array(['daily_diets', 'qa_objective_3', 'alternative_diet',
       'ie_extract_relation', 'qa_objective_1', 'qa_objective_2', 'nli',
       'summarization', 'generation'], dtype=object)

In [6]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
import torch

# 데이터프레임 로드 (예제)
df = df[df['task']=='qa_objective_3']

# 빈 output 및 nutrition 컬럼 추가
df["output"] = ""
df["nutrition_info"] = ""

# Alpaca 스타일 프롬프트 설정
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
"""

# 모델을 사용해 output 및 영양소 정보 생성
def generate_output_and_nutrition(row):
    prompt = alpaca_prompt.format(row["instruction"], row["input"])
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    # Text Streamer 사용하여 결과 생성
    text_streamer = TextStreamer(tokenizer)
    output = model.generate(**inputs, max_new_tokens=20, streamer=text_streamer)

    # 생성된 텍스트 디코딩 및 처리
    decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)

    return decoded_output

# 데이터프레임에 output 및 nutrition 정보 생성 후 저장
for index, row in df.iterrows():
    output = generate_output_and_nutrition(row)
    df.at[index, "model_output"] = output
# 결과 확인 및 저장
# df.to_csv("output_with_diet_and_nutrition.csv", index=False)
# print("Output saved to 'output_with_diet_and_nutrition.csv'")


<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Choose the correct anser (Yes, No, or Maybe) for the given question based on the proviced context.

### Input:
Question: Is qTc interval prolongation independently associated with severe hypoglycemic attacks in type 1 diabetes from the EURODIAB IDDM complications study? Context: Our aim was to assess whether severe hypoglycemic attacks were cross-sectionally associated with abnormalities of the QTc interval in type 1 diabetic patients. The study included 3,248 type 1 diabetic patients from the EURODIAB IDDM Complications Study. Severe hypoglycemia was defined as an attack serious enough to require the help of another person. A corrected QTc interval (QTc) >0.44 s was considered abnormally prolonged. Nineteen percent of patients declared one to two attacks, and 13.2% of patients had three or more 

In [7]:

def token_overlap(output, model_output):
    # 토큰화
    output_tokens = set(output.split())
    model_output_tokens = set(model_output.split())
    
    # 공통 토큰 개수 계산
    common_tokens = output_tokens.intersection(model_output_tokens)
    
    # 일치 비율 계산
    return len(common_tokens) / len(output_tokens) if len(output_tokens) > 0 else 0

qa_descriptive_df = df[df['task'] == 'qa_objective_3']

# 일치 비율 계산 및 저장
qa_descriptive_df['token_match_score'] = qa_descriptive_df.apply(lambda row: token_overlap(row['output'], row['model_output']), axis=1)

# 평균 점수를 정확도로 간주
accuracy = qa_descriptive_df['token_match_score'].mean()

print(f"Token Match Accuracy: {accuracy:.2%}")



Token Match Accuracy: 0.00%


In [86]:
# qa_subjective

import openai
import pandas as pd
import re
from sklearn.metrics import f1_score
from bert_score import score as bert_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

from dotenv import load_dotenv
import os

load_dotenv()

openai.api_key = os.getenv("OPENAI_API_KEY")

bleurt_model_name = "Elron/bleurt-large-512"
tokenizer = AutoTokenizer.from_pretrained(bleurt_model_name)
bleurt_model = AutoModelForSequenceClassification.from_pretrained(bleurt_model_name)
bleurt_model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bleurt_model = bleurt_model.to(device)

MAX_CONTEXT_LENGTH = 8192

def evaluate_with_gpt4(input_text, model_output, true_output):
    input_text = input_text[:MAX_CONTEXT_LENGTH]
    model_output = model_output[:MAX_CONTEXT_LENGTH]
    true_output = true_output[:MAX_CONTEXT_LENGTH]

    prompt = f"""
    You are tasked with evaluating the quality of a QA model's responses based on the following metrics:
    1. **Coherence**: Does the model's response logically align with the context provided in the input?
    2. **Completeness**: Does the model's response sufficiently answer the question in the input?
    3. **Naturalness**: Does the model's response sound fluent and human-like?

    **Input**:
    {input_text}

    **Model's Response**:
    {model_output}

    **True Answer**:
    {true_output}

    Please rate each metric on a scale from 1 to 5. 
    Example response format:
    - Coherence: X.X
    - Completeness: X.X
    - Naturalness: X.X
    """
    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo-0125",
            messages=[
                {"role": "system", "content": "You are an expert evaluator for QA models."},
                {"role": "user", "content": prompt}
            ]
        )
        return response["choices"][0]["message"]["content"]
    except Exception as e:
        print("Error with GPT-4 API:", e)
        return None

def extract_scores(evaluation):
    if evaluation is None:
        return {
            "Coherence": 0.0,
            "Completeness": 0.0,
            "Naturalness": 0.0
        }
    coherence = re.search(r"Coherence: (\d\.\d)", evaluation)
    completeness = re.search(r"Completeness: (\d\.\d)", evaluation)
    naturalness = re.search(r"Naturalness: (\d\.\d)", evaluation)
    
    return {
        "Coherence": float(coherence.group(1)) if coherence else 0.0,
        "Completeness": float(completeness.group(1)) if completeness else 0.0,
        "Naturalness": float(naturalness.group(1)) if naturalness else 0.0
    }

def calculate_concept_f1(y_true, y_pred):
    return f1_score(y_true, y_pred, average="macro")


def calculate_bleurt(y_true, y_pred):
    inputs = tokenizer(y_pred, y_true, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    with torch.no_grad():
        scores = bleurt_model(**inputs).logits

    if scores.numel() == 1:
        return float(scores.squeeze().item()) 
    return [float(score) for score in scores.squeeze().tolist()] 

def calculate_bleurt_and_bertscore(y_true, y_pred):
    bleurt_score_value = calculate_bleurt(y_true, y_pred)

    _, _, bert_f1 = bert_score(y_pred, y_true, lang="en", rescale_with_baseline=True)

    bert_f1 = [float(score) for score in bert_f1]

    return {
        "BLEURT": bleurt_score_value if isinstance(bleurt_score_value, float) else sum(bleurt_score_value) / len(bleurt_score_value),
        "BERTScore_F1": sum(bert_f1) / len(bert_f1)
    }

def normalize_scores(df, column):

    df[column] = df[column].apply(
        lambda x: float(x) if isinstance(x, torch.Tensor) else x
    )
    min_val = df[column].min()
    max_val = df[column].max()

    df[column] = df[column].apply(
        lambda x: (x - min_val) / (max_val - min_val) if max_val > min_val else 0.5
    )
    return df

qa_df = df[df['task'] == 'generation']

results = []

for _, row in qa_df.iterrows():
    input_text = row['input']
    model_output = row['model_output']
    true_output = row['output']
    
    evaluation = evaluate_with_gpt4(input_text, model_output, true_output)
    print(f"Evaluation result:\n{evaluation}")
    
    scores = extract_scores(evaluation)
    
    concept_f1 = calculate_concept_f1([true_output], [model_output])
    
    metric_scores = calculate_bleurt_and_bertscore([true_output], [model_output])

    results.append({
        "input": input_text,
        "model_output": model_output,
        "true_output": true_output,
        "evaluation": evaluation,
        "Coherence": scores["Coherence"],
        "Completeness": scores["Completeness"],
        "Naturalness": scores["Naturalness"],
        "Concept_F1": concept_f1,
        "BLEURT": metric_scores["BLEURT"],
        "BERTScore_F1": metric_scores["BERTScore_F1"]
    })

evaluation_df = pd.DataFrame(results)

evaluation_df = normalize_scores(evaluation_df, "BLEURT")
evaluation_df = normalize_scores(evaluation_df, "BERTScore_F1")

average_scores = evaluation_df[["Coherence", "Completeness", "Naturalness", "Concept_F1", "BLEURT", "BERTScore_F1"]].mean()

print("평균 점수:")
print(average_scores)

Evaluation result:
- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
Based on the provided input and model's response, here is the evaluation of the QA model's response:

- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0

Overall, the model's response lacks coherence, completeness, and naturalness. The response provided does not address the patient's query and seems to be a placeholder text instead of a meaningful medical advice.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 2.5
- Completeness: 2.0
- Naturalness: 1.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
Based on the provided input and the model's response, here is the evaluation for the QA model:
- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0

The model's response lacks coherence, completeness, and naturalness as it does not provide any relevant information or answer to the patient's query. It seems to be outputting gibberish rather than a meaningful medical response.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
Based on the provided information, here is the evaluation of the model's response:

- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0

The model's response completely fails to address the input query and provide a relevant answer. It appears to be generating irrelevant repetitive text rather than a meaningful medical response. The coherence, completeness, and naturalness are all very low as the response does not align with the context, answer the question, or sound human-like at all.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
Sure, I can evaluate the model's response based on the provided metrics. 

- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0

Overall, the model's response did not provide any meaningful information related to the patient's query. The response lacks coherence, completeness, and naturalness. The model simply repeated the input without generating a relevant medical response.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.5
- Completeness: 1.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
Based on the provided input and model's response, here is the evaluation:

- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0

The model's response does not exhibit coherence, completeness, or naturalness as it simply repeats the same irrelevant phrase multiple times instead of generating a comprehensive medical response based on the patient's query.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 3.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
Here is the evaluation based on the provided metrics:

- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
Based on the provided input and the model's response, here is the evaluation for the QA model:

- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0

The model's response does not align at all with the context provided in the input, it does not provide any meaningful information related to the patient's query, and the response is not at all natural or human-like in its structure.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
Based on the provided input and model's response, I would rate the QA model as follows:

- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0

The model's response lacks coherence, as it does not provide a relevant or coherent answer to the patient's query about a renal cyst. The response is also incomplete and does not address the patient's concerns or questions. Furthermore, the response lacks naturalness and fluency, as it consists of repetitive and nonsensical phrases.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


평균 점수:
Coherence       1.040000
Completeness    1.060000
Naturalness     1.010000
Concept_F1      0.000000
BLEURT          0.549018
BERTScore_F1    0.500000
dtype: float64




In [35]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
import torch

# 데이터프레임 로드 (예제)
df = df[(df['task']=='daily_diets')| (df['task']=='alternative_diet')]

# 빈 output 및 nutrition 컬럼 추가
df["output"] = ""
df["nutrition_info"] = ""

# Alpaca 스타일 프롬프트 설정
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. If the output includes a diet, ensure to include detailed nutritional information (calories, sugar, sodium, fiber, protein, fat) for the diet as well.

### Instruction:
{}

### Input:
{}

### Response:
"""

# 모델을 사용해 output 및 영양소 정보 생성
def generate_output_and_nutrition(row):
    prompt = alpaca_prompt.format(row["instruction"], row["input"])
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    # Text Streamer 사용하여 결과 생성
    text_streamer = TextStreamer(tokenizer)
    output = model.generate(**inputs, max_new_tokens=200, streamer=text_streamer)

    # 생성된 텍스트 디코딩 및 처리
    decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)

    # "Diet:"와 "Nutrition:" 구분
    if "Nutrition:" in decoded_output:
        diet_part, nutrition_part = decoded_output.split("Nutrition:", 1)
        diet = diet_part.strip().split("### Response:")[-1].strip()
        nutrition = nutrition_part.strip()
    else:
        diet = decoded_output.split("### Response:")[-1].strip()
        nutrition = "Nutrition information not provided."

    return diet, nutrition

# 데이터프레임에 output 및 nutrition 정보 생성 후 저장
for index, row in df.iterrows():
    diet, nutrition = generate_output_and_nutrition(row)
    df.at[index, "output"] = diet
    df.at[index, "nutrition_info"] = nutrition

# 결과 확인 및 저장
# df.to_csv("output_with_diet_and_nutrition.csv", index=False)
# print("Output saved to 'output_with_diet_and_nutrition.csv'")


<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. If the output includes a diet, ensure to include detailed nutritional information (calories, sugar, sodium, fiber, protein, fat) for the diet as well.

### Instruction:
Recommend a daily diet based on the given nutritional goals.

### Input:
Ensure the daily carbohydrate intake does not exceed 108.0g, protein intake is at least 36.0g, and fat intake does not exceed 39.0g.

### Response:
A healthy 

diet includes a variety of foods that provide essential nutrients. The recommended daily intake of carbohydrates, proteins, and fats can vary depending on factors such as age, gender, activity level, and overall health status. However, some general guidelines can be provided to help individuals meet their nutritional goals.

For carbohydrates, it is recommended to consume between 45% and 65% of total daily calories from carbohydrates. This translates to approximately 225 to 325 grams of carbohydrates per day for a 2000-calorie diet. To ensure a balanced intake of carbohydrates, it is important to include a variety of sources such as whole grains, fruits, vegetables, and legumes.

Protein intake should be between 10% and 35% of total daily calories. This translates to approximately 50 to 175 grams of protein per day for a 2000-calorie diet. Sources of high-quality protein include lean meats, poultry, fish, eggs, dairy products, beans, and nuts.

Fat
<|begin_of_text|>Below is an instruct

In [43]:
df[(df['task']=='alternative_diet')]['output'].tolist()

['A healthy breakfast, such as oatmeal with fruit and nuts, can provide a balanced meal. The oatmeal is a good source of fiber and the fruit and nuts provide essential vitamins and minerals. Additionally, the oatmeal is a complex carbohydrate, which provides sustained',
 'A small apple and a handful of sunflower seeds.\n\n### Explanation:\nRadish, raw contains 25 calories, 0.0g sugar, 0mg sodium, 2g fiber, 1g protein, and 0.0g fat.\nA small apple and a handful of sunflower seeds contain 65 calories, 15.0g sugar',
 'A banana',
 '- A salad with lettuce, cucumber, tomato, and carrots\n- A grilled chicken breast with a side of steamed broccoli\n- A grilled salmon fillet with a side of roasted sweet potato\n- A bowl of vegetable soup with a side of whole wheat bread\n- A grilled steak with a side of baked potato\n- A bowl of lentil soup with a side of whole wheat bread\n- A grilled chicken breast with a side of steamed vegetables\n- A grilled salmon fillet with a side of roasted sweet potat

In [44]:
df[df['task']=='alternative_diet'].head()

Unnamed: 0,dataset,split_data,task,instruction,input,output,__index_level_0__,model_output,nutrition_info
100,diabetes_food_hub,test,alternative_diet,"Based on the previous meal, suggest the next m...","Pastry, puff, custard or cream filled, iced or...","A healthy breakfast, such as oatmeal with frui...",29737,Greek Chicken Pita with Vegetable Salad is rec...,Nutrition information not provided.
101,diabetes_food_hub,test,alternative_diet,"Based on the previous meal, suggest the next m...","Radish, raw",A small apple and a handful of sunflower seeds...,30452,Mediterranean Chicken Pita is recommended. The...,Nutrition information not provided.
102,diabetes_food_hub,test,alternative_diet,"Based on the previous meal, suggest the next m...",Peanut butter,A banana,30040,Greek Vegetable Pita is recommended. The reaso...,Nutrition information not provided.
103,diabetes_food_hub,test,alternative_diet,"Based on the previous meal, suggest the next m...","Cheese, Monterey","- A salad with lettuce, cucumber, tomato, and ...",29963,Grilled Lime Chicken Fajitas is recommended. T...,Nutrition information not provided.
104,diabetes_food_hub,test,alternative_diet,"Based on the previous meal, suggest the next m...",Soy sauce,A small bowl of edamame beans.,30513,Greek Vegetable Pita is recommended. The reaso...,Nutrition information not provided.


: 

In [67]:
df['output'].iloc[1]

'Ensure the daily carbohydrate intake does not exceed 88.0g, protein intake is at least 36.0g, and fat intake does not exceed 22.0g.\n\n### Explanation:\nEnsure the daily carbohydrate intake does not exceed 88.0g, protein intake is at least 36.0g, and fat intake does not exceed 22.0g.'

In [68]:
df.head()

Unnamed: 0,dataset,split_data,task,instruction,input,output,__index_level_0__,model_output,nutrition_info
0,diabetes_food_hub,test,daily_diets,Recommend a daily diet based on the given nutr...,Ensure the daily carbohydrate intake does not ...,Recommend a daily diet based on the given nutr...,28967,"{""Breakfast"": ""Peach & Cream Smoothie"", ""Lunch...",Nutrition information not provided.
1,diabetes_food_hub,test,daily_diets,Recommend a daily diet based on the given nutr...,Ensure the daily carbohydrate intake does not ...,Ensure the daily carbohydrate intake does not ...,27867,"{""Breakfast"": ""Peach & Cream Smoothie"", ""Lunch...",Nutrition information not provided.
2,diabetes_food_hub,test,daily_diets,Analyze and summarize the nutritional content ...,"{""Breakfast"": ""Turkey Sausage and Egg Casserol...","The daily diet consists of 1,040 calories, 39 ...",28731,"{""Daily Total"": {""Calories"": 530.0, ""Carbohydr...",Nutrition information not provided.
3,diabetes_food_hub,test,daily_diets,Analyze and summarize the nutritional content ...,"{""Breakfast"": ""Dark Chocolate Zucchini Bread S...",,28651,"{""Daily Total"": {""Calories"": 610.0, ""Carbohydr...",Nutrition information not provided.
4,diabetes_food_hub,test,daily_diets,Recommend a daily diet that includes a specifi...,Create a diet that includes black pepper(for t...,Recommend a daily diet that includes black pep...,28903,"{""Breakfast"": ""Raspberry Swirl Frozen Yogurt B...",Nutrition information not provided.


In [69]:
import pandas as pd
import openai
from statistics import mean
from dotenv import load_dotenv
import os

# 환경 변수 로드
load_dotenv(dotenv_path="/data/jaesung/llm_for_diabetes/.env")
openai.api_key = os.getenv("OPENAI_API_KEY")

df = df[df['task']=='daily_diets']

# Nutri-score 계산 함수
def calculate_nutri_score(calories, sugar, sodium, fiber, fat, protein):
    # Negative points
    negative_points = 0
    if calories > 800:
        negative_points += 10
    elif calories > 720:
        negative_points += 9
    elif calories > 640:
        negative_points += 8
    elif calories > 560:
        negative_points += 7
    elif calories > 480:
        negative_points += 6
    elif calories > 400:
        negative_points += 5
    elif calories > 320:
        negative_points += 4
    elif calories > 240:
        negative_points += 3
    elif calories > 160:
        negative_points += 2
    elif calories > 80:
        negative_points += 1

    if sugar > 45:
        negative_points += 10
    elif sugar > 40:
        negative_points += 9
    elif sugar > 36:
        negative_points += 8
    elif sugar > 31:
        negative_points += 7
    elif sugar > 27:
        negative_points += 6
    elif sugar > 22.5:
        negative_points += 5
    elif sugar > 18:
        negative_points += 4
    elif sugar > 13.5:
        negative_points += 3
    elif sugar > 9:
        negative_points += 2
    elif sugar > 4.5:
        negative_points += 1

    if fat > 10:
        negative_points += 10
    elif fat > 9:
        negative_points += 9
    elif fat > 8:
        negative_points += 8
    elif fat > 7:
        negative_points += 7
    elif fat > 6:
        negative_points += 6
    elif fat > 5:
        negative_points += 5
    elif fat > 4:
        negative_points += 4
    elif fat > 3:
        negative_points += 3
    elif fat > 2:
        negative_points += 2
    elif fat > 1:
        negative_points += 1

    if sodium > 900:
        negative_points += 10
    elif sodium > 810:
        negative_points += 9
    elif sodium > 720:
        negative_points += 8
    elif sodium > 630:
        negative_points += 7
    elif sodium > 540:
        negative_points += 6
    elif sodium > 450:
        negative_points += 5
    elif sodium > 360:
        negative_points += 4
    elif sodium > 270:
        negative_points += 3
    elif sodium > 180:
        negative_points += 2
    elif sodium > 90:
        negative_points += 1

    # Positive points
    positive_points = 0
    if fiber > 4.7:
        positive_points += 5
    elif fiber > 3.5:
        positive_points += 4
    elif fiber > 2.8:
        positive_points += 3
    elif fiber > 2.1:
        positive_points += 2
    elif fiber > 1.4:
        positive_points += 1

    if protein > 8:
        positive_points += 5
    elif protein > 6.4:
        positive_points += 4
    elif protein > 4.8:
        positive_points += 3
    elif protein > 3.2:
        positive_points += 2
    elif protein > 1.6:
        positive_points += 1

    # Nutri-score 계산
    nutri_score = negative_points - positive_points
    return nutri_score

# GPT를 통해 영양소 데이터 추론 함수
def infer_nutritional_data(output):
    prompt = (
        f"Please estimate the nutritional values for the following dietary recommendation. Return only numerical values without units or text for calories, sugar, sodium, fiber, fat, and protein:\n\n"
        f"Diet Recommendation:\n"
        f"{output}\n\n"
        f"Return the result in the format:\n"
        f"Calories: <value>\n"
        f"Sugar: <value>\n"
        f"Sodium: <value>\n"
        f"Fiber: <value>\n"
        f"Fat: <value>\n"
        f"Protein: <value>\n"
    )
    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo-0125",
            messages=[{"role": "system", "content": "You are a helpful assistant."},
                      {"role": "user", "content": prompt}],
            max_tokens=150,
            temperature=0.5
        )
        evaluation = response["choices"][0]["message"]["content"]
        values = {}
        for line in evaluation.split("\n"):
            if "Calories" in line:
                values["calories"] = float(line.split(":")[1].strip())
            elif "Sugar" in line:
                values["sugar"] = float(line.split(":")[1].strip())
            elif "Sodium" in line:
                values["sodium"] = float(line.split(":")[1].strip())
            elif "Fiber" in line:
                values["fiber"] = float(line.split(":")[1].strip())
            elif "Fat" in line:
                values["fat"] = float(line.split(":")[1].strip())
            elif "Protein" in line:
                values["protein"] = float(line.split(":")[1].strip())
        return values
    except Exception as e:
        print(f"Error during GPT inference: {e}")
        return {"calories": 0.0, "sugar": 0.0, "sodium": 0.0, "fiber": 0.0, "fat": 0.0, "protein": 0.0}

# GPT를 통해 coherence, completeness, naturalness 평가 요청
def evaluate_output(output):
    prompt = (
        f"Please evaluate the following dietary recommendation on the following criteria:\n\n"
        f"1. **Coherence** (1-5): How logically and meaningfully connected the meals are. "
        f"Rate 1 if the meals are completely disconnected, and 5 if they are logically consistent and flow well.\n"
        f"2. **Completeness** (1-5): How complete the meal plan is, considering all meals and balance. "
        f"Rate 1 if essential components are missing, and 5 if all elements are balanced and present.\n"
        f"3. **Naturalness** (1-5): How practical and realistic the meal plan sounds for daily consumption. "
        f"Rate 1 if it is impractical, and 5 if it is highly practical and realistic.\n\n"
        f"Diet Recommendation:\n"
        f"{output}\n\n"
        f"Return the result in the exact format:\n"
        f"Coherence: <score>\n"
        f"Completeness: <score>\n"
        f"Naturalness: <score>\n"
    )
    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo-0125",
            messages=[{"role": "system", "content": "You are a helpful assistant."},
                      {"role": "user", "content": prompt}],
            max_tokens=150,
            temperature=0.7
        )
        evaluation = response["choices"][0]["message"]["content"]
        scores = {}
        for line in evaluation.split("\n"):
            if "Coherence" in line:
                scores["Coherence"] = float(line.split(":")[1].strip())
            elif "Completeness" in line:
                scores["Completeness"] = float(line.split(":")[1].strip())
            elif "Naturalness" in line:
                scores["Naturalness"] = float(line.split(":")[1].strip())
        return scores
    except Exception as e:
        print(f"Error during GPT evaluation: {e}")
        return {"Coherence": None, "Completeness": None, "Naturalness": None}

# 데이터프레임에 평가 점수 추가
df["Coherence"] = None
df["Completeness"] = None
df["Naturalness"] = None
df["Nutri-score"] = None

for index, row in df.iterrows():
    # Coherence, Completeness, Naturalness 계산
    scores = evaluate_output(row["output"])
    df.at[index, "Coherence"] = scores.get("Coherence", None)
    df.at[index, "Completeness"] = scores.get("Completeness", None)
    df.at[index, "Naturalness"] = scores.get("Naturalness", None)

    # Nutri-score 계산을 위한 GPT 영양소 추론
    nutrition = infer_nutritional_data(row["output"])
    calories = nutrition.get("calories", 0.0)
    sugar = nutrition.get("sugar", 0.0)
    sodium = nutrition.get("sodium", 0.0)
    fiber = nutrition.get("fiber", 0.0)
    fat = nutrition.get("fat", 0.0)
    protein = nutrition.get("protein", 0.0)

    nutri_score = calculate_nutri_score(calories, sugar, sodium, fiber, fat, protein)
    df.at[index, "Nutri-score"] = nutri_score

# 평균 점수 계산
def calculate_averages(df):
    averages = {
        "Coherence Average": df["Coherence"].mean(),
        "Completeness Average": df["Completeness"].mean(),
        "Naturalness Average": df["Naturalness"].mean(),
        "Nutri-score Average": df["Nutri-score"].mean()
    }
    return averages

averages = calculate_averages(df)
print("Average Scores:", averages)

# 결과 저장
# df.to_csv("evaluated_meal_plans.csv", index=False)
# print("Evaluation saved to 'evaluated_meal_plans.csv'")


Average Scores: {'Coherence Average': 3.66, 'Completeness Average': 3.4, 'Naturalness Average': 2.76, 'Nutri-score Average': 16.92}


In [2]:
import json

file_path = "/data/jaesung/llm_for_diabetes/src/model/inference_results_0130.jsonl"

data = []
with open(file_path, 'r', encoding='utf-8') as f:
    for line in f:
        data.append(json.loads(line))

In [3]:
import pandas as pd

df = pd.DataFrame(data)

In [4]:
df.head(2)

Unnamed: 0,dataset,split_data,task,instruction,input,output,__index_level_0__,model_output
0,diabetes_food_hub,test,alternative_diet,"Based on the previous meal, suggest the next m...","Cheese, cream",Herbed Soft Scrambled Eggs on Toast is recomme...,36153,Grilled Lime Chicken Fajitas is recommended. T...
1,diabetes_food_hub,test,alternative_diet,"Based on the previous meal, suggest the next m...","Turkey, ground",Almost Smooth Salsa is recommended. The reason...,36229,Grilled Lime Chicken Fajitas is recommended. T...


In [5]:
df['task'].unique()

array(['alternative_diet', 'daily_diets', 'qa_objective_1',
       'summarization', 'relation_extraction', 'qa_objective_3',
       'qa_objective_2', 'generation', 'nli'], dtype=object)

In [6]:
# medqa

import pandas as pd
qa_objective_df = df[df['task']=='qa_objective_1']

# 'output'와 'model_output'에서 A), B), C), D)만 추출
qa_objective_df['output_label'] = qa_objective_df['output'].str.extract(r'(A\)|B\)|C\)|D\))')
qa_objective_df['model_output_label'] = qa_objective_df['model_output'].str.extract(r'(A\)|B\)|C\)|D\))')

# 두 컬럼 비교하여 맞은 경우를 계산
qa_objective_df['correct'] = qa_objective_df['output_label'] == qa_objective_df['model_output_label']

# Accuracy 계산
accuracy = qa_objective_df['correct'].mean()

print(f"Accuracy: {accuracy:.2%}")


Accuracy: 42.00%


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qa_objective_df['output_label'] = qa_objective_df['output'].str.extract(r'(A\)|B\)|C\)|D\))')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qa_objective_df['model_output_label'] = qa_objective_df['model_output'].str.extract(r'(A\)|B\)|C\)|D\))')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qa_ob

In [7]:
# medmcqa

import pandas as pd
qa_subjective_df = df[df['task']=='qa_objective_2']

# 'output'와 'model_output'에서 A), B), C), D)만 추출
qa_subjective_df['output_label'] = qa_subjective_df['output'].str.extract(r'(A\)|B\)|C\)|D\))')
qa_subjective_df['model_output_label'] = qa_subjective_df['model_output'].str.extract(r'(A\)|B\)|C\)|D\))')

# 두 컬럼 비교하여 맞은 경우를 계산
qa_subjective_df['correct'] = qa_subjective_df['output_label'] == qa_subjective_df['model_output_label']

# Accuracy 계산
accuracy = qa_subjective_df['correct'].mean()

print(f"Accuracy: {accuracy:.2%}")

Accuracy: 40.00%


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qa_subjective_df['output_label'] = qa_subjective_df['output'].str.extract(r'(A\)|B\)|C\)|D\))')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qa_subjective_df['model_output_label'] = qa_subjective_df['model_output'].str.extract(r'(A\)|B\)|C\)|D\))')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  q

In [8]:
# medqa

def token_overlap(output, model_output):
    # 토큰화
    output_tokens = set(output.split())
    model_output_tokens = set(model_output.split())
    
    # 공통 토큰 개수 계산
    common_tokens = output_tokens.intersection(model_output_tokens)
    
    # 일치 비율 계산
    return len(common_tokens) / len(output_tokens) if len(output_tokens) > 0 else 0

qa_descriptive_df = df[df['task'] == 'qa_objective_1']


# 일치 비율 계산 및 저장
qa_descriptive_df['token_match_score'] = qa_descriptive_df.apply(lambda row: token_overlap(row['output'], row['model_output']), axis=1)

# 평균 점수를 정확도로 간주
accuracy = qa_descriptive_df['token_match_score'].mean()

print(f"Token Match Accuracy: {accuracy:.2%}")


Token Match Accuracy: 47.97%


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qa_descriptive_df['token_match_score'] = qa_descriptive_df.apply(lambda row: token_overlap(row['output'], row['model_output']), axis=1)


In [9]:
# medmcqa

def token_overlap(output, model_output):
    # 토큰화
    output_tokens = set(output.split())
    model_output_tokens = set(model_output.split())
    
    # 공통 토큰 개수 계산
    common_tokens = output_tokens.intersection(model_output_tokens)
    
    # 일치 비율 계산
    return len(common_tokens) / len(output_tokens) if len(output_tokens) > 0 else 0

qa_descriptive_df = df[df['task'] == 'qa_objective_2']

# 일치 비율 계산 및 저장
qa_descriptive_df['token_match_score'] = qa_descriptive_df.apply(lambda row: token_overlap(row['output'], row['model_output']), axis=1)

# 평균 점수를 정확도로 간주
accuracy = qa_descriptive_df['token_match_score'].mean()

print(f"Token Match Accuracy: {accuracy:.2%}")


Token Match Accuracy: 43.67%


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qa_descriptive_df['token_match_score'] = qa_descriptive_df.apply(lambda row: token_overlap(row['output'], row['model_output']), axis=1)


In [10]:
# pubmedqa

def token_overlap(output, model_output):
    # 토큰화
    output_tokens = set(output.split())
    model_output_tokens = set(model_output.split())
    
    # 공통 토큰 개수 계산
    common_tokens = output_tokens.intersection(model_output_tokens)
    
    # 일치 비율 계산
    return len(common_tokens) / len(output_tokens) if len(output_tokens) > 0 else 0

qa_descriptive_df = df[df['task'] == 'qa_objective_3']

# 일치 비율 계산 및 저장
qa_descriptive_df['token_match_score'] = qa_descriptive_df.apply(lambda row: token_overlap(row['output'], row['model_output']), axis=1)

# 평균 점수를 정확도로 간주
accuracy = qa_descriptive_df['token_match_score'].mean()

print(f"Token Match Accuracy: {accuracy:.2%}")


Token Match Accuracy: 90.00%


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qa_descriptive_df['token_match_score'] = qa_descriptive_df.apply(lambda row: token_overlap(row['output'], row['model_output']), axis=1)


In [11]:
# nli

nli_df = df[df['task'] == 'nli']

correct_predictions = (nli_df['output'] == nli_df['model_output']).sum()
total_predictions = len(nli_df) 

nli_acc = correct_predictions / total_predictions

print(nli_acc)  # bioinstruct - 0.33 

0.6


In [11]:
import pandas as pd

df_ie = df[df['task'] == 'relation_extraction']

# Precision, Recall, F1 계산 함수 (전체)
def calculate_total_metrics(output_col, model_output_col):
    total_true_positive = 0
    total_false_positive = 0
    total_false_negative = 0

    for output, model_output in zip(output_col, model_output_col):
        # ','로 구분된 문자열을 집합으로 변환
        output_set = set(output.split(', '))
        model_output_set = set(model_output.split(', '))

        # 교집합, 정답의 크기, 모델 예측의 크기 계산
        true_positive = len(output_set & model_output_set)
        false_positive = len(model_output_set - output_set)
        false_negative = len(output_set - model_output_set)

        # 누적 합산
        total_true_positive += true_positive
        total_false_positive += false_positive
        total_false_negative += false_negative

    # 총 Precision, Recall, F1 계산
    precision = total_true_positive / (total_true_positive + total_false_positive) if (total_true_positive + total_false_positive) > 0 else 0
    recall = total_true_positive / (total_true_positive + total_false_negative) if (total_true_positive + total_false_negative) > 0 else 0
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return precision, recall, f1

# 메트릭 계산
total_precision, total_recall, total_f1 = calculate_total_metrics(df_ie['output'], df_ie['model_output'])

# 결과 출력
print(f"Total Precision: {total_precision:.4f}")
print(f"Total Recall: {total_recall:.4f}")
print(f"Total F1-Score: {total_f1:.4f}")


Total Precision: 0.9400
Total Recall: 0.9400
Total F1-Score: 0.9400


In [12]:
def calculate_partial_metrics(df):
    precisions, recalls, f1s = [], [], []

    for index, row in df.iterrows():
        # output과 model_output을 리스트로 변환
        true_entities = set(row["output"].split(", "))
        predicted_entities = set(row["model_output"].split(", "))

        # 부분 매칭 허용
        matched = sum(
            int(any(te in pe or pe in te for te in true_entities)) for pe in predicted_entities
        )
        y_true = len(true_entities)
        y_pred = len(predicted_entities)

        # Precision, Recall, F1 계산
        precision = matched / y_pred if y_pred > 0 else 0
        recall = matched / y_true if y_true > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0

        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)

    # 평균 값 계산
    avg_precision = sum(precisions) / len(precisions)
    avg_recall = sum(recalls) / len(recalls)
    avg_f1 = sum(f1s) / len(f1s)

    return avg_precision, avg_recall, avg_f1

df = df_ie
# 계산 실행
precision, recall, f1 = calculate_partial_metrics(df)
print(f"Partial Precision: {precision:.2f}")
print(f"Partial Recall: {recall:.2f}")
print(f"Partial F1 Score: {f1:.2f}")



Partial Precision: 0.94
Partial Recall: 0.94
Partial F1 Score: 0.94


In [14]:
# ie - ie_extract_relation

from sklearn.metrics import precision_score, recall_score, f1_score


ie_extract_relation = df[df['task'] == 'relation_extraction']

y_true = ie_extract_relation['output']
y_pred = ie_extract_relation['model_output']

precision = precision_score(y_true, y_pred, average='micro')
recall = recall_score(y_true, y_pred, average='micro')
f1 = f1_score(y_true, y_pred, average='micro')

print(f"precision: {precision}") 
print(f"recall: {recall}")
print(f"f1: {f1}")


precision: 0.94
recall: 0.94
f1: 0.94


In [12]:
df[df['task']=='summarization'].head(5)

Unnamed: 0,dataset,split_data,task,instruction,input,output,__index_level_0__,model_output
350,pubmed,test,summarization,Write a summary of the article that captures t...,the online version of this article ( doi:10.10...,introductionfew studies have investigated the ...,26988,the online version of this article ( doi:10.10...
351,pubmed,test,summarization,Write a summary of the article that captures t...,"heart disease , breast cancer , osteoporosis ,...",women 's health has been threatened by various...,26897,"Resveratrol ( trans-3,4',5-trihydroxystilbene ..."
352,pubmed,test,summarization,Write a summary of the article that captures t...,diabetes mellitus type 2 ( t2 dm ) is a chroni...,[ purpose ] obesity and hyperglycemia play rol...,26972,diabetes mellitus type 2 ( t2 dm ) is a chroni...
353,pubmed,test,summarization,Write a summary of the article that captures t...,"polyneuropathy , organomegaly , endocrinopathy...","key clinical messagepolyneuropathy , organomeg...",26907,This response is generated by an automated mac...
354,pubmed,test,summarization,Write a summary of the article that captures t...,doctor - patient communication is considered a...,backgrounddoctor - patient communication is an...,26845,doctor - patient communication is considered a...


In [13]:
import openai
import pandas as pd
import re
from sklearn.metrics import f1_score
from bert_score import score as bert_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import os
from dotenv import load_dotenv

load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

# BLEURT 모델 로드
bleurt_model_name = "Elron/bleurt-large-512"
tokenizer = AutoTokenizer.from_pretrained(bleurt_model_name)
bleurt_model = AutoModelForSequenceClassification.from_pretrained(bleurt_model_name)
bleurt_model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bleurt_model = bleurt_model.to(device)

MAX_CONTEXT_LENGTH = 8192

# GPT-4 평가 함수
def evaluate_with_gpt4(input_text, model_output, true_output):
    input_text = input_text[:MAX_CONTEXT_LENGTH]
    model_output = model_output[:MAX_CONTEXT_LENGTH]
    true_output = true_output[:MAX_CONTEXT_LENGTH]

    prompt = f"""
    You are tasked with evaluating the quality of a QA model's responses based on the following metrics:
    1. **Coherence**: Does the model's response logically align with the context provided in the input?
    2. **Completeness**: Does the model's response sufficiently answer the question in the input?
    3. **Naturalness**: Does the model's response sound fluent and human-like?

    **Input**:
    {input_text}

    **Model's Response**:
    {model_output}

    **True Answer**:
    {true_output}

    Please rate each metric on a scale from 1 to 5. 
    Example response format:
    - Coherence: X.X
    - Completeness: X.X
    - Naturalness: X.X
    """
    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo-0125",
            messages=[
                {"role": "system", "content": "You are an expert evaluator for Summarization models."},
                {"role": "user", "content": prompt}
            ]
        )
        return response["choices"][0]["message"]["content"]
    except Exception as e:
        print("Error with GPT-4 API:", e)
        return None

# GPT-4 점수 추출
def extract_scores(evaluation):
    if evaluation is None:
        return {"Coherence": 0.0, "Completeness": 0.0, "Naturalness": 0.0}
    coherence = re.search(r"Coherence: (\d\.\d)", evaluation)
    completeness = re.search(r"Completeness: (\d\.\d)", evaluation)
    naturalness = re.search(r"Naturalness: (\d\.\d)", evaluation)
    
    return {
        "Coherence": float(coherence.group(1)) if coherence else 0.0,
        "Completeness": float(completeness.group(1)) if completeness else 0.0,
        "Naturalness": float(naturalness.group(1)) if naturalness else 0.0
    }

# BLEURT 점수 계산
def calculate_bleurt(y_true, y_pred):
    inputs = tokenizer(y_pred, y_true, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    with torch.no_grad():
        scores = bleurt_model(**inputs).logits

    if scores.numel() == 1:
        return float(scores.squeeze().item())  
    return [float(score) for score in scores.squeeze().tolist()]  

# BLEURT 및 BERTScore 계산
def calculate_bleurt_and_bertscore(y_true, y_pred):
    bleurt_score_value = calculate_bleurt(y_true, y_pred)
    _, _, bert_f1 = bert_score(y_pred, y_true, lang="en", rescale_with_baseline=True)
    bert_f1_avg = sum(bert_f1) / len(bert_f1) if len(bert_f1) > 0 else 0

    return {
        "BLEURT": bleurt_score_value if isinstance(bleurt_score_value, float) else sum(bleurt_score_value) / len(bleurt_score_value),
        "BERTScore_F1": bert_f1_avg
    }

# 점수 정규화 함수
def normalize_scores(df, column):
    if column not in df.columns:
        print(f"Warning: Column {column} not found in DataFrame. Skipping normalization.")
        return df
    df[column] = df[column].apply(lambda x: float(x) if isinstance(x, torch.Tensor) else x)
    min_val, max_val = df[column].min(), df[column].max()
    df[column] = df[column].apply(lambda x: (x - min_val) / (max_val - min_val) if max_val > min_val else 0.5)
    return df

qa_df = df[df['task'] == 'summarization']
results = []

for _, row in qa_df.iterrows():
    input_text, model_output, true_output = row['input'], row['model_output'], row['output']
    
    evaluation = evaluate_with_gpt4(input_text, model_output, true_output)
    print(f"Evaluation result:\n{evaluation}")
    
    scores = extract_scores(evaluation)
    metric_scores = calculate_bleurt_and_bertscore([true_output], [model_output])

    results.append({
        "input": input_text,
        "model_output": model_output,
        "true_output": true_output,
        "evaluation": evaluation,
        "Coherence": scores["Coherence"],
        "Completeness": scores["Completeness"],
        "Naturalness": scores["Naturalness"],
        "BLEURT": metric_scores.get("BLEURT", 0.0),  # 기본값 설정
        "BERTScore_F1": metric_scores.get("BERTScore_F1", 0.0)
    })

evaluation_df = pd.DataFrame(results)

# 'BLEURT' 컬럼이 존재하는지 확인 후 정규화 수행
evaluation_df = normalize_scores(evaluation_df, "BLEURT")
evaluation_df = normalize_scores(evaluation_df, "BERTScore_F1")

average_scores = evaluation_df[["Coherence", "Completeness", "Naturalness", "BLEURT", "BERTScore_F1"]].mean()
print("평균 점수:")
print(average_scores)


Evaluation result:
- Coherence: 4.0
- Completeness: 3.0
- Naturalness: 2.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.5
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.5
- Completeness: 3.0
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.5
- Completeness: 3.5
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.5
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 2.5
- Completeness: 2.0
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 5.0
- Completeness: 4.5
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.5
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.0
- Completeness: 2.5
- Naturalness: 2.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 4.5
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
Based on the provided input, here is the evaluation of the QA model's response:

- Coherence: 2.0
The response lacks coherence as it does not align logically with the context provided in the input. It goes off on a tangent discussing different aspects of diabetes treatment and fails to directly address the content of the input.

- Completeness: 1.5
The response is not sufficiently complete as it does not effectively answer the question in the input. It focuses more on unrelated details and opinions rather than addressing the main points of the information provided.

- Naturalness: 2.0
The response lacks naturalness as it sounds repetitive and lacks variety in sentence structure. The content feels forced and does not flow smoothly, detracting from its human-like quality.

Overall, the QA model's response needs improvement in coherence, completeness, and naturalness to better align with the input and provide a more accurate and well-structured answer.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
Based on the given metrics, here is the evaluation of the QA model's response:

- Coherence: 2.0
- Completeness: 2.5
- Naturalness: 2.0

The model's response somewhat aligns with the context provided, but it lacks depth and relevant details. The response is partially complete in addressing the question but falls short in providing a comprehensive answer. In terms of naturalness, the response lacks fluency and human-like quality.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 2.0
- Completeness: 3.0
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.5
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 4.5
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 5.0
- Completeness: 4.0
- Naturalness: 4.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 2.0
- Completeness: 1.5
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.5
- Completeness: 4.0
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.5
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.5
- Naturalness: 2.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.5
- Completeness: 3.0
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.5
- Completeness: 3.5
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.5
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.0
- Completeness: 2.5
- Naturalness: 2.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.5
- Completeness: 2.5
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.0
- Completeness: 2.5
- Naturalness: 2.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 4.5
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.5
- Completeness: 4.0
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.5
- Completeness: 3.0
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 2.0
- Completeness: 2.5
- Naturalness: 2.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.5 
- Completeness: 2.0 
- Naturalness: 1.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.5
- Completeness: 3.0
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 4.0
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.5
- Completeness: 3.0
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 2.5
- Completeness: 2.0
- Naturalness: 1.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.5
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 4.0
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 4.5
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.5
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 2.0
- Completeness: 1.5
- Naturalness: 2.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.5
- Completeness: 4.0
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.5
- Completeness: 2.5
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


평균 점수:
Coherence       3.535714
Completeness    3.190476
Naturalness     3.119048
BLEURT          0.480314
BERTScore_F1    0.506394
dtype: float64


In [14]:
import openai
import pandas as pd
import re
from sklearn.metrics import f1_score
from bert_score import score as bert_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import os
from dotenv import load_dotenv

load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

# BLEURT 모델 로드
bleurt_model_name = "Elron/bleurt-large-512"
tokenizer = AutoTokenizer.from_pretrained(bleurt_model_name)
bleurt_model = AutoModelForSequenceClassification.from_pretrained(bleurt_model_name)
bleurt_model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bleurt_model = bleurt_model.to(device)

MAX_CONTEXT_LENGTH = 8192

# GPT-4 평가 함수
def evaluate_with_gpt4(input_text, model_output, true_output):
    input_text = input_text[:MAX_CONTEXT_LENGTH]
    model_output = model_output[:MAX_CONTEXT_LENGTH]
    true_output = true_output[:MAX_CONTEXT_LENGTH]

    prompt = f"""
    You are tasked with evaluating the quality of a QA model's responses based on the following metrics:
    1. **Coherence**: Does the model's response logically align with the context provided in the input?
    2. **Completeness**: Does the model's response sufficiently answer the question in the input?
    3. **Naturalness**: Does the model's response sound fluent and human-like?

    **Input**:
    {input_text}

    **Model's Response**:
    {model_output}

    **True Answer**:
    {true_output}

    Please rate each metric on a scale from 1 to 5. 
    Example response format:
    - Coherence: X.X
    - Completeness: X.X
    - Naturalness: X.X
    """
    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo-0125",
            messages=[
                {"role": "system", "content": "You are an expert evaluator for Summarization models."},
                {"role": "user", "content": prompt}
            ]
        )
        return response["choices"][0]["message"]["content"]
    except Exception as e:
        print("Error with GPT-4 API:", e)
        return None

# GPT-4 점수 추출
def extract_scores(evaluation):
    if evaluation is None:
        return {"Coherence": 0.0, "Completeness": 0.0, "Naturalness": 0.0}
    coherence = re.search(r"Coherence: (\d\.\d)", evaluation)
    completeness = re.search(r"Completeness: (\d\.\d)", evaluation)
    naturalness = re.search(r"Naturalness: (\d\.\d)", evaluation)
    
    return {
        "Coherence": float(coherence.group(1)) if coherence else 0.0,
        "Completeness": float(completeness.group(1)) if completeness else 0.0,
        "Naturalness": float(naturalness.group(1)) if naturalness else 0.0
    }

# BLEURT 점수 계산
def calculate_bleurt(y_true, y_pred):
    inputs = tokenizer(y_pred, y_true, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    with torch.no_grad():
        scores = bleurt_model(**inputs).logits

    if scores.numel() == 1:
        return float(scores.squeeze().item())  
    return [float(score) for score in scores.squeeze().tolist()]  

# BLEURT 및 BERTScore 계산
def calculate_bleurt_and_bertscore(y_true, y_pred):
    bleurt_score_value = calculate_bleurt(y_true, y_pred)
    _, _, bert_f1 = bert_score(y_pred, y_true, lang="en", rescale_with_baseline=True)
    bert_f1_avg = sum(bert_f1) / len(bert_f1) if len(bert_f1) > 0 else 0

    return {
        "BLEURT": bleurt_score_value if isinstance(bleurt_score_value, float) else sum(bleurt_score_value) / len(bleurt_score_value),
        "BERTScore_F1": bert_f1_avg
    }

# 점수 정규화 함수
def normalize_scores(df, column):
    if column not in df.columns:
        print(f"Warning: Column {column} not found in DataFrame. Skipping normalization.")
        return df
    df[column] = df[column].apply(lambda x: float(x) if isinstance(x, torch.Tensor) else x)
    min_val, max_val = df[column].min(), df[column].max()
    df[column] = df[column].apply(lambda x: (x - min_val) / (max_val - min_val) if max_val > min_val else 0.5)
    return df

qa_df = df[df['task'] == 'generation']
results = []

for _, row in qa_df.iterrows():
    input_text, model_output, true_output = row['input'], row['model_output'], row['output']
    
    evaluation = evaluate_with_gpt4(input_text, model_output, true_output)
    print(f"Evaluation result:\n{evaluation}")
    
    scores = extract_scores(evaluation)
    metric_scores = calculate_bleurt_and_bertscore([true_output], [model_output])

    results.append({
        "input": input_text,
        "model_output": model_output,
        "true_output": true_output,
        "evaluation": evaluation,
        "Coherence": scores["Coherence"],
        "Completeness": scores["Completeness"],
        "Naturalness": scores["Naturalness"],
        "BLEURT": metric_scores.get("BLEURT", 0.0),  # 기본값 설정
        "BERTScore_F1": metric_scores.get("BERTScore_F1", 0.0)
    })

evaluation_df = pd.DataFrame(results)

# 'BLEURT' 컬럼이 존재하는지 확인 후 정규화 수행
evaluation_df = normalize_scores(evaluation_df, "BLEURT")
evaluation_df = normalize_scores(evaluation_df, "BERTScore_F1")

average_scores = evaluation_df[["Coherence", "Completeness", "Naturalness", "BLEURT", "BERTScore_F1"]].mean()
print("평균 점수:")
print(average_scores)


Evaluation result:
- Coherence: 4.0
- Completeness: 3.0
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.5
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.5
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 2.5
- Completeness: 4.0
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 2.0
- Completeness: 3.5
- Naturalness: 2.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.5
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 2.0
- Completeness: 1.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 2.5
- Completeness: 2.0
- Naturalness: 2.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 2.0
- Naturalness: 1.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.5
- Completeness: 2.5
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 2.0
- Completeness: 1.5
- Naturalness: 2.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.5
- Completeness: 2.5
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.5
- Completeness: 3.5
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 2.0
- Completeness: 1.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 4.5
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.0
- Completeness: 3.5
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.0
- Completeness: 3.5
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 2.0
- Completeness: 1.5
- Naturalness: 2.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 2.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 4.0
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.0
- Completeness: 3.5
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.0
- Completeness: 4.0
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
Based on the provided input and the comparison between the model's response and the true answer, here are the ratings for each metric:

- Coherence: 3.0
- Completeness: 2.5
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.0
- Naturalness: 2.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.0
- Completeness: 2.5
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.0
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.5
- Completeness: 4.5
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.0
- Completeness: 2.5
- Naturalness: 2.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 2.0
- Completeness: 3.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
Based on the metrics provided, here is the evaluation of the model's response:

- Coherence: 3.0
The response is somewhat coherent with the input context as it addresses the issue of erectile dysfunction and its connection to atherosclerosis and high cholesterol. However, the repetitive phrases towards the end detract from overall coherence.

- Completeness: 2.5
The response partially answers the question by discussing the relationship between atherosclerosis, high cholesterol, and erectile dysfunction. However, it lacks specific information on whether the condition can be managed through Siddha medicines, as requested in the input.

- Naturalness: 2.0
The response lacks naturalness and fluency due to the repetitive nature of some phrases. It does not sound human-like and feels more like a generic recommendation rather than a personalized response.

Overall, the model's response needs improvement in completeness, coherence, and naturalness to provide a more effective

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 4.5
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.5
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 2.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 2.5
- Completeness: 2.0
- Naturalness: 2.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
Based on the provided metrics and the input and model's response, here is the evaluation of the QA model's responses:

- **Coherence: 4.5**
  - The model's response is highly coherent with the input context provided. It explains the left ventricular diastolic dysfunction in detail and how it can lead to serious complications if left untreated. The information provided aligns well with the user's concerns about the severity and implications of the condition.

- **Completeness: 4.0**
  - The model's response is quite comprehensive in covering various aspects of left ventricular diastolic dysfunction, including its stages, symptoms, treatments, and the importance of consulting healthcare providers. However, it lacks specific questions related to the type of dysfunction, ejection fraction, and the request for the 2D echo report made in the true answer. Therefore, it falls slightly short in completeness.

- **Naturalness: 4.0**
  - The response sounds fluent, informative,

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.5
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 4.0
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.5
- Completeness: 4.0
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 4.5
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 2.0
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


평균 점수:
Coherence       3.083333
Completeness    2.928571
Naturalness     2.809524
BLEURT          0.527128
BERTScore_F1    0.700717
dtype: float64


In [13]:
df[df['task']=='generation'].head(10)

Unnamed: 0,dataset,split_data,task,instruction,input,output,__index_level_0__,model_output
400,icliniq,test,generation,Generate a comprehensive medical response base...,"Hello doctor,I am a male aged 29 years 3 month...",Hello. I understand your concern. Your HbA1c r...,26185,"Hello,Thank you for writing to us. The best te..."
401,icliniq,test,generation,Generate a comprehensive medical response base...,"Hello doctor,I am super scared. I am 30 years ...",Hi. There can be many idiopathic causes of a n...,26168,"Hello, thank you for sharing your concerns wit..."
402,icliniq,test,generation,Generate a comprehensive medical response base...,"Hi doctor,I am 49 years old male. For the past...","Hi, Welcome to Chat Doctor forum. I need a few...",26182,"Hi, I would recommend you to stop taking the a..."
403,icliniq,test,generation,Generate a comprehensive medical response base...,"Hi doctor,My mother got a sudden heart pain. I...","Hi, Welcome to Chat Doctor forum. An angiogram...",26181,"Dear patient, thank you for sharing your conce..."
404,icliniq,test,generation,Generate a comprehensive medical response base...,"Hello doctor,Four months ago, I consulted you ...","Hello. Looking at the overall picture, the pro...",26207,"Dear Mr. XXXXXXX, Thank you for your query. I ..."
405,icliniq,test,generation,Generate a comprehensive medical response base...,"Hi doctor,I pee very often (even though I am y...","Hi, Welcome to Chat Doctor forum. You should r...",26208,"Yes, urination is very frequent. It is called ..."
406,icliniq,test,generation,Generate a comprehensive medical response base...,"Hello doctor,I am a diabetes patient. My sugar...","Hello, Welcome to Chat Doctor forum. Ketones u...",26210,"Hello, I am Dr. Rahul. I would suggest you to ..."
407,icliniq,test,generation,Generate a comprehensive medical response base...,"Hello doctor,I feel listless and tired when I ...","Hello, Welcome to Chat Doctor forum. The cause...",26204,"Dear Patient, I would like to know about your ..."
408,icliniq,test,generation,Generate a comprehensive medical response base...,"Hi doctor,I am a 58 year old female. I am suff...",Hello. I had verified your report (attachment ...,26152,Thank you for sharing the reports. I reviewed ...
409,icliniq,test,generation,Generate a comprehensive medical response base...,"Hello doctor, I have PCOS and irregular period...",Hello. Diane-35 is taken for 21 consecutive da...,26160,"Hi, It is unlikely that you are pregnant. Howe..."


In [25]:
import re
from sklearn.metrics import accuracy_score

def extract_choice(text):
    match = re.search(r'\b([A-D])\b', text)
    return match.group(1) if match else None

df['output_parsed'] = df['output'].apply(extract_choice)
df['model_output_parsed'] = df['model_output'].apply(extract_choice)

filtered_df = df.dropna(subset=['output_parsed', 'model_output_parsed'])

accuracy = accuracy_score(filtered_df['output_parsed'], filtered_df['model_output_parsed'])

print(f"Accuracy: {accuracy:.4f}")


Accuracy: 0.3900


In [15]:
# daily diets

import openai
import pandas as pd
import re
from tqdm import tqdm

from dotenv import load_dotenv
import os

load_dotenv()

openai.api_key = os.getenv("OPENAI_API_KEY")

MAX_CONTEXT_LENGTH = 8192

def evaluate_with_gpt4(input_text, model_output, true_output, task_type):
    input_text = input_text[:MAX_CONTEXT_LENGTH]
    model_output = model_output[:MAX_CONTEXT_LENGTH]
    true_output = true_output[:MAX_CONTEXT_LENGTH]

    if task_type == "daily_diets":
        prompt = f"""
        You are tasked with evaluating the quality of a meal recommendation model's responses based on the following metrics:
        1. **Coherence**: Does the model's response logically align with the context provided in the input?
        2. **Completeness**: Does the model's response sufficiently answer the input request?
        3. **Naturalness**: Does the model's response sound fluent and human-like?
        4. **Nutritional Adequacy**: Does the meal align with the nutritional goals in the input, considering reasonable flexibility and practical applicability in real-life scenarios?
        5. **Caloric Balance**: Are the recommended meals well-balanced in terms of calories?

        **Input**:
        {input_text}

        **Model's Response**:
        {model_output}

        **True Answer**:
        {true_output}

        Please rate each metric on a scale from 1 to 5. 
        Example response format:
        - Coherence: X.X
        - Completeness: X.X
        - Naturalness: X.X
        - Nutritional Adequacy: X.X
        - Caloric Balance: X.X
        """
    elif task_type == "alternative_diets":
        prompt = f"""
        You are tasked with evaluating the quality of a meal recommendation model's responses based on the following metrics:
        1. **Coherence**: Does the model's response logically align with the context provided in the input?
        2. **Completeness**: Does the model's response sufficiently answer the input request?
        3. **Naturalness**: Does the model's response sound fluent and human-like?
        4. **Improvement**: Does the recommended meal address the shortcomings of the previous meal?
        5. **Suitability**: Is the recommended meal suitable for a diabetes patient?

        **Input**:
        {input_text}

        **Model's Response**:
        {model_output}

        **True Answer**:
        {true_output}

        Please rate each metric on a scale from 1 to 5. 
        Example response format:
        - Coherence: X.X
        - Completeness: X.X
        - Naturalness: X.X
        - Improvement: X.X
        - Suitability: X.X
        """
    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo-0125",
            messages=[
                {"role": "system", "content": "You are an expert evaluator for meal recommendation models."},
                {"role": "user", "content": prompt}
            ]
        )
        return response["choices"][0]["message"]["content"]
    except Exception as e:
        print("Error with GPT-4 API:", e)
        return None

# 점수 추출 함수
def extract_scores(evaluation, task_type):
    if evaluation is None:
        if task_type == "daily_diets":
            return {
                "Coherence": 0.0,
                "Completeness": 0.0,
                "Naturalness": 0.0,
                "Nutritional Adequacy": 0.0,
                "Caloric Balance": 0.0
            }
        elif task_type == "alternative_diets":
            return {
                "Coherence": 0.0,
                "Completeness": 0.0,
                "Naturalness": 0.0,
                "Improvement": 0.0,
                "Suitability": 0.0
            }

    scores = {}
    if task_type == "daily_diets":
        metrics = ["Coherence", "Completeness", "Naturalness", "Nutritional Adequacy", "Caloric Balance"]
    elif task_type == "alternative_diets":
        metrics = ["Coherence", "Completeness", "Naturalness", "Improvement", "Suitability"]

    for metric in metrics:
        match = re.search(fr"{metric}: (\d\.\d)", evaluation)
        scores[metric] = float(match.group(1)) if match else 0.0

    return scores

results = []

daily_df = df[(df['task'] == 'daily_diets') & (df['output'].str.contains('"Breakfast"'))]

for _, row in tqdm(daily_df.iterrows(), total=len(daily_df), desc="Evaluating daily diets"):
    evaluation = evaluate_with_gpt4(row['input'], row['model_output'], row['output'], "daily_diets")
    scores = extract_scores(evaluation, "daily_diets")
    results.append({**row.to_dict(), **scores})

alternative_df = df[df['task'] == 'alternative_diets']

for _, row in tqdm(alternative_df.iterrows(), total=len(alternative_df), desc="Evaluating alternative diets"):
    evaluation = evaluate_with_gpt4(row['input'], row['model_output'], row['output'], "alternative_diets")
    scores = extract_scores(evaluation, "alternative_diets")
    results.append({**row.to_dict(), **scores})

evaluation_df = pd.DataFrame(results)

if "Nutritional Adequacy" in evaluation_df.columns and "Caloric Balance" in evaluation_df.columns:
    daily_avg = evaluation_df[evaluation_df['task'] == 'daily_diets'][[
        "Coherence", "Completeness", "Naturalness", 
        "Nutritional Adequacy", "Caloric Balance"
    ]].mean()
    print("Daily Diets Average Scores:")
    print(daily_avg)

if "Improvement" in evaluation_df.columns and "Suitability" in evaluation_df.columns:
    alternative_avg = evaluation_df[evaluation_df['task'] == 'alternative_diets'][[
        "Coherence", "Completeness", "Naturalness", 
        "Improvement", "Suitability"
    ]].mean()
    print("\nAlternative Diets Average Scores:")
    print(alternative_avg)
else:
    missing_columns = [col for col in ["Improvement", "Suitability"] if col not in evaluation_df.columns]
    print("\nAlternative Diets scores not available:")
    print(f"Missing columns: {missing_columns}")


Evaluating daily diets: 100%|██████████| 43/43 [00:38<00:00,  1.11it/s]
Evaluating alternative diets: 0it [00:00, ?it/s]

Daily Diets Average Scores:
Coherence               3.476744
Completeness            2.906977
Naturalness             3.813953
Nutritional Adequacy    2.767442
Caloric Balance         3.151163
dtype: float64

Alternative Diets scores not available:
Missing columns: ['Improvement', 'Suitability']





In [16]:
# alternative diets

import openai
import pandas as pd
import re
from tqdm import tqdm

from dotenv import load_dotenv
import os

load_dotenv()


openai.api_key = os.getenv("OPENAI_API_KEY")

MAX_CONTEXT_LENGTH = 8192

def evaluate_with_gpt4(input_text, model_output, true_output, task_type):

    input_text = input_text[:MAX_CONTEXT_LENGTH]
    model_output = model_output[:MAX_CONTEXT_LENGTH]
    true_output = true_output[:MAX_CONTEXT_LENGTH]

    if task_type == "daily_diets":
        prompt = f"""
        You are tasked with evaluating the quality of a meal recommendation model's responses based on the following metrics:
        1. **Coherence**: Does the model's response logically align with the context provided in the input?
        2. **Completeness**: Does the model's response sufficiently answer the input request?
        3. **Naturalness**: Does the model's response sound fluent and human-like?
        4. **Nutritional Adequacy**: Does the meal response meet the nutritional goals mentioned in the input?
        5. **Caloric Balance**: Are the recommended meals well-balanced in terms of calories?

        **Input**:
        {input_text}

        **Model's Response**:
        {model_output}

        **True Answer**:
        {true_output}

        Please rate each metric on a scale from 1 to 5. 
        Example response format:
        - Coherence: X.X
        - Completeness: X.X
        - Naturalness: X.X
        - Nutritional Adequacy: X.X
        - Caloric Balance: X.X
        """
    elif task_type == "alternative_diet":
        prompt = f"""
        You are tasked with evaluating the quality of a meal recommendation model's responses based on the following metrics:

        1. **Coherence**: Does the model's response logically align with the context provided in the input?
        2. **Completeness**: Does the model's response sufficiently answer the input request?
        3. **Naturalness**: Does the model's response sound fluent and human-like?
        4. **Improvement**: Assume that the recommended alternative meal is an improvement over the previous meal. Evaluate how effectively it builds upon and enhances the previous meal, even if the changes are small or subtle.
        5. **Suitability**: Is the recommended meal suitable for a diabetes patient?

        **Input**:
        {input_text}

        **Model's Response**:
        {model_output}

        **True Answer**:
        {true_output}

        Please rate each metric on a scale from 1 to 5. 
        Example response format:
        - Coherence: X.X
        - Completeness: X.X
        - Naturalness: X.X
        - Improvement: X.X
        - Suitability: X.X
        """
    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo-0125",
            messages=[
                {"role": "system", "content": "You are an expert evaluator for meal recommendation models."},
                {"role": "user", "content": prompt}
            ]
        )
        return response["choices"][0]["message"]["content"]
    except Exception as e:
        print("Error with GPT-4 API:", e)
        return None

def extract_scores(evaluation, task_type):
    if evaluation is None:
        if task_type == "daily_diets":
            return {
                "Coherence": 0.0,
                "Completeness": 0.0,
                "Naturalness": 0.0,
                "Nutritional Adequacy": 0.0,
                "Caloric Balance": 0.0
            }
        elif task_type == "alternative_diet":
            return {
                "Coherence": 0.0,
                "Completeness": 0.0,
                "Naturalness": 0.0,
                "Improvement": 0.0,
                "Suitability": 0.0
            }

    scores = {}
    if task_type == "daily_diets":
        metrics = ["Coherence", "Completeness", "Naturalness", "Nutritional Adequacy", "Caloric Balance"]
    elif task_type == "alternative_diet":
        metrics = ["Coherence", "Completeness", "Naturalness", "Improvement", "Suitability"]

    for metric in metrics:
        match = re.search(fr"{metric}: (\d\.\d)", evaluation)
        scores[metric] = float(match.group(1)) if match else 0.0

    return scores

results = []

alternative_df = df[df['task'] == 'alternative_diet']

for _, row in tqdm(alternative_df.iterrows(), total=len(alternative_df), desc="Evaluating alternative diets"):
    evaluation = evaluate_with_gpt4(row['input'], row['model_output'], row['output'], "alternative_diet")
    scores = extract_scores(evaluation, "alternative_diet")
    results.append({**row.to_dict(), **scores})

evaluation_df = pd.DataFrame(results)

if all(col in evaluation_df.columns for col in ["Improvement", "Suitability"]):
    alternative_avg = evaluation_df[[
        "Coherence", "Completeness", "Naturalness", 
        "Improvement", "Suitability"
    ]].mean()
    print("\nAlternative Diets Average Scores:")
    print(alternative_avg)


Evaluating alternative diets: 100%|██████████| 50/50 [00:46<00:00,  1.07it/s]


Alternative Diets Average Scores:
Coherence       4.37
Completeness    3.94
Naturalness     4.15
Improvement     3.45
Suitability     4.05
dtype: float64





In [None]:
# qa_subjective

import openai
import pandas as pd
import re
from sklearn.metrics import f1_score
from bert_score import score as bert_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

from dotenv import load_dotenv
import os

load_dotenv()

openai.api_key = os.getenv("OPENAI_API_KEY")

bleurt_model_name = "Elron/bleurt-large-512"
tokenizer = AutoTokenizer.from_pretrained(bleurt_model_name)
bleurt_model = AutoModelForSequenceClassification.from_pretrained(bleurt_model_name)
bleurt_model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bleurt_model = bleurt_model.to(device)

MAX_CONTEXT_LENGTH = 8192

def evaluate_with_gpt4(input_text, model_output, true_output):
    input_text = input_text[:MAX_CONTEXT_LENGTH]
    model_output = model_output[:MAX_CONTEXT_LENGTH]
    true_output = true_output[:MAX_CONTEXT_LENGTH]

    prompt = f"""
    You are tasked with evaluating the quality of a QA model's responses based on the following metrics:
    1. **Coherence**: Does the model's response logically align with the context provided in the input?
    2. **Completeness**: Does the model's response sufficiently answer the question in the input?
    3. **Naturalness**: Does the model's response sound fluent and human-like?

    **Input**:
    {input_text}

    **Model's Response**:
    {model_output}

    **True Answer**:
    {true_output}

    Please rate each metric on a scale from 1 to 5. 
    Example response format:
    - Coherence: X.X
    - Completeness: X.X
    - Naturalness: X.X
    """
    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo-0125",
            messages=[
                {"role": "system", "content": "You are an expert evaluator for QA models."},
                {"role": "user", "content": prompt}
            ]
        )
        return response["choices"][0]["message"]["content"]
    except Exception as e:
        print("Error with GPT-4 API:", e)
        return None

def extract_scores(evaluation):
    if evaluation is None:
        return {
            "Coherence": 0.0,
            "Completeness": 0.0,
            "Naturalness": 0.0
        }
    coherence = re.search(r"Coherence: (\d\.\d)", evaluation)
    completeness = re.search(r"Completeness: (\d\.\d)", evaluation)
    naturalness = re.search(r"Naturalness: (\d\.\d)", evaluation)
    
    return {
        "Coherence": float(coherence.group(1)) if coherence else 0.0,
        "Completeness": float(completeness.group(1)) if completeness else 0.0,
        "Naturalness": float(naturalness.group(1)) if naturalness else 0.0
    }

def calculate_concept_f1(y_true, y_pred):
    return f1_score(y_true, y_pred, average="macro")


def calculate_bleurt(y_true, y_pred):
    inputs = tokenizer(y_pred, y_true, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    with torch.no_grad():
        scores = bleurt_model(**inputs).logits

    if scores.numel() == 1:
        return float(scores.squeeze().item()) 
    return [float(score) for score in scores.squeeze().tolist()] 

def calculate_bleurt_and_bertscore(y_true, y_pred):
    bleurt_score_value = calculate_bleurt(y_true, y_pred)

    _, _, bert_f1 = bert_score(y_pred, y_true, lang="en", rescale_with_baseline=True)

    bert_f1 = [float(score) for score in bert_f1]

    return {
        "BLEURT": bleurt_score_value if isinstance(bleurt_score_value, float) else sum(bleurt_score_value) / len(bleurt_score_value),
        "BERTScore_F1": sum(bert_f1) / len(bert_f1)
    }

def normalize_scores(df, column):

    df[column] = df[column].apply(
        lambda x: float(x) if isinstance(x, torch.Tensor) else x
    )
    min_val = df[column].min()
    max_val = df[column].max()

    df[column] = df[column].apply(
        lambda x: (x - min_val) / (max_val - min_val) if max_val > min_val else 0.5
    )
    return df

qa_df = df[df['task'] == 'qa_subjective']

results = []

for _, row in qa_df.iterrows():
    input_text = row['input']
    model_output = row['model_output']
    true_output = row['output']
    
    evaluation = evaluate_with_gpt4(input_text, model_output, true_output)
    print(f"Evaluation result:\n{evaluation}")
    
    scores = extract_scores(evaluation)
    
    concept_f1 = calculate_concept_f1([true_output], [model_output])
    
    metric_scores = calculate_bleurt_and_bertscore([true_output], [model_output])

    results.append({
        "input": input_text,
        "model_output": model_output,
        "true_output": true_output,
        "evaluation": evaluation,
        "Coherence": scores["Coherence"],
        "Completeness": scores["Completeness"],
        "Naturalness": scores["Naturalness"],
        "Concept_F1": concept_f1,
        "BLEURT": metric_scores["BLEURT"],
        "BERTScore_F1": metric_scores["BERTScore_F1"]
    })

evaluation_df = pd.DataFrame(results)

evaluation_df = normalize_scores(evaluation_df, "BLEURT")
evaluation_df = normalize_scores(evaluation_df, "BERTScore_F1")

average_scores = evaluation_df[["Coherence", "Completeness", "Naturalness", "Concept_F1", "BLEURT", "BERTScore_F1"]].mean()

print("평균 점수:")
print(average_scores)

Evaluation result:
- Coherence: 3.5
- Completeness: 3.0
- Naturalness: 2.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.5
- Completeness: 3.0
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.5
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 4.5
- Naturalness: 4.2


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.5
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.5
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 4.5
- Naturalness: 4.3


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 2.5
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.0
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.5
- Completeness: 3.0
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.5
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.5
- Completeness: 3.0
- Naturalness: 2.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.5
- Completeness: 3.0
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.5
- Completeness: 3.0
- Naturalness: 2.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 2.0
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.5
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.0
- Completeness: 2.0
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.5
- Completeness: 4.0
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 2.0
- Completeness: 1.0
- Naturalness: 1.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.5
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.0
- Completeness: 2.5
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 4.5
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 2.0
- Completeness: 3.0
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.0
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.0
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.5
- Completeness: 3.0
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.5
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.0
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.5
- Completeness: 3.0
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.5
- Completeness: 3.0
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.5
- Completeness: 4.0
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.0
- Completeness: 1.0
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.0
- Completeness: 1.0
- Naturalness: 2.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
Based on the provided scenario, here is the evaluation of the QA model's response:

- **Coherence**: 1.0  
The model's response lacks coherence with the context provided in the input. While the input detailed a specific case of an individual experiencing erectile dysfunction and seeking guidance, the model's response lists a plethora of potential causes without addressing the individual's specific situation.

- **Completeness**: 1.0  
The model's response is not complete as it fails to sufficiently answer the specific questions and concerns raised in the input. It provides a broad list of potential causes without addressing the individual's medical history, symptoms, or the effects of the medication they are currently taking.

- **Naturalness**: 1.0  
The response lacks naturalness as it repeats a long list of potential causes in a repetitive and unnatural manner. The response sounds robotic, lacks empathy, and does not engage with the individual seeking help.

Overa

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.0
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.5
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 2.5
- Naturalness: 2.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.5
- Naturalness: 4.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.5
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 2.0
- Naturalness: 2.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.0
- Completeness: 3.5
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 2.0
- Completeness: 4.5
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 2.5
- Completeness: 4.5
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 2.0
- Completeness: 2.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.5
- Completeness: 2.5
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.5
- Completeness: 3.0
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.5
- Completeness: 4.0
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 2.0
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


평균 점수:
Coherence       3.310000
Completeness    2.920000
Naturalness     3.180000
Concept_F1      0.000000
BLEURT          0.515577
BERTScore_F1    0.639133
dtype: float64


In [9]:
## nutri score, HEI

df[df['task']=='daily_diets'].head(2)

Unnamed: 0,dataset,split_data,task,instruction,input,output,__index_level_0__,model_output,output_parsed,model_output_parsed
430,diabetes_food_hub,test,daily_diets,Recommend a daily diet based on the given nutr...,Ensure the daily carbohydrate intake does not ...,"{""Breakfast"": ""Chicken and Cucumber Lettuce Wr...",32166,"{""Breakfast"": ""Turkey Sausage and Egg Casserol...",,
431,diabetes_food_hub,test,daily_diets,Recommend a daily diet that includes a specifi...,Create a diet that includes Parmesan cheese(gr...,"{""Breakfast"": ""Chicken and Cucumber Lettuce Wr...",32167,"{""Breakfast"": ""Turkey Meatball \u201cWonton\u2...",,


In [10]:
df[df['task']=='daily_diets'].iloc[1]['model_output']

'{"Breakfast": "Turkey Meatball \\u201cWonton\\u201d Soup with Bok Choy & Carrots", "Lunch": "Peach & Cream Smoothie", "Dinner": "Curried Chickpea Stew with Roasted Vegetables"}'

In [17]:
import pandas as pd

dfh = pd.read_csv("/data/jaesung/llm_for_diabetes/src/data/data2_daily_diets/diabetes_food_hub_new_nutri_facts.csv")
dfh.head(2)

Unnamed: 0,title,description,prep_time,cook_time,servings,steps,tags,nutrition_facts,ingredients
0,Raspberry Swirl Frozen Yogurt Bark,Raspberry Swirl Frozen Yogurt Bark: Dive into ...,10 min,4 hr,6 Servings,['Cover a freezer-safe tray with parchment pap...,"['Kid Friendly', 'Vegetarian', 'Dessert', 'Sna...","{'Servings': '6 Servings', 'Serving Size': '1 ...","[{'label': 'Plain Nonfat Greek yogurt', 'us_me..."
1,Maple-Pumpkin Spice Oatmeal Cookies,Description not found,10 min,25 min,14 Servings,['Preheat the oven to 350 degrees F. Line two ...,"['Kid Friendly', 'Vegetarian', 'Snacks', 'Glut...","{'Servings': '14 Servings', 'Serving Size': '1...","[{'label': 'old-fashioned rolled oats', 'us_me..."


In [18]:
print(dfh['nutrition_facts'].iloc[0])

{'Servings': '6 Servings', 'Serving Size': '1 slice (4×4 inch square)', 'Amount per Serving': {'Calories': '70', 'Total Fat': {'Amount': '1g', 'Saturated Fat': '1g', 'Trans Fat': '0g'}, 'Cholesterol': '5mg', 'Sodium': '30mg', 'Total Carbohydrates': {'Amount': '19g', 'Dietary Fiber': '1g', 'Total Sugars': '4g', 'Added Sugars': '0g'}, 'Protein': '8g', 'Potassium': '140mg', 'Phosphorus': None}}


In [19]:
print(type(dfh['nutrition_facts'].iloc[0]))

<class 'str'>


In [20]:
dfh['ingredients'].iloc[0]

"[{'label': 'Plain Nonfat Greek yogurt', 'us_measure': '2 cup', 'metric_measure': '473 ml'}, {'label': 'pure vanilla extract', 'us_measure': '1 tsp', 'metric_measure': '5 g'}, {'label': 'Splenda® Multi-Use Syrup(divided)', 'us_measure': '1/3 cup', 'metric_measure': '78 ml'}, {'label': 'fresh or frozen raspberries', 'us_measure': '1/2 cup', 'metric_measure': '118 ml'}, {'label': 'Fresh Blueberries', 'us_measure': '1/2 cup', 'metric_measure': '118 ml'}, {'label': 'unsweetened coconut flakes', 'us_measure': '1 tbsp', 'metric_measure': '15 ml'}]"

In [21]:
dfh.columns

Index(['title', 'description', 'prep_time', 'cook_time', 'servings', 'steps',
       'tags', 'nutrition_facts', 'ingredients'],
      dtype='object')

In [22]:
dfh.head(2)

Unnamed: 0,title,description,prep_time,cook_time,servings,steps,tags,nutrition_facts,ingredients
0,Raspberry Swirl Frozen Yogurt Bark,Raspberry Swirl Frozen Yogurt Bark: Dive into ...,10 min,4 hr,6 Servings,['Cover a freezer-safe tray with parchment pap...,"['Kid Friendly', 'Vegetarian', 'Dessert', 'Sna...","{'Servings': '6 Servings', 'Serving Size': '1 ...","[{'label': 'Plain Nonfat Greek yogurt', 'us_me..."
1,Maple-Pumpkin Spice Oatmeal Cookies,Description not found,10 min,25 min,14 Servings,['Preheat the oven to 350 degrees F. Line two ...,"['Kid Friendly', 'Vegetarian', 'Snacks', 'Glut...","{'Servings': '14 Servings', 'Serving Size': '1...","[{'label': 'old-fashioned rolled oats', 'us_me..."


In [23]:
dfh['ingredients'].iloc[5]

"[{'label': 'canned no-salt added tomato sauce(for the BBQ sauce)', 'us_measure': '1/2 cup', 'metric_measure': '118 ml'}, {'label': 'Splenda® Monk Fruit Granulated Sweetener(for the BBQ sauce)', 'us_measure': '3 tbsp', 'metric_measure': '44 ml'}, {'label': 'Apple Cider Vinegar(for the BBQ sauce)', 'us_measure': '1 tbsp', 'metric_measure': '15 ml'}, {'label': 'Worcestershire sauce(for the BBQ sauce)', 'us_measure': '1 tbsp', 'metric_measure': '15 ml'}, {'label': 'Yellow Mustard(for the BBQ sauce)', 'us_measure': '2 tsp', 'metric_measure': '10 g'}, {'label': 'liquid smoke(for the BBQ sauce)', 'us_measure': '1/2 tsp', 'metric_measure': '2 g'}, {'label': 'onion powder(divided use)', 'us_measure': '1 1/4 tsp', 'metric_measure': '6 g'}, {'label': 'garlic powder(divided use)', 'us_measure': '1 tsp', 'metric_measure': '5 g'}, {'label': 'celery seed(for the BBQ sauce)', 'us_measure': '1/8 tsp', 'metric_measure': '1 g'}, {'label': 'Kosher Salt(divided use)', 'us_measure': '1/4 tsp', 'metric_meas

In [24]:
dfh['tags'].iloc[0]

"['Kid Friendly', 'Vegetarian', 'Dessert', 'Snacks']"

In [25]:
# data2 - nutri score

import ast
import pandas as pd
import json
import re
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

def extract_numeric_value(value):
    try:
        if isinstance(value, str):
            match = re.search(r"(\d+(\.\d+)?)", value)
            if match:
                return float(match.group(1))
        elif isinstance(value, (int, float)):
            return float(value)
    except Exception as e:
        print(f"Error in extract_numeric_value: {e}, value: {value}")
    return 0.0

def is_valid_meal_structure(json_string):
    try:
        data = json.loads(json_string)
        return all(key in data for key in ['Breakfast', 'Lunch', 'Dinner'])
    except (json.JSONDecodeError, TypeError):
        return False

def find_most_similar_row(title, dfh):
    try:
        dfh['title'] = dfh['title'].fillna('')  # NaN 처리
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform(dfh['title'])
        input_vector = vectorizer.transform([title])
        similarities = cosine_similarity(input_vector, tfidf_matrix)
        most_similar_idx = similarities.argmax()
        return dfh.iloc[most_similar_idx]
    except Exception as e:
        print(f"Error in find_most_similar_row: {e}, title: {title}")
        return None

def extract_nested_value(data, keys, default=0):
    try:
        for key in keys:
            if isinstance(data, dict):
                data = data.get(key, {})
            else:
                return default
        return extract_numeric_value(data)
    except Exception as e:
        print(f"Error in extract_nested_value: {e}, keys: {keys}, data: {data}")
        return default

def calculate_nutri_score(nutrition_facts):
    try:
        if isinstance(nutrition_facts, str):
            nutrition_facts = ast.literal_eval(nutrition_facts)

        energy = extract_nested_value(nutrition_facts, ['Amount per Serving', 'Calories'])
        saturated_fat = extract_nested_value(nutrition_facts, ['Amount per Serving', 'Total Fat', 'Amount'])
        sugar = extract_nested_value(nutrition_facts, ['Amount per Serving', 'Total Carbohydrates', 'Total Sugars'])
        sodium = extract_nested_value(nutrition_facts, ['Amount per Serving', 'Sodium'])
        fiber = extract_nested_value(nutrition_facts, ['Amount per Serving', 'Total Carbohydrates', 'Dietary Fiber'])
        protein = extract_nested_value(nutrition_facts, ['Amount per Serving', 'Protein'])

        # 불리한 점수 계산
        energy_points = min(energy / 80, 800)
        saturated_fat_points = min(saturated_fat / 1, 10)
        sugar_points = min(sugar / 4.5, 45)
        sodium_points = min(sodium / 90, 900)

        unfavorable_points = energy_points + saturated_fat_points + sugar_points + sodium_points

        # 유리한 점수 계산
        fiber_points = min(fiber / 0.7, 3.5)
        protein_points = min(protein / 1.6, 8.0)
        fruit_veg_points = 0  # 과일/야채 정보가 없으므로 기본값 사용

        favorable_points = fiber_points + protein_points + fruit_veg_points

        # 최종 Nutri-Score 계산
        total_score = unfavorable_points - favorable_points
        return total_score
    except Exception as e:
        print(f"Error in calculate_nutri_score: {e}, nutrition_facts: {nutrition_facts}")
        return None

def get_nutri_score_grade(score):
    """
    Nutri-Score 등급 계산.
    """
    if score <= -1:
        return "A"
    elif score <= 2:
        return "B"
    elif score <= 10:
        return "C"
    elif score <= 18:
        return "D"
    else:
        return "E"

def calculate_meal_nutri_score(meal_data, dfh):
    meal_scores = {}

    for meal, title in meal_data.items():
        matched_row = find_most_similar_row(title, dfh)
        if matched_row is None:
            continue

        nutrition_facts = matched_row['nutrition_facts']
        score = calculate_nutri_score(nutrition_facts)
        grade = get_nutri_score_grade(score)

        meal_scores[meal] = {
            'score': score,
            'grade': grade
        }

    return meal_scores

def calculate_scores_with_comparison(df, dfh):
    """
    output 및 model_output Nutri-Score를 계산하고 비교.
    """
    results = []
    output_scores_list = []
    model_output_scores_list = []

    for idx, row in df.iterrows():
        try:
            if is_valid_meal_structure(row.get('output', '')):
                output_data = json.loads(row['output'])
                output_scores = calculate_meal_nutri_score(output_data, dfh)
                output_avg_score = sum([meal['score'] for meal in output_scores.values() if meal['score'] is not None]) / len(output_scores)
                output_scores_list.append(output_avg_score)
            else:
                output_avg_score = None

            if is_valid_meal_structure(row.get('model_output', '')):
                model_output_data = json.loads(row['model_output'])
                model_output_scores = calculate_meal_nutri_score(model_output_data, dfh)
                model_output_avg_score = sum([meal['score'] for meal in model_output_scores.values() if meal['score'] is not None]) / len(model_output_scores)
                model_output_scores_list.append(model_output_avg_score)
            else:
                model_output_avg_score = None

            results.append({
                'row_index': idx,
                'output_avg_score': output_avg_score,
                'model_output_avg_score': model_output_avg_score
            })

        except Exception as e:
            print(f"Error processing row {idx}: {e}")
            results.append({
                'row_index': idx,
                'output_avg_score': None,
                'model_output_avg_score': None
            })

    final_output_avg = sum(output_scores_list) / len(output_scores_list) if output_scores_list else None
    final_model_output_avg = sum(model_output_scores_list) / len(model_output_scores_list) if model_output_scores_list else None

    print(f"Output 평균 Nutri-Score: {final_output_avg}")
    print(f"Model Output 평균 Nutri-Score: {final_model_output_avg}")

    return results

filtered_df = df[df['task'] == 'daily_diets'] 
results = calculate_scores_with_comparison(filtered_df, dfh)

for result in results:
    print(result)


Output 평균 Nutri-Score: 5.573909499200197
Model Output 평균 Nutri-Score: 6.3296942291128335
{'row_index': 50, 'output_avg_score': 2.8386243386243386, 'model_output_avg_score': 4.587962962962964}
{'row_index': 51, 'output_avg_score': 7.407407407407407, 'model_output_avg_score': 1.7923280423280425}
{'row_index': 52, 'output_avg_score': 3.396759259259259, 'model_output_avg_score': 3.3247354497354507}
{'row_index': 53, 'output_avg_score': 2.7268518518518525, 'model_output_avg_score': 0.9325396825396824}
{'row_index': 54, 'output_avg_score': 1.48558201058201, 'model_output_avg_score': 7.513888888888889}
{'row_index': 55, 'output_avg_score': None, 'model_output_avg_score': None}
{'row_index': 56, 'output_avg_score': 3.247354497354498, 'model_output_avg_score': 4.587962962962964}
{'row_index': 57, 'output_avg_score': 4.622354497354498, 'model_output_avg_score': 6.504629629629631}
{'row_index': 58, 'output_avg_score': 6.909722222222221, 'model_output_avg_score': 1.6818783068783076}
{'row_index': 

In [26]:
df[df['task'] == 'alternative_diet'].head(2)

Unnamed: 0,dataset,split_data,task,instruction,input,output,__index_level_0__,model_output
0,diabetes_food_hub,test,alternative_diet,"Based on the previous meal, suggest the next m...","Cheese, cream",Herbed Soft Scrambled Eggs on Toast is recomme...,36153,Grilled Lime Chicken Fajitas is recommended. T...
1,diabetes_food_hub,test,alternative_diet,"Based on the previous meal, suggest the next m...","Turkey, ground",Almost Smooth Salsa is recommended. The reason...,36229,Grilled Lime Chicken Fajitas is recommended. T...


In [27]:
df[df['task']=='alternative_diet']['model_output'].iloc[2]

'Mediterranean Lentil Soup is recommended. The reason is Mediterranean Lentil Soup complements Ginger root, pickled as it provides a good source of protein, fiber, and potassium which are lacking in the pickled ginger root. However, caution should be taken with the sodium content in the soup, as it may be high for some individuals.'

In [28]:
# nutri score - alternative diet

import ast
import pandas as pd
import json
import re
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

def extract_numeric_value(value):
    try:
        if isinstance(value, str):
            match = re.search(r"(\d+(\.\d+)?)", value)
            if match:
                return float(match.group(1))
        elif isinstance(value, (int, float)):
            return float(value)
    except Exception as e:
        print(f"Error in extract_numeric_value: {e}, value: {value}")
    return 0.0

def is_valid_meal_structure(json_string):
    try:
        data = json.loads(json_string)
        return isinstance(data, dict)
    except (json.JSONDecodeError, TypeError):
        return False

def find_most_similar_row(title, dfh):
    try:
        dfh['title'] = dfh['title'].fillna('')  # NaN 처리
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform(dfh['title'])
        input_vector = vectorizer.transform([title])
        similarities = cosine_similarity(input_vector, tfidf_matrix)
        most_similar_idx = similarities.argmax()
        return dfh.iloc[most_similar_idx]
    except Exception as e:
        print(f"Error in find_most_similar_row: {e}, title: {title}")
        return None

def extract_nested_value(data, keys, default=0):
    try:
        for key in keys:
            if isinstance(data, dict):
                data = data.get(key, {})
            else:
                return default
        return extract_numeric_value(data)
    except Exception as e:
        print(f"Error in extract_nested_value: {e}, keys: {keys}, data: {data}")
        return default

def calculate_nutri_score(nutrition_facts):
    try:
        if isinstance(nutrition_facts, str):
            nutrition_facts = ast.literal_eval(nutrition_facts)

        energy = extract_nested_value(nutrition_facts, ['Amount per Serving', 'Calories'])
        saturated_fat = extract_nested_value(nutrition_facts, ['Amount per Serving', 'Total Fat', 'Amount'])
        sugar = extract_nested_value(nutrition_facts, ['Amount per Serving', 'Total Carbohydrates', 'Total Sugars'])
        sodium = extract_nested_value(nutrition_facts, ['Amount per Serving', 'Sodium'])
        fiber = extract_nested_value(nutrition_facts, ['Amount per Serving', 'Total Carbohydrates', 'Dietary Fiber'])
        protein = extract_nested_value(nutrition_facts, ['Amount per Serving', 'Protein'])

        energy_points = min(energy / 80, 800)
        saturated_fat_points = min(saturated_fat / 1, 10)
        sugar_points = min(sugar / 4.5, 45)
        sodium_points = min(sodium / 90, 900)

        unfavorable_points = energy_points + saturated_fat_points + sugar_points + sodium_points

        fiber_points = min(fiber / 0.7, 3.5)
        protein_points = min(protein / 1.6, 8.0)
        fruit_veg_points = 0  # 과일/야채 정보가 없으므로 기본값 사용

        favorable_points = fiber_points + protein_points + fruit_veg_points

        total_score = unfavorable_points - favorable_points
        return total_score
    except Exception as e:
        print(f"Error in calculate_nutri_score: {e}, nutrition_facts: {nutrition_facts}")
        return None

def get_nutri_score_grade(score):
    if score <= -1:
        return "A"
    elif score <= 2:
        return "B"
    elif score <= 10:
        return "C"
    elif score <= 18:
        return "D"
    else:
        return "E"

def extract_recommended_text(text):
    """
    'is recommended' 앞의 텍스트를 추출합니다.
    """
    try:
        if isinstance(text, str):
            recommended_text = text.split('is recommended')[0].strip()
            return recommended_text
        return None
    except Exception as e:
        print(f"Error in extract_recommended_text: {e}, text: {text}")
        return None

def calculate_scores_with_comparison_no_meals(df, dfh):
    """
    output 및 model_output에서 'is recommended' 앞의 텍스트를 추출하여 Nutri-Score를 계산하고 비교합니다.
    """
    results = []
    output_scores_list = []
    model_output_scores_list = []

    for idx, row in df.iterrows():
        try:
            output_text = extract_recommended_text(row.get('output', ''))
            if output_text:
                matched_row = find_most_similar_row(output_text, dfh)
                if matched_row is not None:
                    nutrition_facts = matched_row['nutrition_facts']
                    output_score = calculate_nutri_score(nutrition_facts)
                    output_scores_list.append(output_score)
                else:
                    output_score = None
            else:
                output_score = None

            model_output_text = extract_recommended_text(row.get('model_output', ''))
            if model_output_text:
                matched_row = find_most_similar_row(model_output_text, dfh)
                if matched_row is not None:
                    nutrition_facts = matched_row['nutrition_facts']
                    model_output_score = calculate_nutri_score(nutrition_facts)
                    model_output_scores_list.append(model_output_score)
                else:
                    model_output_score = None
            else:
                model_output_score = None

            results.append({
                'row_index': idx,
                'output_score': output_score,
                'model_output_score': model_output_score
            })

        except Exception as e:
            print(f"Error processing row {idx}: {e}")
            results.append({
                'row_index': idx,
                'output_score': None,
                'model_output_score': None
            })

    final_output_avg = sum(output_scores_list) / len(output_scores_list) if output_scores_list else None
    final_model_output_avg = sum(model_output_scores_list) / len(model_output_scores_list) if model_output_scores_list else None

    print(f"Output 평균 Nutri-Score: {final_output_avg}")
    print(f"Model Output 평균 Nutri-Score: {final_model_output_avg}")

    return results

filtered_df = df[df['task'] == 'alternative_diet']
results = calculate_scores_with_comparison_no_meals(filtered_df, dfh)

for result in results:
    print(result)


Output 평균 Nutri-Score: 4.4262460317460315
Model Output 평균 Nutri-Score: 5.688353174603175
{'row_index': 0, 'output_score': 2.9722222222222214, 'model_output_score': -0.27777777777777857}
{'row_index': 1, 'output_score': 1.9097222222222223, 'model_output_score': -0.27777777777777857}
{'row_index': 2, 'output_score': 0.5694444444444444, 'model_output_score': 0.7083333333333339}
{'row_index': 3, 'output_score': 1.9097222222222223, 'model_output_score': 3.6567460317460316}
{'row_index': 4, 'output_score': 7.270833333333334, 'model_output_score': 3.6567460317460316}
{'row_index': 5, 'output_score': 11.472222222222221, 'model_output_score': 6.416666666666666}
{'row_index': 6, 'output_score': 3.4484126984126977, 'model_output_score': 6.416666666666666}
{'row_index': 7, 'output_score': 2.9722222222222214, 'model_output_score': -0.27777777777777857}
{'row_index': 8, 'output_score': 0.5694444444444444, 'model_output_score': 11.472222222222221}
{'row_index': 9, 'output_score': 1.2361111111111116, 

In [29]:
import ast
import pandas as pd
import json
import re
import openai
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from dotenv import load_dotenv
import os
from tqdm import tqdm

# Load environment variables
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

def extract_numeric_value(value):
    try:
        if isinstance(value, str):
            match = re.search(r"(\d+(\.\d+)?)", value)
            if match:
                return float(match.group(1))
        elif isinstance(value, (int, float)):
            return float(value)
    except Exception as e:
        print(f"Error in extract_numeric_value: {e}, value: {value}")
    return 0.0

def is_valid_meal_structure(json_string):
    try:
        data = json.loads(json_string)
        return all(key in data for key in ['Breakfast', 'Lunch', 'Dinner'])
    except (json.JSONDecodeError, TypeError):
        return False

def find_most_similar_row(title, dfh):
    try:
        dfh['title'] = dfh['title'].fillna('')  # Handle NaN values
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform(dfh['title'])
        input_vector = vectorizer.transform([title])
        similarities = cosine_similarity(input_vector, tfidf_matrix)
        most_similar_idx = similarities.argmax()
        return dfh.iloc[most_similar_idx]
    except Exception as e:
        print(f"Error in find_most_similar_row: {e}, title: {title}")
        return None

def identify_fruit_veg(ingredients_list):
    try:
        prompt = f"Identify which items in the following ingredient list are fruits or vegetables:\n\n{ingredients_list}\n\nReturn only the names of items that are fruits or vegetables in a Python list format."
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are an assistant identifying fruits and vegetables."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=100,
            temperature=0
        )
        fruits_vegetables = response['choices'][0]['message']['content']
        return ast.literal_eval(fruits_vegetables)
    except Exception as e:
        print(f"Error identifying fruits and vegetables: {e}")
        return []

def calculate_fruit_veg_points(ingredients):
    try:
        ingredients_list = ast.literal_eval(ingredients)
        fruit_veg_labels = identify_fruit_veg(ingredients_list)

        total_weight = 0
        fruit_veg_weight = 0

        for ingredient in ingredients_list:
            label = ingredient.get('label', '')
            weight = extract_numeric_value(ingredient.get('metric_measure', 0))
            total_weight += weight
            if label in fruit_veg_labels:
                fruit_veg_weight += weight

        fruit_veg_ratio = (fruit_veg_weight / total_weight) * 100 if total_weight > 0 else 0

        if fruit_veg_ratio > 80:
            return 5
        elif fruit_veg_ratio > 60:
            return 2
        elif fruit_veg_ratio > 40:
            return 1
        else:
            return 0
    except Exception as e:
        print(f"Error calculating fruit_veg_points: {e}")
        return 0

def extract_nested_value(data, keys, default=0):
    try:
        for key in keys:
            if isinstance(data, dict):
                data = data.get(key, {})
            else:
                return default
        return extract_numeric_value(data) if isinstance(data, (int, float, str)) else default
    except Exception as e:
        print(f"Error in extract_nested_value: {e}, keys: {keys}, data: {data}")
        return default

def calculate_nutri_score(nutrition_facts, ingredients):
    try:
        if isinstance(nutrition_facts, str):
            nutrition_facts = ast.literal_eval(nutrition_facts)

        energy = extract_nested_value(nutrition_facts, ['Amount per Serving', 'Calories'])
        saturated_fat = extract_nested_value(nutrition_facts, ['Amount per Serving', 'Total Fat', 'Amount'])
        sugar = extract_nested_value(nutrition_facts, ['Amount per Serving', 'Total Carbohydrates', 'Total Sugars'])
        sodium = extract_nested_value(nutrition_facts, ['Amount per Serving', 'Sodium'])
        fiber = extract_nested_value(nutrition_facts, ['Amount per Serving', 'Total Carbohydrates', 'Dietary Fiber'])
        protein = extract_nested_value(nutrition_facts, ['Amount per Serving', 'Protein'])

        # Unfavorable points calculation
        energy_points = min(energy / 80, 800)
        saturated_fat_points = min(saturated_fat / 1, 10)
        sugar_points = min(sugar / 4.5, 45)
        sodium_points = min(sodium / 90, 900)

        unfavorable_points = energy_points + saturated_fat_points + sugar_points + sodium_points

        # Favorable points calculation
        fiber_points = min(fiber / 0.7, 3.5)
        protein_points = min(protein / 1.6, 8.0)
        fruit_veg_points = calculate_fruit_veg_points(ingredients)

        favorable_points = fiber_points + protein_points + fruit_veg_points

        # Final Nutri-Score calculation
        total_score = unfavorable_points - favorable_points
        return total_score
    except Exception as e:
        print(f"Error in calculate_nutri_score: {e}, nutrition_facts: {nutrition_facts}")
        return None

def get_nutri_score_grade(score):
    if score <= -1:
        return "A"
    elif score <= 2:
        return "B"
    elif score <= 10:
        return "C"
    elif score <= 18:
        return "D"
    else:
        return "E"

def calculate_meal_nutri_score(meal_data, dfh):
    meal_scores = {}

    for meal, title in meal_data.items():
        matched_row = find_most_similar_row(title, dfh)
        if matched_row is None:
            continue

        nutrition_facts = matched_row['nutrition_facts']
        ingredients = matched_row['ingredients']
        score = calculate_nutri_score(nutrition_facts, ingredients)

        if score is None:
            print(f"Warning: Nutri-Score calculation failed for meal '{meal}' with title '{title}'.")
            grade = "N/A"
        else:
            grade = get_nutri_score_grade(score)

        meal_scores[meal] = {'score': score, 'grade': grade}

    return meal_scores

def calculate_scores_with_comparison(df, dfh):
    results = []
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        output_scores = {}
        model_scores = {}
        if is_valid_meal_structure(row.get('output', '')):
            output_data = json.loads(row['output'])
            output_scores = calculate_meal_nutri_score(output_data, dfh)
        if is_valid_meal_structure(row.get('model_output', '')):
            model_data = json.loads(row['model_output'])
            model_scores = calculate_meal_nutri_score(model_data, dfh)
        results.append({'row_index': idx, 'output_scores': output_scores, 'model_scores': model_scores})
    return results

def calculate_average_scores(results):
    """
    Calculate the average Nutri-Scores for outputs and model outputs.
    """
    output_total_score = 0
    model_total_score = 0
    output_count = 0
    model_count = 0

    for result in results:
        # Extract output scores
        for meal, score_data in result['output_scores'].items():
            if score_data['score'] is not None:
                output_total_score += score_data['score']
                output_count += 1

        # Extract model scores
        for meal, score_data in result['model_scores'].items():
            if score_data['score'] is not None:
                model_total_score += score_data['score']
                model_count += 1

    # Calculate averages
    output_avg = output_total_score / output_count if output_count > 0 else None
    model_avg = model_total_score / model_count if model_count > 0 else None

    return output_avg, model_avg


# 'daily_diets' task Nutri-Score calculation
filtered_df = df[df['task'] == 'daily_diets']
results = calculate_scores_with_comparison(filtered_df, dfh)

# Calculate overall averages
output_avg, model_avg = calculate_average_scores(results)

# Print results
print("=== Results for Each Row ===")
for result in results:
    print(f"Row Index: {result['row_index']}")
    print(f"Output Scores: {result['output_scores']}")
    print(f"Model Output Scores: {result['model_scores']}")
    print()

print("=== Overall Averages ===")
print(f"Output Average Nutri-Score: {output_avg}")
print(f"Model Output Average Nutri-Score: {model_avg}")


  0%|          | 0/50 [00:00<?, ?it/s]

100%|██████████| 50/50 [02:37<00:00,  3.15s/it]

=== Results for Each Row ===
Row Index: 50
Output Scores: {'Breakfast': {'score': 1.0694444444444429, 'grade': 'B'}, 'Lunch': {'score': 10.777777777777779, 'grade': 'D'}, 'Dinner': {'score': -3.3313492063492065, 'grade': 'A'}}
Model Output Scores: {'Breakfast': {'score': -4.541666666666666, 'grade': 'A'}, 'Lunch': {'score': 10.777777777777779, 'grade': 'D'}, 'Dinner': {'score': 2.5277777777777786, 'grade': 'C'}}

Row Index: 51
Output Scores: {'Breakfast': {'score': 7.763888888888889, 'grade': 'C'}, 'Lunch': {'score': 11.930555555555554, 'grade': 'D'}, 'Dinner': {'score': 2.5277777777777786, 'grade': 'C'}}
Model Output Scores: {'Breakfast': {'score': -4.541666666666666, 'grade': 'A'}, 'Lunch': {'score': 3.904761904761905, 'grade': 'C'}, 'Dinner': {'score': -0.9861111111111116, 'grade': 'B'}}

Row Index: 52
Output Scores: {'Breakfast': {'score': -1.0277777777777777, 'grade': 'A'}, 'Lunch': {'score': 7.162499999999998, 'grade': 'C'}, 'Dinner': {'score': 4.055555555555557, 'grade': 'C'}}
M




In [30]:
import ast
import pandas as pd
import json
import re
import openai
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")


def extract_numeric_value(value):
    try:
        if isinstance(value, str):
            match = re.search(r"(\d+(\.\d+)?)", value)
            if match:
                return float(match.group(1))
        elif isinstance(value, (int, float)):
            return float(value)
    except Exception as e:
        print(f"Error in extract_numeric_value: {e}, value: {value}")
    return 0.0


def is_valid_meal_structure(json_string):
    try:
        data = json.loads(json_string)
        return isinstance(data, dict)
    except (json.JSONDecodeError, TypeError):
        return False


def find_most_similar_row(title, dfh):
    try:
        dfh['title'] = dfh['title'].fillna('')  # Handle NaN values
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform(dfh['title'])
        input_vector = vectorizer.transform([title])
        similarities = cosine_similarity(input_vector, tfidf_matrix)
        most_similar_idx = similarities.argmax()
        return dfh.iloc[most_similar_idx]
    except Exception as e:
        print(f"Error in find_most_similar_row: {e}, title: {title}")
        return None


def identify_fruit_veg(ingredients_list):
    """
    Use GPT-3.5-turbo to identify fruits and vegetables in the ingredient list.
    """
    try:
        prompt = f"Identify which items in the following ingredient list are fruits or vegetables:\n\n{ingredients_list}\n\nReturn only the names of items that are fruits or vegetables in a Python list format."
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are an assistant identifying fruits and vegetables."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=100,
            temperature=0
        )
        fruits_vegetables = response['choices'][0]['message']['content']
        return ast.literal_eval(fruits_vegetables)
    except Exception as e:
        print(f"Error identifying fruits and vegetables: {e}")
        return []


def calculate_fruit_veg_points(ingredients):
    """
    Calculate fruit and vegetable ratio and return points.
    """
    try:
        ingredients_list = ast.literal_eval(ingredients)
        fruit_veg_labels = identify_fruit_veg(ingredients_list)

        total_weight = 0
        fruit_veg_weight = 0

        for ingredient in ingredients_list:
            label = ingredient.get('label', '')
            weight = extract_numeric_value(ingredient.get('metric_measure', 0))
            total_weight += weight
            if label in fruit_veg_labels:
                fruit_veg_weight += weight

        fruit_veg_ratio = (fruit_veg_weight / total_weight) * 100 if total_weight > 0 else 0

        if fruit_veg_ratio > 80:
            return 5
        elif fruit_veg_ratio > 60:
            return 2
        elif fruit_veg_ratio > 40:
            return 1
        else:
            return 0
    except Exception as e:
        print(f"Error calculating fruit_veg_points: {e}")
        return 0


def extract_nested_value(data, keys, default=0):
    try:
        for key in keys:
            if isinstance(data, dict):
                data = data.get(key, {})
            else:
                return default
        return extract_numeric_value(data)
    except Exception as e:
        print(f"Error in extract_nested_value: {e}, keys: {keys}, data: {data}")
        return default


def calculate_nutri_score(nutrition_facts, ingredients):
    try:
        if isinstance(nutrition_facts, str):
            nutrition_facts = ast.literal_eval(nutrition_facts)

        energy = extract_nested_value(nutrition_facts, ['Amount per Serving', 'Calories'])
        saturated_fat = extract_nested_value(nutrition_facts, ['Amount per Serving', 'Total Fat', 'Amount'])
        sugar = extract_nested_value(nutrition_facts, ['Amount per Serving', 'Total Carbohydrates', 'Total Sugars'])
        sodium = extract_nested_value(nutrition_facts, ['Amount per Serving', 'Sodium'])
        fiber = extract_nested_value(nutrition_facts, ['Amount per Serving', 'Total Carbohydrates', 'Dietary Fiber'])
        protein = extract_nested_value(nutrition_facts, ['Amount per Serving', 'Protein'])

        # Unfavorable points calculation
        energy_points = min(energy / 80, 800)
        saturated_fat_points = min(saturated_fat / 1, 10)
        sugar_points = min(sugar / 4.5, 45)
        sodium_points = min(sodium / 90, 900)

        unfavorable_points = energy_points + saturated_fat_points + sugar_points + sodium_points

        # Favorable points calculation
        fiber_points = min(fiber / 0.7, 3.5)
        protein_points = min(protein / 1.6, 8.0)
        fruit_veg_points = calculate_fruit_veg_points(ingredients)

        favorable_points = fiber_points + protein_points + fruit_veg_points

        # Final Nutri-Score calculation
        total_score = unfavorable_points - favorable_points
        return total_score
    except Exception as e:
        print(f"Error in calculate_nutri_score: {e}, nutrition_facts: {nutrition_facts}")
        return None


def get_nutri_score_grade(score):
    if score <= -1:
        return "A"
    elif score <= 2:
        return "B"
    elif score <= 10:
        return "C"
    elif score <= 18:
        return "D"
    else:
        return "E"


def extract_recommended_text(text):
    """
    Extract the text before 'is recommended'.
    """
    try:
        if isinstance(text, str):
            recommended_text = text.split('is recommended')[0].strip()
            return recommended_text
        return None
    except Exception as e:
        print(f"Error in extract_recommended_text: {e}, text: {text}")
        return None


def calculate_scores_with_comparison_no_meals(df, dfh):
    """
    Calculate and compare Nutri-Scores for 'output' and 'model_output'.
    """
    results = []
    output_scores_list = []
    model_output_scores_list = []

    for idx, row in tqdm(df.iterrows()):
        try:
            output_text = extract_recommended_text(row.get('output', ''))
            if output_text:
                matched_row = find_most_similar_row(output_text, dfh)
                if matched_row is not None:
                    nutrition_facts = matched_row['nutrition_facts']
                    ingredients = matched_row['ingredients']
                    output_score = calculate_nutri_score(nutrition_facts, ingredients)
                    output_scores_list.append(output_score)
                else:
                    output_score = None
            else:
                output_score = None

            model_output_text = extract_recommended_text(row.get('model_output', ''))
            if model_output_text:
                matched_row = find_most_similar_row(model_output_text, dfh)
                if matched_row is not None:
                    nutrition_facts = matched_row['nutrition_facts']
                    ingredients = matched_row['ingredients']
                    model_output_score = calculate_nutri_score(nutrition_facts, ingredients)
                    model_output_scores_list.append(model_output_score)
                else:
                    model_output_score = None
            else:
                model_output_score = None

            results.append({
                'row_index': idx,
                'output_score': output_score,
                'model_output_score': model_output_score
            })

        except Exception as e:
            print(f"Error processing row {idx}: {e}")
            results.append({
                'row_index': idx,
                'output_score': None,
                'model_output_score': None
            })

    final_output_avg = sum(output_scores_list) / len(output_scores_list) if output_scores_list else None
    final_model_output_avg = sum(model_output_scores_list) / len(model_output_scores_list) if model_output_scores_list else None

    print(f"Output Average Nutri-Score: {final_output_avg}")
    print(f"Model Output Average Nutri-Score: {final_model_output_avg}")

    return results


# Execution
filtered_df = df[df['task'] == 'alternative_diet']
results = calculate_scores_with_comparison_no_meals(filtered_df, dfh)

# Print results
for result in results:
    print(result)


0it [00:00, ?it/s]

50it [01:13,  1.48s/it]

Output Average Nutri-Score: 3.506246031746031
Model Output Average Nutri-Score: 5.528353174603175
{'row_index': 0, 'output_score': -2.0277777777777786, 'model_output_score': -0.27777777777777857}
{'row_index': 1, 'output_score': 1.9097222222222223, 'model_output_score': -0.27777777777777857}
{'row_index': 2, 'output_score': 0.5694444444444444, 'model_output_score': 0.7083333333333339}
{'row_index': 3, 'output_score': 1.9097222222222223, 'model_output_score': 3.6567460317460316}
{'row_index': 4, 'output_score': 7.270833333333334, 'model_output_score': 3.6567460317460316}
{'row_index': 5, 'output_score': 10.472222222222221, 'model_output_score': 6.416666666666666}
{'row_index': 6, 'output_score': -1.5515873015873023, 'model_output_score': 6.416666666666666}
{'row_index': 7, 'output_score': -2.0277777777777786, 'model_output_score': -0.27777777777777857}
{'row_index': 8, 'output_score': 0.5694444444444444, 'model_output_score': 10.472222222222221}
{'row_index': 9, 'output_score': 1.236111




In [31]:
import ast
import pandas as pd
import json
import re
import openai
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from dotenv import load_dotenv
import os
from tqdm import tqdm

# Load environment variables
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

def extract_numeric_value(value):
    try:
        if isinstance(value, str):
            match = re.search(r"(\d+(\.\d+)?)", value)
            if match:
                return float(match.group(1))
        elif isinstance(value, (int, float)):
            return float(value)
    except Exception as e:
        print(f"Error in extract_numeric_value: {e}, value: {value}")
    return 0.0

def is_valid_meal_structure(json_string):
    try:
        data = json.loads(json_string)
        return all(key in data for key in ['Breakfast', 'Lunch', 'Dinner'])
    except (json.JSONDecodeError, TypeError):
        return False

def find_most_similar_row(title, dfh):
    try:
        dfh['title'] = dfh['title'].fillna('')  # Handle NaN values
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform(dfh['title'])
        input_vector = vectorizer.transform([title])
        similarities = cosine_similarity(input_vector, tfidf_matrix)
        most_similar_idx = similarities.argmax()
        return dfh.iloc[most_similar_idx]
    except Exception as e:
        print(f"Error in find_most_similar_row: {e}, title: {title}")
        return None

def identify_fruit_veg(ingredients_list):
    try:
        prompt = f"Identify which items in the following ingredient list are fruits or vegetables:\n\n{ingredients_list}\n\nReturn only the names of items that are fruits or vegetables in a Python list format."
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are an assistant identifying fruits and vegetables."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=100,
            temperature=0
        )
        fruits_vegetables = response['choices'][0]['message']['content']
        return ast.literal_eval(fruits_vegetables)
    except Exception as e:
        print(f"Error identifying fruits and vegetables: {e}")
        return []

def calculate_fruit_veg_points(ingredients, total_weight):
    try:
        ingredients_list = ast.literal_eval(ingredients)
        fruit_veg_labels = identify_fruit_veg(ingredients_list)

        fruit_veg_weight = 0
        for ingredient in ingredients_list:
            label = ingredient.get('label', '')
            weight = extract_numeric_value(ingredient.get('metric_measure', 0))
            if label in fruit_veg_labels:
                fruit_veg_weight += weight

        # 과일/채소 비율을 100g 기준으로 변환
        fruit_veg_ratio = (fruit_veg_weight / total_weight) * 100 if total_weight > 0 else 0

        if fruit_veg_ratio > 80:
            return 5
        elif fruit_veg_ratio > 60:
            return 2
        elif fruit_veg_ratio > 40:
            return 1
        else:
            return 0
    except Exception as e:
        print(f"Error calculating fruit_veg_points: {e}")
        return 0

def extract_nested_value(data, keys, default=0):
    try:
        for key in keys:
            if isinstance(data, dict):
                data = data.get(key, {})
            else:
                return default
        return extract_numeric_value(data) if isinstance(data, (int, float, str)) else default
    except Exception as e:
        print(f"Error in extract_nested_value: {e}, keys: {keys}, data: {data}")
        return default

def calculate_nutri_score(nutrition_facts, ingredients):
    try:
        if isinstance(nutrition_facts, str):
            nutrition_facts = ast.literal_eval(nutrition_facts)

        # 전체 무게 계산
        total_weight = sum(
            extract_numeric_value(ingredient.get('metric_measure', 0)) 
            for ingredient in ast.literal_eval(ingredients)
        )
        if total_weight == 0:
            print("Warning: Total weight is zero. Skipping calculation.")
            return None

        # 100g 기준으로 성분 정규화
        energy = extract_nested_value(nutrition_facts, ['Amount per Serving', 'Calories']) / total_weight * 100
        saturated_fat = extract_nested_value(nutrition_facts, ['Amount per Serving', 'Total Fat', 'Amount']) / total_weight * 100
        sugar = extract_nested_value(nutrition_facts, ['Amount per Serving', 'Total Carbohydrates', 'Total Sugars']) / total_weight * 100
        sodium = extract_nested_value(nutrition_facts, ['Amount per Serving', 'Sodium']) / total_weight * 100
        fiber = extract_nested_value(nutrition_facts, ['Amount per Serving', 'Total Carbohydrates', 'Dietary Fiber']) / total_weight * 100
        protein = extract_nested_value(nutrition_facts, ['Amount per Serving', 'Protein']) / total_weight * 100

        # Unfavorable points calculation
        energy_points = min(energy / 80, 800)
        saturated_fat_points = min(saturated_fat / 1, 10)
        sugar_points = min(sugar / 4.5, 45)
        sodium_points = min(sodium / 90, 900)

        unfavorable_points = energy_points + saturated_fat_points + sugar_points + sodium_points

        # Favorable points calculation
        fiber_points = min(fiber / 0.7, 3.5)
        protein_points = min(protein / 1.6, 8.0)
        fruit_veg_points = calculate_fruit_veg_points(ingredients, total_weight)

        favorable_points = fiber_points + protein_points + fruit_veg_points

        # Final Nutri-Score calculation
        total_score = unfavorable_points - favorable_points
        return total_score
    except Exception as e:
        print(f"Error in calculate_nutri_score: {e}, nutrition_facts: {nutrition_facts}")
        return None

def get_nutri_score_grade(score):
    if score <= -1:
        return "A"
    elif score <= 2:
        return "B"
    elif score <= 10:
        return "C"
    elif score <= 18:
        return "D"
    else:
        return "E"

def calculate_meal_nutri_score(meal_data, dfh):
    meal_scores = {}

    for meal, title in meal_data.items():
        matched_row = find_most_similar_row(title, dfh)
        if matched_row is None:
            continue

        nutrition_facts = matched_row['nutrition_facts']
        ingredients = matched_row['ingredients']
        score = calculate_nutri_score(nutrition_facts, ingredients)

        if score is None:
            print(f"Warning: Nutri-Score calculation failed for meal '{meal}' with title '{title}'.")
            grade = "N/A"
        else:
            grade = get_nutri_score_grade(score)

        meal_scores[meal] = {'score': score, 'grade': grade}

    return meal_scores

def calculate_scores_with_comparison(df, dfh):
    results = []
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        output_scores = {}
        model_scores = {}
        if is_valid_meal_structure(row.get('output', '')):
            output_data = json.loads(row['output'])
            output_scores = calculate_meal_nutri_score(output_data, dfh)
        if is_valid_meal_structure(row.get('model_output', '')):
            model_data = json.loads(row['model_output'])
            model_scores = calculate_meal_nutri_score(model_data, dfh)
        results.append({'row_index': idx, 'output_scores': output_scores, 'model_scores': model_scores})
    return results

def calculate_average_scores(results):
    """
    Calculate the average Nutri-Scores for outputs and model outputs.
    """
    output_total_score = 0
    model_total_score = 0
    output_count = 0
    model_count = 0

    for result in results:
        # Extract output scores
        for meal, score_data in result['output_scores'].items():
            if score_data['score'] is not None:
                output_total_score += score_data['score']
                output_count += 1

        # Extract model scores
        for meal, score_data in result['model_scores'].items():
            if score_data['score'] is not None:
                model_total_score += score_data['score']
                model_count += 1

    # Calculate averages
    output_avg = output_total_score / output_count if output_count > 0 else None
    model_avg = model_total_score / model_count if model_count > 0 else None

    return output_avg, model_avg


# 'daily_diets' task Nutri-Score calculation
filtered_df = df[df['task'] == 'daily_diets']
results = calculate_scores_with_comparison(filtered_df, dfh)

# Calculate overall averages
output_avg, model_avg = calculate_average_scores(results)

# Print results
print("=== Results for Each Row ===")
for result in results:
    print(f"Row Index: {result['row_index']}")
    print(f"Output Scores: {result['output_scores']}")
    print(f"Model Output Scores: {result['model_scores']}")
    print()

print("=== Overall Averages ===")
print(f"Output Average Nutri-Score: {output_avg}")
print(f"Model Output Average Nutri-Score: {model_avg}")


100%|██████████| 50/50 [02:44<00:00,  3.29s/it]

=== Results for Each Row ===
Row Index: 50
Output Scores: {'Breakfast': {'score': -0.21290860331956285, 'grade': 'B'}, 'Lunch': {'score': 0.12827070932539675, 'grade': 'B'}, 'Dinner': {'score': -0.41280659310399087, 'grade': 'B'}}
Model Output Scores: {'Breakfast': {'score': -5.636214630779849, 'grade': 'A'}, 'Lunch': {'score': 0.12827070932539675, 'grade': 'B'}, 'Dinner': {'score': 0.1844070961718014, 'grade': 'B'}}

Row Index: 51
Output Scores: {'Breakfast': {'score': 0.16996327971937708, 'grade': 'B'}, 'Lunch': {'score': 0.27903677232536306, 'grade': 'B'}, 'Dinner': {'score': 0.1844070961718014, 'grade': 'B'}}
Model Output Scores: {'Breakfast': {'score': -5.636214630779849, 'grade': 'A'}, 'Lunch': {'score': 0.18546853989891976, 'grade': 'B'}, 'Dinner': {'score': -1.93626407732553, 'grade': 'A'}}

Row Index: 52
Output Scores: {'Breakfast': {'score': -1.1194395453654709, 'grade': 'A'}, 'Lunch': {'score': 0.6065349544072949, 'grade': 'B'}, 'Dinner': {'score': -0.11593641401333699, 'gra




In [None]:
import ast
import pandas as pd
import json
import re
import openai
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from dotenv import load_dotenv
import os
from tqdm import tqdm

# Load environment variables
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

def extract_numeric_value(value):
    try:
        if isinstance(value, str):
            match = re.search(r"(\d+(\.\d+)?)", value)
            if match:
                return float(match.group(1))
        elif isinstance(value, (int, float)):
            return float(value)
    except Exception as e:
        print(f"Error in extract_numeric_value: {e}, value: {value}")
    return 0.0

def is_valid_meal_structure(json_string):
    try:
        data = json.loads(json_string)
        return isinstance(data, dict)
    except (json.JSONDecodeError, TypeError):
        return False

def find_most_similar_row(title, dfh):
    try:
        dfh['title'] = dfh['title'].fillna('')  # Handle NaN values
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform(dfh['title'])
        input_vector = vectorizer.transform([title])
        similarities = cosine_similarity(input_vector, tfidf_matrix)
        most_similar_idx = similarities.argmax()
        return dfh.iloc[most_similar_idx]
    except Exception as e:
        print(f"Error in find_most_similar_row: {e}, title: {title}")
        return None

def identify_fruit_veg(ingredients_list):
    try:
        prompt = f"Identify which items in the following ingredient list are fruits or vegetables:\n\n{ingredients_list}\n\nReturn only the names of items that are fruits or vegetables in a Python list format."
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are an assistant identifying fruits and vegetables."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=100,
            temperature=0
        )
        fruits_vegetables = response['choices'][0]['message']['content']
        return ast.literal_eval(fruits_vegetables)
    except Exception as e:
        print(f"Error identifying fruits and vegetables: {e}")
        return []

def calculate_fruit_veg_points(ingredients, total_weight):
    try:
        ingredients_list = ast.literal_eval(ingredients)
        fruit_veg_labels = identify_fruit_veg(ingredients_list)

        fruit_veg_weight = 0
        for ingredient in ingredients_list:
            label = ingredient.get('label', '')
            weight = extract_numeric_value(ingredient.get('metric_measure', 0))
            if label in fruit_veg_labels:
                fruit_veg_weight += weight

        # 과일/채소 비율을 100g 기준으로 변환
        fruit_veg_ratio = (fruit_veg_weight / total_weight) * 100 if total_weight > 0 else 0

        if fruit_veg_ratio > 80:
            return 5
        elif fruit_veg_ratio > 60:
            return 2
        elif fruit_veg_ratio > 40:
            return 1
        else:
            return 0
    except Exception as e:
        print(f"Error calculating fruit_veg_points: {e}")
        return 0

def extract_nested_value(data, keys, default=0):
    try:
        for key in keys:
            if isinstance(data, dict):
                data = data.get(key, {})
            else:
                return default
        return extract_numeric_value(data)
    except Exception as e:
        print(f"Error in extract_nested_value: {e}, keys: {keys}, data: {data}")
        return default

def calculate_nutri_score(nutrition_facts, ingredients):
    try:
        if isinstance(nutrition_facts, str):
            nutrition_facts = ast.literal_eval(nutrition_facts)

        # 전체 무게 계산
        total_weight = sum(
            extract_numeric_value(ingredient.get('metric_measure', 0)) 
            for ingredient in ast.literal_eval(ingredients)
        )
        if total_weight == 0:
            print("Warning: Total weight is zero. Skipping calculation.")
            return None

        # 100g 기준으로 성분 정규화
        energy = extract_nested_value(nutrition_facts, ['Amount per Serving', 'Calories']) / total_weight * 100
        saturated_fat = extract_nested_value(nutrition_facts, ['Amount per Serving', 'Total Fat', 'Amount']) / total_weight * 100
        sugar = extract_nested_value(nutrition_facts, ['Amount per Serving', 'Total Carbohydrates', 'Total Sugars']) / total_weight * 100
        sodium = extract_nested_value(nutrition_facts, ['Amount per Serving', 'Sodium']) / total_weight * 100
        fiber = extract_nested_value(nutrition_facts, ['Amount per Serving', 'Total Carbohydrates', 'Dietary Fiber']) / total_weight * 100
        protein = extract_nested_value(nutrition_facts, ['Amount per Serving', 'Protein']) / total_weight * 100

        # Unfavorable points calculation
        energy_points = min(energy / 80, 800)
        saturated_fat_points = min(saturated_fat / 1, 10)
        sugar_points = min(sugar / 4.5, 45)
        sodium_points = min(sodium / 90, 900)

        unfavorable_points = energy_points + saturated_fat_points + sugar_points + sodium_points

        # Favorable points calculation
        fiber_points = min(fiber / 0.7, 3.5)
        protein_points = min(protein / 1.6, 8.0)
        fruit_veg_points = calculate_fruit_veg_points(ingredients, total_weight)

        favorable_points = fiber_points + protein_points + fruit_veg_points

        # Final Nutri-Score calculation
        total_score = unfavorable_points - favorable_points
        return total_score
    except Exception as e:
        print(f"Error in calculate_nutri_score: {e}, nutrition_facts: {nutrition_facts}")
        return None

def get_nutri_score_grade(score):
    if score <= -1:
        return "A"
    elif score <= 2:
        return "B"
    elif score <= 10:
        return "C"
    elif score <= 18:
        return "D"
    else:
        return "E"

def calculate_scores_with_comparison_no_meals(df, dfh):
    results = []
    output_scores_list = []
    model_output_scores_list = []

    for idx, row in tqdm(df.iterrows(), total=len(df)):
        try:
            output_text = row.get('output', '')
            if output_text:
                matched_row = find_most_similar_row(output_text, dfh)
                if matched_row is not None:
                    nutrition_facts = matched_row['nutrition_facts']
                    ingredients = matched_row['ingredients']
                    output_score = calculate_nutri_score(nutrition_facts, ingredients)
                    output_scores_list.append(output_score)
                else:
                    output_score = None

            model_output_text = row.get('model_output', '')
            if model_output_text:
                matched_row = find_most_similar_row(model_output_text, dfh)
                if matched_row is not None:
                    nutrition_facts = matched_row['nutrition_facts']
                    ingredients = matched_row['ingredients']
                    model_output_score = calculate_nutri_score(nutrition_facts, ingredients)
                    model_output_scores_list.append(model_output_score)
                else:
                    model_output_score = None

            results.append({
                'row_index': idx,
                'output_score': output_score,
                'model_output_score': model_output_score
            })

        except Exception as e:
            print(f"Error processing row {idx}: {e}")
            results.append({
                'row_index': idx,
                'output_score': None,
                'model_output_score': None
            })

    final_output_avg = sum(output_scores_list) / len(output_scores_list) if output_scores_list else None
    final_model_output_avg = sum(model_output_scores_list) / len(model_output_scores_list) if model_output_scores_list else None

    print(f"Output Average Nutri-Score: {final_output_avg}")
    print(f"Model Output Average Nutri-Score: {final_model_output_avg}")

    return results

# Execution
filtered_df = df[df['task'] == 'alternative_diet']
results = calculate_scores_with_comparison_no_meals(filtered_df, dfh)

# Print results
for result in results:
    print(result)


  0%|          | 0/50 [00:00<?, ?it/s]

100%|██████████| 50/50 [01:11<00:00,  1.42s/it]

Output Average Nutri-Score: 1.139158091597086
Model Output Average Nutri-Score: 0.7815755065397028
{'row_index': 0, 'output_score': 1.2495974235104654, 'model_output_score': -1.8280321920168663}
{'row_index': 1, 'output_score': 2.357681755829904, 'model_output_score': -1.8280321920168663}
{'row_index': 2, 'output_score': 0.09538432905267075, 'model_output_score': -0.2137724271482232}
{'row_index': 3, 'output_score': 2.357681755829904, 'model_output_score': 0.9751322751322751}
{'row_index': 4, 'output_score': 0.43666214326836095, 'model_output_score': 0.9751322751322751}
{'row_index': 5, 'output_score': 0.9837745637035153, 'model_output_score': 1.0389211034967496}
{'row_index': 6, 'output_score': -4.595731219412345, 'model_output_score': 8.479166666666668}
{'row_index': 7, 'output_score': 1.2495974235104654, 'model_output_score': -1.8280321920168663}
{'row_index': 8, 'output_score': 0.09538432905267075, 'model_output_score': 0.9837745637035153}
{'row_index': 9, 'output_score': -0.315431




: 