In [7]:
import json

file_path = "/data/jaesung/llm_for_diabetes/src/model/inference_results_1224.jsonl"

data = []
with open(file_path, 'r', encoding='utf-8') as f:
    for line in f:
        data.append(json.loads(line))

In [8]:
data

[{'dataset': 'bionli',
  'split_data': 'test',
  'task': 'nli',
  'instruction': 'Evaluate if the hypothesis can be inferred from the premise. Label it as entailment, contradiction, or neutral.',
  'input': "[PRE] To investigate the mechanism of thyroid hormone action on pulmonary surfactant synthesis, we characterized the effect of triiodothyronine on phosphatidylcholine synthesis in cultured fetal rabbit lung. Since glucocorticoids stimulate surfactant synthesis and reduce the incidence of Respiratory Distress Syndrome in premature infants, we also examined the interaction of triiodothyronine and dexamethasone. The rate of choline incorporation into phosphatidylcholine was determined in organ cultures of rabbit lung maintained in serum-free Waymouth's medium. In 23-d lung cultured for 72 h, the increase in choline incorporation with triiodothyronine alone, dexamethasone alone, and triiodothyronine plus dexamethasone was 50, 62, and 161%, respectively. Both triiodothyronine and dexame

In [9]:
import pandas as pd

df = pd.DataFrame(data)

In [10]:
df.head(2)

Unnamed: 0,dataset,split_data,task,instruction,input,output,__index_level_0__,model_output
0,bionli,test,nli,Evaluate if the hypothesis can be inferred fro...,[PRE] To investigate the mechanism of thyroid ...,contradiction,29785,entailment
1,bionli,test,nli,Classify the relationship between the premise ...,[PRE] Cold exposure reverses the diabetogenic ...,contradiction,29786,contradiction


In [11]:
df['task'].unique()

array(['nli', 'ie_extract_relation', 'qa_subjective', 'qa_objective',
       'summarization', 'daily_diets', 'alternative_diet'], dtype=object)

In [8]:
# nli

nli_df = df[df['task'] == 'nli']

correct_predictions = (nli_df['output'] == nli_df['model_output']).sum()
total_predictions = len(nli_df) 

nli_acc = correct_predictions / total_predictions

print(nli_acc)  # bioinstruct - 0.33 

0.6


In [13]:
# ie - ie_extract_relation
from sklearn.metrics import precision_score, recall_score, f1_score


ie_extract_relation = df[df['task'] == 'ie_extract_relation']

y_true = ie_extract_relation['output']
y_pred = ie_extract_relation['model_output']

precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')
f1 = f1_score(y_true, y_pred, average='weighted')

print(f"precision: {precision}") 
print(f"recall: {recall}")
print(f"f1: {f1}")


precision: 0.1259782608695652
recall: 0.18
f1: 0.1424948024948025


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# summarization

import openai
import pandas as pd
import re
from sklearn.metrics import f1_score
from bert_score import score as bert_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

import os
from dotenv import load_dotenv

load_dotenv()

openai.api_key = os.getenv("OPENAI_API_KEY")

bleurt_model_name = "Elron/bleurt-large-512"
tokenizer = AutoTokenizer.from_pretrained(bleurt_model_name)
bleurt_model = AutoModelForSequenceClassification.from_pretrained(bleurt_model_name)
bleurt_model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bleurt_model = bleurt_model.to(device)

MAX_CONTEXT_LENGTH = 8192

def evaluate_with_gpt4(input_text, model_output, true_output):

    input_text = input_text[:MAX_CONTEXT_LENGTH]
    model_output = model_output[:MAX_CONTEXT_LENGTH]
    true_output = true_output[:MAX_CONTEXT_LENGTH]

    prompt = f"""
    You are tasked with evaluating the quality of a QA model's responses based on the following metrics:
    1. **Coherence**: Does the model's response logically align with the context provided in the input?
    2. **Completeness**: Does the model's response sufficiently answer the question in the input?
    3. **Naturalness**: Does the model's response sound fluent and human-like?

    **Input**:
    {input_text}

    **Model's Response**:
    {model_output}

    **True Answer**:
    {true_output}

    Please rate each metric on a scale from 1 to 5. 
    Example response format:
    - Coherence: X.X
    - Completeness: X.X
    - Naturalness: X.X
    """
    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo-0125",
            messages=[
                {"role": "system", "content": "You are an expert evaluator for Summarization models."},
                {"role": "user", "content": prompt}
            ]
        )
        return response["choices"][0]["message"]["content"]
    except Exception as e:
        print("Error with GPT-4 API:", e)
        return None

def extract_scores(evaluation):

    if evaluation is None:
        return {
            "Coherence": 0.0,
            "Completeness": 0.0,
            "Naturalness": 0.0
        }
    coherence = re.search(r"Coherence: (\d\.\d)", evaluation)
    completeness = re.search(r"Completeness: (\d\.\d)", evaluation)
    naturalness = re.search(r"Naturalness: (\d\.\d)", evaluation)
    
    return {
        "Coherence": float(coherence.group(1)) if coherence else 0.0,
        "Completeness": float(completeness.group(1)) if completeness else 0.0,
        "Naturalness": float(naturalness.group(1)) if naturalness else 0.0
    }

def calculate_concept_f1(y_true, y_pred):
    return f1_score(y_true, y_pred, average="micro")

def calculate_bleurt(y_true, y_pred):

    inputs = tokenizer(y_pred, y_true, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    with torch.no_grad():
        scores = bleurt_model(**inputs).logits

    if scores.numel() == 1:
        return float(scores.squeeze().item())  
    return [float(score) for score in scores.squeeze().tolist()]  

def calculate_bleurt_and_bertscore(y_true, y_pred):

    bleurt_score_value = calculate_bleurt(y_true, y_pred)

    _, _, bert_f1 = bert_score(y_pred, y_true, lang="en", rescale_with_baseline=True)

    bert_f1 = [float(score) for score in bert_f1]

    return {
        "BLEURT": bleurt_score_value if isinstance(bleurt_score_value, float) else sum(bleurt_score_value) / len(bleurt_score_value),
        "BERTScore_F1": sum(bert_f1) / len(bert_f1)
    }

def normalize_scores(df, column):
    df[column] = df[column].apply(
        lambda x: float(x) if isinstance(x, torch.Tensor) else x
    )
    min_val = df[column].min()
    max_val = df[column].max()
    df[column] = df[column].apply(
        lambda x: (x - min_val) / (max_val - min_val) if max_val > min_val else 0.5
    )
    return df

qa_df = df[df['task'] == 'summarization']

results = []

for _, row in qa_df.iterrows():
    input_text = row['input']
    model_output = row['model_output']
    true_output = row['output']
    
    evaluation = evaluate_with_gpt4(input_text, model_output, true_output)
    print(f"Evaluation result:\n{evaluation}")
    
    scores = extract_scores(evaluation)
    
    concept_f1 = calculate_concept_f1([true_output], [model_output])
    
    metric_scores = calculate_bleurt_and_bertscore([true_output], [model_output])

    results.append({
        "input": input_text,
        "model_output": model_output,
        "true_output": true_output,
        "evaluation": evaluation,
        "Coherence": scores["Coherence"],
        "Completeness": scores["Completeness"],
        "Naturalness": scores["Naturalness"],
        "Concept_F1": concept_f1,
        "BLEURT": metric_scores["BLEURT"],
        "BERTScore_F1": metric_scores["BERTScore_F1"]
    })

evaluation_df = pd.DataFrame(results)

evaluation_df = normalize_scores(evaluation_df, "BLEURT")
evaluation_df = normalize_scores(evaluation_df, "BERTScore_F1")

average_scores = evaluation_df[["Coherence", "Completeness", "Naturalness", "Concept_F1", "BLEURT", "BERTScore_F1"]].mean()

print("평균 점수:")
print(average_scores)

Evaluation result:
- Coherence: 4.0
- Completeness: 3.5
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.5
- Completeness: 4.0
- Naturalness: 4.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.5
- Completeness: 3.0
- Naturalness: 2.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.5
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 4.5
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
**Evaluation of Model's Response:**

- Coherence: 4.0
- Completeness: 3.5
- Naturalness: 3.0

Overall, the model's response is coherent with the input article, providing a detailed summary of the content. However, the response lacks completeness as it does not cover all key points and findings from the original text. Additionally, the naturalness of the response could be improved for better readability and fluency.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 4.0
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 2.5
- Completeness: 3.0
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.5
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.5
- Completeness: 2.5
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.5
- Completeness: 4.0
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.5
- Completeness: 4.0
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
Based on the provided inputs, here is the evaluation of the QA model's response:

- Coherence: 4.5
- Completeness: 3.0
- Naturalness: 4.0

Overall, the model's response is coherent and natural, but it lacks completeness as it focuses more on lifestyle factors and less on insulin resistance and hyperinsulinemia, which are crucial aspects of cancer development as highlighted in the true answer.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.5
- Completeness: 4.5
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.0
- Completeness: 2.5
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 4.5
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 2.5
- Completeness: 2.0
- Naturalness: 2.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 5.0
- Completeness: 5.0
- Naturalness: 5.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 4.0
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.5
- Completeness: 4.5
- Naturalness: 4.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.5
- Completeness: 4.0
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.5
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 2.0
- Completeness: 3.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 2.0
- Completeness: 1.5
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.5
- Completeness: 4.0
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.5
- Completeness: 4
- Naturalness: 4.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.0
  - The model's response does not provide a clear summary of the article, as it mentions the difficulty of summarizing without delving into the key findings or insights provided in the original text.

- Completeness: 2.5
  - The model's response lacks key details and important findings from the original article, such as the specific metabolic changes observed in patients undergoing bariatric surgery and the implications for weight loss and diabetes improvement.

- Naturalness: 3.0
  - The response is somewhat disjointed and lacks the flow expected in a human-like summary. It reads more like a vague comment rather than a coherent summary of the article.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.5
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.5
- Completeness: 4.0
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.5
- Completeness: 4.0
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


평균 점수:
Coherence       3.850000
Completeness    3.433333
Naturalness     3.483333
Concept_F1      0.000000
BLEURT          0.539845
BERTScore_F1    0.603795
dtype: float64


In [None]:
import re
from sklearn.metrics import accuracy_score

def extract_choice(text):
    match = re.search(r'\b([A-D])\b', text)
    return match.group(1) if match else None

df['output_parsed'] = df['output'].apply(extract_choice)
df['model_output_parsed'] = df['model_output'].apply(extract_choice)

filtered_df = df.dropna(subset=['output_parsed', 'model_output_parsed'])

accuracy = accuracy_score(filtered_df['output_parsed'], filtered_df['model_output_parsed'])

print(f"Accuracy: {accuracy:.4f}")


Accuracy: 0.4100


In [None]:
# daily diets

import openai
import pandas as pd
import re
from tqdm import tqdm

from dotenv import load_dotenv
import os

load_dotenv()

openai.api_key = os.getenv("OPENAI_API_KEY")

MAX_CONTEXT_LENGTH = 8192

def evaluate_with_gpt4(input_text, model_output, true_output, task_type):
    input_text = input_text[:MAX_CONTEXT_LENGTH]
    model_output = model_output[:MAX_CONTEXT_LENGTH]
    true_output = true_output[:MAX_CONTEXT_LENGTH]

    if task_type == "daily_diets":
        prompt = f"""
        You are tasked with evaluating the quality of a meal recommendation model's responses based on the following metrics:
        1. **Coherence**: Does the model's response logically align with the context provided in the input?
        2. **Completeness**: Does the model's response sufficiently answer the input request?
        3. **Naturalness**: Does the model's response sound fluent and human-like?
        4. **Nutritional Adequacy**: Does the meal align with the nutritional goals in the input, considering reasonable flexibility and practical applicability in real-life scenarios?
        5. **Caloric Balance**: Are the recommended meals well-balanced in terms of calories?

        **Input**:
        {input_text}

        **Model's Response**:
        {model_output}

        **True Answer**:
        {true_output}

        Please rate each metric on a scale from 1 to 5. 
        Example response format:
        - Coherence: X.X
        - Completeness: X.X
        - Naturalness: X.X
        - Nutritional Adequacy: X.X
        - Caloric Balance: X.X
        """
    elif task_type == "alternative_diets":
        prompt = f"""
        You are tasked with evaluating the quality of a meal recommendation model's responses based on the following metrics:
        1. **Coherence**: Does the model's response logically align with the context provided in the input?
        2. **Completeness**: Does the model's response sufficiently answer the input request?
        3. **Naturalness**: Does the model's response sound fluent and human-like?
        4. **Improvement**: Does the recommended meal address the shortcomings of the previous meal?
        5. **Suitability**: Is the recommended meal suitable for a diabetes patient?

        **Input**:
        {input_text}

        **Model's Response**:
        {model_output}

        **True Answer**:
        {true_output}

        Please rate each metric on a scale from 1 to 5. 
        Example response format:
        - Coherence: X.X
        - Completeness: X.X
        - Naturalness: X.X
        - Improvement: X.X
        - Suitability: X.X
        """
    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo-0125",
            messages=[
                {"role": "system", "content": "You are an expert evaluator for meal recommendation models."},
                {"role": "user", "content": prompt}
            ]
        )
        return response["choices"][0]["message"]["content"]
    except Exception as e:
        print("Error with GPT-4 API:", e)
        return None

# 점수 추출 함수
def extract_scores(evaluation, task_type):
    if evaluation is None:
        if task_type == "daily_diets":
            return {
                "Coherence": 0.0,
                "Completeness": 0.0,
                "Naturalness": 0.0,
                "Nutritional Adequacy": 0.0,
                "Caloric Balance": 0.0
            }
        elif task_type == "alternative_diets":
            return {
                "Coherence": 0.0,
                "Completeness": 0.0,
                "Naturalness": 0.0,
                "Improvement": 0.0,
                "Suitability": 0.0
            }

    scores = {}
    if task_type == "daily_diets":
        metrics = ["Coherence", "Completeness", "Naturalness", "Nutritional Adequacy", "Caloric Balance"]
    elif task_type == "alternative_diets":
        metrics = ["Coherence", "Completeness", "Naturalness", "Improvement", "Suitability"]

    for metric in metrics:
        match = re.search(fr"{metric}: (\d\.\d)", evaluation)
        scores[metric] = float(match.group(1)) if match else 0.0

    return scores

results = []

daily_df = df[(df['task'] == 'daily_diets') & (df['output'].str.contains('"Breakfast"'))]

for _, row in tqdm(daily_df.iterrows(), total=len(daily_df), desc="Evaluating daily diets"):
    evaluation = evaluate_with_gpt4(row['input'], row['model_output'], row['output'], "daily_diets")
    scores = extract_scores(evaluation, "daily_diets")
    results.append({**row.to_dict(), **scores})

alternative_df = df[df['task'] == 'alternative_diets']

for _, row in tqdm(alternative_df.iterrows(), total=len(alternative_df), desc="Evaluating alternative diets"):
    evaluation = evaluate_with_gpt4(row['input'], row['model_output'], row['output'], "alternative_diets")
    scores = extract_scores(evaluation, "alternative_diets")
    results.append({**row.to_dict(), **scores})

evaluation_df = pd.DataFrame(results)

if "Nutritional Adequacy" in evaluation_df.columns and "Caloric Balance" in evaluation_df.columns:
    daily_avg = evaluation_df[evaluation_df['task'] == 'daily_diets'][[
        "Coherence", "Completeness", "Naturalness", 
        "Nutritional Adequacy", "Caloric Balance"
    ]].mean()
    print("Daily Diets Average Scores:")
    print(daily_avg)

if "Improvement" in evaluation_df.columns and "Suitability" in evaluation_df.columns:
    alternative_avg = evaluation_df[evaluation_df['task'] == 'alternative_diets'][[
        "Coherence", "Completeness", "Naturalness", 
        "Improvement", "Suitability"
    ]].mean()
    print("\nAlternative Diets Average Scores:")
    print(alternative_avg)
else:
    missing_columns = [col for col in ["Improvement", "Suitability"] if col not in evaluation_df.columns]
    print("\nAlternative Diets scores not available:")
    print(f"Missing columns: {missing_columns}")


Evaluating daily diets: 100%|██████████| 40/40 [00:34<00:00,  1.17it/s]
Evaluating alternative diets: 0it [00:00, ?it/s]

Daily Diets Average Scores:
Coherence               3.5750
Completeness            3.3250
Naturalness             3.9625
Nutritional Adequacy    3.0000
Caloric Balance         3.2250
dtype: float64

Alternative Diets scores not available:
Missing columns: ['Improvement', 'Suitability']





In [None]:
# alternative diets

import openai
import pandas as pd
import re
from tqdm import tqdm

from dotenv import load_dotenv
import os

load_dotenv()


openai.api_key = os.getenv("OPNEAI_API_KEY")

MAX_CONTEXT_LENGTH = 8192

def evaluate_with_gpt4(input_text, model_output, true_output, task_type):

    input_text = input_text[:MAX_CONTEXT_LENGTH]
    model_output = model_output[:MAX_CONTEXT_LENGTH]
    true_output = true_output[:MAX_CONTEXT_LENGTH]

    if task_type == "daily_diets":
        prompt = f"""
        You are tasked with evaluating the quality of a meal recommendation model's responses based on the following metrics:
        1. **Coherence**: Does the model's response logically align with the context provided in the input?
        2. **Completeness**: Does the model's response sufficiently answer the input request?
        3. **Naturalness**: Does the model's response sound fluent and human-like?
        4. **Nutritional Adequacy**: Does the meal response meet the nutritional goals mentioned in the input?
        5. **Caloric Balance**: Are the recommended meals well-balanced in terms of calories?

        **Input**:
        {input_text}

        **Model's Response**:
        {model_output}

        **True Answer**:
        {true_output}

        Please rate each metric on a scale from 1 to 5. 
        Example response format:
        - Coherence: X.X
        - Completeness: X.X
        - Naturalness: X.X
        - Nutritional Adequacy: X.X
        - Caloric Balance: X.X
        """
    elif task_type == "alternative_diet":
        prompt = f"""
        You are tasked with evaluating the quality of a meal recommendation model's responses based on the following metrics:

        1. **Coherence**: Does the model's response logically align with the context provided in the input?
        2. **Completeness**: Does the model's response sufficiently answer the input request?
        3. **Naturalness**: Does the model's response sound fluent and human-like?
        4. **Improvement**: Assume that the recommended alternative meal is an improvement over the previous meal. Evaluate how effectively it builds upon and enhances the previous meal, even if the changes are small or subtle.
        5. **Suitability**: Is the recommended meal suitable for a diabetes patient?

        **Input**:
        {input_text}

        **Model's Response**:
        {model_output}

        **True Answer**:
        {true_output}

        Please rate each metric on a scale from 1 to 5. 
        Example response format:
        - Coherence: X.X
        - Completeness: X.X
        - Naturalness: X.X
        - Improvement: X.X
        - Suitability: X.X
        """
    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo-0125",
            messages=[
                {"role": "system", "content": "You are an expert evaluator for meal recommendation models."},
                {"role": "user", "content": prompt}
            ]
        )
        return response["choices"][0]["message"]["content"]
    except Exception as e:
        print("Error with GPT-4 API:", e)
        return None

def extract_scores(evaluation, task_type):
    if evaluation is None:
        if task_type == "daily_diets":
            return {
                "Coherence": 0.0,
                "Completeness": 0.0,
                "Naturalness": 0.0,
                "Nutritional Adequacy": 0.0,
                "Caloric Balance": 0.0
            }
        elif task_type == "alternative_diet":
            return {
                "Coherence": 0.0,
                "Completeness": 0.0,
                "Naturalness": 0.0,
                "Improvement": 0.0,
                "Suitability": 0.0
            }

    scores = {}
    if task_type == "daily_diets":
        metrics = ["Coherence", "Completeness", "Naturalness", "Nutritional Adequacy", "Caloric Balance"]
    elif task_type == "alternative_diet":
        metrics = ["Coherence", "Completeness", "Naturalness", "Improvement", "Suitability"]

    for metric in metrics:
        match = re.search(fr"{metric}: (\d\.\d)", evaluation)
        scores[metric] = float(match.group(1)) if match else 0.0

    return scores

results = []

alternative_df = df[df['task'] == 'alternative_diet']

for _, row in tqdm(alternative_df.iterrows(), total=len(alternative_df), desc="Evaluating alternative diets"):
    evaluation = evaluate_with_gpt4(row['input'], row['model_output'], row['output'], "alternative_diet")
    scores = extract_scores(evaluation, "alternative_diet")
    results.append({**row.to_dict(), **scores})

evaluation_df = pd.DataFrame(results)

if all(col in evaluation_df.columns for col in ["Improvement", "Suitability"]):
    alternative_avg = evaluation_df[[
        "Coherence", "Completeness", "Naturalness", 
        "Improvement", "Suitability"
    ]].mean()
    print("\nAlternative Diets Average Scores:")
    print(alternative_avg)


Evaluating alternative diets:   0%|          | 0/50 [00:00<?, ?it/s]

Evaluating alternative diets: 100%|██████████| 50/50 [00:43<00:00,  1.16it/s]


Alternative Diets Average Scores:
Coherence       4.32
Completeness    3.85
Naturalness     4.02
Improvement     3.27
Suitability     3.90
dtype: float64





In [None]:
# qa_subjective

import openai
import pandas as pd
import re
from sklearn.metrics import f1_score
from bert_score import score as bert_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

from dotenv import load_dotenv
import os

load_dotenv()

openai.api_key = os.getenv("OPENAI_API_KEY")

bleurt_model_name = "Elron/bleurt-large-512"
tokenizer = AutoTokenizer.from_pretrained(bleurt_model_name)
bleurt_model = AutoModelForSequenceClassification.from_pretrained(bleurt_model_name)
bleurt_model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bleurt_model = bleurt_model.to(device)

MAX_CONTEXT_LENGTH = 8192

def evaluate_with_gpt4(input_text, model_output, true_output):
    input_text = input_text[:MAX_CONTEXT_LENGTH]
    model_output = model_output[:MAX_CONTEXT_LENGTH]
    true_output = true_output[:MAX_CONTEXT_LENGTH]

    prompt = f"""
    You are tasked with evaluating the quality of a QA model's responses based on the following metrics:
    1. **Coherence**: Does the model's response logically align with the context provided in the input?
    2. **Completeness**: Does the model's response sufficiently answer the question in the input?
    3. **Naturalness**: Does the model's response sound fluent and human-like?

    **Input**:
    {input_text}

    **Model's Response**:
    {model_output}

    **True Answer**:
    {true_output}

    Please rate each metric on a scale from 1 to 5. 
    Example response format:
    - Coherence: X.X
    - Completeness: X.X
    - Naturalness: X.X
    """
    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo-0125",
            messages=[
                {"role": "system", "content": "You are an expert evaluator for QA models."},
                {"role": "user", "content": prompt}
            ]
        )
        return response["choices"][0]["message"]["content"]
    except Exception as e:
        print("Error with GPT-4 API:", e)
        return None

def extract_scores(evaluation):
    if evaluation is None:
        return {
            "Coherence": 0.0,
            "Completeness": 0.0,
            "Naturalness": 0.0
        }
    coherence = re.search(r"Coherence: (\d\.\d)", evaluation)
    completeness = re.search(r"Completeness: (\d\.\d)", evaluation)
    naturalness = re.search(r"Naturalness: (\d\.\d)", evaluation)
    
    return {
        "Coherence": float(coherence.group(1)) if coherence else 0.0,
        "Completeness": float(completeness.group(1)) if completeness else 0.0,
        "Naturalness": float(naturalness.group(1)) if naturalness else 0.0
    }

def calculate_concept_f1(y_true, y_pred):
    return f1_score(y_true, y_pred, average="macro")


def calculate_bleurt(y_true, y_pred):
    inputs = tokenizer(y_pred, y_true, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    with torch.no_grad():
        scores = bleurt_model(**inputs).logits

    if scores.numel() == 1:
        return float(scores.squeeze().item()) 
    return [float(score) for score in scores.squeeze().tolist()] 

def calculate_bleurt_and_bertscore(y_true, y_pred):
    bleurt_score_value = calculate_bleurt(y_true, y_pred)

    _, _, bert_f1 = bert_score(y_pred, y_true, lang="en", rescale_with_baseline=True)

    bert_f1 = [float(score) for score in bert_f1]

    return {
        "BLEURT": bleurt_score_value if isinstance(bleurt_score_value, float) else sum(bleurt_score_value) / len(bleurt_score_value),
        "BERTScore_F1": sum(bert_f1) / len(bert_f1)
    }

def normalize_scores(df, column):

    df[column] = df[column].apply(
        lambda x: float(x) if isinstance(x, torch.Tensor) else x
    )
    min_val = df[column].min()
    max_val = df[column].max()

    df[column] = df[column].apply(
        lambda x: (x - min_val) / (max_val - min_val) if max_val > min_val else 0.5
    )
    return df

qa_df = df[df['task'] == 'qa_subjective']

results = []

for _, row in qa_df.iterrows():
    input_text = row['input']
    model_output = row['model_output']
    true_output = row['output']
    
    evaluation = evaluate_with_gpt4(input_text, model_output, true_output)
    print(f"Evaluation result:\n{evaluation}")
    
    scores = extract_scores(evaluation)
    
    concept_f1 = calculate_concept_f1([true_output], [model_output])
    
    metric_scores = calculate_bleurt_and_bertscore([true_output], [model_output])

    results.append({
        "input": input_text,
        "model_output": model_output,
        "true_output": true_output,
        "evaluation": evaluation,
        "Coherence": scores["Coherence"],
        "Completeness": scores["Completeness"],
        "Naturalness": scores["Naturalness"],
        "Concept_F1": concept_f1,
        "BLEURT": metric_scores["BLEURT"],
        "BERTScore_F1": metric_scores["BERTScore_F1"]
    })

evaluation_df = pd.DataFrame(results)

evaluation_df = normalize_scores(evaluation_df, "BLEURT")
evaluation_df = normalize_scores(evaluation_df, "BERTScore_F1")

average_scores = evaluation_df[["Coherence", "Completeness", "Naturalness", "Concept_F1", "BLEURT", "BERTScore_F1"]].mean()

print("평균 점수:")
print(average_scores)

Evaluation result:
- Coherence: 3.5
- Completeness: 3.0
- Naturalness: 2.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.5
- Completeness: 3.0
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.5
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 4.5
- Naturalness: 4.2


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 1.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.5
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.5
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 4.5
- Naturalness: 4.3


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 2.5
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.0
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.5
- Completeness: 3.0
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.5
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.5
- Completeness: 3.0
- Naturalness: 2.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.5
- Completeness: 3.0
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.5
- Completeness: 3.0
- Naturalness: 2.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 2.0
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.5
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.0
- Completeness: 2.0
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.5
- Completeness: 4.0
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 2.0
- Completeness: 1.0
- Naturalness: 1.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.5
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.0
- Completeness: 2.5
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 4.5
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 2.0
- Completeness: 3.0
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.0
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.0
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.5
- Completeness: 3.0
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.5
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.0
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.5
- Completeness: 3.0
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.5
- Completeness: 3.0
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.5
- Completeness: 4.0
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.0
- Completeness: 1.0
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.0
- Completeness: 1.0
- Naturalness: 2.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
Based on the provided scenario, here is the evaluation of the QA model's response:

- **Coherence**: 1.0  
The model's response lacks coherence with the context provided in the input. While the input detailed a specific case of an individual experiencing erectile dysfunction and seeking guidance, the model's response lists a plethora of potential causes without addressing the individual's specific situation.

- **Completeness**: 1.0  
The model's response is not complete as it fails to sufficiently answer the specific questions and concerns raised in the input. It provides a broad list of potential causes without addressing the individual's medical history, symptoms, or the effects of the medication they are currently taking.

- **Naturalness**: 1.0  
The response lacks naturalness as it repeats a long list of potential causes in a repetitive and unnatural manner. The response sounds robotic, lacks empathy, and does not engage with the individual seeking help.

Overa

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.0
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.5
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 2.5
- Naturalness: 2.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.5
- Naturalness: 4.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.0
- Completeness: 3.5
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 2.0
- Naturalness: 2.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.0
- Completeness: 3.5
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 2.0
- Completeness: 4.5
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 2.5
- Completeness: 4.5
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 2.0
- Completeness: 2.0
- Naturalness: 1.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.5
- Completeness: 2.5
- Naturalness: 3.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 4.5
- Completeness: 3.0
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 3.5
- Completeness: 4.0
- Naturalness: 3.5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation result:
- Coherence: 1.0
- Completeness: 2.0
- Naturalness: 4.0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


평균 점수:
Coherence       3.310000
Completeness    2.920000
Naturalness     3.180000
Concept_F1      0.000000
BLEURT          0.515577
BERTScore_F1    0.639133
dtype: float64


In [14]:
## nutri score, HEI

df[df['task']=='daily_diets'].head(2)

Unnamed: 0,dataset,split_data,task,instruction,input,output,__index_level_0__,model_output
430,diabetes_food_hub,test,daily_diets,Recommend a daily diet based on the given nutr...,Ensure the daily carbohydrate intake does not ...,"{""Breakfast"": ""Chicken and Cucumber Lettuce Wr...",32166,"{""Breakfast"": ""Turkey Sausage and Egg Casserol..."
431,diabetes_food_hub,test,daily_diets,Recommend a daily diet that includes a specifi...,Create a diet that includes Parmesan cheese(gr...,"{""Breakfast"": ""Chicken and Cucumber Lettuce Wr...",32167,"{""Breakfast"": ""Turkey Meatball \u201cWonton\u2..."


In [23]:
df[df['task']=='daily_diets'].iloc[0]['model_output']

'{"Breakfast": "Turkey Sausage and Egg Casserole", "Lunch": "Curried Chickpea Stew with Roasted Vegetables", "Dinner": "Chicken and Cucumber Lettuce Wraps with Peanut Sauce"}'

In [15]:
import pandas as pd

dfh = pd.read_csv("/data/jaesung/llm_for_diabetes/src/data/data2_daily_diets/diabetes_food_hub.csv")
dfh.head(2)

Unnamed: 0.1,Unnamed: 0,title,description,prep_time,cook_time,servings,steps,tags,ingredients,nutrition_facts.Amount per serving,nutrition_facts.Total Fat,nutrition_facts.Cholesterol,nutrition_facts.Sodium,nutrition_facts.Total Carbohydrate,nutrition_facts.Protein,file_name
0,0,Chicken Apple Crunch Salad,This savory and sweet chicken apple crunch sal...,5 min,1 hr,5,['Cube cooked chicken. Dice apple and celery. ...,"['CKD Non-Dialysis', 'CKD Dialysis', 'Kidney-F...","[{'label': 'cooked chicken', 'us_measure': '2 ...",230,10g,65mg,210mg,15g,21g,kidney-friendly
1,1,Broccoli and Apple Salad,This kidney-friendly recipe is a kid favorite....,10 min,1 hr,8,['Add the remaining ingredients and coat with ...,"['CKD Non-Dialysis', 'CKD Dialysis', 'Kidney-F...","[{'label': 'Plain Nonfat Greek yogurt', 'us_me...",130,9g,9mg,70mg,12g,4g,kidney-friendly


In [16]:
dfh['ingredients'].iloc[0]

"[{'label': 'cooked chicken', 'us_measure': '2 cup', 'metric_measure': '473 ml'}, {'label': 'Gala apples', 'us_measure': '1 cup', 'metric_measure': '237 ml'}, {'label': 'celery', 'us_measure': '1/2 cup', 'metric_measure': '118 ml'}, {'label': 'scallions', 'us_measure': '2 tbsp', 'metric_measure': '30 ml'}, {'label': 'raisins', 'us_measure': '1/4 cup', 'metric_measure': '59 ml'}, {'label': 'light mayonnaise', 'us_measure': '1/3 cup', 'metric_measure': '78 ml'}, {'label': 'light sour cream', 'us_measure': '1 tbsp', 'metric_measure': '15 ml'}, {'label': 'lemon juice', 'us_measure': '1 tbsp', 'metric_measure': '15 ml'}, {'label': 'ground cinnamon', 'us_measure': '1/4 tsp', 'metric_measure': '1 g'}, {'label': 'black pepper', 'us_measure': '1/4 tsp', 'metric_measure': '1 g'}]"

In [29]:
dfh.columns

Index(['Unnamed: 0', 'title', 'description', 'prep_time', 'cook_time',
       'servings', 'steps', 'tags', 'ingredients',
       'nutrition_facts.Amount per serving', 'nutrition_facts.Total Fat',
       'nutrition_facts.Cholesterol', 'nutrition_facts.Sodium',
       'nutrition_facts.Total Carbohydrate', 'nutrition_facts.Protein',
       'file_name'],
      dtype='object')

In [None]:
import pandas as pd
import json
import re
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# 데이터 유효성 검사 함수
# meal_data(JSON 문자열)에서 반드시 Breakfast, Lunch, Dinner 키가 존재해야 함
def is_valid_meal_structure(json_string):
    try:
        data = json.loads(json_string)
        return all(key in data for key in ['Breakfast', 'Lunch', 'Dinner'])
    except (json.JSONDecodeError, TypeError):
        return False

# 텍스트 유사도 기반으로 가장 유사한 행 찾기
# TF-IDF를 사용하여 dfh의 제목과 입력된 title 간 유사도를 계산
def find_most_similar_row(title, dfh):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(dfh['title'])
    input_vector = vectorizer.transform([title])
    similarities = cosine_similarity(input_vector, tfidf_matrix)
    most_similar_idx = similarities.argmax()
    return dfh.iloc[most_similar_idx]

# Nutri-Score를 계산하는 함수
def calculate_nutri_score(nutrition):
    try:
        # 에너지 점수 계산
        energy = nutrition.get('nutrition_facts.Energy (kJ)', 0)
        energy_points = min(energy / 335, 10)

        # 기타 불리한 점수 계산
        saturated_fat_points = min(nutrition.get('nutrition_facts.Total Fat', 0) / 1, 10)
        sugar_points = min(nutrition.get('nutrition_facts.Total Carbohydrate', 0) / 4.5, 10)
        sodium_points = min(nutrition.get('nutrition_facts.Sodium', 0) / 90, 10)

        unfavorable_points = energy_points + saturated_fat_points + sugar_points + sodium_points

        # 유리한 점수 계산
        fiber_points = 0  # 데이터에 섬유질 정보가 없을 경우 기본값 0
        protein_points = min(nutrition.get('nutrition_facts.Protein', 0) / 1.6, 5)
        fruit_veg_points = 0  # 과일/야채 정보가 없을 경우 기본값 0

        favorable_points = fiber_points + protein_points + fruit_veg_points

        # 최종 Nutri-Score 계산
        total_score = unfavorable_points - favorable_points
        return total_score
    except Exception as e:
        print(f"Error in calculate_nutri_score: {e}, nutrition: {nutrition}")
        return None

# Nutri-Score 등급 계산
def get_nutri_score_grade(score):
    if score <= -1:
        return "A"
    elif score <= 2:
        return "B"
    elif score <= 10:
        return "C"
    elif score <= 18:
        return "D"
    else:
        return "E"

# 영양 데이터 정리 함수
def clean_nutrition_data(title, dfh):
    try:
        matched_row = find_most_similar_row(title, dfh)
        nutrition = {
            'nutrition_facts.Amount per serving': extract_numeric_value(matched_row.get('nutrition_facts.Amount per serving', 0)),
            'nutrition_facts.Total Fat': extract_numeric_value(matched_row.get('nutrition_facts.Total Fat', 0)),
            'nutrition_facts.Sodium': extract_numeric_value(matched_row.get('nutrition_facts.Sodium', 0)),
            'nutrition_facts.Total Carbohydrate': extract_numeric_value(matched_row.get('nutrition_facts.Total Carbohydrate', 0)),
            'nutrition_facts.Protein': extract_numeric_value(matched_row.get('nutrition_facts.Protein', 0)),
        }
        return nutrition
    except Exception as e:
        print(f"Error in clean_nutrition_data: {e}, title: {title}")
        return None

# 단위 제거 및 숫자 변환 함수
def extract_numeric_value(value):
    try:
        if isinstance(value, str):
            match = re.search(r"(\d+(\.\d+)?)", value)
            if match:
                return float(match.group(1))
        elif isinstance(value, (int, float)):
            return float(value)
    except Exception as e:
        print(f"Error in extract_numeric_value: {e}, value: {value}")
    return 0.0

# 각 식사의 Nutri-Score 계산 함수
def calculate_meal_nutri_score(meal_data, dfh):
    meal_scores = {}
    total_nutrition = {
        'nutrition_facts.Energy (kJ)': 0,
        'nutrition_facts.Total Fat': 0,
        'nutrition_facts.Sodium': 0,
        'nutrition_facts.Total Carbohydrate': 0,
        'nutrition_facts.Protein': 0
    }

    for meal, title in meal_data.items():
        matched_row = find_most_similar_row(title, dfh)
        nutrition = {
            'nutrition_facts.Energy (kJ)': extract_numeric_value(matched_row.get('nutrition_facts.Energy (kJ)', 0)),
            'nutrition_facts.Total Fat': extract_numeric_value(matched_row.get('nutrition_facts.Total Fat', 0)),
            'nutrition_facts.Sodium': extract_numeric_value(matched_row.get('nutrition_facts.Sodium', 0)),
            'nutrition_facts.Total Carbohydrate': extract_numeric_value(matched_row.get('nutrition_facts.Total Carbohydrate', 0)),
            'nutrition_facts.Protein': extract_numeric_value(matched_row.get('nutrition_facts.Protein', 0))
        }

        score = calculate_nutri_score(nutrition)
        meal_scores[meal] = {
            'score': score,
            'grade': get_nutri_score_grade(score)
        }

        for key in total_nutrition:
            total_nutrition[key] += nutrition[key]

    daily_score = calculate_nutri_score(total_nutrition)
    daily_grade = get_nutri_score_grade(daily_score)

    return meal_scores, {'score': daily_score, 'grade': daily_grade}

# 전체 Nutri-Score 계산 및 결과 저장 함수
def calculate_scores(df, dfh):
    results = []

    for idx, row in df.iterrows():
        try:
            if not is_valid_meal_structure(row.get('output', '')) or not is_valid_meal_structure(row.get('model_output', '')):
                print(f"Skipping row {idx}: Invalid meal structure")
                continue

            output_data = json.loads(row['output'])
            model_output_data = json.loads(row['model_output'])

            output_scores, output_daily_score = calculate_meal_nutri_score(output_data, dfh)
            model_scores, model_daily_score = calculate_meal_nutri_score(model_output_data, dfh)

            results.append({
                'row_index': idx,
                'output_scores': output_scores,
                'output_daily_score': output_daily_score,
                'model_scores': model_scores,
                'model_daily_score': model_daily_score
            })
        except Exception as e:
            print(f"Error processing row {idx}: {e}")
            results.append({
                'row_index': idx,
                'output_scores': None,
                'output_daily_score': None,
                'model_scores': None,
                'model_daily_score': None
            })

    return results

# 실행 및 결과 출력
results = calculate_scores(df, dfh)
for result in results:
    print(result)


Skipping row 434: Invalid meal structure
Skipping row 439: Invalid meal structure
Skipping row 444: Invalid meal structure
Skipping row 449: Invalid meal structure
Skipping row 454: Invalid meal structure
Skipping row 459: Invalid meal structure
Skipping row 464: Invalid meal structure
Skipping row 469: Invalid meal structure
Skipping row 474: Invalid meal structure
Skipping row 479: Invalid meal structure
{'row_index': 430, 'output_scores': {'Breakfast': {'score': 8.777777777777779, 'grade': 'C'}, 'Lunch': {'score': 12.555555555555557, 'grade': 'D'}, 'Dinner': {'score': 9.444444444444445, 'grade': 'C'}}, 'output_daily_score': {'score': 15.0, 'grade': 'D'}, 'model_scores': {'Breakfast': {'score': -0.7222222222222223, 'grade': 'B'}, 'Lunch': {'score': 12.555555555555557, 'grade': 'D'}, 'Dinner': {'score': 8.777777777777779, 'grade': 'C'}}, 'model_daily_score': {'score': 15.0, 'grade': 'D'}}
{'row_index': 431, 'output_scores': {'Breakfast': {'score': 8.777777777777779, 'grade': 'C'}, 'Lu

In [42]:
tmp = pd.DataFrame(results)

In [43]:
tmp.head(2)

Unnamed: 0,row_index,output_scores,output_daily_score,model_scores,model_daily_score
0,430,"{'Breakfast': {'score': 8.777777777777779, 'gr...","{'score': 15.0, 'grade': 'D'}","{'Breakfast': {'score': -0.7222222222222223, '...","{'score': 15.0, 'grade': 'D'}"
1,431,"{'Breakfast': {'score': 8.777777777777779, 'gr...","{'score': 15.0, 'grade': 'D'}","{'Breakfast': {'score': 9.444444444444445, 'gr...","{'score': 15.0, 'grade': 'D'}"


In [44]:
import pandas as pd
from statistics import mode

# 'score' 평균 계산
output_score_mean = tmp['output_daily_score'].apply(lambda x: x['score']).mean()
model_score_mean = tmp['model_daily_score'].apply(lambda x: x['score']).mean()

# 'grade' 최빈값 계산
output_grade_mode = tmp['output_daily_score'].apply(lambda x: x['grade']).mode()[0]
model_grade_mode = tmp['model_daily_score'].apply(lambda x: x['grade']).mode()[0]

# 결과 출력
print(f"Output Daily Score 평균: {output_score_mean}")
print(f"Output Daily Grade 최빈값: {output_grade_mode}")
print(f"Model Daily Score 평균: {model_score_mean}")
print(f"Model Daily Grade 최빈값: {model_grade_mode}")


Output Daily Score 평균: 14.888888888888891
Output Daily Grade 최빈값: D
Model Daily Score 평균: 14.916666666666666
Model Daily Grade 최빈값: D
