In [1]:
import pandas as pd
import glob
import os

recipes_folder = "/data/jaesung/llm_for_diabetes/src/data/Recipes"

json_files = glob.glob(os.path.join(recipes_folder, "*.json"))

dfs = []
for file in json_files:
    df = pd.read_json(file)  # JSON을 DataFrame으로 변환
    df["category"] = os.path.basename(file).replace(".json", "")  # 파일명으로 카테고리 추가
    dfs.append(df)

merged_df = pd.concat(dfs, ignore_index=True)

In [2]:
merged_df.head()

Unnamed: 0,title,description,prep_time,cook_time,servings,steps,tags,nutrition_facts,ingredients,category
0,Chicken Apple Crunch Salad,This savory and sweet chicken apple crunch sal...,5 min,1 hr,5,[Cube cooked chicken. Dice apple and celery. C...,"[CKD Non-Dialysis, CKD Dialysis, Kidney-Friend...","{'Amount per serving': '230', 'Total Fat': '10...","[{'label': 'cooked chicken', 'us_measure': '2 ...",kidney-friendly
1,Broccoli and Apple Salad,This kidney-friendly recipe is a kid favorite....,10 min,1 hr,8,[Add the remaining ingredients and coat with t...,"[CKD Non-Dialysis, CKD Dialysis, Kidney-Friend...","{'Amount per serving': '130', 'Total Fat': '9g...","[{'label': 'Plain Nonfat Greek yogurt', 'us_me...",kidney-friendly
2,Chicken Nuggets with Honey Mustard Dipping Sauce,"This recipe is not only delicious, but it’s al...",10 min,15 min,12,"[Stir mustard, mayonnaise, honey, and Worceste...","[CKD Non-Dialysis, CKD Dialysis, Kidney-Friend...","{'Amount per serving': '160', 'Total Fat': '9g...","[{'label': 'corn flakes', 'us_measure': '3 cup...",kidney-friendly
3,Colorful Pasta Salad,Who wants to eat an ordinary pasta salad? This...,10 min,5 min,6,"[Chop bell peppers, red onion, and cucumbers. ...","[CKD Non-Dialysis, CKD Dialysis, Kidney-Friend...","{'Amount per serving': '140', 'Total Fat': '6g...","[{'label': 'Bow-tie pasta', 'us_measure': '4 o...",kidney-friendly
4,Green Pesto Pasta,Say goodbye to plain pasta and add a touch of ...,5 min,15 min,4,[Bring water to a boil and cook pasta as direc...,"[CKD Non-Dialysis, CKD Dialysis, Kidney-Friend...","{'Amount per serving': '290', 'Total Fat': '16...","[{'label': 'whole grain spaghetti noodles', 'u...",kidney-friendly


In [3]:
merged_df = merged_df.drop_duplicates(subset=['title'])

In [4]:
df = merged_df.copy()

In [6]:
import openai
import pandas as pd
import json
import os
from dotenv import load_dotenv
from tqdm import tqdm

output_file = "/data/jaesung/llm_for_diabetes/src/trial3/diabetes_food_hub_script.jsonl"

load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

client = openai.OpenAI()

def make_script(title, description, serving, steps, tags, nutrition_facts, ingredients, category):
    prompt = f"""
    You are a professional nutritionist and food scientist specializing in diabetes-friendly diets. Your task is to generate a **detailed and highly accurate** encyclopedic-style paragraph (similar to a Wikipedia article) that provides precise information about the given dish. The focus should be on **nutrition, health benefits, and its suitability for people with diabetes**.

    **Recipe Information:**
    - **Title**: {title}
    - **Category**: {category}
    - **Description**: {description}
    - **Serving Size**: {serving}
    - **Steps**: {steps}
    - **Tags**: {tags}
    - **Nutrition Facts**: {nutrition_facts}
    - **Ingredients**: {ingredients}

    **Instructions:**
    - Use the **nutrition_facts** data to explain the macronutrient balance (proteins, fats, carbohydrates, fiber) and its impact on blood sugar levels.
    - Use the **ingredients** list to describe the role of each key ingredient, particularly how it affects glycemic index (GI) and diabetes management.
    - Incorporate relevant **tags** (e.g., "low-carb", "high-protein") to justify why this dish fits into specific dietary categories.
    - Provide **scientifically accurate** information, supported by common nutritional knowledge.
    - If applicable, suggest ingredient modifications that could make the dish even more diabetes-friendly.
    - Explain **how this meal influences blood sugar control**, insulin response, and overall metabolic health.
    - Maintain a **neutral, encyclopedic tone**, similar to Wikipedia.

    **Output Format:**
    - A single, well-structured paragraph with **clear and logical flow**.
    - Use **formal and factual language** with precise nutritional details.
    - Ensure the content is medically accurate and informative.
    """

    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": prompt}]
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error during GPT response: {e}")

with open(output_file, "a", encoding="utf-8") as f:
    for _, row in tqdm(df.iterrows()):
        title = row['title'].strip()
        description = row['description'].strip()
        serving = f"{row['servings']} serving".strip()
        steps = row['steps']
        tags = row['tags']
        nutrition_facts = row['nutrition_facts']
        ingredients = row['ingredients']
        category = row['category'].strip()

        script = make_script(title, description, serving, steps, tags, nutrition_facts, ingredients, category)

        data = {
            "title": title,
            "description": description,
            "serving": serving,
            "steps": steps,
            "tags": tags,
            "nutrition_facts": nutrition_facts,
            "ingredients": ingredients,
            "category": category,
            "script": script,
        }

        json_str = json.dumps(data, ensure_ascii=False)
        f.write(json_str + "\n")


1336it [3:35:04,  9.66s/it]
