In [1]:
import requests
import pandas as pd
from tqdm.auto import tqdm

In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
from openai import OpenAI
openai_client = OpenAI()

In [4]:
# - Include at least ONE question asking how to cook/prepare/make the recipe (answerable from the directions)

prompt_template = """
You are a home cook looking at a recipe and want to understand the specific details provided in this recipe. Generate 5 questions that can be directly answered using ONLY the information given in the recipe below.

Strict Requirements:
- Each question MUST be answerable using only the provided recipe name, description, ratings, cooking time, ingredients list, or directions
- Do NOT ask about cooking techniques, substitutions, tips, or general cooking advice NOT EXPLICITLY mentioned in the recipe
- Always include the recipe name in each question to make the question relevant to the recipe
- Focus only on factual information present in the recipe data
- Use minimal words from the original recipe text to ensure diverse phrasing
- Use natural, conversational language that a home cook would actually ask - avoid referencing database fields like "directions" or "ingredients list"
- Do NOT reference specific numbers, database fields, or internal data structure details that users wouldn't know (like "619 ratings", "directions", "ingredients list", etc.)


Recipe Data:
- Name: {recipe_name}
- Description: {recipe_description}  
- Rating: {ratings}
- Cook Time: {ready-in}
- Ingredients: {ingredients}
- Directions: {directions}

Generate exactly 5 questions that can be answered directly from the above information:
["question1", "question2", "question3", "question4", "question5"]
""".strip()

In [5]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = openai_client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [6]:
import json

df_recipes = pd.read_csv('../data/recipes.csv')

ground_truth_questions = []
documents = df_recipes.to_dict(orient='records')
for doc in tqdm(documents):
    questions_raw = generate_questions(doc)
    questions = json.loads(questions_raw)
    for q in questions:
        ground_truth_questions.append((doc['recipe_id'], q))

  0%|          | 0/88 [00:00<?, ?it/s]

In [7]:
ground_truth_questions[0]

(0, "What's the total cook time for the Ground Beef Gyros?")

In [8]:
df_ground_truth_questions = pd.DataFrame(ground_truth_questions, columns=['id', 'question'])

In [9]:
df_ground_truth_questions.to_csv('../data/ground-truth-retrieval.csv', index=False)

In [10]:
!head ../data/ground-truth-retrieval.csv

id,question
0,What's the total cook time for the Ground Beef Gyros?
0,What kind of meat is recommended in the Ground Beef Gyros recipe?
0,How is the Ground Beef Gyros served according to the directions?
0,What rating did the Ground Beef Gyros recipe receive?
0,Do the Ground Beef Gyros include a sauce with cucumbers?
1,How long does the Lasagna recipe take from start to finish?
1,What's the main protein in this Lasagna recipe?
1,What kind of cheese does the Lasagna recipe use for layering?
1,Are the lasagna noodles pre-cooked before adding to the Lasagna recipe?
