In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('../data/baby_recipes_cleaned.csv', sep = ';')
documents = df.to_dict(orient='records')

In [4]:
documents[0]

{'id': 1,
 'dish_name': 'Sweet Potato Purée',
 'baby_age': '6-8 months',
 'iron_rich': 'No',
 'allergen': 'Dairy',
 'ingredients': 'sweet potato, water or breastmilk/formula',
 'cooking_time': 15,
 'recipe': 'Wash and prep ingredients: sweet potato, water or breastmilk/formula. Steam or gently simmer vegetables/fruit until fork-tender. Blend with a splash of cooking water or breastmilk/formula until smooth. Cool before serving. Store in the fridge up to 2 days or freeze portions up to 1 month.',
 'texture': 'Purée',
 'meal_type': 'Breakfast',
 'calories': 160,
 'preparation_difficulty': 'Easy'}

In [5]:
prompt_template = """
You emulate a user who is searching for a recipe for a baby. 
Formulate 5 questions this user might ask based on a baby food recipe. The record
should contain the answer to the questions, and the questions should be complete and not too short.
Use as fewer words as possible from the record. 

The record:


'dish_name': '{dish_name}',
'baby_age' : {baby_age},
'iron_rich': {iron_rich},
'allergen': {allergen},
'ingredients': '{ingredients}',
'cooking_time': {cooking_time},
'texture': '{texture}',
'meal_type': '{meal_type}',
'calories': {calories},
'preparation_difficulty': '{preparation_difficulty}',
'recipe': '{recipe}'


Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()

In [6]:
print(prompt_template.format(**documents[0]))

You emulate a user who is searching for a recipe for a baby. 
Formulate 5 questions this user might ask based on a baby food recipe. The record
should contain the answer to the questions, and the questions should be complete and not too short.
Use as fewer words as possible from the record. 

The record:


'dish_name': 'Sweet Potato Purée',
'baby_age' : 6-8 months,
'iron_rich': No,
'allergen': Dairy,
'ingredients': 'sweet potato, water or breastmilk/formula',
'cooking_time': 15,
'texture': 'Purée',
'meal_type': 'Breakfast',
'calories': 160,
'preparation_difficulty': 'Easy',
'recipe': 'Wash and prep ingredients: sweet potato, water or breastmilk/formula. Steam or gently simmer vegetables/fruit until fork-tender. Blend with a splash of cooking water or breastmilk/formula until smooth. Cool before serving. Store in the fridge up to 2 days or freeze portions up to 1 month.'


Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]


In [7]:
from dotenv import load_dotenv
import os
from groq import Groq

In [8]:
load_dotenv()

groq_client = Groq(api_key=os.getenv("GROQ_API_KEY"))

In [9]:
import re

In [10]:
def llm(prompt):
    response = groq_client.chat.completions.create(
        model='qwen/qwen3-32b',
        messages=[{"role": "user", "content": prompt}]
    )
    final_answer = response.choices[0].message.content

    return re.sub(r"<think>.*?</think>\s*", "", final_answer, flags=re.DOTALL).strip()

In [11]:
prompt = prompt_template.format(**documents[0])

In [12]:
questions = llm(prompt)
print(questions)

["Is Sweet Potato Purée suitable for a 6-month-old baby just starting solids?","Does the Sweet Potato Purée recipe include any allergens like dairy?","How can I prepare Sweet Potato Purée without breastmilk/formula?","What is the recommended consistency for 6-8 month old infants in this recipe?","How long does it take to cook Sweet Potato Purée for baby?"]


In [13]:
import json

In [14]:
json.loads(questions)

['Is Sweet Potato Purée suitable for a 6-month-old baby just starting solids?',
 'Does the Sweet Potato Purée recipe include any allergens like dairy?',
 'How can I prepare Sweet Potato Purée without breastmilk/formula?',
 'What is the recommended consistency for 6-8 month old infants in this recipe?',
 'How long does it take to cook Sweet Potato Purée for baby?']

In [15]:
from tqdm.auto import tqdm

In [16]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = groq_client.chat.completions.create(
        model='qwen/qwen3-32b',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content

    return re.sub(r"<think>.*?</think>\s*", "", json_response, flags=re.DOTALL).strip()

In [17]:
results = {}

In [18]:
for doc in tqdm(documents): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions = generate_questions(doc)
    results[doc_id] = questions

  0%|          | 0/500 [00:00<?, ?it/s]

In [22]:
results[1]

'["Does this recipe contain any allergens?", "Is this dish considered iron-rich for babies?", "Can this purée be served warm, or must it be cooled first?", "How long can this purée be stored in the freezer?", "Is this recipe suitable for a baby younger than 6 months?"]'

In [23]:
import pandas as pd
import ast

final_results = []

for doc_id, questions in results.items():
    if isinstance(questions, str):
        try:
            questions = json.loads(questions.strip())
        except json.JSONDecodeError:
            questions = [questions] 

    for q in questions:
        final_results.append((doc_id, q))

df_results = pd.DataFrame(final_results, columns=['id', 'question'])
print(df_results.head())


   id                                           question
0   1            Does this recipe contain any allergens?
1   1      Is this dish considered iron-rich for babies?
2   1  Can this purée be served warm, or must it be c...
3   1  How long can this purée be stored in the freezer?
4   1  Is this recipe suitable for a baby younger tha...


In [26]:
df_results.to_csv('../data/ground-truth-data.csv', index=False)

# Retrival evaluation

In [8]:
ground_truth = pd.read_csv('../data/ground-truth-data.csv')
ground_truth.head()

NameError: name 'pd' is not defined

In [27]:
ground_truth = df_results.to_dict(orient='records')

In [1]:
from qdrant_client import QdrantClient, models

In [2]:
qd_client = QdrantClient("http://localhost:6333")

In [3]:
model = 'BAAI/bge-small-en'
EMBEDDING_DIMENSIONALITY = 384
collection_name = "recipe_database"

In [4]:
#qd_client.delete_collection(collection_name=collection_name)

In [5]:
qd_client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=EMBEDDING_DIMENSIONALITY,
        distance=models.Distance.COSINE
    )
)

True

In [6]:
qd_client.create_payload_index(
    collection_name=collection_name,
    field_name="id",
    field_schema="keyword"
)

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

In [7]:
points = []

for i, doc in enumerate(documents):
    text = ' '.join(str(doc.get(field, "")) for field in df.columns if field != "id")

    vector = models.Document(text=text, model=model)
    point = models.PointStruct(
        id=i,
        vector=vector,
        payload=doc
    )
    points.append(point)

NameError: name 'documents' is not defined

In [None]:
qd_client.upsert(
    collection_name=collection_name,
    points=points
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [None]:
def vector_search(question):
    print('vector_search is used')
    
    query_points = qd_client.query_points(
        collection_name=collection_name,
        query=models.Document(
            text=question,
            model=model
        ),
        limit=2,
        with_payload=True
    )
    
    results = []
    
    for point in query_points.points:
        results.append(point.payload)
    
    return results

In [None]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['id']
    results = search(query=q['id'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)