In [1]:
import pandas as pd

In [5]:
import os
from dotenv import load_dotenv

load_dotenv()

OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')

In [6]:
from openai import OpenAI

client = OpenAI()

In [81]:
df = pd.read_csv('../data/data.csv')
documents = df.to_dict(orient='records')

In [82]:
prompt_template = """
You emulate a user of our activity planner application.
Formulate 5 questions this user might ask based on a provided activity.
Make the questions specific to this activity.
The record should contain the answer to the questions, and the questions should
be complete and not too short. Use as fewer words as possible from the record. 


THE RECORD:
 
activity_name:{activity_name}
activity_type:{activity_type}
materials_needed:{materials_needed}
time_required:{time_required}
age_group:{age_group}
difficulty_level:{difficulty_level} 
instructions:{instructions} 

Provide the output in parsable JSON without using code blocks:

{{"questions": ["question1", "question2", ..., "question5"]}}
""".strip()

In [83]:
prompt = prompt_template.format(**documents[0])

In [84]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [85]:
questions = llm(prompt)

In [86]:
import json

In [87]:
json.loads(questions)

{'questions': ['What materials do I need for the family picnic activity?',
  'How much time should I allocate for the family picnic?',
  'Is this family picnic suitable for toddlers?',
  'What level of difficulty should I expect when organizing the picnic?',
  'What steps should I follow to set up the picnic?']}

In [88]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [89]:
from tqdm.auto import tqdm

In [90]:
results = {}

In [91]:
for doc in tqdm(documents): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions_raw = generate_questions(doc)
    questions = json.loads(questions_raw)
    results[doc_id] = questions['questions']

  0%|          | 0/241 [00:00<?, ?it/s]

In [92]:
final_results = []

for doc_id, questions in results.items():
    for q in questions:
        final_results.append((doc_id, q))

In [93]:
final_results[0]

(0, 'What materials do I need for the Family Picnic?')

In [94]:
df_results = pd.DataFrame(final_results, columns=['id', 'question'])

In [95]:
df_results.to_csv('../data/ground-truth-retrieval.csv', index=False)

In [96]:
!head ../data/ground-truth-retrieval.csv

id,question
0,What materials do I need for the Family Picnic?
0,How long should we allocate for the Family Picnic activity?
0,What age group is this Family Picnic suitable for?
0,Can you provide a simple instruction for organizing the Family Picnic?
0,What is the difficulty level of setting up the Family Picnic?
1,What materials do I need for the DIY Craft Project?
1,How long will the DIY Craft Project take to complete?
1,What age group is the DIY Craft Project suitable for?
1,What is the difficulty level of the DIY Craft Project?
