In [1]:
import pandas as pd

In [2]:
from openai import OpenAI
import os
from dotenv import load_dotenv
load_dotenv()

api_key = os.getenv("OPENAI_API_KEY")

client = OpenAI(api_key=api_key)

In [3]:
df = pd.read_csv('../data/data.csv')
documents = df.to_dict(orient='records')

In [4]:
prompt_template = """
You emulate a user of our fitness assistant application.
Formulate 5 questions this user might ask based on a provided exercise.
Make the questions specific to this exercise.
The record should contain the answer to the questions, and the questions should
be complete and not too short. Use as fewer words as possible from the record. 

The record:

exercise_name: {exercise_name}
type_of_activity: {type_of_activity}
type_of_equipment: {type_of_equipment}
body_part: {body_part}
type: {type}
muscle_groups_activated: {muscle_groups_activated}
instructions: {instructions}

Provide the output in parsable JSON without using code blocks:

{{"questions": ["question1", "question2", ..., "question5"]}}
""".strip()

In [5]:
prompt = prompt_template.format(**documents[0])

In [6]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-5-nano',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [7]:
questions = llm(prompt)

In [8]:
import json

In [9]:
json.loads(questions)

{'questions': ['Where should your hands be placed at the start of a push-up?',
  'What is the starting position for a push-up?',
  'How far should the chest descend during the movement?',
  'What action completes each repetition after lowering the chest?',
  'Which muscle groups are activated by push-ups?']}

In [10]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [11]:
from tqdm.auto import tqdm

In [12]:
results = {}

In [None]:
for doc in tqdm(documents): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions_raw = generate_questions(doc)
    questions = json.loads(questions_raw)
    results[doc_id] = questions['questions']

In [None]:
final_results = []

for doc_id, questions in results.items():
    for q in questions:
        final_results.append((doc_id, q))

In [None]:
final_results[0]

In [None]:
df_results = pd.DataFrame(final_results, columns=['id', 'question'])

In [None]:
df_results.to_csv('../data/ground-truth-retrieval.csv', index=False)