In [27]:
import pandas as pd
import os
from dotenv import load_dotenv
import json
from tqdm.auto import tqdm

In [10]:
load_dotenv()

OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')

In [11]:
from openai import OpenAI

client = OpenAI()

In [13]:
df = pd.read_csv("../data/qa_data.csv")
docs = df.to_dict(orient='records')

In [16]:
docs[0]

{'id': 681,
 'question_type': 'research',
 'question': 'what research (or clinical trials) is being done for Striatonigral Degeneration ?',
 'answer': 'The NINDS supports and conducts research on disorders of the brain and nervous system such as striatonigral degeneration. This research focuses on finding ways to prevent and treat these disorders.'}

In [17]:
prompt_template = """
You emulate a user of our medical conversational ai application.
Formulate 5 questions this student might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

question_type: {question_type}
question: {question}
answer: {answer}

Provide the output in parsable JSON without using code blocks:

{{"questions": ["question1", "question2", ..., "question5"]}}
""".strip()

## Generating questions for a single answer (to test)

In [18]:
prompt = prompt_template.format(**docs[0])

In [19]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [20]:
questions = llm(prompt)

In [23]:
json.loads(questions)

{'questions': ['What kind of studies are currently being conducted for Striatonigral Degeneration?',
  'Can you tell me about the clinical trials related to Striatonigral Degeneration?',
  'Which organization is responsible for research on Striatonigral Degeneration?',
  'What are the goals of the research on brain disorders like Striatonigral Degeneration?',
  'Is there ongoing research aimed at treating Striatonigral Degeneration?']}

## Generating questions for all answers

In [29]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [31]:
results = {}

In [32]:
for doc in tqdm(docs): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions_raw = generate_questions(doc)
    questions = json.loads(questions_raw)
    results[doc_id] = questions['questions']

  0%|          | 0/210 [00:00<?, ?it/s]

In [33]:
final_results = []

for doc_id, questions in results.items():
    for q in questions:
        final_results.append((doc_id, q))

In [34]:
final_results[0]

(681,
 'What kind of studies are being conducted regarding Striatonigral Degeneration?')

In [35]:
df_results = pd.DataFrame(final_results, columns=['id', 'question'])

In [36]:
df_results.to_csv('../data/ground-truth-retrieval-gpt-4-mini.csv', index=False)

In [37]:
!head ../data/ground-truth-retrieval-gpt-4-mini.csv

id,question
681,What kind of studies are being conducted regarding Striatonigral Degeneration?
681,Are there any ongoing clinical trials for Striatonigral Degeneration?
681,Which organization is involved in research related to Striatonigral Degeneration?
681,What is the goal of the research being done on Striatonigral Degeneration?
681,How does the NINDS contribute to research on Striatonigral Degeneration?
388,What types of studies are currently being conducted regarding ADHD?
388,Which organizations are involved in ADHD research and clinical trials?
388,How does the NIH contribute to the research on developmental disorders like ADHD?
388,What are the goals of the research programs related to ADHD?
