In [1]:
import pandas as pd
from tqdm.auto import tqdm
import json

In [2]:
#Loading the dataset having already id generated for each documents

with open("./../dataset/medical_qa_documents_with_id.json", "r") as f:
    docs_raw = json.load(f)

In [3]:
documents = docs_raw[0]["documents"]  # get the list of Q&A documents

print("Total documents loaded:", len(documents))
print("Keys in document:", docs_raw[0].keys())

Total documents loaded: 14979
Keys in document: dict_keys(['document_info', 'documents'])


In [4]:
documents = []

for docs_info in docs_raw:
    for doc in docs_info['documents']:
        documents.append(doc)

In [5]:
documents[0]

{'answer': 'LCMV infections can occur after exposure to fresh urine, droppings, saliva, or nesting materials from infected rodents. Transmission may also occur when these materials are directly introduced into broken skin, the nose, the eyes, or the mouth, or presumably, via the bite of an infected rodent. Person-to-person transmission has not been reported, with the exception of vertical transmission from infected mother to fetus, and rarely, through organ transplantation.',
 'question': 'Who is at risk for Lymphocytic Choriomeningitis (LCM)?',
 'qtype': 'susceptibility',
 'id': 'f72c0d85'}

### Generating questions for answers
Generating 4 relevant questions for each answers, the 50 sample of documents is selected randomly.Because of the Groq API free version free trials I am selecing only the 50 documents randomly. The returned question (4 x 50 = 200)will be saved in csv file for further evaluation.

In [6]:
import os
os.environ["GROQ_API_KEY"] = "gsk_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"

In [7]:
from groq import Groq

client = Groq()

In [8]:
prompt_template = """
You are a helpful and specialized medical assistant emulating a patients.
Your task is to formulate 4 generatic, clear, diverse, and natural questions that a patient might ask based on medical FAQ record.
The answer of the question must and should be in medical and the question should be meaningful, complete and not too short.
If possible, use as fewer words as possible from the record. 


The Medical FAQ Record:

Answer: {answer}
Question: {question}
Qtype: {qtype}

Output only a valid JSON list (no explanations, no code blocks):

["question1", "question2", "question3", "question4"]
""".strip()


In [10]:
import random
# Select 50 random sample for ground truth generation
doc_sample = random.sample(documents, 50)

In [9]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='openai/gpt-oss-20b',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [11]:
results = {}

In [53]:
import time

for doc in tqdm(doc_sample):
    doc_id = doc['id']
    if doc_id in results:
        continue

    for attempt in range(3):  # retry up to 3 times
        try:
            questions = generate_questions(doc)
            results[doc_id] = questions
            break  # success, exit retry loop
        except Exception as e:
            print(f"Error for doc_id={doc_id}: {e} (attempt {attempt+1}/3)")
            time.sleep(random.uniform(3, 6))  # small delay before retry
    else:
        print(f"Skipped doc_id={doc_id} after 3 failed attempts")


  0%|          | 0/50 [00:00<?, ?it/s]

Error for doc_id=749a127c: Connection error. (attempt 1/3)


In [60]:
results['e3e8adf5']

'["What symptoms do babies with Bartter syndrome usually show?", "How does Bartter syndrome affect growth and development in children?", "Can Bartter syndrome cause kidney complications like nephrocalcinosis or bone problems?", "Are there any specific eye or hearing issues associated with Bartter syndrome?"]'

In [61]:
parsed_resulst = {}

for doc_id, json_questions in results.items():
    parsed_resulst[doc_id] = json.loads(json_questions)

In [62]:
doc_index = {d['id']: d for d in doc_sample}

In [63]:
final_results = []

for doc_id, questions in parsed_resulst.items():
    qtype = doc_index[doc_id]['qtype']
    for q in questions:
        final_results.append((q, qtype, doc_id))

In [64]:
# create dataframe
df = pd.DataFrame(final_results, columns=['question', 'qtype', 'document'])

In [65]:
# save dataframe in a csv file
df.to_csv('./../dataset/search_ground-truth-data.csv', index=False)

In [66]:
!head ./../dataset/search_ground-truth-data.csv

question,qtype,document
When does hair loss typically start in hypotrichosis simplex?,information,eb86628a
How is hypotrichosis simplex inherited?,information,eb86628a
Is a particular gene mutation linked to hypotrichosis simplex?,information,eb86628a
What are the two primary forms of hypotrichosis simplex?,information,eb86628a
What are the typical symptoms that might indicate I have obesity hypoventilation syndrome?,information,0f4682be
"How is obesity hypoventilation syndrome diagnosed, and what tests are involved?",information,0f4682be
"What treatment options are available for obesity hypoventilation syndrome, and how do they help improve breathing?",information,0f4682be
"Can obesity hypoventilation syndrome lead to other health complications, and what steps can I take to prevent them?",information,0f4682be
What role does the ITGB2 gene play in leukocyte adhesion deficiency type 1?,genetic changes,96c8e59a
