In [None]:
# Cell 1: Imports and Azure OpenAI Client Initialization
# >>>>>> pip install openai==1.82.0 openai-agents==0.0.16 <<<<< #


import os
import glob
import random
import asyncio

from agents import Agent                                    # For creating and managing agents
from agents import trace
from agents import Runner                                   # For tracing agent execution 
from agents import set_default_openai_client                # This is typically called in Cell 1
from openai import AsyncAzureOpenAI                         # async_client instance is expected from Cell 1
from openai.types.chat import ChatCompletion                # For type hinting
from openai.types.chat import ChatCompletionMessageParam    # For message structure reference

# --- Azure OpenAI Configuration ---
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT", "YOUR_AZURE_OPENAI_ENDPOINT_HERE")
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY", "YOUR_AZURE_OPENAI_API_KEY_HERE")
AZURE_OPENAI_DEPLOYMENT_NAME = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME", "YOUR_AZURE_DEPLOYMENT_NAME_HERE")
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION", "2024-12-01-preview") # Original API version


# --- Azure OpenAI Async Client Initialization ---
async_client = AsyncAzureOpenAI(
    azure_endpoint      =   AZURE_OPENAI_ENDPOINT,
    api_key             =   AZURE_OPENAI_API_KEY,
    api_version         =   AZURE_OPENAI_API_VERSION
)

# Set the default client for the Agents SDK
set_default_openai_client(async_client)

print("Imports complete and Azure OpenAI Async Client initialized.")

Imports complete and Azure OpenAI Async Client initialized.


In [None]:
# --- Cell 2: Configuration, Paths, and Global Variables ---

# --- Directory Paths ---
PSEUDO_MD_DIRECTORY_PATH = r'D:\Dataset\Lagerugpijn\pseudonymized-epds' # Adjust if necessary
SYNTHETIC_OUTPUT_BASE_DIR = r'D:\Dataset\Lagerugpijn\synthetic-epds' # Adjust if necessary
SYNTHETIC_OUTPUT_DIR = os.path.join(SYNTHETIC_OUTPUT_BASE_DIR, "synthetic_epds")

# --- Generation Settings ---
NUM_SYNTHETIC_RECORDS_TO_GENERATE = 20 # Example: Generate 20 synthetic records
MAX_GENERATION_RETRIES_PER_RECORD = 5 # Max retries if a record doesn't end with "FINISH"

# --- Global Variables for Example Tracking ---
g_selected_example_file_name = None
g_selected_example_directory_path = None

# --- Prompts ---
WORKER_SYSTEM_MESSAGE = (
    "Je bent een fysiotherapeut die realistische synthetische Nederlandse patiëntdossiers (EHR) genereert op basis van geanonimiseerde intake-informatie en expertbegeleiding. "
    "Je past het International Classification of Functioning (ICF) kader toe en volgt de KNGF klinische richtlijn voor lage rugpijn. "
    "Produceer uitsluitend het gevraagde patiëntdossier. Zorg ervoor dat je output **altijd** eindigt met 'FINISH'."
)

SUPERVISOR_SYSTEM_MESSAGE = (
    "You are a meticulous supervisor overseeing a worker agent. The worker's task is to generate a complete Dutch physiotherapeutic EHR record for low back pain, "
    "following ICF and KNGF guidelines. Your primary responsibility is to validate the worker's output. "
    "Ensure each generated record is complete, adheres to all instructions provided to the worker, and critically, ends with the exact marker 'FINISH'. "
    "If the output is unsatisfactory (e.g., missing 'FINISH' or incomplete), you will signal a retry by providing a new, corrected instruction set for the worker. " # Adjusted for clarity
    "The process is complete for a record only when you confirm it meets all criteria and has the 'FINISH' marker."
)

SUPERVISOR_RETRY_PROMPT_TEMPLATE = """\
A worker agent was tasked with generating a physiotherapeutic EHR record.
The full original task given to the worker was:
--- Full Original Worker Task Start ---
{initial_worker_task}
--- Full Original Worker Task End ---

The worker's most recent output was:
--- Worker Output Start ---
{worker_output}
--- Worker Output End ---

This output is unsatisfactory because: {error_detail}.

Your role as supervisor is to generate a **new, complete instruction message that will be sent directly to the worker agent** to retry the task.
This new instruction message for the worker should:
1.  Acknowledge the previous attempt's failure.
2.  Clearly restate that the worker must follow all requirements from the original task.
3.  Specifically emphasize that the worker's entire output **MUST end with the exact marker 'FINISH'**.
4.  Encourage the worker to produce a complete and correct record based on the original detailed instructions.

Generate **only** the new instruction message for the worker agent.
Do not add any conversational fluff, preamble, or explanation intended for the orchestrator.
The message should be directly addressed to the worker. For example: "Your previous attempt was not successful because... Please retry and ensure..."
"""

WORKER_USER_PROMPT_TEMPLATE = """Genereer EEN compleet en realistisch synthetisch fysiotherapeutisch patiëntdossier in het Nederlands, uitsluitend voor een patiënt met **acute, subacute of chronische lage rugpijn**. Genereer **geen** dossiers voor andere klachten.

Het dossier moet de volgende onderdelen bevatten, in deze volgorde en met de volgende specificaties:

1.  **Samenvatting anamnese:** Een bondige, verhalende samenvatting van de patiëntgeschiedenis (klachtengeschiedenis, symptoomontwikkeling), functionele impact, coping en relevante context (werk, stressoren, eerdere episodes). Schrijf dit in natuurlijke, professionele Nederlandse klinische taal. Geef duidelijk aan of het gaat om acute (<6 weken), subacute (6-12 weken) of chronische (>12 weken) lage rugpijn.
2.  **ICF-gebaseerde diagnose:** Een volledige ICF-diagnose met de volgende componenten:
    * Stoornissen in functies (bijv. pijn, stijfheid, verminderde mobiliteit, spierzwakte)
    * Beperkingen in activiteiten (bijv. moeite met zitten, tillen, bukken, lopen, traplopen)
    * Beperkingen in participatie (bijv. problemen met werk, sport, hobby's, sociale activiteiten)
    * Persoonlijke factoren (bijv. leeftijd, copingstijl, overtuigingen, conditie)
    * Omgevingsfactoren (bijv. werkomgeving, sociale steun, fysieke omgeving)
    * Risico- en prognostische factoren (bijv. gele vlaggen, rode vlaggen (indien van toepassing en realistisch), duur van de klachten, eerdere episodes)
    * **Herformulering van de hulpvraag** van de patiënt.
3.  **Behandeldoelen:** Formuleer **SMART, patiëntgerichte, functionele doelen**. Beschrijf specifiek **wat de patiënt weer wil kunnen doen**. Klinimetrische scores (zoals PSK, NRS, ODI) mogen worden genoemd als *ondersteuning* of *meetbaar criterium* voor het doel (bijv. "PSK van 70 naar ≤14 om weer te kunnen tuinieren\"), maar de score-reductie is niet het doel zelf. Geef aan *wanneer* het doel bereikt moet zijn.
4.  **Behandelplan:** Beschrijf de voorgestelde interventies (bijv. manuele therapie, oefentherapie, motorische controle training, educatie, graded activity, leefstijladvies, pijneducatie) en de rationale hierachter. Baseer dit plan op de KNGF richtlijn lage rugpijn en de gestelde doelen.
5.  **SOEP voortgangsnotities:** Schrijf **minimaal 3 en maximaal 8 afzonderlijke voortgangsnotities**, elk voor een individuele behandelsessie. Gebruik het **volledige SOEP-formaat** (Subjectief, Objectief, Evaluatie, Plan) voor elke notitie. Toon progressie, eventuele stagnatie, terugval, aanpassing van het plan en klinische besluitvorming over de sessies heen. Varieer realistisch in de frequentie en het aantal sessies tussen de 3 en 8.
6.  **Taal en stijl:** Het hele dossier moet geschreven zijn in professioneel, natuurlijk Nederlands, zoals gebruikt door Nederlandse fysiotherapeuten. Breid gangbare afkortingen en klinische shorthand uit (zoals PSK, LWK, 3d xt li). Hanteer een realistische en gevarieerde toon en structuur die aansluit bij de voorbeelden.

Hieronder staan voorbeelden van gepseudonimiseerde patiëntdossiers. Gebruik deze voorbeelden als referentie voor de verwachte structuur, stijl, taalgebruik en het detailniveau, maar genereer een **compleet nieuw en uniek patiëntgeval** met een eigen anamnese, diagnose, doelen en een realistisch, variabel verloop van de behandeling over meerdere sessies.

{example_markdown_content}

Genereer nu **uitsluitend** het nieuwe patiëntdossier hieronder, beginnend met de anamnese samenvatting en eindigend met 'FINISH'. Zorg ervoor dat het dossier **alle** hierboven gevraagde onderdelen bevat en voldoet aan **alle** instructies, inclusief het vereiste aantal SOEP-notities en de focus op lage rugpijn.
"""

print("Configuration, paths, and global variables set.")

# --- Cell 3: Helper Functions ---

def load_pseudonymized_examples(directory_path: str) -> str:
    """
    Loads content from a randomly selected pseudonymized markdown file in a directory.
    Adds clear separators to help the AI distinguish the example.
    Updates global variables for file name and path.
    Returns an empty string if no valid example file is found or an error occurs.
    """
    global g_selected_example_file_name, g_selected_example_directory_path
    g_selected_example_directory_path = directory_path

    example_files = glob.glob(os.path.join(directory_path, "pseudo_*.md"))
    if not example_files:
        g_selected_example_file_name = None
        return ""

    selected_file_path = random.choice(example_files)
    g_selected_example_file_name = os.path.basename(selected_file_path)

    try:
        with open(selected_file_path, 'r', encoding='utf-8') as f:
            content = f.read()
        return f"\n--- BEGIN VOORBEELD DOSSIER: {g_selected_example_file_name} ---\n{content.strip()}\n--- EINDE VOORBEELD DOSSIER ---\n"
    except Exception as e:
        print(f"Error reading selected example file {selected_file_path}: {e}")
        g_selected_example_file_name = None
        return ""

def save_synthetic_record(synthetic_content: str, output_dir: str, record_number: int):
    """Saves a single synthetic record content string to a specified file."""
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, f"synthetic_patient_{record_number:03d}.md")
    try:
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(synthetic_content)
        print(f"  Saved synthetic record {record_number} to: {output_path}")
    except Exception as e:
        print(f"Error writing synthetic record file {output_path}: {e}")

print("Helper functions defined.")

# --- Cell 4: Agent Definitions and Generation Logic ---

supervisor_agent = Agent(
    name="SupervisorAgent",
    instructions=SUPERVISOR_SYSTEM_MESSAGE,
    model=AZURE_OPENAI_DEPLOYMENT_NAME
)

worker_agent = Agent(
    name="WorkerAgent",
    instructions=WORKER_SYSTEM_MESSAGE,
    model=AZURE_OPENAI_DEPLOYMENT_NAME
)

async def generate_single_record_with_agents_async(
    example_markdown_content: str,
    record_number: int
) -> str | None:
    """
    Generates a single synthetic record using the worker_agent.
    If the worker's output is unsatisfactory, the supervisor_agent is invoked
    to generate new instructions for the worker to retry.
    Retries if the worker's output doesn't end with "FINISH" or if an error occurs.
    """
    initial_worker_task = WORKER_USER_PROMPT_TEMPLATE.format(
        example_markdown_content=example_markdown_content
    )
    current_task_for_worker = initial_worker_task

    for attempt in range(MAX_GENERATION_RETRIES_PER_RECORD):
        print(f"  Record {record_number}, Attempt {attempt + 1}/{MAX_GENERATION_RETRIES_PER_RECORD}: Worker agent is generating...")

        try:
            run_result = await Runner.run(
                worker_agent,
                current_task_for_worker
            )

            generated_content = None
            if hasattr(run_result, "final_output") and isinstance(run_result.final_output, str):
                generated_content = run_result.final_output
            elif isinstance(run_result, str): # Simpler check if result is just a string
                generated_content = run_result
            elif hasattr(run_result, "content") and isinstance(run_result.content, str): # Check for .content attribute
                 generated_content = run_result.content
            else:
                print(f"  Record {record_number}, Attempt {attempt + 1}: Worker returned unexpected response format from Runner.run().")

            if generated_content and generated_content.strip().endswith("FINISH"):
                print(f"  Record {record_number}, Attempt {attempt + 1}: Worker output successful and 'FINISH' marker confirmed.")
                return generated_content
            else:
                error_detail = "did not end with 'FINISH'" if generated_content else "was empty or invalid"
                worker_output_for_supervisor = generated_content if generated_content else "No content was generated by the worker."
                print(f"  Record {record_number}, Attempt {attempt + 1}: Worker output {error_detail}.")

                if attempt < MAX_GENERATION_RETRIES_PER_RECORD - 1:
                    print(f"  Record {record_number}, Attempt {attempt + 1}: Asking supervisor for retry instructions...")
                    
                    supervisor_retry_task_input = SUPERVISOR_RETRY_PROMPT_TEMPLATE.format(
                        initial_worker_task=initial_worker_task,
                        worker_output=worker_output_for_supervisor,
                        error_detail=error_detail
                    )
                    
                    supervisor_response_run = await Runner.run(supervisor_agent, supervisor_retry_task_input)
                    
                    new_worker_instructions = None
                    if hasattr(supervisor_response_run, "final_output") and isinstance(supervisor_response_run.final_output, str) and supervisor_response_run.final_output.strip():
                        new_worker_instructions = supervisor_response_run.final_output
                    elif isinstance(supervisor_response_run, str) and supervisor_response_run.strip():
                        new_worker_instructions = supervisor_response_run
                    elif hasattr(supervisor_response_run, "content") and isinstance(supervisor_response_run.content, str) and supervisor_response_run.content.strip():
                         new_worker_instructions = supervisor_response_run.content

                    if new_worker_instructions:
                        current_task_for_worker = new_worker_instructions
                        print(f"  Record {record_number}, Attempt {attempt + 1}: Supervisor provided new instructions for the worker.")
                    else:
                        print(f"  Record {record_number}, Attempt {attempt + 1}: Supervisor failed to provide valid retry instructions or response was empty. Using fallback.")
                        current_task_for_worker = (
                            f"Your previous response for this patient record {error_detail}. "
                            f"Please regenerate the complete record, ensuring it adheres to all instructions from the original task, "
                            f"especially that it ends with 'FINISH'.\n\n"
                            f"Original task (please re-attempt this carefully):\n{initial_worker_task}"
                        )
                    await asyncio.sleep(0.5) 
                else:
                    print(f"  Record {record_number}: Max retries reached. Failed to generate a valid record after worker/supervisor attempts.")
                    return None

        except Exception as e:
            print(f"  Record {record_number}, Attempt {attempt + 1}: An error occurred during agent execution: {e}")
            if attempt < MAX_GENERATION_RETRIES_PER_RECORD - 1:
                print(f"  Retrying record {record_number} due to error...")
                current_task_for_worker = (
                    f"A system error occurred in the previous attempt: {e}. Please try to fulfill the original task again. "
                    f"Ensure your entire response ends with 'FINISH'.\n\n"
                    f"Original task:\n{initial_worker_task}"
                )
                await asyncio.sleep(1) 
            else:
                print(f"  Record {record_number}: Max retries reached after error. Failed to generate.")
                return None
    return None

print("Agent definitions and generation logic defined.")

# --- Cell 5: Main asynchronous orchestration logic ---

async def main():
    print("\n--- Starting Synthetic Data Generation (Agent-Based Async) ---")

    if not async_client:
        print("Azure OpenAI Client not initialized. Please check your API key and configuration.")
        return

    os.makedirs(PSEUDO_MD_DIRECTORY_PATH, exist_ok=True)
    dummy_file_path = os.path.join(PSEUDO_MD_DIRECTORY_PATH, "pseudo_example_1.md")
    if not glob.glob(os.path.join(PSEUDO_MD_DIRECTORY_PATH, "pseudo_*.md")):\
        # Create a dummy file only if directory is truly empty of pseudo_*.md files
        if not os.path.exists(PSEUDO_MD_DIRECTORY_PATH) or not any(fname.startswith("pseudo_") and fname.endswith(".md") for fname in os.listdir(PSEUDO_MD_DIRECTORY_PATH)):
            print(f"No existing pseudo_*.md files found in '{PSEUDO_MD_DIRECTORY_PATH}'. Creating a dummy example.")
            with open(dummy_file_path, "w", encoding="utf-8") as f:
                f.write("Dit is een voorbeeld van een gepseudonimiseerd patiëntdossier voor lage rugpijn.\n"
                        "Het omvat een samenvatting van de anamnese, ICF-diagnose, behandeldoelen, "
                        "behandelplan en SOEP-notities, eindigend met 'FINISH'.\n\nFINISH")
            print(f"Dummy file created: '{os.path.basename(dummy_file_path)}'.")


    if not os.path.isdir(PSEUDO_MD_DIRECTORY_PATH) or not glob.glob(os.path.join(PSEUDO_MD_DIRECTORY_PATH, "pseudo_*.md")):
        print(f"Error: Pseudonymized examples directory '{PSEUDO_MD_DIRECTORY_PATH}' not found or contains no 'pseudo_*.md' files after check.")
        print("Please ensure the directory exists and is populated with example markdown files.")
        return

    print(f"\nGenerating {NUM_SYNTHETIC_RECORDS_TO_GENERATE} synthetic records in '{SYNTHETIC_OUTPUT_DIR}'.")
    os.makedirs(SYNTHETIC_OUTPUT_DIR, exist_ok=True)

    for i in range(NUM_SYNTHETIC_RECORDS_TO_GENERATE):
        record_index = i + 1
        print(f"\n--- Preparing for Synthetic Record {record_index} of {NUM_SYNTHETIC_RECORDS_TO_GENERATE} ---")

        example_content = load_pseudonymized_examples(PSEUDO_MD_DIRECTORY_PATH)

        if g_selected_example_file_name:
            print(f"  ---> Using example file: '{g_selected_example_file_name}'.")
        else:
            print(f"  ---> No specific example file loaded from '{PSEUDO_MD_DIRECTORY_PATH}' for this record. "
                  "Generation will proceed without a direct example in the prompt.")
            example_content = "\n--- GEEN SPECIFIEK VOORBEELD GELADEN ---\n"

        synthetic_record_content = await generate_single_record_with_agents_async(
            example_markdown_content=example_content,
            record_number=record_index
        )

        if synthetic_record_content:
            save_synthetic_record(synthetic_record_content, SYNTHETIC_OUTPUT_DIR, record_index)
        else:
            print(f"Skipping save for synthetic record {record_index} due to generation failure or max retries reached.")

        if i < NUM_SYNTHETIC_RECORDS_TO_GENERATE - 1:
            await asyncio.sleep(1) 

    print("\n--- Synthetic data generation complete. ---")

print("Main asynchronous orchestration logic defined.")

# --- Cell 6: Execution ---
# In a Jupyter Notebook, you run the main async function using top-level await.
# Ensure you are in an asyncio-compatible environment (e.g., Jupyter, Python 3.8+ script with asyncio.run).
# In a Jupyter Notebook, you run the main async function using top-level await
await main()

Configuration, paths, and global variables set.
Helper functions defined.
Agent definitions and generation logic defined.
Main asynchronous orchestration logic defined.

--- Starting Synthetic Data Generation (Agent-Based Async) ---

Generating 20 synthetic records in 'D:\\Dataset\\Lagerugpijn\synthetic_epds'.

--- Preparing for Synthetic Record 1 of 20 ---
  ---> Using example file: 'pseudo_EPDAfdruk_897_62117.md'.
  Record 1, Attempt 1/5: Worker agent is generating...
  Record 1, Attempt 1: Worker output successful and 'FINISH' marker confirmed.
  Saved synthetic record 1 to: D:\\Dataset\\Lagerugpijn\synthetic_epds\synthetic_patient_001.md

--- Preparing for Synthetic Record 2 of 20 ---
  ---> Using example file: 'pseudo_EPDAfdruk_897_61665.md'.
  Record 2, Attempt 1/5: Worker agent is generating...
