In [9]:
import yaml
import re

# 1. Load the YAML config (list of dicts)
with open("../src/data/question_queries.yaml", encoding="utf-8") as f:
    templates = yaml.safe_load(f)

# 2. Extract all SPARQL & question templates
sparql_templates = [tpl["query_template"] for tpl in templates]
question_templates = [tpl["question_template"] for tpl in templates]


# 3. Define multi‐template helpers
def make_sparql_queries(threshold: float) -> list[str]:
    return [
        re.sub(r"\*Threshold\*", str(threshold), tmpl)
        for tmpl in sparql_templates
    ]


def make_natural_questions(threshold: float) -> list[str]:
    return [
        tmpl.replace("*Threshold*", str(threshold))
        for tmpl in question_templates
    ]


thr = 7.5
print("---- SPARQL ----")
for q in make_sparql_queries(thr):
    print(q)
print("\n-- Questions --")
for q in make_natural_questions(thr):
    print(q)


---- SPARQL ----
PREFIX sphn: <https://www.biomedit.ch/rdf/sphn-schema/sphn/>
PREFIX icd: <https://www.biomedit.ch/rdf/sphn-schema/sphn/icd#>

SELECT ?patient WHERE {
  ?patient sphn:hasDiagnosis ?diag .
  ?diag sphn:hasCode ?code .
  ?code sphn:hasCodeValue icd:§Diagnosis§ .
}

PREFIX sphn:  <https://www.biomedit.ch/rdf/sphn-schema/sphn/>
SELECT ?patient WHERE {
  ?event sphn:hasSubjectPseudoIdentifier ?patient .
  ?event sphn:hasLabResult ?res .
  ?res sphn:hasQuantityValue ?val .
  FILTER(?val > §Threshold§)
}

SELECT ?patient WHERE {
  ?presc sphn:hasSubjectPseudoIdentifier ?patient .
  ?presc sphn:hasDrug ?drug .
  ?drug sphn:hasCode ?code .
  ?code sphn:hasValue §Drug§
}

PREFIX sphn: <https://www.biomedit.ch/rdf/sphn-schema/sphn/>
PREFIX icd: <https://www.biomedit.ch/rdf/sphn-schema/sphn/icd#>

SELECT (COUNT(*) AS ?numPatients) WHERE {
  {
    SELECT DISTINCT ?patient WHERE {
      ?patient sphn:hasDiagnosis ?diag .
      ?diag sphn:hasCode ?code .
      ?code sphn:hasCodeValue 

In [10]:
openai_key=""

In [11]:
from langchain.chat_models     import ChatOpenAI
from langchain_core.messages   import SystemMessage, HumanMessage
from langchain_community.callbacks import get_openai_callback

from src.pipeline.llm import llm_pipeline

# toggle between OpenAI or your pipeline
use_openai = True

# 1) your one-and-only system prompt
system_prompt = """
You are a paraphrasing assistant. Each time you receive an input question template, you must output exactly one paraphrased question that:
1. Keeps the same meaning and intent as the original.
2. Remains a valid question (ends with a question mark).
3. Preserves all placeholder tokens (e.g. §Threshold§, §Diagnosis§) exactly—do not rename, alter, or remove them, but you may move them within the sentence.
4. Uses different wording and sentence structure from the original.

Respond only with the paraphrased question—do not include any commentary or extra text.
"""

# 2) OpenAI client
openai_model = ChatOpenAI(model="gpt-4o-mini", api_key=openai_key)

def ask_openai(prompt: str, seen: set[str]) -> str:
    # build a “please avoid these” preamble if we have any seen paraphrases
    if seen:
        avoid_list = "\n".join(f"- {s}" for s in seen)
        user_content = (
            f"The following paraphrases have already been generated:\n"
            f"{avoid_list}\n\n"
            f"Please give me a new paraphrase of the question below, different from any of the above.\n\n"
            f"Original: {prompt}"
        )
    else:
        user_content = prompt

    msgs = [
        SystemMessage(content=system_prompt),
        HumanMessage(content=user_content),
    ]

    with get_openai_callback() as cb:
        resp = openai_model(msgs).content
        print(f"Cost (USD): ${cb.total_cost:.6f}")

    # optional: strip “The result is:” if it sneaks in
    prefix = "The result is:"
    if resp.startswith(prefix):
        resp = resp[len(prefix):].strip()
    return resp

# 3) your existing pipeline client
pipeline_model = llm_pipeline.ModelPipeline(
    "meta-llama/Llama-3.2-3B-Instruct",
    max_length=256,
    temperature=0.7,
)

# 4) the paraphrasing loop
n_alts = 15
expanded_templates = []

for question_template, sparql_template in zip(question_templates, sparql_templates):
    print(f"== Iteration for template: {question_template!r} ==")
    # expanded_templates.append({
    #     "question_template": question_template,
    #     "sparql_template": sparql_template
    # })

    seen, attempts = set(), 0
    while len(seen) < n_alts and attempts < n_alts * 5:
        attempts += 1

        if use_openai:
            alt = ask_openai(question_template, seen).strip()
        else:
            if seen:
                avoid_list = "\n".join(f"- {s}" for s in seen)
                user_prompt = (
                    f"The following paraphrases have already been generated:\n"
                    f"{avoid_list}\n\n"
                    f"Please give me a new paraphrase of the question below, different from any of the above.\n\n"
                    f"Original: {question_template}"
                )
            else:
                user_prompt = question_template

            alt = pipeline_model.generate(
                prompt=user_prompt,
                system_prompt=system_prompt
            ).strip()

        if alt not in seen:
            seen.add(alt)
            expanded_templates.append({
                "question_template": alt,
                "sparql_template": sparql_template
            })

    if len(seen) < n_alts:
        print(f"⚠️ Only got {len(seen)} unique paraphrases after {attempts} tries.")


Using device: cuda
== Iteration for template: 'Which patients have been diagnosed with {Diagnosis}' ==
Cost (USD): $0.000028
Cost (USD): $0.000035
Cost (USD): $0.000036
Cost (USD): $0.000039
Cost (USD): $0.000040
Cost (USD): $0.000042
Cost (USD): $0.000043
Cost (USD): $0.000045
Cost (USD): $0.000047
Cost (USD): $0.000049
Cost (USD): $0.000049
Cost (USD): $0.000049
Cost (USD): $0.000048
Cost (USD): $0.000050
Cost (USD): $0.000051
Cost (USD): $0.000055
Cost (USD): $0.000056
Cost (USD): $0.000058
== Iteration for template: "Which patient's lab results have surpassed the threshold of {THRESHOLD}" ==
Cost (USD): $0.000029
Cost (USD): $0.000036
Cost (USD): $0.000038
Cost (USD): $0.000039
Cost (USD): $0.000041
Cost (USD): $0.000045
Cost (USD): $0.000045
Cost (USD): $0.000045
Cost (USD): $0.000047
Cost (USD): $0.000048
Cost (USD): $0.000051
Cost (USD): $0.000053
Cost (USD): $0.000055
Cost (USD): $0.000058
Cost (USD): $0.000061
Cost (USD): $0.000062
Cost (USD): $0.000065
== Iteration for templa

In [12]:
import pandas as pd

df = pd.DataFrame(expanded_templates)
df.to_csv("expanded_templates.csv", index=False, encoding="utf-8")

print("Wrote expanded_templates.csv")


Wrote expanded_templates.csv


In [13]:
df=pd.read_csv("expanded_templates.csv")
df.shape

(120, 2)