In [2]:
from openai import OpenAI
import os
client = OpenAI()

In [3]:
key = client.api_key = os.getenv("OPENAI_API_KEY")
print(key)

In [3]:
import json
from pathlib import Path


json_path = Path("/home/m2khoda/dual_retriever/evaluations/dycot/qald_results/qald_test_solo_stage_top10_dycot_cleaned.json")

with json_path.open("r", encoding="utf-8") as f:
    data = json.load(f)

cleaned = [d for d in data if "cluster_id" in d]

with json_path.open("w", encoding="utf-8") as f:
    json.dump(cleaned, f, ensure_ascii=False, indent=2)


In [17]:
import json
from pathlib import Path

src  = Path("/home/m2khoda/dual_retriever/evaluations/dycot/qald_results/qald_test_solo_stage_top10_dycot_cleaned.json")           # source file
dest = Path("/home/m2khoda/dual_retriever/evaluations/dycot/qald_results/qald_test_solo_stage_top10_dycot_cleaned_removed_empty_triples.json")  # output file

with src.open("r", encoding="utf-8") as f:
    records = json.load(f)         # list[dict]

skipped = {"no triples list": 0}
filtered = []

for sample in records:
    # pick whichever key exists
    raw_triples = sample.get("retrived_triples_ranked")

    # keep only non-empty lists
    if raw_triples:
        filtered.append(sample)
    else:
        skipped["no triples list"] += 1

# --- 4. Save ----------------------------------------------------------------
with dest.open("w", encoding="utf-8") as f:
    json.dump(filtered, f, ensure_ascii=False, indent=2)

# --- 5. Report --------------------------------------------------------------
print(f"Skipped {skipped['no triples list']} records without triples "
      f"— kept {len(filtered)}.  Clean file: {dest}")


Skipped 19 records without triples — kept 130.  Clean file: /home/m2khoda/dual_retriever/evaluations/dycot/qald_results/qald_test_solo_stage_top10_dycot_cleaned_removed_empty_triples.json


Step by Step Reasoning

In [None]:
#!/usr/bin/env python3
"""Build an inference‑time JSONL file for the NLQ‑to‑SPARQL model.

Revision: 2025‑07‑05 (cluster prompt matrix, v8)
• Supports **eight** cluster‑specific THINK prompts:
    – Cluster 0 → 9‑step anchor/target instructions.
    – Cluster 1 → 8‑step root‑constant/2‑hop instructions.
    – Cluster 2 → snowflake‑style anchor/type instructions.
    – Cluster 3 → boolean ASK‑triple instructions.
    – Cluster 4 → single‑triple subject/object swap instructions.
    – Cluster 5 → multi‑constraint intersection instructions.
    – Cluster 6 → tree‑shaped root/type/target instructions.
    – Cluster 7 → cycle‑pattern anchor/expansion instructions (added now).
• Only clusters {0‑7} are exported.
• The *user* message is a JSON object with keys:
    "question" and "retrived_triples_ranked" (top‑10 triples, numbered).
"""
from __future__ import annotations

import json
from collections import Counter
from pathlib import Path
from typing import Any, Dict, List

TRIPLES_LIMIT = 10
ALLOWED_CLUSTER_IDS = {0, 1, 2, 3, 4, 5, 6, 7}

INPUT_PATH = Path(
    "/home/m2khoda/dual_retriever/evaluations/dycot/qald_results/qald_test_solo_stage_top10_dycot_cleaned.json"
)
OUTPUT_PATH = Path(
    "/home/m2khoda/dual_retriever/evaluations/dycot/qald_results/qald_test_solo_stage_top_10_dycot.jsonl"
)

# ────────────────────────── PROMPTS ───────────────────────────
SYSTEM_PROMPT_GENERIC = (
    "Given a specific question and up to ten potentially relevant triples, and the Step by step reasoning that you are given "
    "generate the corresponding SPARQL query for DBpedia. "
    "Return your answer after <Answer>, in JSON with key \"sparql\" and the query as its string value. Do not include any of your thought in the final output"
)

PROMPT_CLUSTER_0 = (
    "{\n"
    "  \"Question\": \"In which U.S. state is Area 51 located?\",\n"
    "  \"Candidate triples list (numbered)\": \"1. res:Area_51 dbo:location ?uri\\n"
    "2. ?uri dbo:country res:United_States\\n"
    "3. res:Area_51 dbo:wikiPageWikiLink res:United_States_Department_of_State\\n"
    "4. res:List_of_United_States_Air_Force_installations dbo:wikiPageWikiLink res:Area_51\\n"
    "5. res:Area_51 dbo:wikiPageWikiLink res:United_States_Department_of_Defense\\n"
    "6. res:Area_51 dbo:wikiPageWikiLink res:Freedom_of_Information_Act_(United_States)\\n"
    "7. res:51st_state_(disambiguation) dbo:wikiPageWikiLink res:Area_51\\n"
    "8. res:Area_51 dbo:wikiPageWikiLink res:List_of_United_States_Air_Force_installations\\n"
    "9. res:Area_51 dbo:wikiPageWikiLink res:United_States_Department_of_Energy\\n"
    "10. res:Area_51 dbo:wikiPageWikiLink res:United_States_Air_Force\"\n"
    "}\n"
    "\n"
    "### THINK (do not output) ###\n"
    "Step 1 – Understand the TASK and read the INPUT\n"
    "        • Question: “In which U.S. state is Area 51 located?”\n"
    "        • Candidate triples (10) – see JSON above.\n"
    "\n"
    "Step 2 – Extract the ANCHOR triple\n"
    "        • The wording “U.S. state” implies a state **belonging to** the United States.\n"
    "        • Triple 2 matches that fact, so choose:\n"
    "                P₁ = dbo:country  C₁ = res:United_States\n"
    "        • Anchor template: ?S dbo:country res:United_States .\n"
    "\n"
    "Step 3 – Filter NOISE and decide the TARGET predicate P₂\n"
    "        • **Predicate test** – ontology predicates (dbo:location, dbo:country, …) encode real relations;\n"
    "          generic links like **dbo:wikiPageWikiLink** or topical tags **dcterms:subject** do not.\n"
    "        • Therefore **discard triples 3 – 10** (all use dbo:wikiPageWikiLink).\n"
    "        • Remaining informative triples:\n"
    "                1. res:Area_51 dbo:location ?S (target candidate)\n"
    "                2. ?S dbo:country res:United_States (anchor)\n"
    "        • The question asks where Area 51 *is located* ⇒ set **P₂ = dbo:location** (its object is the answer variable).\n"
    "\n"
    "Step 4 – Infer TYPE constraint (optional)\n"
    "        • The wh-phrase already specifies “U.S. state”; the country filter suffices, so no extra rdf:type needed.\n"
    "\n"
    "Step 5 – Choose DECORATOR\n"
    "        • Not a “How many …?” query → use **DISTINCT**.\n"
    "\n"
    "Step 6 – Assemble the triple block (star shape)\n"
    "        Core:\n"
    "```\n"
    "res:Area_51 dbo:location ?S .\n"
    "?S          dbo:country  res:United_States .\n"
    "```\n"
    "        Extras:\n"
    "        – No additional rdf:type, FILTER, UNION, or OPTIONAL needed because the country constraint already restricts ?S to U.S. states.\n"
    "\n"
    "Step 7 – Compose the SELECT line\n"
    "        • Pattern: SELECT {DISTINCT|COUNT(?uri)} ?uri\n"
    "        • We choose **DISTINCT** because the goal is to return the state resource(s) themselves, not a count; therefore COUNT() is unnecessary.\n"
    "        SELECT DISTINCT ?S\n"
    "\n"
    "Step 8 – Finish (wrap in WHERE)\n"
    "        • No LIMIT, OFFSET, or ORDER are required.\n"
    "```\n"
    "SELECT DISTINCT ?S WHERE {\n"
    "  res:Area_51 dbo:location ?S .\n"
    "  ?S          dbo:country  res:United_States .\n"
    "}\n"
    "```\n"
    "\n"
    "Step 9 – Emit inside the required JSON wrapper\n"
    "\n"
    "<Answer>\n"
    "{\n"
    "  \"sparql\": \"SELECT DISTINCT ?uri WHERE { res:Area_51 dbo:location ?uri . ?uri dbo:country res:United_States . }\"\n"
    "}\n"
    "### END THINK ###"
)

PROMPT_CLUSTER_1 = (
    "{\n"
    "  \"Question\": \"Who was the pope that founded the Vatican Television?\",\n"
    "  \"Candidate triples list (numbered)\": \"1. res:Vatican_Television_Center dbo:foundedBy ?uri\\n"
    "2. ?uri rdf:type dbo:Pope\\n"
    "3. res:Vatican_Media dbp:founder res:Pope_John_Paul_II\\n"
    "4. res:Vatican_Media dbo:foundedBy res:Pope_John_Paul_II\\n"
    "5. res:Vatican_Media dbo:wikiPageWikiLink res:Pope_John_Paul_II\\n"
    "6. res:Vatican_Media dbo:wikiPageWikiLink res:Pope_Benedict_XVI\\n"
    "7. res:Vatican_Media dbo:wikiPageWikiLink res:Pope\\n"
    "8. res:Vatican_Media dbo:wikiPageWikiLink res:Padre_Pio_TV\\n"
    "9. res:Vatican_Media dbo:wikiPageWikiLink res:Category:Television_channels_and_stations_established_in_1983\\n"
    "10. res:Vatican_Media dcterms:subject res:Category:Television_channels_and_stations_established_in_1983\"\n"
    "}\n"
    "\n"
    "### THINK (do not output) ###\n"
    "Step 1 – Understand the TASK and read the INPUT\n"
    "        • Question: “Who was the pope that founded the Vatican Television?”\n"
    "        • Candidate triples (10) – see JSON above.\n"
    "\n"
    "Step 2 – Locate the ROOT constant (C₁) and first predicate (P₁)\n"
    "        • The explicit entity is **Vatican Television** → DBpedia resource `res:Vatican_Television_Center`.\n"
    "        • Triple 1 uses that constant with predicate `dbo:foundedBy`; therefore\n"
    "                C₁ = res:Vatican_Television_Center\n"
    "                P₁ = dbo:foundedBy\n"
    "        • Bind its object position to variable **?uri** (the prospective founder).\n"
    "\n"
    "Step 3 – Decide the TARGET predicate (P₂)\n"
    "        • The question demands the founder **and** specifies that the answer must be a pope.\n"
    "        • Triple 2 provides that check: `?uri rdf:type dbo:Pope` ⇒ choose **P₂ = rdf:type**, constant class **dbo:Pope**.\n"
    "        • Triples whose predicate is `dbo:wikiPageWikiLink` or `dcterms:subject` are ignored as *noise* because they do not encode factual relations useful for this query.\n"
    "\n"
    "Step 4 – Infer RESULT class filter (optional)\n"
    "        • Already satisfied by the `rdf:type dbo:Pope` triple; no extra filter needed.\n"
    "\n"
    "Step 5 – Pick DECORATORS\n"
    "        • Not a “How many …” question, so apply **DISTINCT** to prevent duplicates.\n"
    "\n"
    "Step 6 – Draft the WHERE block (two triples + optional type)\n"
    "        Core:\n"
    "```\n"
    "res:Vatican_Television_Center dbo:foundedBy ?uri .   # C₁  P₁  ?uri\n"
    "?uri rdf:type dbo:Pope .                             # ?uri P₂  dbo:Pope\n"
    "```\n"
    "        Extras:\n"
    "        – No OPTIONAL, UNION, or FILTER clauses are necessary because the `rdf:type` line already restricts ?uri to popes.\n"
    "\n"
    "Step 7 – Compose the SELECT line\n"
    "        • Pattern: `SELECT {DISTINCT|COUNT(?uri)} ?uri`\n"
    "        • We choose **`SELECT DISTINCT ?uri`** because the task is to return the unique pope URI(s); `COUNT()` is irrelevant.\n"
    "        SELECT DISTINCT ?uri\n"
    "\n"
    "Step 8 – Finish (wrap in WHERE)\n"
    "        • Per style guidelines, **no LIMIT, OFFSET, or ORDER** clauses are added.\n"
    "```\n"
    "SELECT DISTINCT ?uri WHERE {\n"
    "  res:Vatican_Television_Center dbo:foundedBy ?uri .\n"
    "  ?uri rdf:type dbo:Pope .\n"
    "}\n"
    "```\n"
    "\n"
    "Step 9 – Emit inside the required JSON wrapper\n"
    "\n"
    "<Answer>\n"
    "{\n"
    "  \"sparql\": \"SELECT DISTINCT ?uri WHERE { res:Vatican_Television_Center dbo:foundedBy ?uri . ?uri rdf:type dbo:Pope . }\"\n"
    "}\n"
    "### END THINK ###"
)

PROMPT_CLUSTER_2 = (
    "{\n"
    "  \"Question\": \"Give me a list of all lakes in Denmark.\",\n"
    "  \"Candidate triples list (numbered)\": \"1. res:Romer_Lake dbo:wikiPageWikiLink res:Denmark\\n"
    "2. res:Sortedam_Lake dbo:location res:Denmark\\n"
    "3. res:Denmark dbo:wikiPageWikiLink res:List_of_rivers_of_Denmark\\n"
    "4. res:Arreskov_Lake dbo:location res:Denmark\\n"
    "5. res:List_of_brackish_bodies_of_water dbo:wikiPageWikiLink res:Denmark\\n"
    "6. res:List_of_floods dbo:wikiPageWikiLink res:Denmark\\n"
    "7. res:Geding_Lake dbo:location res:Denmark\\n"
    "8. res:Madum_Lake dbo:location res:Denmark\\n"
    "9. res:Kilen_(lake) dbo:location res:Denmark\\n"
    "10. res:List_of_floods_in_Europe dbo:wikiPageWikiLink res:Denmark\"\n"
    "}\n"
    "\n"
    "### THINK (do not output) ###\n"
    "Step 1 – Understand the TASK and Read the INPUT\n"
    "        • Question: “Give me a list of all lakes in Denmark.”\n"
    "        • Candidate triples (10) – see JSON above.\n"
    "\n"
    "Step 2 – Choose the ANCHOR triple (P₁, C₁)\n"
    "        • Constant explicitly named: **Denmark** ⇒  C₁ = res:Denmark.\n"
    "        • Triples 2, 4, 7–9 use predicate **dbo:location** to link various IRIs\n"
    "          to Denmark.  Pick that pattern:  `?uri dbo:location res:Denmark`.\n"
    "              P₁ = dbo:location  variable bound = ?uri\n"
    "\n"
    "Step 3 – Determine the TARGET predicate (P₂)\n"
    "        • The question asks for the lakes **themselves**, not a property of them.\n"
    "        • Therefore no separate target edge is required; we will return **?uri**\n"
    "          directly after verifying it is a lake.\n"
    "\n"
    "Step 4 – Retrieve the INTERMEDIATE type (Class T)\n"
    "        • Every lake resource has a triple  `?uri rdf:type dbo:Lake` in DBpedia\n"
    "          (even if not shown in our small candidate slice).  Use it as a type guard:\n"
    "              T = dbo:Lake\n"
    "\n"
    "Step 5 – Pick DECORATORS\n"
    "        • Not a “How many …” question → project `?uri` with **DISTINCT**.\n"
    "\n"
    "Step 6 – Assemble the WHERE block (snowflake shape)\n"
    "        {Constant ↔ ?uri link}\n"
    "        ?uri dbo:location res:Denmark .\n"
    "        {Type guard}\n"
    "        ?uri rdf:type   dbo:Lake   .\n"
    "        {Target edge}\n"
    "        – Omitted (Step 3 explained why).\n"
    "\n"
    "Step 7 – Compose the SELECT line\n"
    "        SELECT DISTINCT ?uri\n"
    "\n"
    "Step 8 – No extras\n"
    "        • Do **not** add OPTIONAL, UNION, FILTER, LIMIT, or ORDER.\n"
    "\n"
    "Step 9 – Emit inside the required JSON wrapper\n"
    "\n"
    "<Answer>\n"
    "{\n"
    "  \"sparql\": \"SELECT DISTINCT ?uri WHERE { ?uri dbo:location res:Denmark . ?uri rdf:type dbo:Lake . }\"\n"
    "}\n"
    "### END THINK ###"
)

PROMPT_CLUSTER_3 = (
    "{\n"
    "  \"Question\": \"Which rivers flow into a German lake?\",\n"
    "  \"Candidate triples list (numbered)\": \"1. res:Battle_of_the_Unstrut_River_(531) dbo:place res:Germany\\n"
    "2. res:Lake_Bant dbo:location res:Germany\\n"
    "3. res:Schlachtensee_(lake) dbo:location res:Germany\\n"
    "4. res:Battle_of_the_Lupia_River dbo:place res:Germany\\n"
    "5. res:Illmensee_(lake) dbo:location res:Germany\"\n"
    "}\n"
    "\n"
    "### THINK (do not output) ###\n"
    "Step 1 – Understand the TASK and read the INPUT\n"
    "        • Question: “Which rivers flow into a German lake?”\n"
    "        • Candidate triples (5) – see JSON above.\n"
    "\n"
    "Step 2 – Identify the two CONSTANTS (C₁ and C₂)\n"
    "        • The explicit constant in the question is **Germany** (C₂).  \n"
    "        • There is no specific named lake; instead, we must bind a variable **?x** that stands for “a lake in Germany”.  \n"
    "        • We therefore treat C₁ not as a constant but as the variable **?x** that will later be constrained to `dbo:Lake` and `dbo:country res:Germany`.\n"
    "\n"
    "Step 3 – Select the PREDICATE (P)\n"
    "        • The verbal phrase “flow into” maps to the ontology predicate `dbo:inflow` (river → lake).  \n"
    "        • This does not appear in the noisy candidate list; nonetheless, `dbo:inflow` is the correct predicate for the first hop.  \n"
    "        • The lake’s relationship to Germany is expressed with `dbo:country` (or `dbo:location` if `dbo:country` is absent).  \n"
    "        • Triples with predicates like `dbo:place` or `dbo:location` in the list are signal for this second hop; entries using battle–place links are ignored as *noise* because they do not involve lakes or the inflow relation.\n"
    "\n"
    "Step 4 – Form the core triples\n"
    "        • First hop (river → lake):      ?uri dbo:inflow ?x .  \n"
    "        • Second hop (lake → Germany):  ?x  dbo:country  res:Germany .  \n"
    "        • Add a type guard to ensure ?uri is a `dbo:River` and ?x a `dbo:Lake`.\n"
    "\n"
    "Step 5 – Wrap in a SELECT block (not ASK)\n"
    "        • The question demands *which* rivers, so we need a list.  \n"
    "        • Use **`SELECT DISTINCT ?uri`** to return each river once.\n"
    "\n"
    "Step 6 – Output inside the required JSON wrapper\n"
    "\n"
    "<Answer>\n"
    "{\n"
    "  \"sparql\": \"SELECT DISTINCT ?uri WHERE { ?uri rdf:type dbo:River . ?x rdf:type dbo:Lake . ?uri dbo:inflow ?x . ?x dbo:country res:Germany . }\"\n"
    "}\n"
    "### END THINK ###"
)

PROMPT_CLUSTER_4 = (
    "{\n"
    "  \"Question\": \"Which software has been developed by organizations founded in California?\",\n"
    "  \"Candidate triples list (numbered)\": \"1. res:Software_Publishing_Corporation dbo:location res:California\\n"
    "2. res:X1_(software_company) dbo:location res:California\\n"
    "3. res:TIBCO_Software dbo:location res:California\\n"
    "4. res:Megatech_Software dbo:location res:California\\n"
    "5. res:Tacit_Software dbo:location res:California\\n"
    "6. res:BatchMaster_Software dbo:location res:California\\n"
    "7. res:BlueSky_Software__BlueSky_Software__1 dbo:location res:California\\n"
    "8. res:Offset_Software dbo:location res:California\\n"
    "9. res:Starfish_Software dbo:location res:California\\n"
    "10. res:FutureWave_Software dbo:location res:California\"\n"
    "}\n"
    "\n"
    "### THINK (do not output) ###\n"
    "Step 1 – Understand the TASK and read the INPUT\n"
    "        • Question: “Which software has been developed by organizations founded in California?”\n"
    "        • Candidate triples (10) – see JSON above.\n"
    "\n"
    "Step 2 – Locate the CONSTANT mentioned in the question\n"
    "        • The only explicit entity is **California** → constant **C = res:California**.\n"
    "        • It appears as the *object* in all ten candidate triples.\n"
    "\n"
    "Step 3 – Select the PREDICATE P (property phrase)\n"
    "        • The English phrase is “organizations **founded in** California”.\n"
    "        • Correct ontology predicate: **dbo:foundationPlace** (company → place founded).\n"
    "        • None of the retrieved triples use dbo:foundationPlace; the `dbo:location` triples are *noise* for this intent because “location” ≠ “foundation place”.\n"
    "\n"
    "Step 4 – Decide VARIABLE placement\n"
    "        • The wording “… founded **in California**” puts California as *object*.\n"
    "        • Therefore:      **?company dbo:foundationPlace res:California**.\n"
    "\n"
    "        • A second hop is required to reach the requested software:\n"
    "              ?software dbo:developer ?company\n"
    "          (Here the property phrase is “developed by <company>”.)\n"
    "\n"
    "Step 5 – Pick DECORATOR\n"
    "        • Question does **not** start with “How many…”, so use **DISTINCT**.\n"
    "\n"
    "Step 6 – Compose the WHERE block\n"
    "        Core triples\n"
    "```\n"
    "?company  dbo:foundationPlace  res:California .   # constant-anchored triple\n"
    "?software dbo:developer       ?company .         # target triple\n"
    "```\n"
    "        Extras (why / why not)\n"
    "        – Add type guards for clarity but nothing else:\n"
    "              ?company  rdf:type dbo:Company  .\n"
    "              ?software rdf:type dbo:Software .\n"
    "          No OPTIONAL / UNION / FILTER / LIMIT / ORDER are needed.\n"
    "\n"
    "Step 7 – Compose the SELECT line\n"
    "        • Pattern: SELECT {DISTINCT|COUNT(?uri)} ?uri\n"
    "        • We need the list itself, not a count ⇒ `SELECT DISTINCT ?software`.\n"
    "\n"
    "Step 8 – Emit inside the required JSON wrapper\n"
    "\n"
    "<Answer>\n"
    "{\n"
    "  \"sparql\": \"SELECT DISTINCT ?software WHERE { ?company dbo:foundationPlace res:California . ?company rdf:type dbo:Company . ?software dbo:developer ?company . ?software rdf:type dbo:Software . }\"\n"
    "}\n"
    "### END THINK ###"
)

PROMPT_CLUSTER_5 = (
    "{\n"
    "  \"Question\": \"Which people were born in Heraklion?\",\n"
    "  \"Candidate triples list (numbered)\": \"1. res:Borna_Kapusta dbo:wikiPageWikiLink res:Heraklion\\n"
    "2. res:Dimitrios_Papadopoulos_(footballer,_born_1950) dbo:wikiPageWikiLink res:Heraklion\\n"
    "3. res:Dimitrios_Papadopoulos_(footballer,_born_1950)__Dimitrios_Papadopoulos__1 dbo:birthPlace res:Heraklion\\n"
    "4. res:Nicholas_Kalliakis dbp:birthPlace res:Heraklion\\n"
    "5. res:Emmanuel_Skordilis dbp:birthPlace res:Heraklion\\n"
    "6. res:Nicholas_Kalliakis dbo:birthPlace res:Heraklion\\n"
    "7. res:Emmanuel_Skordilis dbo:birthPlace res:Heraklion\\n"
    "8. res:Nadia_Valavani dbp:birthPlace res:Heraklion\\n"
    "9. res:Michael_Koukoulakis dbp:birthPlace res:Heraklion\\n"
    "10. res:Michael_Katehakis dbo:birthPlace res:Heraklion\"\n"
    "}\n"
    "\n"
    "### THINK (do not output) ###\n"
    "Step 1 – Understand the TASK and read the INPUT\n"
    "        • Question: “Which people were born in Heraklion?”\n"
    "        • Candidate triples (10) – see JSON above.\n"
    "\n"
    "Step 2 – Extract all CONSTANT–predicate pairs\n"
    "        • Fact 1: “born in Heraklion”\n"
    "              Constant   C₁ = res:Heraklion\n"
    "              Predicate  P₁ = dbo:birthPlace / dbp:birthPlace   (appears in triples 3–10)\n"
    "        • Fact 2: result must be “people” (class Person)\n"
    "              Constant   C₂ = dbo:Person\n"
    "              Predicate  P₂ = rdf:type                       (implicit but standard guard)\n"
    "\n"
    "Step 3 – Bind a single answer variable\n"
    "        • Use **?uri** for the entity that satisfies *both* facts.\n"
    "\n"
    "Step 4 – Determine triple orientation for each pair\n"
    "        • Triples 3–10 show *object* orientation for birth-place →\n"
    "              ?uri P₁ C₁   (person → Heraklion)\n"
    "        • rdf:type guard is always   ?uri rdf:type C₂.\n"
    "\n"
    "Step 5 – Add optional TYPE filter (already covered)\n"
    "        • Fact 2 itself provides the type guard; include it explicitly.\n"
    "\n"
    "Step 6 – Select DECORATOR\n"
    "        • Question does not ask “How many…”, so apply **DISTINCT**.\n"
    "\n"
    "Step 7 – Assemble the WHERE block (both pairs)\n"
    "```\n"
    "?uri rdf:type     dbo:Person   .   # Pair 2 (type guard)\n"
    "?uri dbo:birthPlace res:Heraklion .   # Pair 1 (birth-place)\n"
    "```\n"
    "        • No OPTIONAL, UNION, FILTER, LIMIT, or ORDER are added.\n"
    "\n"
    "Step 8 – Compose the SELECT line\n"
    "        SELECT DISTINCT ?uri\n"
    "\n"
    "Step 9 – No extras (already noted in Step 7).\n"
    "\n"
    "Step 10 – Emit inside the required JSON wrapper\n"
    "\n"
    "<Answer>\n"
    "{\n"
    "  \"sparql\": \"SELECT DISTINCT ?uri WHERE { ?uri rdf:type dbo:Person . ?uri dbo:birthPlace res:Heraklion . }\"\n"
    "}\n"
    "### END THINK ###"
)

PROMPT_CLUSTER_6 = (
    "{\n"
    "  \"Question\": \"Who developed Skype?\",\n"
    "  \"Candidate triples list (numbered)\": \"1. res:Skype dbo:developer res:Skype_Technologies\\n"
    "2. res:Skype dbo:wikiPageWikiLink res:Category:Estonian_inventions\\n"
    "3. res:Skype dcterms:subject res:Category:Estonian_inventions\\n"
    "4. res:William_Pugh_(game_designer) dbo:wikiPageWikiLink res:Skype\\n"
    "5. res:Michael_Seifert_(programmer) dbo:wikiPageWikiLink res:Skype\\n"
    "6. res:Jason_Johnson_(entrepreneur) dbo:wikiPageWikiLink res:Skype\\n"
    "7. res:Qi_Lu_(computer_scientist) dbo:wikiPageWikiLink res:Skype\\n"
    "8. res:Bryan_Johnson_(entrepreneur) dbo:wikiPageWikiLink res:Skype\\n"
    "9. res:List_of_Swedish_entrepreneurs dbo:wikiPageWikiLink res:Skype\\n"
    "10. res:List_of_Macintosh_software_published_by_Microsoft dbo:wikiPageWikiLink res:Skype\"\n"
    "}\n"
    "\n"
    "### THINK (do not output) ###\n"
    "Step 1 – Understand the TASK and read the INPUT\n"
    "        • Question: “Who developed Skype?”\n"
    "        • Candidate triples (10) – see JSON above.\n"
    "\n"
    "Step 2 – Pick the ROOT triple (C₀ ↔ ?x)\n"
    "        • Constant named in question: **C₀ = res:Skype**.\n"
    "        • Triple 1 links Skype to a developer via **dbo:developer**.\n"
    "              Predicate  P₀ = dbo:developer\n"
    "              Variable   ?x = the developer entity\n"
    "        • Orientation in candidates is  *C₀ P₀ constant*; we generalise object → **?x**.\n"
    "              Pattern:   res:Skype dbo:developer ?x .\n"
    "\n"
    "Step 3 – Grab the INTERMEDIATE class (T)\n"
    "        • No candidate shows  ?x rdf:type T  → skip the type guard (none available).\n"
    "\n"
    "Step 4 – Choose the TARGET predicate (P₁)\n"
    "        • The requested fact *is* the developer itself, so set **?uri = ?x**.\n"
    "        • No additional edge is required; omit Step 4 triple.\n"
    "\n"
    "Step 5 – Select DECORATOR\n"
    "        • Question does *not* start with “How many…”, so use **DISTINCT**.\n"
    "\n"
    "Step 6 – Build the WHERE block (single edge)\n"
    "```\n"
    "res:Skype dbo:developer ?uri .   # root edge already yields the answer\n"
    "```\n"
    "        • The remaining wikiPageWikiLink / dcterms:subject triples are *noise*—they encode page-link metadata, not factual developer relations.\n"
    "\n"
    "Step 7 – Compose the SELECT line\n"
    "        SELECT DISTINCT ?uri\n"
    "\n"
    "Step 8 – No extras\n"
    "        • Do **not** add OPTIONAL, UNION, FILTER, LIMIT, or ORDER clauses.\n"
    "\n"
    "Step 9 – Output inside the required JSON wrapper\n"
    "\n"
    "<Answer>\n"
    "{\n"
    "  \"sparql\": \"SELECT DISTINCT ?uri WHERE { res:Skype dbo:developer ?uri . }\"\n"
    "}\n"
    "### END THINK ###"
)

PROMPT_CLUSTER_7 = (
    "{\n"
    "  \"Question\": \"Give me all actors starring in movies directed by and starring William Shatner.\",\n"
    "  \"Candidate triples list (numbered)\": \"1. res:The_People_(film) dbo:starring res:William_Shatner\\n"
    "2. res:The_People_(film) dbp:starring res:William_Shatner\\n"
    "3. res:Showtime_(film) dbo:starring res:William_Shatner\\n"
    "4. res:The_Wild dbo:starring res:William_Shatner\\n"
    "5. res:The_Devil's_Rain_(film) dbo:starring res:William_Shatner\\n"
    "6. res:Shoot_or_Be_Shot dbo:starring res:William_Shatner\\n"
    "7. res:Free_Enterprise_(film) dbo:starring res:William_Shatner\\n"
    "8. res:Over_the_Hedge_(film) dbo:starring res:William_Shatner\\n"
    "9. res:Aliens_Ate_My_Homework_(2018_film) dbo:starring res:William_Shatner\\n"
    "10. res:Shoot_or_Be_Shot dbp:starring res:William_Shatner\\n"
    "}\n"
    "\n"
    "### THINK (do not output) ###\n"
    "Step 1 – Understand the TASK and read the INPUT\n"
    "        • Question: “Give me all actors starring in movies **directed by and starring William Shatner**.”\n"
    "        • Candidate triples (12) – see JSON above.\n"
    "\n"
    "Step 2 – Find the CONSTANT pair (P, C)\n"
    "        • The explicit entity is **William Shatner** → constant **C = res:William_Shatner**.\n"
    "        • We need a predicate that links each movie (?x) to that constant.  \n"
    "          – For “directed by”, the ontology predicate is **dbo:director**.  \n"
    "          – Although no candidate triple shows dbo:director, it is the correct factual edge; choose it for the anchor.\n"
    "        • Orientation: constant is **object** ⇒ pattern **?x dbo:director C**.\n"
    "        • Bind the movie to **?x**.\n"
    "\n"
    "Step 3 – Identify the “OTHER” edge (same subject ?x)\n"
    "        • The second condition *starring William Shatner* is expressed by **dbo:starring** (or its dbp variant).  \n"
    "        • Use it twice:  \n"
    "              a) To enforce the starring-Shatner condition ⇒ ?x dbo:starring C  .  \n"
    "              b) To reach the requested co-stars      ⇒ ?x dbo:starring ?uri .\n"
    "        • For (b) we treat **P′ = dbo:starring**, yielding the variable object **?uri** (the other actor).\n"
    "\n"
    "Step 4 – Pick DECORATOR\n"
    "        • Counting is **not** asked; list the actors → `SELECT DISTINCT ?uri`.\n"
    "\n"
    "Step 5 – Assemble WHERE block (extended cycle)\n"
    "        {anchor triple}\n"
    "        ?x dbo:director  res:William_Shatner   .\n"
    "        {starring constraint}\n"
    "        ?x dbo:starring res:William_Shatner   .\n"
    "        {expansion triple}\n"
    "        ?x dbo:starring ?uri                  .\n"
    "        • No rdf:type, OPTIONAL, FILTER, UNION, LIMIT, or ORDER.\n"
    "\n"
    "Step 6 – Compose SELECT line\n"
    "        SELECT DISTINCT ?uri\n"
    "\n"
    "Step 7 – Emit inside the required JSON wrapper\n"
    "\n"
    "<Answer>\n"
    "{\n"
    "  \"sparql\": \"SELECT DISTINCT ?uri WHERE { ?x dbo:director res:William_Shatner . ?x dbo:starring res:William_Shatner . ?x dbo:starring ?uri . }\"\n"
    "}\n"
    "### END THINK ###"
)

# Map cluster id -> prompt string
PROMPTS_BY_CLUSTER = {
    0: PROMPT_CLUSTER_0,
    1: PROMPT_CLUSTER_1,
    2: PROMPT_CLUSTER_2,
    3: PROMPT_CLUSTER_3,
    4: PROMPT_CLUSTER_4,
    5: PROMPT_CLUSTER_5,
    6: PROMPT_CLUSTER_6,
    7: PROMPT_CLUSTER_7,
}

# ───────────────────── Helper functions ──────────────────────

def lists_to_numbered_string(triples: List[Any]) -> str:
    """Convert triples into a 1‑based numbered newline‑separated string."""
    return "\n".join(
        f"{i}. {' '.join(map(str, t)) if isinstance(t, (list, tuple)) else str(t)}"
        for i, t in enumerate(triples, 1)
    )

def build_messages(question: str, triples_str: str, cluster_id: int) -> List[Dict[str, str]]:
    """Return the message list for a single sample."""
    # System message 1: generic
    generic_msg = {"role": "system", "content": SYSTEM_PROMPT_GENERIC}

    # System message 2: cluster‑specific THINK prompt
    think_prompt = PROMPTS_BY_CLUSTER[cluster_id]
    think_msg = {"role": "system", "content": think_prompt}

    # User message with question + triples JSON string
    user_payload = {
        "Question": question,
        "Candidate triples list (numbered)": triples_str,
    }
    user_msg = {"role": "user", "content": json.dumps(user_payload)}

    return [generic_msg, think_msg, user_msg]


# ────────────────────────── Main logic ───────────────────────

def main() -> None:
    with INPUT_PATH.open(encoding="utf-8") as f:
        dataset = json.load(f)

    jsonl_rows: List[Dict[str, Any]] = []
    cluster_counts: Counter[int] = Counter()
    skipped: Counter[str] = Counter()

    for sample in dataset:
        cluster_raw = sample.get("cluster_id")
        try:
            cluster_id = int(cluster_raw)
        except (TypeError, ValueError):
            skipped["invalid cluster_id"] += 1
            continue

        if cluster_id not in ALLOWED_CLUSTER_IDS:
            skipped["cluster not 0‑7"] += 1
            continue

        # Fetch triples list (accept both spellings)
        raw_triples = sample.get("retrived_triples_ranked")
        if not raw_triples:
            triples_list = []
        else:
            triples_list = [hit["triple"] if isinstance(hit, dict) else hit for hit in raw_triples[:TRIPLES_LIMIT]]
            
        triples_str = lists_to_numbered_string(triples_list)

        question_text = sample.get("question", "").strip()
        if not question_text:
            skipped["missing question"] += 1
            continue

        messages = build_messages(question_text, triples_str, cluster_id)
        jsonl_rows.append({"messages": messages})
        cluster_counts[cluster_id] += 1

    # Write output
    with OUTPUT_PATH.open("w", encoding="utf-8") as f_out:
        for rec in jsonl_rows:
            f_out.write(json.dumps(rec) + "\n")

    # Console summary
    total_written = len(jsonl_rows)
    print(f"Wrote {total_written} inference records to {OUTPUT_PATH}")
    print("Cluster distribution (kept):")
    for cid in sorted(cluster_counts):
        print(f"  {cid}: {cluster_counts[cid]}")
    print("Skipped samples by reason:")
    for reason, count in skipped.items():
        print(f"  {reason}: {count}")


if __name__ == "__main__":
    main()

Wrote 149 inference records to /home/m2khoda/dual_retriever/evaluations/dycot/qald_results/qald_test_solo_stage_top_10_dycot.jsonl
Cluster distribution (kept):
  0: 67
  1: 12
  2: 69
  6: 1
Skipped samples by reason:


Chain of Thoughts

In [None]:
from __future__ import annotations

import json
from collections import Counter
from pathlib import Path
from typing import Any, Dict, List

TRIPLES_LIMIT = 10
ALLOWED_CLUSTER_IDS = {0, 1, 2, 3, 4, 5, 6, 7}

INPUT_PATH = Path(
    "/home/m2khoda/dual_retriever/evaluations/dycot/qald_results/qald_test_solo_stage_top10_dycot_cleaned.json"
)
OUTPUT_PATH = Path(
    "/home/m2khoda/dual_retriever/evaluations/dycot/qald_results/qald_test_solo_stage_top_10_dycot_medoid.jsonl"
)

# ────────────────────────── PROMPTS ───────────────────────────
SYSTEM_PROMPT_GENERIC = (
    "Given a natural-language question and up to ten potentially relevant triples, and the Step by step reasoning that you are given "
    "generate the corresponding SPARQL query for DBpedia. "
    "Return your answer after <Answer>, in JSON with key \"sparql\" and the query as its string value. Do not include any of your thought in the final output"
)

PROMPT_CLUSTER_0 = (
    "{\n"
    "  \"Question\": \"In which U.S. state is Area 51 located?\",\n"
    "  \"Candidate triples list (numbered)\": \"1. res:Area_51 dbo:location ?uri\\n"
    "2. ?uri dbo:country res:United_States\\n"
    "3. res:Area_51 dbo:wikiPageWikiLink res:United_States_Department_of_State\\n"
    "4. res:List_of_United_States_Air_Force_installations dbo:wikiPageWikiLink res:Area_51\\n"
    "5. res:Area_51 dbo:wikiPageWikiLink res:United_States_Department_of_Defense\\n"
    "6. res:Area_51 dbo:wikiPageWikiLink res:Freedom_of_Information_Act_(United_States)\\n"
    "7. res:51st_state_(disambiguation) dbo:wikiPageWikiLink res:Area_51\\n"
    "8. res:Area_51 dbo:wikiPageWikiLink res:List_of_United_States_Air_Force_installations\\n"
    "9. res:Area_51 dbo:wikiPageWikiLink res:United_States_Department_of_Energy\\n"
    "10. res:Area_51 dbo:wikiPageWikiLink res:United_States_Air_Force\"\n"
    "}\n"
    "\n"
    "### THINK (do not output) ###\n"
    "Step 1 – Understand the TASK and read the INPUT\n"
    "        • Question: “In which U.S. state is Area 51 located?”\n"
    "        • Candidate triples (up to 10) – see JSON above.\n"
    "\n"
    "Step 2 – Extract the ANCHOR triple\n"
    "        • The wording “U.S. state” implies a state **belonging to** the United States.\n"
    "        • Triple 2 matches that fact, so choose:\n"
    "                P₁ = dbo:country  C₁ = res:United_States\n"
    "        • Anchor template: ?S dbo:country res:United_States .\n"
    "\n"
    "Step 3 – Filter NOISE and decide the TARGET predicate P₂\n"
    "        • **Predicate test** – ontology predicates (dbo:location, dbo:country, …) encode real relations;\n"
    "          generic links like **dbo:wikiPageWikiLink** or topical tags **dcterms:subject** do not.\n"
    "        • Therefore **discard triples 3 – 10** (all use dbo:wikiPageWikiLink).\n"
    "        • Remaining informative triples:\n"
    "                1. res:Area_51 dbo:location ?S (target candidate)\n"
    "                2. ?S dbo:country res:United_States (anchor)\n"
    "        • The question asks where Area 51 *is located* ⇒ set **P₂ = dbo:location** (its object is the answer variable).\n"
    "\n"
    "Step 4 – Infer TYPE constraint (optional)\n"
    "        • The wh-phrase already specifies “U.S. state”; the country filter suffices, so no extra rdf:type needed.\n"
    "\n"
    "Step 5 – Choose DECORATOR\n"
    "        • Not a “How many …?” query → use **DISTINCT**.\n"
    "\n"
    "Step 6 – Assemble the triple block (star shape)\n"
    "        Core:\n"
    "```\n"
    "res:Area_51 dbo:location ?S .\n"
    "?S          dbo:country  res:United_States .\n"
    "```\n"
    "        Extras:\n"
    "        – No additional rdf:type, FILTER, UNION, or OPTIONAL needed because the country constraint already restricts ?S to U.S. states.\n"
    "\n"
    "Step 7 – Compose the SELECT line\n"
    "        • Pattern: SELECT {DISTINCT|COUNT(?uri)} ?uri\n"
    "        • We choose **DISTINCT** because the goal is to return the state resource(s) themselves, not a count; therefore COUNT() is unnecessary.\n"
    "        SELECT DISTINCT ?S\n"
    "\n"
    "Step 8 – Finish (wrap in WHERE)\n"
    "        • No LIMIT, OFFSET, or ORDER are required.\n"
    "```\n"
    "SELECT DISTINCT ?S WHERE {\n"
    "  res:Area_51 dbo:location ?S .\n"
    "  ?S          dbo:country  res:United_States .\n"
    "}\n"
    "```\n"
    "\n"
    "Step 9 – Emit inside the required JSON wrapper\n"
    "\n"
    "<Answer>\n"
    "{\n"
    "  \"sparql\": \"SELECT DISTINCT ?uri WHERE { res:Area_51 dbo:location ?uri . ?uri dbo:country res:United_States . }\"\n"
    "}\n"
    "### END THINK ###"
)

PROMPT_CLUSTER_1 = (
    "{\n"
    "  \"Question\": \"Who was the pope that founded the Vatican Television?\",\n"
    "  \"Candidate triples list (numbered)\": \"1. res:Vatican_Television_Center dbo:foundedBy ?uri\\n"
    "2. ?uri rdf:type dbo:Pope\\n"
    "3. res:Vatican_Media dbp:founder res:Pope_John_Paul_II\\n"
    "4. res:Vatican_Media dbo:foundedBy res:Pope_John_Paul_II\\n"
    "5. res:Vatican_Media dbo:wikiPageWikiLink res:Pope_John_Paul_II\\n"
    "6. res:Vatican_Media dbo:wikiPageWikiLink res:Pope_Benedict_XVI\\n"
    "7. res:Vatican_Media dbo:wikiPageWikiLink res:Pope\\n"
    "8. res:Vatican_Media dbo:wikiPageWikiLink res:Padre_Pio_TV\\n"
    "9. res:Vatican_Media dbo:wikiPageWikiLink res:Category:Television_channels_and_stations_established_in_1983\\n"
    "10. res:Vatican_Media dcterms:subject res:Category:Television_channels_and_stations_established_in_1983\"\n"
    "}\n"
    "\n"
    "### THINK (do not output) ###\n"
    "Step 1 – Understand the TASK and read the INPUT\n"
    "        • Question: “Who was the pope that founded the Vatican Television?”\n"
    "        • Candidate triples (up to 10) – see JSON above.\n"
    "\n"
    "Step 2 – Locate the ROOT constant (C₁) and first predicate (P₁)\n"
    "        • The explicit entity is **Vatican Television** → DBpedia resource `res:Vatican_Television_Center`.\n"
    "        • Triple 1 uses that constant with predicate `dbo:foundedBy`; therefore\n"
    "                C₁ = res:Vatican_Television_Center\n"
    "                P₁ = dbo:foundedBy\n"
    "        • Bind its object position to variable **?uri** (the prospective founder).\n"
    "\n"
    "Step 3 – Decide the TARGET predicate (P₂)\n"
    "        • The question demands the founder **and** specifies that the answer must be a pope.\n"
    "        • Triple 2 provides that check: `?uri rdf:type dbo:Pope` ⇒ choose **P₂ = rdf:type**, constant class **dbo:Pope**.\n"
    "        • Triples whose predicate is `dbo:wikiPageWikiLink` or `dcterms:subject` are ignored as *noise* because they do not encode factual relations useful for this query.\n"
    "\n"
    "Step 4 – Infer RESULT class filter (optional)\n"
    "        • Already satisfied by the `rdf:type dbo:Pope` triple; no extra filter needed.\n"
    "\n"
    "Step 5 – Pick DECORATORS\n"
    "        • Not a “How many …” question, so apply **DISTINCT** to prevent duplicates.\n"
    "\n"
    "Step 6 – Draft the WHERE block (two triples + optional type)\n"
    "        Core:\n"
    "```\n"
    "res:Vatican_Television_Center dbo:foundedBy ?uri .   # C₁  P₁  ?uri\n"
    "?uri rdf:type dbo:Pope .                             # ?uri P₂  dbo:Pope\n"
    "```\n"
    "        Extras:\n"
    "        – No OPTIONAL, UNION, or FILTER clauses are necessary because the `rdf:type` line already restricts ?uri to popes.\n"
    "\n"
    "Step 7 – Compose the SELECT line\n"
    "        • Pattern: `SELECT {DISTINCT|COUNT(?uri)} ?uri`\n"
    "        • We choose **`SELECT DISTINCT ?uri`** because the task is to return the unique pope URI(s); `COUNT()` is irrelevant.\n"
    "        SELECT DISTINCT ?uri\n"
    "\n"
    "Step 8 – Finish (wrap in WHERE)\n"
    "        • Per style guidelines, **no LIMIT, OFFSET, or ORDER** clauses are added.\n"
    "```\n"
    "SELECT DISTINCT ?uri WHERE {\n"
    "  res:Vatican_Television_Center dbo:foundedBy ?uri .\n"
    "  ?uri rdf:type dbo:Pope .\n"
    "}\n"
    "```\n"
    "\n"
    "Step 9 – Emit inside the required JSON wrapper\n"
    "\n"
    "<Answer>\n"
    "{\n"
    "  \"sparql\": \"SELECT DISTINCT ?uri WHERE { res:Vatican_Television_Center dbo:foundedBy ?uri . ?uri rdf:type dbo:Pope . }\"\n"
    "}\n"
    "### END THINK ###"
)

PROMPT_CLUSTER_2 = (
    "{\n"
    "  \"Question\": \"Does the new Battlestar Galactica series have more episodes than the old one?\",\n"
    "  \"Candidate triples list (numbered)\": \"1. res:Battlestar_Galactica_(1978_TV_series) dbo:numberOfEpisodes ?x\\n"
    "2. res:Battlestar_Galactica_(2004_TV_series) dbo:numberOfEpisodes ?y\"\n"
    "}\n"
    "\n"
    "### THINK (do not output) ###\n"
    "Step 1 – Understand the TASK and read the INPUT\n"
    "        • Question: “Does the new Battlestar Galactica series have more episodes than the old one?”\n"
    "        • Candidate triples (up to 10) – see JSON above.\n"
    "\n"
    "Step 2 – Choose the ANCHOR triple (P₁, C₁)\n"
    "        • The *old* series (1978) is explicitly mentioned. Triple 1 links that constant to the episode count\n"
    "          via predicate dbo:numberOfEpisodes.\n"
    "                C₁ = res:Battlestar_Galactica_(1978_TV_series)\n"
    "                P₁ = dbo:numberOfEpisodes\n"
    "        • Bind its object to **?x** (episodes of the old show).\n"
    "\n"
    "Step 3 – Determine the TARGET predicate (P₂)\n"
    "        • We must compare ?x with the episode count of the *new* (2004) series.\n"
    "        • Triple 2 provides that fact with the *same* predicate, so set\n"
    "                P₂ = dbo:numberOfEpisodes\n"
    "        • Bind its object to **?y** (episodes of the new show).\n"
    "\n"
    "Step 4 – Retrieve the INTERMEDIATE type (Class T)\n"
    "        • No rdf:type triple exists for the numeric literals ?x or ?y, so this step is skipped.\n"
    "\n"
    "Step 5 – Pick DECORATORS\n"
    "        • The question is yes/no; instead of SELECT we issue an **ASK** query and return a boolean.\n"
    "        • DISTINCT/COUNT are therefore unnecessary.\n"
    "\n"
    "Step 6 – Assemble the WHERE block (comparison pattern)\n"
    "```\n"
    "res:Battlestar_Galactica_(1978_TV_series) dbo:numberOfEpisodes ?x .   # anchor\n"
    "res:Battlestar_Galactica_(2004_TV_series) dbo:numberOfEpisodes ?y .  # target\n"
    "FILTER ( ?y > ?x )                                                   # comparison\n"
    "```\n"
    "        Extras:\n"
    "        – The FILTER clause is essential to test whether the new count (?y) exceeds the old count (?x).\n"
    "\n"
    "Step 7 – Compose the query form\n"
    "        • Pattern for boolean tasks: **ASK WHERE { … }**\n"
    "        ASK WHERE { triples + FILTER }\n"
    "\n"
    "Step 8 – Final touches\n"
    "        • No OPTIONAL, UNION, LIMIT, or ORDER clauses are added.\n"
    "\n"
    "Step 9 – Emit inside the required JSON wrapper\n"
    "\n"
    "<Answer>\n"
    "{\n"
    "  \"sparql\": \"ASK WHERE { res:Battlestar_Galactica_(1978_TV_series) dbo:numberOfEpisodes ?x . res:Battlestar_Galactica_(2004_TV_series) dbo:numberOfEpisodes ?y . FILTER(?y > ?x) }\"\n"
    "}\n"
    "### END THINK ###"
)


PROMPT_CLUSTER_3 = (
    "{\n"
    "  \"Question\": \"Which rivers flow into a German lake?\",\n"
    "  \"Candidate triples list (numbered)\": \"1. res:Battle_of_the_Unstrut_River_(531) dbo:place res:Germany\\n"
    "2. res:Lake_Bant dbo:location res:Germany\\n"
    "3. res:Schlachtensee_(lake) dbo:location res:Germany\\n"
    "4. res:Battle_of_the_Lupia_River dbo:place res:Germany\\n"
    "5. res:Illmensee_(lake) dbo:location res:Germany\"\n"
    "}\n"
    "\n"
    "### THINK (do not output) ###\n"
    "Step 1 – Understand the TASK and read the INPUT\n"
    "        • Question: “Which rivers flow into a German lake?”\n"
    "        • Candidate triples (5) – see JSON above.\n"
    "\n"
    "Step 2 – Identify the two CONSTANTS (C₁ and C₂)\n"
    "        • The explicit constant in the question is **Germany** (C₂).  \n"
    "        • There is no specific named lake; instead, we must bind a variable **?x** that stands for “a lake in Germany”.  \n"
    "        • We therefore treat C₁ not as a constant but as the variable **?x** that will later be constrained to `dbo:Lake` and `dbo:country res:Germany`.\n"
    "\n"
    "Step 3 – Select the PREDICATE (P)\n"
    "        • The verbal phrase “flow into” maps to the ontology predicate `dbo:inflow` (river → lake).  \n"
    "        • This does not appear in the noisy candidate list; nonetheless, `dbo:inflow` is the correct predicate for the first hop.  \n"
    "        • The lake’s relationship to Germany is expressed with `dbo:country` (or `dbo:location` if `dbo:country` is absent).  \n"
    "        • Triples with predicates like `dbo:place` or `dbo:location` in the list are signal for this second hop; entries using battle–place links are ignored as *noise* because they do not involve lakes or the inflow relation.\n"
    "\n"
    "Step 4 – Form the core triples\n"
    "        • First hop (river → lake):      ?uri dbo:inflow ?x .  \n"
    "        • Second hop (lake → Germany):  ?x  dbo:country  res:Germany .  \n"
    "        • Add a type guard to ensure ?uri is a `dbo:River` and ?x a `dbo:Lake`.\n"
    "\n"
    "Step 5 – Wrap in a SELECT block (not ASK)\n"
    "        • The question demands *which* rivers, so we need a list.  \n"
    "        • Use **`SELECT DISTINCT ?uri`** to return each river once.\n"
    "\n"
    "Step 6 – Output inside the required JSON wrapper\n"
    "\n"
    "<Answer>\n"
    "{\n"
    "  \"sparql\": \"SELECT DISTINCT ?uri WHERE { ?uri rdf:type dbo:River . ?x rdf:type dbo:Lake . ?uri dbo:inflow ?x . ?x dbo:country res:Germany . }\"\n"
    "}\n"
    "### END THINK ###"
)


PROMPT_CLUSTER_4 = (
    "{\n"
    "  \"Question\": \"Which software has been developed by organizations founded in California?\",\n"
    "  \"Candidate triples list (numbered)\": \"1. res:Software_Publishing_Corporation dbo:location res:California\\n"
    "2. res:X1_(software_company) dbo:location res:California\\n"
    "3. res:TIBCO_Software dbo:location res:California\\n"
    "4. res:Megatech_Software dbo:location res:California\\n"
    "5. res:Tacit_Software dbo:location res:California\\n"
    "6. res:BatchMaster_Software dbo:location res:California\\n"
    "7. res:BlueSky_Software__BlueSky_Software__1 dbo:location res:California\\n"
    "8. res:Offset_Software dbo:location res:California\\n"
    "9. res:Starfish_Software dbo:location res:California\\n"
    "10. res:FutureWave_Software dbo:location res:California\"\n"
    "}\n"
    "\n"
    "### THINK (do not output) ###\n"
    "Step 1 – Understand the TASK and read the INPUT\n"
    "        • Question: “Which software has been developed by organizations founded in California?”\n"
    "        • Candidate triples (10) – see JSON above.\n"
    "\n"
    "Step 2 – Locate the CONSTANT mentioned in the question\n"
    "        • The only explicit entity is **California** → constant **C = res:California**.\n"
    "        • It appears as the *object* in all ten candidate triples.\n"
    "\n"
    "Step 3 – Select the PREDICATE P (property phrase)\n"
    "        • The English phrase is “organizations **founded in** California”.\n"
    "        • Correct ontology predicate: **dbo:foundationPlace** (company → place founded).\n"
    "        • None of the retrieved triples use dbo:foundationPlace; the `dbo:location` triples are *noise* for this intent because “location” ≠ “foundation place”.\n"
    "\n"
    "Step 4 – Decide VARIABLE placement\n"
    "        • The wording “… founded **in California**” puts California as *object*.\n"
    "        • Therefore:      **?company dbo:foundationPlace res:California**.\n"
    "\n"
    "        • A second hop is required to reach the requested software:\n"
    "              ?software dbo:developer ?company\n"
    "          (Here the property phrase is “developed by <company>”.)\n"
    "\n"
    "Step 5 – Pick DECORATOR\n"
    "        • Question does **not** start with “How many…”, so use **DISTINCT**.\n"
    "\n"
    "Step 6 – Compose the WHERE block\n"
    "        Core triples\n"
    "```\n"
    "?company  dbo:foundationPlace  res:California .   # constant-anchored triple\n"
    "?software dbo:developer       ?company .         # target triple\n"
    "```\n"
    "        Extras (why / why not)\n"
    "        – Add type guards for clarity but nothing else:\n"
    "              ?company  rdf:type dbo:Company  .\n"
    "              ?software rdf:type dbo:Software .\n"
    "          No OPTIONAL / UNION / FILTER / LIMIT / ORDER are needed.\n"
    "\n"
    "Step 7 – Compose the SELECT line\n"
    "        • Pattern: SELECT {DISTINCT|COUNT(?uri)} ?uri\n"
    "        • We need the list itself, not a count ⇒ `SELECT DISTINCT ?software`.\n"
    "\n"
    "Step 8 – Emit inside the required JSON wrapper\n"
    "\n"
    "<Answer>\n"
    "{\n"
    "  \"sparql\": \"SELECT DISTINCT ?software WHERE { ?company dbo:foundationPlace res:California . ?company rdf:type dbo:Company . ?software dbo:developer ?company . ?software rdf:type dbo:Software . }\"\n"
    "}\n"
    "### END THINK ###"
)

PROMPT_CLUSTER_5 = (
    "{\n"
    "  \"Question\": \"Which people were born in Heraklion?\",\n"
    "  \"Candidate triples list (numbered)\": \"1. res:Borna_Kapusta dbo:wikiPageWikiLink res:Heraklion\\n"
    "2. res:Dimitrios_Papadopoulos_(footballer,_born_1950) dbo:wikiPageWikiLink res:Heraklion\\n"
    "3. res:Dimitrios_Papadopoulos_(footballer,_born_1950)__Dimitrios_Papadopoulos__1 dbo:birthPlace res:Heraklion\\n"
    "4. res:Nicholas_Kalliakis dbp:birthPlace res:Heraklion\\n"
    "5. res:Emmanuel_Skordilis dbp:birthPlace res:Heraklion\\n"
    "6. res:Nicholas_Kalliakis dbo:birthPlace res:Heraklion\\n"
    "7. res:Emmanuel_Skordilis dbo:birthPlace res:Heraklion\\n"
    "8. res:Nadia_Valavani dbp:birthPlace res:Heraklion\\n"
    "9. res:Michael_Koukoulakis dbp:birthPlace res:Heraklion\\n"
    "10. res:Michael_Katehakis dbo:birthPlace res:Heraklion\"\n"
    "}\n"
    "\n"
    "### THINK (do not output) ###\n"
    "Step 1 – Understand the TASK and read the INPUT\n"
    "        • Question: “Which people were born in Heraklion?”\n"
    "        • Candidate triples (10) – see JSON above.\n"
    "\n"
    "Step 2 – Extract all CONSTANT–predicate pairs\n"
    "        • Fact 1: “born in Heraklion”\n"
    "              Constant   C₁ = res:Heraklion\n"
    "              Predicate  P₁ = dbo:birthPlace / dbp:birthPlace   (appears in triples 3–10)\n"
    "        • Fact 2: result must be “people” (class Person)\n"
    "              Constant   C₂ = dbo:Person\n"
    "              Predicate  P₂ = rdf:type                       (implicit but standard guard)\n"
    "\n"
    "Step 3 – Bind a single answer variable\n"
    "        • Use **?uri** for the entity that satisfies *both* facts.\n"
    "\n"
    "Step 4 – Determine triple orientation for each pair\n"
    "        • Triples 3–10 show *object* orientation for birth-place →\n"
    "              ?uri P₁ C₁   (person → Heraklion)\n"
    "        • rdf:type guard is always   ?uri rdf:type C₂.\n"
    "\n"
    "Step 5 – Add optional TYPE filter (already covered)\n"
    "        • Fact 2 itself provides the type guard; include it explicitly.\n"
    "\n"
    "Step 6 – Select DECORATOR\n"
    "        • Question does not ask “How many…”, so apply **DISTINCT**.\n"
    "\n"
    "Step 7 – Assemble the WHERE block (both pairs)\n"
    "```\n"
    "?uri rdf:type     dbo:Person   .   # Pair 2 (type guard)\n"
    "?uri dbo:birthPlace res:Heraklion .   # Pair 1 (birth-place)\n"
    "```\n"
    "        • No OPTIONAL, UNION, FILTER, LIMIT, or ORDER are added.\n"
    "\n"
    "Step 8 – Compose the SELECT line\n"
    "        SELECT DISTINCT ?uri\n"
    "\n"
    "Step 9 – No extras (already noted in Step 7).\n"
    "\n"
    "Step 10 – Emit inside the required JSON wrapper\n"
    "\n"
    "<Answer>\n"
    "{\n"
    "  \"sparql\": \"SELECT DISTINCT ?uri WHERE { ?uri rdf:type dbo:Person . ?uri dbo:birthPlace res:Heraklion . }\"\n"
    "}\n"
    "### END THINK ###"
)

PROMPT_CLUSTER_6 = (
    "{\n"
    "  \"Question\": \"Who developed Skype?\",\n"
    "  \"Candidate triples list (numbered)\": \"1. res:Skype dbo:developer res:Skype_Technologies\\n"
    "2. res:Skype dbo:wikiPageWikiLink res:Category:Estonian_inventions\\n"
    "3. res:Skype dcterms:subject res:Category:Estonian_inventions\\n"
    "4. res:William_Pugh_(game_designer) dbo:wikiPageWikiLink res:Skype\\n"
    "5. res:Michael_Seifert_(programmer) dbo:wikiPageWikiLink res:Skype\\n"
    "6. res:Jason_Johnson_(entrepreneur) dbo:wikiPageWikiLink res:Skype\\n"
    "7. res:Qi_Lu_(computer_scientist) dbo:wikiPageWikiLink res:Skype\\n"
    "8. res:Bryan_Johnson_(entrepreneur) dbo:wikiPageWikiLink res:Skype\\n"
    "9. res:List_of_Swedish_entrepreneurs dbo:wikiPageWikiLink res:Skype\\n"
    "10. res:List_of_Macintosh_software_published_by_Microsoft dbo:wikiPageWikiLink res:Skype\"\n"
    "}\n"
    "\n"
    "### THINK (do not output) ###\n"
    "Step 1 – Understand the TASK and read the INPUT\n"
    "        • Question: “Who developed Skype?”\n"
    "        • Candidate triples (10) – see JSON above.\n"
    "\n"
    "Step 2 – Pick the ROOT triple (C₀ ↔ ?x)\n"
    "        • Constant named in question: **C₀ = res:Skype**.\n"
    "        • Triple 1 links Skype to a developer via **dbo:developer**.\n"
    "              Predicate  P₀ = dbo:developer\n"
    "              Variable   ?x = the developer entity\n"
    "        • Orientation in candidates is  *C₀ P₀ constant*; we generalise object → **?x**.\n"
    "              Pattern:   res:Skype dbo:developer ?x .\n"
    "\n"
    "Step 3 – Grab the INTERMEDIATE class (T)\n"
    "        • No candidate shows  ?x rdf:type T  → skip the type guard (none available).\n"
    "\n"
    "Step 4 – Choose the TARGET predicate (P₁)\n"
    "        • The requested fact *is* the developer itself, so set **?uri = ?x**.\n"
    "        • No additional edge is required; omit Step 4 triple.\n"
    "\n"
    "Step 5 – Select DECORATOR\n"
    "        • Question does *not* start with “How many…”, so use **DISTINCT**.\n"
    "\n"
    "Step 6 – Build the WHERE block (single edge)\n"
    "```\n"
    "res:Skype dbo:developer ?uri .   # root edge already yields the answer\n"
    "```\n"
    "        • The remaining wikiPageWikiLink / dcterms:subject triples are *noise*—they encode page-link metadata, not factual developer relations.\n"
    "\n"
    "Step 7 – Compose the SELECT line\n"
    "        SELECT DISTINCT ?uri\n"
    "\n"
    "Step 8 – No extras\n"
    "        • Do **not** add OPTIONAL, UNION, FILTER, LIMIT, or ORDER clauses.\n"
    "\n"
    "Step 9 – Output inside the required JSON wrapper\n"
    "\n"
    "<Answer>\n"
    "{\n"
    "  \"sparql\": \"SELECT DISTINCT ?uri WHERE { res:Skype dbo:developer ?uri . }\"\n"
    "}\n"
    "### END THINK ###"
)

PROMPT_CLUSTER_7 = (
    "{\n"
    "  \"Question\": \"Give me all actors starring in movies directed by and starring William Shatner.\",\n"
    "  \"Candidate triples list (numbered)\": \"1. res:The_People_(film) dbo:starring res:William_Shatner\\n"
    "2. res:The_People_(film) dbp:starring res:William_Shatner\\n"
    "3. res:Showtime_(film) dbo:starring res:William_Shatner\\n"
    "4. res:The_Wild dbo:starring res:William_Shatner\\n"
    "5. res:The_Devil's_Rain_(film) dbo:starring res:William_Shatner\\n"
    "6. res:Shoot_or_Be_Shot dbo:starring res:William_Shatner\\n"
    "7. res:Free_Enterprise_(film) dbo:starring res:William_Shatner\\n"
    "8. res:Over_the_Hedge_(film) dbo:starring res:William_Shatner\\n"
    "9. res:Aliens_Ate_My_Homework_(2018_film) dbo:starring res:William_Shatner\\n"
    "10. res:Shoot_or_Be_Shot dbp:starring res:William_Shatner\\n"
    "}\n"
    "\n"
    "### THINK (do not output) ###\n"
    "Step 1 – Understand the TASK and read the INPUT\n"
    "        • Question: “Give me all actors starring in movies **directed by and starring William Shatner**.”\n"
    "        • Candidate triples (12) – see JSON above.\n"
    "\n"
    "Step 2 – Find the CONSTANT pair (P, C)\n"
    "        • The explicit entity is **William Shatner** → constant **C = res:William_Shatner**.\n"
    "        • We need a predicate that links each movie (?x) to that constant.  \n"
    "          – For “directed by”, the ontology predicate is **dbo:director**.  \n"
    "          – Although no candidate triple shows dbo:director, it is the correct factual edge; choose it for the anchor.\n"
    "        • Orientation: constant is **object** ⇒ pattern **?x dbo:director C**.\n"
    "        • Bind the movie to **?x**.\n"
    "\n"
    "Step 3 – Identify the “OTHER” edge (same subject ?x)\n"
    "        • The second condition *starring William Shatner* is expressed by **dbo:starring** (or its dbp variant).  \n"
    "        • Use it twice:  \n"
    "              a) To enforce the starring-Shatner condition ⇒ ?x dbo:starring C  .  \n"
    "              b) To reach the requested co-stars      ⇒ ?x dbo:starring ?uri .\n"
    "        • For (b) we treat **P′ = dbo:starring**, yielding the variable object **?uri** (the other actor).\n"
    "\n"
    "Step 4 – Pick DECORATOR\n"
    "        • Counting is **not** asked; list the actors → `SELECT DISTINCT ?uri`.\n"
    "\n"
    "Step 5 – Assemble WHERE block (extended cycle)\n"
    "        {anchor triple}\n"
    "        ?x dbo:director  res:William_Shatner   .\n"
    "        {starring constraint}\n"
    "        ?x dbo:starring res:William_Shatner   .\n"
    "        {expansion triple}\n"
    "        ?x dbo:starring ?uri                  .\n"
    "        • No rdf:type, OPTIONAL, FILTER, UNION, LIMIT, or ORDER.\n"
    "\n"
    "Step 6 – Compose SELECT line\n"
    "        SELECT DISTINCT ?uri\n"
    "\n"
    "Step 7 – Emit inside the required JSON wrapper\n"
    "\n"
    "<Answer>\n"
    "{\n"
    "  \"sparql\": \"SELECT DISTINCT ?uri WHERE { ?x dbo:director res:William_Shatner . ?x dbo:starring res:William_Shatner . ?x dbo:starring ?uri . }\"\n"
    "}\n"
    "### END THINK ###"
)

# Map cluster id -> prompt string
PROMPTS_BY_CLUSTER = {
    0: PROMPT_CLUSTER_0,
    1: PROMPT_CLUSTER_1,
    2: PROMPT_CLUSTER_2,
    3: PROMPT_CLUSTER_3,
    4: PROMPT_CLUSTER_4,
    5: PROMPT_CLUSTER_5,
    6: PROMPT_CLUSTER_6,
    7: PROMPT_CLUSTER_7,
}

# ───────────────────── Helper functions ──────────────────────

def lists_to_numbered_string(triples: List[Any]) -> str:
    """Convert triples into a 1‑based numbered newline‑separated string."""
    return "\n".join(
        f"{i}. {' '.join(map(str, t)) if isinstance(t, (list, tuple)) else str(t)}"
        for i, t in enumerate(triples, 1)
    )

def build_messages(question: str, triples_str: str, cluster_id: int) -> List[Dict[str, str]]:
    """Return the message list for a single sample."""
    # System message 1: generic
    generic_msg = {"role": "system", "content": SYSTEM_PROMPT_GENERIC}

    # System message 2: cluster‑specific THINK prompt
    think_prompt = PROMPTS_BY_CLUSTER[cluster_id]
    think_msg = {"role": "system", "content": think_prompt}

    # User message with question + triples JSON string
    user_payload = {
        "Question": question,
        "Candidate triples list (numbered)": triples_str,
    }
    user_msg = {"role": "user", "content": json.dumps(user_payload)}

    return [generic_msg, think_msg, user_msg]


# ────────────────────────── Main logic ───────────────────────

def main() -> None:
    with INPUT_PATH.open(encoding="utf-8") as f:
        dataset = json.load(f)

    jsonl_rows: List[Dict[str, Any]] = []
    cluster_counts: Counter[int] = Counter()
    skipped: Counter[str] = Counter()

    for sample in dataset:
        cluster_raw = sample.get("cluster_id")
        try:
            cluster_id = int(cluster_raw)
        except (TypeError, ValueError):
            skipped["invalid cluster_id"] += 1
            continue

        if cluster_id not in ALLOWED_CLUSTER_IDS:
            skipped["cluster not 0‑7"] += 1
            continue

        # Fetch triples list (accept both spellings)
        raw_triples = sample.get("retrived_triples_ranked")
        if not raw_triples:
            triples_list = []
        else:
            triples_list = [hit["triple"] if isinstance(hit, dict) else hit for hit in raw_triples[:TRIPLES_LIMIT]]
            
        triples_str = lists_to_numbered_string(triples_list)

        question_text = sample.get("question", "").strip()
        if not question_text:
            skipped["missing question"] += 1
            continue

        messages = build_messages(question_text, triples_str, cluster_id)
        jsonl_rows.append({"messages": messages})
        cluster_counts[cluster_id] += 1

    # Write output
    with OUTPUT_PATH.open("w", encoding="utf-8") as f_out:
        for rec in jsonl_rows:
            f_out.write(json.dumps(rec) + "\n")

    # Console summary
    total_written = len(jsonl_rows)
    print(f"Wrote {total_written} inference records to {OUTPUT_PATH}")
    print("Cluster distribution (kept):")
    for cid in sorted(cluster_counts):
        print(f"  {cid}: {cluster_counts[cid]}")
    print("Skipped samples by reason:")
    for reason, count in skipped.items():
        print(f"  {reason}: {count}")


if __name__ == "__main__":
    main()

Wrote 149 inference records to /home/m2khoda/dual_retriever/evaluations/dycot/qald_results/qald_test_solo_stage_top_10_dycot_medoid.jsonl
Cluster distribution (kept):
  0: 67
  1: 12
  2: 69
  6: 1
Skipped samples by reason:


In [4]:
import json

SOURCE  = "/home/m2khoda/dual_retriever/evaluations/dycot/qald_results/qald_test_solo_stage_top_10_dycot_medoid.jsonl"
TARGET  = "/home/m2khoda/dual_retriever/evaluations/dycot/qald_results/qald_test_solo_stage_top_10_batch_input.jsonl"
MODEL   = "ft:gpt-3.5-turbo-0125:personal::Bk9BchWy"

with open(SOURCE, "r", encoding="utf-8") as fin, \
     open(TARGET,  "w", encoding="utf-8") as fout:
    for idx, line in enumerate(fin):
        messages = json.loads(line)["messages"]

        batch_row = {
            "custom_id": f"example_{idx}",
            "method":    "POST",
            "url":       "/v1/chat/completions",
            "body": {
                "model":       MODEL,
                "messages":    messages,
                "temperature": 0
            }
        }
        fout.write(json.dumps(batch_row) + "\n")

print(f"Wrote {idx+1} lines to {TARGET}")


Wrote 149 lines to /home/m2khoda/dual_retriever/evaluations/dycot/qald_results/qald_test_solo_stage_top_10_batch_input.jsonl


In [5]:
from openai import OpenAI
import time
import json

upload = client.files.create(
    file=open("/home/m2khoda/dual_retriever/evaluations/dycot/qald_results/qald_test_solo_stage_top_10_batch_input.jsonl", "rb"),
    purpose="batch"
)
input_file_id = upload.id
print("Uploaded file:", input_file_id)

batch = client.batches.create(
    input_file_id     = input_file_id,
    endpoint          = "/v1/chat/completions",
    completion_window = "24h",
    metadata          = {"job": "QALD test inference"}
)
print("Batch ID:", batch.id)

while True:
    batch = client.batches.retrieve(batch.id)
    print("Status:", batch.status)
    if batch.status in {"failed", "completed"}:
        break
    time.sleep(60)

if batch.status == "failed":
    print("Batch failed! Full batch object:")
    print(batch)

    raise SystemExit(1)

result_file_id = batch.output_file_id

result_response = client.files.content(result_file_id)

with open("/home/m2khoda/dual_retriever/evaluations/dycot/qald_results/qald_test_solo_stage_10_dycot_batch_output.jsonl", "w", encoding="utf-8") as f:
    f.write(result_response.text)

print("Saved outputs to qald_test_batch_output.jsonl")

Uploaded file: file-CdSzbmifvL4nSiqXLWwhh7
Batch ID: batch_687284c4857c81908cadd1aafbd98ed7
Status: validating
Status: in_progress
Status: in_progress
Status: in_progress
Status: finalizing
Status: finalizing
Status: completed
Saved outputs to qald_test_batch_output.jsonl


In [6]:
import json, re
from pathlib import Path

GOLD_PATH   = Path("/home/m2khoda/dual_retriever/evaluations/dycot/qald_results/qald_test_solo_stage_top10_dycot_cleaned.json")
PRED_PATH   = Path("/home/m2khoda/dual_retriever/evaluations/dycot/qald_results/qald_test_solo_stage_10_dycot_batch_output.jsonl")
OUTPUT_PATH = Path("/home/m2khoda/dual_retriever/evaluations/end_to_end_evalution/qald_results/qald_test_solo_stage_10_dycot_medoid_plus_gold.json")

# ───────── helper to pull "sparql" string from assistant content ──────────
ANSWER_RE = re.compile(r'<Answer>\s*(\{.*\})', re.DOTALL)

def extract_sparql(content: str) -> str:
    try:
        obj = json.loads(content)
        if isinstance(obj, dict) and "sparql" in obj:
            return obj["sparql"]
    except json.JSONDecodeError:
        pass

    # ---------- 2.  Remove common little wrappers then retry  ----------
    cleaned = content
    cleaned = re.sub(r'^[<\s/]*answer[>\s:]*', '', cleaned, flags=re.I).strip()
    cleaned = re.sub(r'^```(?:json)?|```$', '', cleaned, flags=re.I | re.M).strip()

    try:
        obj = json.loads(cleaned)
        if isinstance(obj, dict) and "sparql" in obj:
            return obj["sparql"]
    except json.JSONDecodeError:
        pass

    # ---------- 3.  Fallback – hunt for the *first* {"sparql": …} blob  ----------
    anchor = '{"sparql"'
    start = content.find(anchor)
    while start != -1:
        depth = 0
        for i in range(start, len(content)):
            ch = content[i]
            if ch == '{':
                depth += 1
            elif ch == '}':
                depth -= 1
                if depth == 0:                      # end of JSON object
                    snippet = content[start:i+1]
                    try:
                        obj = json.loads(snippet)
                        if "sparql" in obj:
                            return obj["sparql"]
                    except json.JSONDecodeError:
                        # malformed fragment – keep scanning
                        break
        # look for the next candidate, if any
        start = content.find(anchor, start + len(anchor))

    # ---------- 4.  Nothing matched  ----------
    return ""

# ───────── load gold ──────────────────────────────────────────────────────
with GOLD_PATH.open(encoding="utf-8") as f:
    gold_records = json.load(f)

# ───────── build lookup of predictions by custom_id ───────────────────────
pred_lookup = {}
with PRED_PATH.open(encoding="utf-8") as f:
    for line in f:
        rec     = json.loads(line)
        cid     = rec["custom_id"]
        content = rec["response"]["body"]["choices"][0]["message"]["content"]
        pred_lookup[cid] = extract_sparql(content)

# ───────── merge into gold ────────────────────────────────────────────────
for idx, rec in enumerate(gold_records):
    cid = f"example_{idx}"
    rec["pred_query"] = pred_lookup.get(cid, "")   # "" if missing

# ───────── write result ───────────────────────────────────────────────────
with OUTPUT_PATH.open("w", encoding="utf-8") as f:
    json.dump(gold_records, f, ensure_ascii=False, indent=2)

print(f"Enriched file written → {OUTPUT_PATH}. Total records: {len(gold_records)}")

Enriched file written → /home/m2khoda/dual_retriever/evaluations/end_to_end_evalution/qald_results/qald_test_solo_stage_10_dycot_medoid_plus_gold.json. Total records: 149


LCQUAD Subsampled

In [None]:
import json
from pathlib import Path


json_path = Path("/home/m2khoda/dual_retriever/evaluations/dycot/qald_results/qald_test_solo_stage_top10_dycot_cleaned.json")

with json_path.open("r", encoding="utf-8") as f:
    data = json.load(f)

cleaned = [d for d in data if "cluster_id" in d]

with json_path.open("w", encoding="utf-8") as f:
    json.dump(cleaned, f, ensure_ascii=False, indent=2)
