In [1]:
from pipeline import pipelines
from pipeline.simple_llm_query_generator import SimpleLLMQueryGenerator
from pipeline.query_engine_component import QueryExecutorStep
from pipeline.context.context_constructor import context_constructor
import os
import traceback
from tqdm.autonotebook import tqdm as notebook_tqdm

def run_pipelines(natural_language_question: list, expected_sparql: list):
    sparql_llm_pipeline = None
    sparql_execution_pipeline = None
    try:
        query_generator = SimpleLLMQueryGenerator(model_name="meta-llama/LLama-3.2-1B-Instruct")
        query_executor = QueryExecutorStep(
            engine_name="milleniumDB",
            graph_path="rdf_100_sphn.nt",
            verbose=True,
            query_format="sparql",
            construct_graph=False
        )
        context_maker = context_constructor()
        sparql_llm_pipeline = pipelines.InitialPipeline([query_generator, query_executor])
        sparql_execution_pipeline = pipelines.InitialPipeline([query_executor])

        sparql_llm_pipeline.initialize()
        sparql_execution_pipeline.initialize()
        results_dict = {
            'sparql_without_context': [],
            'sparql_with_context': [],
            'expected_sparql_result': []
        }
        for question, sparql in notebook_tqdm(zip(natural_language_question, expected_sparql)):

            context =  context_maker.context(question)
            print("Context for the question:", context)
            result_llm_without_context = sparql_llm_pipeline.run(
                natural_language_question=natural_language_question,
                sparql_is_path=False
            )
            print("LLM Result without context:", result_llm_without_context['query'])
            result_llm_with_context = None
            # Here context needs to be filled in depending on the Natural Language Question
            result_llm_with_context = sparql_llm_pipeline.run(natural_language_question=natural_language_question,
                                                              context=context,
                                                              sparql_is_path=False)
            print("LLM Result with context:", result_llm_with_context['query'])
            expected_sparql_result = sparql_execution_pipeline.run({"query": sparql})
            results_dict['sparql_without_context'].append(result_llm_without_context['query'])
            results_dict['sparql_with_context'].append(result_llm_with_context['query'])
            results_dict['expected_sparql_result'].append(expected_sparql_result)

        return results_dict

    except Exception as ex:
        traceback.print_exc()

    finally:
        if sparql_llm_pipeline:
            try:
                sparql_llm_pipeline.close()
            except Exception as e:
                print(f"Error while closing sparql_llm_pipeline: {e}")
        if sparql_execution_pipeline:
            try:
                sparql_execution_pipeline.close()
            except Exception as e:
                print(f"Error while closing sparql_execution_pipeline: {e}")

    return None




  from .autonotebook import tqdm as notebook_tqdm


4.52.4


In [3]:
import pandas as pd
import numpy as np

updated_q_sparql_pairs = pd.read_csv("../sparql_queries/updated_qq_pairs/question_query_pairs_100.csv")
# Get 100 random samples
sampled_pairs = updated_q_sparql_pairs.sample(n=1, random_state=42).reset_index(drop=True)


In [4]:
sampled_pairs.size

3

In [4]:
# result = run_pipelines(
#     natural_language_question=sampled_pairs['Question'].tolist(),
#     expected_sparql=sampled_pairs['SPARQL Query'].tolist()
# )

In [5]:
from pipeline import pipelines

In [6]:
from pipeline import mdb_validated_generation
generate_execute_component = mdb_validated_generation.MDBValidatedGeneration(model_name = "meta-llama/Llama-3.2-1B-Instruct")
val_pipeline  = pipelines.InitialPipeline([generate_execute_component])
context_maker = context_constructor()

Using device: cpu
Checking available images...
['mdb:latest']
['docker-aseprite:latest']
['ghcr.io/avantlab/avantgraph:openaire-transport']
['ghcr.io/avantlab/avantgraph:openaire-energy']
['ghcr.io/avantlab/avantgraph:openaire-neuro']
['ghcr.io/avantlab/avantgraph:openaire-cancer']
['hello-world:latest']
['ghcr.io/avantlab/avantgraph:ckg']
['ghcr.io/avantlab/avantgraph:release-2024-01-31']
Using device: cpu


In [None]:
context_maker.get_context(question = "how many substances does drug 12035 contain?")

"Next follow template questions and queries related to the natural language question:\nQuestion: How many substances does {DRUG} contain?\nQuery: SELECT COUNT(DISTINCT substance) FROM substance WHERE drug = {{DRUG}}\n\nThe following class IRIs can be placed in place of the classes enclosed by brackets \\{\\} in the templates:\nClass IRI: ['https://www.biomedit.ch/rdf/sphn-schema/sphn/Substance', 'https://www.biomedit.ch/rdf/sphn-schema/sphn/Drug']\nClass IRI: []\n"

In [7]:
natural_language_question = ["How many substances does drug 12305 contain?"]
for question in notebook_tqdm((natural_language_question)):
    context =  context_maker.get_context(question = question)
    print(context)
    result_llm_with_context = None
    # Here context needs to be filled in depending on the Natural Language Question
    result_llm_with_context = val_pipeline.run(natural_language_question=natural_language_question,
                                                    context = context,
                                                      sparql_is_path=False)
    print("LLM Result with context:", result_llm_with_context['query'])



  0%|          | 0/1 [00:00<?, ?it/s]

Next follow template questions and queries related to the natural language question:
Question: How many substances does {DRUG} contain?
Query: SELECT COUNT(DISTINCT substance) FROM substance WHERE drug = {{DRUG}}

The following class IRIs can be placed in place of the classes enclosed by brackets \{\} in the templates:
Class IRI: ['https://www.biomedit.ch/rdf/sphn-schema/sphn/Substance', 'https://www.biomedit.ch/rdf/sphn-schema/sphn/Drug']
Class IRI: []

▶ Running step: MDBValidatedGeneration

🧠 Attempt 0: Executing query...

        SELECT COUNT(DISTINCT substance) FROM substance WHERE drug = __DRUG__
        

>>> Query being sent to MillenniumDB:
 
        SELECT COUNT(DISTINCT substance) FROM substance WHERE drug = __DRUG__
        


Device set to use cpu
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Query fixing prompt: This SPARQL query returned an error:  extraneous input 'COUNT' expecting {DISTINCT, REDUCED, VAR1, VAR2, '(', '*'}. Fix the query and try again. Again, do not include any explanations or apologies in your responses.

✅ Fixed SPARQL Query:
 PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX obo: <http://purl.obolibrary.org/obo/>

SELECT?substance
WHERE {
   ?drug obo:Drug?substance.
    FILTER(relation?drug = obo:Drug)
}

🧠 Attempt 1: Executing query...
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX obo: <http://purl.obolibrary.org/obo/>

SELECT?substance
WHERE {
   ?drug obo:Drug?substance.
    FILTER(relation?drug = obo:Drug)
}

>>> Query being sent to MillenniumDB:
 PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX obo: <http://purl.obolibrary.org/obo/>

SELECT?substance
WHERE {
   ?drug obo:Drug?substance.
    FILTER(relation?drug = obo:Drug)
}


Device set to use cpu
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|          | 0/1 [00:33<?, ?it/s]


KeyboardInterrupt: 

In [None]:
from pipeline import mdb_validated_generation
from pipeline.context.context_constructor import context_constructor
def run_validated_pipeline(natural_language_question):
    generate_execute_component = mdb_validated_generation.MDBValidatedGeneration(model_name = "meta-llama/Llama-3.2-1B-Instruct")
    val_pipeline  = pipelines.InitialPipeline([generate_execute_component])
    context_maker = context_constructor(construct_vector_store=True, example_queries_file="/home/mathiasyap/Code/university/phkg/MAI_Project_PHKG/src/pipeline/context/qq_pairs.yaml")
    val_pipeline.initialize()
    try: 
        for question in notebook_tqdm((natural_language_question)):
            context =  context_maker.get_context(question)
            print(context)
            result_llm_with_context = None
            # Here context needs to be filled in depending on the Natural Language Question
            result_llm_with_context = val_pipeline.run(natural_language_question=natural_language_question,
                                                              sparql_is_path=False)
            print("LLM Result with context:", result_llm_with_context['query'])

        return result_llm_with_context

    except Exception as ex:
        traceback.print_exc()

    finally:
        try:
            val_pipeline.close()
        except Exception as e:
            print(f"Error while closing pipeline: {e}")

    return None

  from .autonotebook import tqdm as notebook_tqdm


4.52.4


In [3]:
data_try = run_validated_pipeline(
    natural_language_question="How many substances does drug 12305 contain?"
)

Using device: cpu
['mdb:latest']
['docker-aseprite:latest']
['ghcr.io/avantlab/avantgraph:openaire-transport']
['ghcr.io/avantlab/avantgraph:openaire-energy']
['ghcr.io/avantlab/avantgraph:openaire-neuro']
['ghcr.io/avantlab/avantgraph:openaire-cancer']
['hello-world:latest']
['ghcr.io/avantlab/avantgraph:ckg']
['ghcr.io/avantlab/avantgraph:release-2024-01-31']
Using device: cpu


NameError: name 'pipelines' is not defined

In [None]:
def initialize_query_engine():
    executor = QueryExecutorStep(
        engine_name="milleniumDB",
        graph_path="rdf_100_sphn.nt",
        verbose=True,
        query_format="sparql",
        construct_graph=False
    )
    return executor

In [None]:
engine = initialize_query_engine()

Checking available images...
['mdb:latest']
['docker-aseprite:latest']
['ghcr.io/avantlab/avantgraph:openaire-transport']
['ghcr.io/avantlab/avantgraph:openaire-energy']
['ghcr.io/avantlab/avantgraph:openaire-neuro']
['ghcr.io/avantlab/avantgraph:openaire-cancer']
['hello-world:latest']
['ghcr.io/avantlab/avantgraph:ckg']
['ghcr.io/avantlab/avantgraph:release-2024-01-31']


In [None]:
engine.query("Select ?1 .... {}", path = False)

>>> Query being sent to MillenniumDB:
 Select ?1 .... {}


ResultError: 

In [None]:
question = "How many patients were diagnosed with diagnosis 102358?"
expected_sparql = """
SELECT (COUNT(?patient) AS ?count)
WHERE {
  ?patient a <http://www.semanticweb.org/ontologies/2023/10/untitled-ontology-2#Patient> .
  ?patient <http://www.semanticweb.org/ontologies/2023/10/untitled-ontology-2#hasDiagnosis> <http://www.semanticweb.org/ontologies/2023/10/untitled-ontology-2#Diagnosis102358> .
}
"""
result_llm_without_context_list = []
result_llm_with_context_list = []
expected_sparql_result_list = []
print(f"Run pipeline for question: {question}")
result_llm_without_context, result_llm_with_context, expected_sparql_result = run_pipelines(
    natural_language_question=question,
    expected_sparql=expected_sparql)
if result_llm_without_context:
    result_llm_without_context_list.append(result_llm_without_context["result"])
else:
    result_llm_without_context_list.append(None)

if result_llm_with_context:
    result_llm_with_context_list.append(result_llm_with_context["result"])
else:
    result_llm_with_context_list.append(None)

if expected_sparql_result:
    expected_sparql_result_list.append(expected_sparql_result["result"])
else:
    expected_sparql_result_list.append(None)

Run pipeline for question: How many patients were diagnosed with diagnosis 102358?
Using device: cpu
Checking available images...
['mdb:latest']
['docker-aseprite:latest']
['ghcr.io/avantlab/avantgraph:openaire-transport']
['ghcr.io/avantlab/avantgraph:openaire-energy']
['ghcr.io/avantlab/avantgraph:openaire-neuro']
['ghcr.io/avantlab/avantgraph:openaire-cancer']
['hello-world:latest']
['ghcr.io/avantlab/avantgraph:ckg']
['ghcr.io/avantlab/avantgraph:release-2024-01-31']
▶ Running step: SimpleLLMQueryGenerator
Prompt: 
Task: Generate a SPARQL SELECT statement for querying a graph database.
For instance, to find all email addresses of John Doe, the following query in backticks would be suitable:
```
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
SELECT ?email
WHERE {
    ?person foaf:name "John Doe" .
    ?person foaf:mbox ?email .
}
```
Keep in mind that you might need several classes in order to provide the correct answer. 

Instructions:
Use only the node types and properties provided in 

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 30.48it/s]
Device set to use cpu
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


📥 Raw LLM Output:
 ```sparql
PREFIX sphn: <https://www.biomedit.ch/rdf/sphn-schema/sphn/>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
SELECT COUNT (?patient)
WHERE {
   ?patient sphn:hasSubjectPseudoIdentifier?identifier.
   ?identifier sphn:hasCodingSystemAndVersion?coding.
   ?coding sphn:hasCode?code.
   ?code sphn:hasActiveIngredient?activeIngredient.
   ?activeIngredient a sphn:Drug.
   ?patient sphn:hasDiagnosis?diagnosis.
   ?diagnosis sphn:id "102358".
}
```

✅ Final SPARQL Query:
 PREFIX sphn: <https://www.biomedit.ch/rdf/sphn-schema/sphn/>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
SELECT COUNT (?patient)
WHERE {
   ?patient sphn:hasSubjectPseudoIdentifier?identifier.
   ?identifier sphn:hasCodingSystemAndVersion?coding.
   ?coding sphn:hasCode?code.
   ?code sphn:hasActiveIngredient?activeIngredient.
   ?activeIngredient a sphn:Drug.
   ?patient sphn:hasDiagnosis?diagnosis.
   ?diagnosis sphn:id "102358".
}
✅ Step 'SimpleLLMQueryGenerator' completed in 66.07 seconds

millenniumdb_driver.millenniumdb_error.MillenniumDBError: Query Exception: extraneous input 'COUNT' expecting {DISTINCT, REDUCED, VAR1, VAR2, '(', '*'}

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/tmp/ipykernel_314576/45117174.py", line 27, in run_pipelines
    result_llm_without_context = sparql_llm_pipeline.run(
        natural_language_question=natural_language_question,
        sparql_is_path=False
    )
  File "/home/mathiasyap/Code/university/phkg/MAI_Project_PHKG/src/pipeline/pipelines.py", line 24, in run
    data = step.run(data, **kwargs)
  File "/home/mathiasyap/Code/university/phkg/MAI_Project_PHKG/src/pipeline/query_engine_component.py", line 47, in run
    data["result"]=self.query(query, path=sparql_is_path)
                   ~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/mathiasyap/Code/university/phkg/MAI_Project_PHKG/src/pipeline/query_engine_component.py", line 96, in query
    return self.m

Error while closing sparql_execution_pipeline: 404 Client Error for http+docker://localhost/v1.49/containers/b179469f6b5f239c6c3d2c91fb44b7651d71a5233e2e0afb814e41a5427f2a5f?v=True&link=False&force=True: Not Found ("No such container: b179469f6b5f239c6c3d2c91fb44b7651d71a5233e2e0afb814e41a5427f2a5f")


In [None]:
queries = sampled_pairs["SPARQL Query"].values
questions = sampled_pairs["Question"].values
result_llm_without_context_list = []
result_llm_with_context_list = []
expected_sparql_result_list = []
for expected_sparql, question in zip(queries, questions):
    print(f"Run pipeline for question: {question}")
    result_llm_without_context, result_llm_with_context, expected_sparql_result = run_pipelines(
        natural_language_question=question,
        expected_sparql=expected_sparql)
    if result_llm_without_context:
        result_llm_without_context_list.append(result_llm_without_context["result"])
    else:
        result_llm_without_context_list.append(None)

    if result_llm_with_context:
        result_llm_with_context_list.append(result_llm_with_context["result"])
    else:
        result_llm_with_context_list.append(None)

    if expected_sparql_result:
        expected_sparql_result_list.append(expected_sparql_result["result"])
    else:
        expected_sparql_result_list.append(None)

Run pipeline for question: What are the codes for the substances used in the drug prescription 17205470?
Using device: cpu
Checking available images...
['mdb:latest']
['docker-aseprite:latest']
['ghcr.io/avantlab/avantgraph:openaire-transport']
['ghcr.io/avantlab/avantgraph:openaire-energy']
['ghcr.io/avantlab/avantgraph:openaire-neuro']
['ghcr.io/avantlab/avantgraph:openaire-cancer']
['hello-world:latest']
['ghcr.io/avantlab/avantgraph:ckg']
['ghcr.io/avantlab/avantgraph:release-2024-01-31']
▶ Running step: SimpleLLMQueryGenerator
Prompt: 
Task: Generate a SPARQL SELECT statement for querying a graph database.
For instance, to find all email addresses of John Doe, the following query in backticks would be suitable:
```
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
SELECT ?email
WHERE {
    ?person foaf:name "John Doe" .
    ?person foaf:mbox ?email .
}
```
Keep in mind that you might need several classes in order to provide the correct answer. 

Instructions:
Use only the node types and p

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 83.85it/s]
Device set to use cpu
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Error while closing sparql_execution_pipeline: 404 Client Error for http+docker://localhost/v1.49/containers/0d73a27ee79dd2db601a2b74980e1690e3b5183161704a6bb2f9bda1fd720b5c?v=True&link=False&force=True: Not Found ("No such container: 0d73a27ee79dd2db601a2b74980e1690e3b5183161704a6bb2f9bda1fd720b5c")


KeyboardInterrupt: 

In [None]:
tp_with_context = fp_with_context = fn_with_context = 0
tp_without_context = fp_without_context = fn_without_context = 0

for result_llm_without_context, result_llm_with_context, result_exact in zip(
        result_llm_without_context_list,
        result_llm_with_context_list,
        expected_sparql_result_list):

    # Skip only if the expected result is missing
    if not result_exact:
        continue

    values_exact = set(v[0] for v in result_exact.values())

    # Without context
    if result_llm_without_context:
        values_llm_without = set(v[0] for v in result_llm_without_context.values())
    else:
        values_llm_without = set()

    tp = len(values_llm_without & values_exact)
    fp = len(values_llm_without - values_exact)
    fn = len(values_exact - values_llm_without)
    tp_without_context += tp
    fp_without_context += fp
    fn_without_context += fn

    # With context
    if result_llm_with_context:
        values_llm_with = set(v[0] for v in result_llm_with_context.values())
    else:
        values_llm_with = set()

    tp = len(values_llm_with & values_exact)
    fp = len(values_llm_with - values_exact)
    fn = len(values_exact - values_llm_with)
    tp_with_context += tp
    fp_with_context += fp
    fn_with_context += fn

# Metric functions
def precision(tp, fp):
    return tp / (tp + fp) if (tp + fp) else 0

def recall(tp, fn):
    return tp / (tp + fn) if (tp + fn) else 0

def accuracy(tp, fp, fn):
    total = tp + fp + fn
    return tp / total if total else 0

# Output
print("\n== Without Context ==")
print(f"Precision: {precision(tp_without_context, fp_without_context):.2f}")
print(f"Recall:    {recall(tp_without_context, fn_without_context):.2f}")
print(f"Accuracy:  {accuracy(tp_without_context, fp_without_context, fn_without_context):.2f}")

print("\n== With Context ==")
print(f"Precision: {precision(tp_with_context, fp_with_context):.2f}")
print(f"Recall:    {recall(tp_with_context, fn_with_context):.2f}")
print(f"Accuracy:  {accuracy(tp_with_context, fp_with_context, fn_with_context):.2f}")



== Without Context ==
Precision: 0.00
Recall:    0.00
Accuracy:  0.00

== With Context ==
Precision: 0.00
Recall:    0.00
Accuracy:  0.00
