In [3]:
from collections import defaultdict
from itertools import combinations
from typing import Dict, List, Tuple
from OntologyDesc import OntologyDesc
from Enumerator import Enumerator
from OntologyChainEnumerator import OntologyChainEnumerator
from OntologyStarEnumerator import OntologyStarEnumerator 

Ontology = OntologyDesc()
star_enumerator = OntologyStarEnumerator(Ontology)
chain_enumerator = OntologyChainEnumerator(Ontology)

In [5]:
star_queries = star_enumerator.enumerate_all()
chain_queries = chain_enumerator.enumerate_all()

In [25]:
chain_queries[4]

[[('Substance', 'hasActiveIngredient⁻¹', 'Drug'),
  ('Drug', 'hasActiveIngredient', 'Substance'),
  ('Substance', 'hasActiveIngredient⁻¹', 'Drug'),
  ('Drug', 'hasActiveIngredient', 'Substance')],
 [('Substance', 'hasActiveIngredient⁻¹', 'Drug'),
  ('Drug', 'hasActiveIngredient', 'Substance'),
  ('Substance', 'hasActiveIngredient⁻¹', 'Drug'),
  ('Drug', 'hasCode', 'Code')],
 [('Substance', 'hasActiveIngredient⁻¹', 'Drug'),
  ('Drug', 'hasActiveIngredient', 'Substance'),
  ('Substance', 'hasActiveIngredient⁻¹', 'Drug'),
  ('Drug', 'hasDrug⁻¹', 'DrugPrescription')],
 [('Substance', 'hasActiveIngredient⁻¹', 'Drug'),
  ('Drug', 'hasCode', 'Code'),
  ('Code', 'hasCodingSystemAndVersion', 'CodingSystem'),
  ('CodingSystem', 'hasCodingSystemAndVersion⁻¹', 'Diagnosis')],
 [('Substance', 'hasActiveIngredient⁻¹', 'Drug'),
  ('Drug', 'hasCode', 'Code'),
  ('Code', 'hasCodingSystemAndVersion', 'CodingSystem'),
  ('CodingSystem', 'hasCodingSystemAndVersion⁻¹', 'MedicalProcedure')],
 [('Substance', 

In [47]:
import pandas as pd
def create_chain_queries_dataframe(chain_dict):
    """
    Convert chain queries dictionary to DataFrame
    Each row represents one complete chain query
    """
    rows = []
    
    for hop_count, queries in chain_dict.items():
        for query_idx, query_path in enumerate(queries):
            # Create a single row for the entire chain query
            # Extract start and end entities
            start_entity = query_path[0][0] if query_path else None
            end_entity = query_path[-1][2] if query_path else None
            
            # Create path representation
            path_str = ' -> '.join([f"{s}--{r}--> {t}" for s, r, t in query_path])
            
            # Create entities sequence
            entities_sequence = [query_path[0][0]] + [hop[2] for hop in query_path] if query_path else []
            
            # Create relations sequence
            relations_sequence = [hop[1] for hop in query_path] if query_path else []
            
            rows.append({
                'hop_count': hop_count,
                'start_entity': start_entity,
                'end_entity': end_entity,
                'path_representation': path_str,
                'entities_sequence': ','.join(entities_sequence),
                'relations_sequence': ','.join(relations_sequence),
                'query_tuples': str(query_path)  # Keep original format for reference
            })
    
    return pd.DataFrame(rows)

def create_star_queries_dataframe(star_dict):
    """
    Convert star queries dictionary to DataFrame
    Each row represents one complete star query with all arms
    """
    rows = []
    
    for center_entity, queries in star_dict.items():
        for query_idx, query_info in enumerate(queries):
            center = query_info['center']
            arms = query_info['arms']
            num_arms = query_info['num_arms']
            
            # Create one row for the entire star query
            # Extract relations and target entities
            relations = [relation for relation, target in arms]
            targets = [target for relation, target in arms]
            
            # Create various representations
            arms_tuples = str(arms)  # Keep original format
            arms_summary = ','.join([f"{center}--{r}--> {t}" for r, t in arms])
            relations_list = ','.join(relations)
            targets_list = ','.join(targets)
            
            rows.append({
                'center_entity': center,
                'num_arms': num_arms,
                'relations': relations_list,
                'target_entities': targets_list,
                'star_representation': arms_summary,
                'arms_tuples': arms_tuples  # Keep original format for reference
            })
    
    return pd.DataFrame(rows)

In [48]:
chain_df = create_chain_queries_dataframe(chain_queries)

In [49]:
chain_df

Unnamed: 0,hop_count,start_entity,end_entity,path_representation,entities_sequence,relations_sequence,query_tuples
0,1,Substance,Drug,Substance--hasActiveIngredient⁻¹--> Drug,"Substance,Drug",hasActiveIngredient⁻¹,"[('Substance', 'hasActiveIngredient⁻¹', 'Drug')]"
1,1,Diagnosis,Code,Diagnosis--hasCode--> Code,"Diagnosis,Code",hasCode,"[('Diagnosis', 'hasCode', 'Code')]"
2,1,Diagnosis,CodingSystem,Diagnosis--hasCodingSystemAndVersion--> Coding...,"Diagnosis,CodingSystem",hasCodingSystemAndVersion,"[('Diagnosis', 'hasCodingSystemAndVersion', 'C..."
3,1,Diagnosis,Patient,Diagnosis--hasSubjectPseudIdentifier--> Patient,"Diagnosis,Patient",hasSubjectPseudIdentifier,"[('Diagnosis', 'hasSubjectPseudIdentifier', 'P..."
4,1,CodingSystem,Diagnosis,CodingSystem--hasCodingSystemAndVersion⁻¹--> D...,"CodingSystem,Diagnosis",hasCodingSystemAndVersion⁻¹,"[('CodingSystem', 'hasCodingSystemAndVersion⁻¹..."
...,...,...,...,...,...,...,...
1307,4,DrugPrescription,DrugPrescription,DrugPrescription--hasSubjectPseudIdentifier-->...,"DrugPrescription,Patient,DrugPrescription,Drug...","hasSubjectPseudIdentifier,hasSubjectPseudIdent...","[('DrugPrescription', 'hasSubjectPseudIdentifi..."
1308,4,DrugPrescription,LabTestEvent,DrugPrescription--hasSubjectPseudIdentifier-->...,"DrugPrescription,Patient,DrugPrescription,Pati...","hasSubjectPseudIdentifier,hasSubjectPseudIdent...","[('DrugPrescription', 'hasSubjectPseudIdentifi..."
1309,4,DrugPrescription,MedicalProcedure,DrugPrescription--hasSubjectPseudIdentifier-->...,"DrugPrescription,Patient,DrugPrescription,Pati...","hasSubjectPseudIdentifier,hasSubjectPseudIdent...","[('DrugPrescription', 'hasSubjectPseudIdentifi..."
1310,4,DrugPrescription,Diagnosis,DrugPrescription--hasSubjectPseudIdentifier-->...,"DrugPrescription,Patient,DrugPrescription,Pati...","hasSubjectPseudIdentifier,hasSubjectPseudIdent...","[('DrugPrescription', 'hasSubjectPseudIdentifi..."


In [50]:
star_df = create_star_queries_dataframe(star_queries)

In [51]:
star_df

Unnamed: 0,center_entity,num_arms,relations,target_entities,star_representation,arms_tuples
0,Diagnosis,2,"hasCode,hasCodingSystemAndVersion","Code,CodingSystem","Diagnosis--hasCode--> Code,Diagnosis--hasCodin...","[('hasCode', 'Code'), ('hasCodingSystemAndVers..."
1,Diagnosis,2,"hasCode,hasSubjectPseudIdentifier","Code,Patient","Diagnosis--hasCode--> Code,Diagnosis--hasSubje...","[('hasCode', 'Code'), ('hasSubjectPseudIdentif..."
2,Diagnosis,2,"hasCodingSystemAndVersion,hasSubjectPseudIdent...","CodingSystem,Patient",Diagnosis--hasCodingSystemAndVersion--> Coding...,"[('hasCodingSystemAndVersion', 'CodingSystem')..."
3,Diagnosis,3,"hasCode,hasCodingSystemAndVersion,hasSubjectPs...","Code,CodingSystem,Patient","Diagnosis--hasCode--> Code,Diagnosis--hasCodin...","[('hasCode', 'Code'), ('hasCodingSystemAndVers..."
4,CodingSystem,2,"hasCodingSystemAndVersion⁻¹,hasCodingSystemAnd...","Diagnosis,MedicalProcedure",CodingSystem--hasCodingSystemAndVersion⁻¹--> D...,"[('hasCodingSystemAndVersion⁻¹', 'Diagnosis'),..."
5,CodingSystem,2,"hasCodingSystemAndVersion⁻¹,hasCodingSystemAnd...","Diagnosis,Code",CodingSystem--hasCodingSystemAndVersion⁻¹--> D...,"[('hasCodingSystemAndVersion⁻¹', 'Diagnosis'),..."
6,CodingSystem,2,"hasCodingSystemAndVersion⁻¹,hasCodingSystemAnd...","MedicalProcedure,Code",CodingSystem--hasCodingSystemAndVersion⁻¹--> M...,"[('hasCodingSystemAndVersion⁻¹', 'MedicalProce..."
7,CodingSystem,3,"hasCodingSystemAndVersion⁻¹,hasCodingSystemAnd...","Diagnosis,MedicalProcedure,Code",CodingSystem--hasCodingSystemAndVersion⁻¹--> D...,"[('hasCodingSystemAndVersion⁻¹', 'Diagnosis'),..."
8,Patient,2,"hasSubjectPseudIdentifier⁻¹,hasSubjectPseudIde...","LabTestEvent,MedicalProcedure",Patient--hasSubjectPseudIdentifier⁻¹--> LabTes...,"[('hasSubjectPseudIdentifier⁻¹', 'LabTestEvent..."
9,Patient,2,"hasSubjectPseudIdentifier⁻¹,hasSubjectPseudIde...","LabTestEvent,Diagnosis",Patient--hasSubjectPseudIdentifier⁻¹--> LabTes...,"[('hasSubjectPseudIdentifier⁻¹', 'LabTestEvent..."


In [56]:
def generate_sparql_from_star_query(arms_tuples_str, center_variable="?center"):
    """
    Generate SPARQL query from star query arms tuples
    Args:
        arms_tuples_str: String representation of arms tuples from DataFrame
        center_variable: Variable name for the center entity (default: "?center")
    """
    import ast
    
    # Parse the string representation back to list of tuples
    try:
        arms = ast.literal_eval(arms_tuples_str)
    except:
        return "Error: Could not parse arms tuples"
    
    # Start building SPARQL query
    sparql_lines = ["SELECT DISTINCT * WHERE {"]
    
    # Add triple patterns for each arm
    for i, (relation, target_entity) in enumerate(arms):
        target_var = f"?{target_entity.lower()}{i+1}"
        
        # Handle inverse relations (those ending with ⁻¹)
        if relation.endswith('⁻¹'):
            # Inverse relation: target relates to center
            clean_relation = relation[:-1]  # Remove ⁻¹
            sparql_lines.append(f"  {target_var} <{clean_relation}> {center_variable} .")
        else:
            # Regular relation: center relates to target
            sparql_lines.append(f"  {center_variable} <{relation}> {target_var} .")
    
    sparql_lines.append("}")
    
    return "\n".join(sparql_lines)


def generate_sparql_from_chain_query(query_tuples_str):
    """
    Generate SPARQL query from chain query tuples
    Args:
        query_tuples_str: String representation of query tuples from DataFrame
    """
    import ast
    
    # Parse the string representation back to list of tuples
    try:
        chain = ast.literal_eval(query_tuples_str)
    except:
        return "Error: Could not parse query tuples"
    
    if not chain:
        return "Error: Empty chain query"
    
    # Start building SPARQL query
    sparql_lines = ["SELECT DISTINCT * WHERE {"]
    
    # Keep track of variables used
    variables = {}
    var_counter = 0
    
    # Process each hop in the chain
    for i, (source_entity, relation, target_entity) in enumerate(chain):
        # Get or create variable for source
        if source_entity not in variables:
            variables[source_entity] = f"?{source_entity.lower()}{var_counter}"
            var_counter += 1
        source_var = variables[source_entity]
        
        # Get or create variable for target
        if target_entity not in variables:
            variables[target_entity] = f"?{target_entity.lower()}{var_counter}"
            var_counter += 1
        target_var = variables[target_entity]
        
        # Handle inverse relations (those ending with ⁻¹)
        if relation.endswith('⁻¹'):
            # Inverse relation: target relates to source
            clean_relation = relation[:-1]  # Remove ⁻¹
            sparql_lines.append(f"  {target_var} <{clean_relation}> {source_var} .")
        else:
            # Regular relation: source relates to target
            sparql_lines.append(f"  {source_var} <{relation}> {target_var} .")
    
    sparql_lines.append("}")
    
    return "\n".join(sparql_lines)

In [59]:
generate_sparql_from_chain_query(chain_df['query_tuples'][900])

'SELECT DISTINCT * WHERE {\n  ?labresult0 <hasCode> ?code1 .\n  ?code1 <hasCodingSystemAndVersion> ?codingsystem2 .\n  ?diagnosis3 <hasCodingSystemAndVersion⁻> ?codingsystem2 .\n  ?diagnosis3 <hasCodingSystemAndVersion> ?codingsystem2 .\n}'