In [63]:
# old queries
query1 = f"""
SELECT DISTINCT ?class ?property (SAMPLE(?value) AS ?sampleValue)
    WHERE {{
        ?class a <{class_uri}> .
        {{
            SELECT DISTINCT ?instance ?property ?value
            WHERE {{
                ?instance a ?class .
                ?instance ?property ?value .
            }}
            LIMIT 10000
        }}
    }}
    GROUP BY ?class ?property
"""

query2= f"""

SELECT ?property (SAMPLE(COALESCE(?type, STR(DATATYPE(?value)), "Untyped")) AS ?valueType) WHERE {{
  ?instance a <{class_uri}> ;
            ?property ?value .
  OPTIONAL {{
    FILTER (!isLiteral(?value))
    ?value a ?type .
  }}
}} GROUP BY ?property ?type
LIMIT 100000
    """


In [81]:
from SPARQLWrapper import SPARQLWrapper, JSON
from rdflib import Graph, URIRef, Namespace, Literal
import rdflib
from tqdm import tqdm
import re

# Initialize the SPARQL endpoint
sparql = SPARQLWrapper('https://enpkg.commons-lab.org/graphdb/repositories/ENPKG')

def get_classes():
    # Query to get distinct classes
    sparql.setQuery("""
    SELECT DISTINCT ?class WHERE {
        ?class a rdfs:Class .
    }
    """)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    return [result["class"]["value"] for result in results["results"]["bindings"]]

def get_properties_and_sample_values(class_uri):
    # Query to get properties and values of instances of a class, limited to 100 instances
    sparql.setQuery(f"""
    PREFIX foaf: <http://xmlns.com/foaf/0.1/>
    SELECT ?property (SAMPLE(COALESCE(?type, STR(DATATYPE(?value)), "Untyped")) AS ?valueType) WHERE {{
      # Subquery to select instances with a limit
      {{
        SELECT ?instance WHERE {{
          ?instance a <{class_uri}> .
        }} LIMIT 1000
      }}
      ?instance ?property ?value .
      OPTIONAL {{
        ?value a ?type .
      }}
      FILTER (?property != rdf:type && ?property != rdfs:label && ?property != rdfs:comment && ?property != rdfs:Class && ?property != foaf:depiction)  
    }}
    GROUP BY ?property ?type
    LIMIT 300
    """)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    # return [(result["property"]["value"], result["valueType"]["value"]) for result in results["results"]["bindings"]]
    
    # Filter out properties with alphanumeric (hexadecimal) sequences after an underscore
    filtered_results = []
    for result in results["results"]["bindings"]:
        property_uri = result["property"]["value"]
        # Regex to match properties that do NOT have a hexadecimal sequence after an underscore
        if not re.search(r'_([0-9a-fA-F]+)', property_uri):
            filtered_results.append((property_uri, result["valueType"]["value"]))

    return filtered_results

g = Graph()
RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
FOAF = Namespace("http://xmlns.com/foaf/0.1/")

classes = get_classes()
for class_uri in tqdm(classes, desc="processing classes"):
    print('processing', class_uri)
    class_ref = URIRef(class_uri)
    properties_and_values = get_properties_and_sample_values(class_uri)
    for property_uri, sample_value in properties_and_values:
        property_ref = URIRef(property_uri)
        sample_value_ref = URIRef(sample_value)
        g.add((class_ref, property_ref, sample_value_ref))



# 15 seconds on macbook pro 2

processing classes:   2%|▏         | 1/48 [00:00<00:06,  6.74it/s]

processing http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral
processing http://www.w3.org/2001/XMLSchema#nonNegativeInteger


processing classes:   6%|▋         | 3/48 [00:00<00:06,  6.46it/s]

processing http://www.w3.org/2001/XMLSchema#string
processing https://enpkg.commons-lab.org/kg/RawMaterial


processing classes:   8%|▊         | 4/48 [00:00<00:09,  4.70it/s]

processing https://enpkg.commons-lab.org/kg/LFpair


processing classes:  12%|█▎        | 6/48 [00:01<00:09,  4.47it/s]

processing https://enpkg.commons-lab.org/kg/WDChemical
processing https://enpkg.commons-lab.org/kg/WDTaxon


processing classes:  15%|█▍        | 7/48 [00:01<00:08,  4.69it/s]

processing https://enpkg.commons-lab.org/kg/InChIkey2D


processing classes:  17%|█▋        | 8/48 [00:01<00:09,  4.08it/s]

processing https://enpkg.commons-lab.org/kg/InChIkey


processing classes:  19%|█▉        | 9/48 [00:02<00:10,  3.75it/s]

processing https://enpkg.commons-lab.org/kg/LabExtract


processing classes:  21%|██        | 10/48 [00:02<00:11,  3.38it/s]

processing https://enpkg.commons-lab.org/kg/LCMSAnalysisPos


processing classes:  23%|██▎       | 11/48 [00:02<00:10,  3.40it/s]

processing https://enpkg.commons-lab.org/kg/LCMSFeatureList


processing classes:  25%|██▌       | 12/48 [00:07<01:00,  1.68s/it]

processing https://enpkg.commons-lab.org/kg/SiriusCanopusAnnotation


processing classes:  27%|██▋       | 13/48 [00:07<00:44,  1.27s/it]

processing https://enpkg.commons-lab.org/kg/LCMSFeature


processing classes:  29%|██▉       | 14/48 [00:08<00:35,  1.04s/it]

processing https://enpkg.commons-lab.org/kg/Spec2VecLoss


processing classes:  33%|███▎      | 16/48 [00:08<00:19,  1.63it/s]

processing https://enpkg.commons-lab.org/kg/NPCClass
processing https://enpkg.commons-lab.org/kg/NPCSuperclass


processing classes:  35%|███▌      | 17/48 [00:08<00:14,  2.09it/s]

processing https://enpkg.commons-lab.org/kg/Spec2VecPeak


processing classes:  38%|███▊      | 18/48 [00:09<00:12,  2.48it/s]

processing https://enpkg.commons-lab.org/kg/SiriusStructureAnnotation


processing classes:  40%|███▉      | 19/48 [00:09<00:10,  2.71it/s]

processing https://enpkg.commons-lab.org/kg/Spec2VecDoc


processing classes:  42%|████▏     | 20/48 [00:10<00:18,  1.50it/s]

processing https://enpkg.commons-lab.org/kg/IsdbAnnotation


processing classes:  46%|████▌     | 22/48 [00:11<00:11,  2.30it/s]

processing https://enpkg.commons-lab.org/kg/NPCPathway
processing https://enpkg.commons-lab.org/kg/LCMSAnalysisNeg


processing classes:  48%|████▊     | 23/48 [00:11<00:09,  2.59it/s]

processing https://enpkg.commons-lab.org/module/L610ugml


processing classes:  50%|█████     | 24/48 [00:11<00:08,  2.94it/s]

processing https://enpkg.commons-lab.org/module/Ldono10ugml


processing classes:  52%|█████▏    | 25/48 [00:12<00:07,  3.19it/s]

processing https://enpkg.commons-lab.org/module/Ldono2ugml


processing classes:  54%|█████▍    | 26/48 [00:12<00:06,  3.39it/s]

processing https://enpkg.commons-lab.org/module/Tbrucei10ugml


processing classes:  56%|█████▋    | 27/48 [00:12<00:05,  3.56it/s]

processing https://enpkg.commons-lab.org/module/Tbrucei2ugml


processing classes:  58%|█████▊    | 28/48 [00:12<00:05,  3.70it/s]

processing https://enpkg.commons-lab.org/module/Tcruzi10ugml


processing classes:  62%|██████▎   | 30/48 [00:13<00:04,  4.33it/s]

processing https://enpkg.commons-lab.org/module/ChEMBLTarget
processing https://enpkg.commons-lab.org/kg/LabBlank


processing classes:  67%|██████▋   | 32/48 [00:13<00:03,  5.06it/s]

processing https://enpkg.commons-lab.org/kg/LabQc
processing https://enpkg.commons-lab.org/module/ChEMBLAssayResults


processing classes:  71%|███████   | 34/48 [00:14<00:03,  4.52it/s]

processing https://enpkg.commons-lab.org/module/ChEMBLAssay
processing https://enpkg.commons-lab.org/module/ChEMBLChemical


processing classes:  75%|███████▌  | 36/48 [00:14<00:02,  4.63it/s]

processing https://enpkg.commons-lab.org/module/ChEMBLDocument
processing https://enpkg.commons-lab.org/kg/LabObject


processing classes:  77%|███████▋  | 37/48 [00:14<00:02,  4.19it/s]

processing https://enpkg.commons-lab.org/kg/BioAssayResults


processing classes:  79%|███████▉  | 38/48 [00:15<00:02,  4.20it/s]

processing https://enpkg.commons-lab.org/kg/MS2Spectrum


processing classes:  81%|████████▏ | 39/48 [00:15<00:02,  3.02it/s]

processing https://enpkg.commons-lab.org/kg/LCMSAnalysis


processing classes:  83%|████████▎ | 40/48 [00:15<00:02,  3.19it/s]

processing https://enpkg.commons-lab.org/kg/Annotation


processing classes:  88%|████████▊ | 42/48 [00:16<00:01,  3.75it/s]

processing https://enpkg.commons-lab.org/kg/GNPSAnnotation
processing https://enpkg.commons-lab.org/kg/SpectralPair


processing classes:  90%|████████▉ | 43/48 [00:16<00:01,  3.60it/s]

processing https://enpkg.commons-lab.org/kg/ChemicalEntity


processing classes:  94%|█████████▍| 45/48 [00:17<00:00,  3.83it/s]

processing https://enpkg.commons-lab.org/kg/ChemicalTaxonomy
processing https://enpkg.commons-lab.org/kg/XRef


processing classes:  96%|█████████▌| 46/48 [00:17<00:00,  4.08it/s]

processing https://enpkg.commons-lab.org/kg/spec2vec


processing classes:  98%|█████████▊| 47/48 [00:17<00:00,  3.67it/s]

processing https://enpkg.commons-lab.org/module/SwissTPHBioAssay


processing classes: 100%|██████████| 48/48 [00:17<00:00,  2.68it/s]


In [82]:
print(g.serialize(format='turtle'))

@prefix ns1: <https://enpkg.commons-lab.org/module/> .
@prefix ns2: <https://enpkg.commons-lab.org/kg/> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

ns2:Annotation ns2:has_InChIkey2D ns2:InChIkey2D ;
    ns2:has_ionization xsd:string ;
    ns2:has_sirius_score xsd:float ;
    ns2:has_spectral_score xsd:float ;
    ns2:has_taxo_score xsd:float ;
    ns2:has_zodiac_score xsd:float .

ns2:BioAssayResults ns1:inhibition_percentage xsd:float ;
    ns1:target_id ns2:XRef,
        ns1:ChEMBLTarget .

ns2:IsdbAnnotation ns2:has_InChIkey2D ns2:InChIkey2D ;
    ns2:has_spectral_score xsd:float ;
    ns2:has_taxo_score xsd:float .

ns2:LCMSFeatureList ns2:has_ionization xsd:string .

ns2:LFpair ns2:has_member ns2:LCMSFeature,
        ns2:MS2Spectrum ;
    ns2:has_mn_params <Untyped> .

ns2:LabBlank ns2:has_LCMS ns2:LCMSAnalysis,
        ns2:LCMSAnalysisNeg .

ns2:LabQc ns2:has_LCMS ns2:LCMSAnalysis,
        ns2:LCMSAnalysisNeg .

ns2:RawMaterial ns2:has_LCMS xsd:string,
        ns2:WDTax