In [16]:
from SPARQLWrapper import SPARQLWrapper, JSON
from rdflib import Graph, URIRef, Namespace, Literal
import rdflib
from tqdm import tqdm
import re
from typing import List, Tuple
import tiktoken


In [49]:
## utils

def token_counter(text: str) -> int:
    tokenizer = tiktoken.encoding_for_model(model_name='gpt-4')
    tokens = tokenizer.encode(text)
    return len(tokens)

In [63]:
# old queries
query1 = f"""
SELECT DISTINCT ?class ?property (SAMPLE(?value) AS ?sampleValue)
    WHERE {{
        ?class a <{class_uri}> .
        {{
            SELECT DISTINCT ?instance ?property ?value
            WHERE {{
                ?instance a ?class .
                ?instance ?property ?value .
            }}
            LIMIT 10000
        }}
    }}
    GROUP BY ?class ?property
"""

query2= f"""

SELECT ?property (SAMPLE(COALESCE(?type, STR(DATATYPE(?value)), "Untyped")) AS ?valueType) WHERE {{
  ?instance a <{class_uri}> ;
            ?property ?value .
  OPTIONAL {{
    FILTER (!isLiteral(?value))
    ?value a ?type .
  }}
}} GROUP BY ?property ?type
LIMIT 100000
    """


In [15]:


# Initialize the SPARQL endpoint
sparql = SPARQLWrapper('https://enpkg.commons-lab.org/graphdb/repositories/ENPKG')

def get_classes():
    # Query to get distinct classes
    sparql.setQuery("""
    SELECT DISTINCT ?class WHERE {
        ?class a rdfs:Class .
    }
    """)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    return [result["class"]["value"] for result in results["results"]["bindings"]]

def get_properties_and_sample_values(class_uri):
    # Query to get properties and values of instances of a class, limited to 100 instances
    sparql.setQuery(f"""
    SELECT ?property (SAMPLE(COALESCE(?type, STR(DATATYPE(?value)), "Untyped")) AS ?valueType) WHERE {{
      {{
        SELECT ?instance WHERE {{
          ?instance a <{class_uri}> .
        }} LIMIT 1000
      }}
      ?instance ?property ?value .
      OPTIONAL {{
        ?value a ?type .
      }}
    }}
    GROUP BY ?property ?type
    LIMIT 300
    """)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    # return [(result["property"]["value"], result["valueType"]["value"]) for result in results["results"]["bindings"]]
    excluded_uris = [
            "http://www.w3.org/1999/02/22-rdf-syntax-ns#type",
            "http://www.w3.org/2000/01/rdf-schema#label",
            "http://www.w3.org/2000/01/rdf-schema#comment",
            "http://www.w3.org/2000/01/rdf-schema#Class",
            "http://xmlns.com/foaf/0.1/depiction"
        ]
    # Filter out properties with alphanumeric (hexadecimal) sequences after an underscore
    filtered_results = []
    for result in results["results"]["bindings"]:
        property_uri = result["property"]["value"]

        # Regex to match properties that do NOT have a hexadecimal sequence after an underscore
        if not re.search(r'_([0-9a-fA-F]+)', property_uri) and property_uri not in excluded_uris:
            filtered_results.append((property_uri, result["valueType"]["value"]))

    return filtered_results

g = Graph()
RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
FOAF = Namespace("http://xmlns.com/foaf/0.1/")

classes = get_classes()
for class_uri in tqdm(classes, desc="processing classes"):
    print('processing', class_uri)
    class_ref = URIRef(class_uri)
    properties_and_values = get_properties_and_sample_values(class_uri)
    for property_uri, sample_value in properties_and_values:
        property_ref = URIRef(property_uri)
        sample_value_ref = URIRef(sample_value)
        g.add((class_ref, property_ref, sample_value_ref))


# 9.7 seconds with LIMIT = 100 -> incomplete schemas
# 15 seconds with LIMIT = 1000 
# 42 seconds with LIMIT = 10000

processing classes:   2%|▏         | 1/48 [00:00<00:04,  9.88it/s]

processing http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral
processing http://www.w3.org/2001/XMLSchema#nonNegativeInteger


processing classes:   4%|▍         | 2/48 [00:00<00:08,  5.42it/s]

processing http://www.w3.org/2001/XMLSchema#string
processing https://enpkg.commons-lab.org/kg/RawMaterial


processing classes:   8%|▊         | 4/48 [00:00<00:06,  6.59it/s]

processing https://enpkg.commons-lab.org/kg/LFpair


processing classes:  12%|█▎        | 6/48 [00:01<00:07,  5.84it/s]

processing https://enpkg.commons-lab.org/kg/WDChemical
processing https://enpkg.commons-lab.org/kg/WDTaxon


processing classes:  17%|█▋        | 8/48 [00:01<00:06,  6.19it/s]

processing https://enpkg.commons-lab.org/kg/InChIkey2D
processing https://enpkg.commons-lab.org/kg/InChIkey


processing classes:  21%|██        | 10/48 [00:01<00:06,  5.71it/s]

processing https://enpkg.commons-lab.org/kg/LabExtract
processing https://enpkg.commons-lab.org/kg/LCMSAnalysisPos


processing classes:  23%|██▎       | 11/48 [00:01<00:06,  5.81it/s]

processing https://enpkg.commons-lab.org/kg/LCMSFeatureList


processing classes:  27%|██▋       | 13/48 [00:04<00:21,  1.61it/s]

processing https://enpkg.commons-lab.org/kg/SiriusCanopusAnnotation
processing https://enpkg.commons-lab.org/kg/LCMSFeature


processing classes:  31%|███▏      | 15/48 [00:04<00:13,  2.47it/s]

processing https://enpkg.commons-lab.org/kg/Spec2VecLoss
processing https://enpkg.commons-lab.org/kg/NPCClass


processing classes:  33%|███▎      | 16/48 [00:04<00:10,  3.15it/s]

processing https://enpkg.commons-lab.org/kg/NPCSuperclass


processing classes:  38%|███▊      | 18/48 [00:06<00:12,  2.36it/s]

processing https://enpkg.commons-lab.org/kg/Spec2VecPeak
processing https://enpkg.commons-lab.org/kg/SiriusStructureAnnotation


processing classes:  40%|███▉      | 19/48 [00:06<00:10,  2.86it/s]

processing https://enpkg.commons-lab.org/kg/Spec2VecDoc


processing classes:  44%|████▍     | 21/48 [00:07<00:10,  2.66it/s]

processing https://enpkg.commons-lab.org/kg/IsdbAnnotation
processing https://enpkg.commons-lab.org/kg/NPCPathway


processing classes:  48%|████▊     | 23/48 [00:07<00:06,  3.84it/s]

processing https://enpkg.commons-lab.org/kg/LCMSAnalysisNeg
processing https://enpkg.commons-lab.org/module/L610ugml


processing classes:  52%|█████▏    | 25/48 [00:07<00:04,  4.76it/s]

processing https://enpkg.commons-lab.org/module/Ldono10ugml
processing https://enpkg.commons-lab.org/module/Ldono2ugml


processing classes:  56%|█████▋    | 27/48 [00:08<00:07,  2.64it/s]

processing https://enpkg.commons-lab.org/module/Tbrucei10ugml
processing https://enpkg.commons-lab.org/module/Tbrucei2ugml


processing classes:  60%|██████    | 29/48 [00:09<00:06,  3.15it/s]

processing https://enpkg.commons-lab.org/module/Tcruzi10ugml
processing https://enpkg.commons-lab.org/module/ChEMBLTarget


processing classes:  65%|██████▍   | 31/48 [00:09<00:03,  4.70it/s]

processing https://enpkg.commons-lab.org/kg/LabBlank
processing https://enpkg.commons-lab.org/kg/LabQc
processing https://enpkg.commons-lab.org/module/ChEMBLAssayResults


processing classes:  71%|███████   | 34/48 [00:10<00:02,  5.85it/s]

processing https://enpkg.commons-lab.org/module/ChEMBLAssay
processing https://enpkg.commons-lab.org/module/ChEMBLChemical


processing classes:  75%|███████▌  | 36/48 [00:12<00:06,  1.96it/s]

processing https://enpkg.commons-lab.org/module/ChEMBLDocument
processing https://enpkg.commons-lab.org/kg/LabObject


processing classes:  77%|███████▋  | 37/48 [00:12<00:04,  2.38it/s]

processing https://enpkg.commons-lab.org/kg/BioAssayResults


processing classes:  79%|███████▉  | 38/48 [00:13<00:06,  1.60it/s]

processing https://enpkg.commons-lab.org/kg/MS2Spectrum


processing classes:  83%|████████▎ | 40/48 [00:14<00:03,  2.36it/s]

processing https://enpkg.commons-lab.org/kg/LCMSAnalysis
processing https://enpkg.commons-lab.org/kg/Annotation


processing classes:  85%|████████▌ | 41/48 [00:14<00:02,  2.79it/s]

processing https://enpkg.commons-lab.org/kg/GNPSAnnotation


processing classes:  90%|████████▉ | 43/48 [00:14<00:01,  3.58it/s]

processing https://enpkg.commons-lab.org/kg/SpectralPair
processing https://enpkg.commons-lab.org/kg/ChemicalEntity


processing classes:  94%|█████████▍| 45/48 [00:16<00:01,  2.40it/s]

processing https://enpkg.commons-lab.org/kg/ChemicalTaxonomy
processing https://enpkg.commons-lab.org/kg/XRef


processing classes:  98%|█████████▊| 47/48 [00:16<00:00,  3.50it/s]

processing https://enpkg.commons-lab.org/kg/spec2vec
processing https://enpkg.commons-lab.org/module/SwissTPHBioAssay


processing classes: 100%|██████████| 48/48 [00:16<00:00,  2.91it/s]


In [16]:
print(g.serialize(format='turtle'))

@prefix ns1: <https://enpkg.commons-lab.org/kg/> .
@prefix ns2: <https://enpkg.commons-lab.org/module/> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

ns1:Annotation ns1:has_InChIkey2D ns1:InChIkey2D ;
    ns1:has_ionization xsd:string ;
    ns1:has_sirius_score xsd:float ;
    ns1:has_spectral_score xsd:float ;
    ns1:has_taxo_score xsd:float ;
    ns1:has_zodiac_score xsd:float .

ns1:BioAssayResults ns2:inhibition_percentage xsd:float ;
    ns2:target_id ns1:XRef,
        ns2:ChEMBLTarget .

ns1:IsdbAnnotation ns1:has_InChIkey2D ns1:InChIkey2D ;
    ns1:has_spectral_score xsd:float ;
    ns1:has_taxo_score xsd:float .

ns1:LCMSFeatureList ns1:has_ionization xsd:string .

ns1:LFpair ns1:has_member ns1:LCMSFeature,
        ns1:MS2Spectrum ;
    ns1:has_mn_params <Untyped> .

ns1:LabBlank ns1:has_LCMS ns1:LCMSAnalysis,
        ns1:LCMSAnalysisNeg .

ns1:LabQc ns1:has_LCMS ns1:LCMSAnalysis,
        ns1:LCMSAnalysisNeg .

ns1:RawMaterial ns1:has_LCMS xsd:string,
        ns1:WDTax

In [50]:

print(f"Number of tokens: {token_counter(g.serialize(format='turtle'))}")


Number of tokens: 1930


## RDFLIB

### init

In [1]:
import rdflib
from rdflib.graph import DATASET_DEFAULT_GRAPH_ID as default
from rdflib.plugins.stores import sparqlstore

In [2]:
_store = sparqlstore.SPARQLStore()
_store.open('https://enpkg.commons-lab.org/graphdb/repositories/ENPKG')
graph = rdflib.Graph(_store, bind_namespaces="none")

### Get Classes, Com, Example

In [60]:
from rdflib.exceptions import ParserError
from rdflib.query import ResultRow

query = """
SELECT DISTINCT ?cls ?com (SAMPLE(?instance) AS ?example)
WHERE {
    ?cls a rdfs:Class . 
    OPTIONAL { ?cls rdfs:comment ?com }
    OPTIONAL { ?instance a ?cls }
}
GROUP BY ?cls ?com
"""
res = graph.query(query_object=query)
res = [r for r in res if isinstance(r, ResultRow)]

In [7]:
res

[(rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral'),
  None,
  None),
 (rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#nonNegativeInteger'),
  None,
  None),
 (rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string'), None, None),
 (rdflib.term.URIRef('https://enpkg.commons-lab.org/kg/RawMaterial'),
  rdflib.term.Literal('A raw laboratory biological material, i.e. before extraction'),
  rdflib.term.URIRef('https://enpkg.commons-lab.org/kg/walth_ind_aerparts')),
 (rdflib.term.URIRef('https://enpkg.commons-lab.org/kg/LFpair'),
  rdflib.term.Literal('A pair of 2 LCMSFeature'),
  rdflib.term.URIRef('https://enpkg.commons-lab.org/kg/lcms_feature_pair_mzspec:MSV000088521:SC_AP_Wi_DCM_features_ms2_pos.mgf:scan:9_mzspec:MSV000088521:SC_AP_Wi_DCM_features_ms2_pos.mgf:scan:59')),
 (rdflib.term.URIRef('https://enpkg.commons-lab.org/kg/WDChemical'),
  rdflib.term.Literal('Cross-reference to a chemical entity in Wikidata'),
  rdflib.term.URIRef('http://www.wikid

In [9]:
def _get_local_name(iri: str) -> str:
    if "#" in iri:
        local_name = iri.split("#")[-1]
    elif "/" in iri:
        local_name = iri.split("/")[-1]
    else:
        raise ValueError(f"Unexpected IRI '{iri}', contains neither '#' nor '/'.")
    return local_name

def _res_to_str(res: rdflib.query.ResultRow, var: str) -> str:
    return (
        "<"
        + str(res[var])
        + "> ("
        + _get_local_name(res[var])
        + ", "
        + str(res["com"])
        + ", "
        + str(res["example"])
        + ")"
    )
    
", ".join([_res_to_str(r, "cls") for r in res])

'<http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral> (XMLLiteral, None, None), <http://www.w3.org/2001/XMLSchema#nonNegativeInteger> (nonNegativeInteger, None, None), <http://www.w3.org/2001/XMLSchema#string> (string, None, None), <https://enpkg.commons-lab.org/kg/RawMaterial> (RawMaterial, A raw laboratory biological material, i.e. before extraction, https://enpkg.commons-lab.org/kg/walth_ind_aerparts), <https://enpkg.commons-lab.org/kg/LFpair> (LFpair, A pair of 2 LCMSFeature, https://enpkg.commons-lab.org/kg/lcms_feature_pair_mzspec:MSV000088521:SC_AP_Wi_DCM_features_ms2_pos.mgf:scan:9_mzspec:MSV000088521:SC_AP_Wi_DCM_features_ms2_pos.mgf:scan:59), <https://enpkg.commons-lab.org/kg/WDChemical> (WDChemical, Cross-reference to a chemical entity in Wikidata, http://www.wikidata.org/entity/Q94518604), <https://enpkg.commons-lab.org/kg/WDTaxon> (WDTaxon, Cross-reference to a taxon in Wikidata, http://www.wikidata.org/entity/Q10878263), <https://enpkg.commons-lab.org/kg/InChIkey2D> (I

#### how to pass ResultRow value to the prop/value query

In [13]:
query2 = """
    SELECT ?property (SAMPLE(COALESCE(?type, STR(DATATYPE(?value)), "Untyped")) AS ?valueType) WHERE {{
      {{
        SELECT ?instance WHERE {{
          ?instance a <{class_uri}> .
        }} LIMIT 1000
      }}
      ?instance ?property ?value .
      OPTIONAL {{
        ?value a ?type .
      }}
    }}
    GROUP BY ?property ?type
    LIMIT 300
    """
    
query2 = query2.format(class_uri=res[7]["cls"])
print(query2)


    SELECT ?property (SAMPLE(COALESCE(?type, STR(DATATYPE(?value)), "Untyped")) AS ?valueType) WHERE {
      {
        SELECT ?instance WHERE {
          ?instance a <https://enpkg.commons-lab.org/kg/InChIkey2D> .
        } LIMIT 1000
      }
      ?instance ?property ?value .
      OPTIONAL {
        ?value a ?type .
      }
    }
    GROUP BY ?property ?type
    LIMIT 300
    


## function to automate query2 

In [15]:


def get_prop_and_val_types(class_uri: str) -> List[Tuple[str, str, str]]:
    query = """
    SELECT ?property (SAMPLE(COALESCE(?type, STR(DATATYPE(?value)), "Untyped")) AS ?valueType) WHERE {{
      {{
        SELECT ?instance WHERE {{
          ?instance a <{class_uri}> .
        }} LIMIT 1000
      }}
      ?instance ?property ?value .
      OPTIONAL {{
        ?value a ?type .
      }}
    }}
    GROUP BY ?property ?type
    LIMIT 300
    """
    query = query.format(class_uri=class_uri)
    res = graph.query(query_object=query)
    res = [r for r in res if isinstance(r, ResultRow)]
    res = [(str(r["property"]), str(r["valueType"])) for r in res]
    excluded_uris = [
        "http://www.w3.org/1999/02/22-rdf-syntax-ns#type",
        "http://www.w3.org/2000/01/rdf-schema#label",
        "http://www.w3.org/2000/01/rdf-schema#comment",
        "http://www.w3.org/2000/01/rdf-schema#Class",
        "http://xmlns.com/foaf/0.1/depiction"
    ]
    # Filter out properties with alphanumeric (hexadecimal) sequences after an underscore
    filtered_results = []
    for property_uri, value_type in res:
        # Regex to match properties that do NOT have a hexadecimal sequence after an underscore
        if not re.search(r'_([0-9a-fA-F]+)', property_uri) and property_uri not in excluded_uris:
            filtered_results.append((property_uri, value_type))
            
    return filtered_results
  
for r in tqdm(res):
    print(r["cls"])
    print(get_prop_and_val_types(r["cls"]))

  2%|▏         | 1/48 [00:00<00:07,  6.28it/s]

http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral
[]
http://www.w3.org/2001/XMLSchema#nonNegativeInteger


  6%|▋         | 3/48 [00:00<00:06,  6.55it/s]

[]
http://www.w3.org/2001/XMLSchema#string
[]
https://enpkg.commons-lab.org/kg/RawMaterial


  8%|▊         | 4/48 [00:00<00:08,  5.17it/s]

[('https://enpkg.commons-lab.org/kg/has_lab_process', 'https://enpkg.commons-lab.org/kg/LabExtract'), ('https://enpkg.commons-lab.org/kg/has_lab_process', 'https://enpkg.commons-lab.org/kg/LabObject'), ('https://enpkg.commons-lab.org/kg/submitted_taxon', 'http://www.w3.org/2001/XMLSchema#string'), ('https://enpkg.commons-lab.org/kg/has_LCMS', 'https://enpkg.commons-lab.org/kg/WDTaxon'), ('https://enpkg.commons-lab.org/kg/has_LCMS', 'https://enpkg.commons-lab.org/kg/XRef'), ('https://enpkg.commons-lab.org/kg/has_LCMS', 'http://www.w3.org/2001/XMLSchema#string'), ('https://enpkg.commons-lab.org/kg/has_wd_id', 'https://enpkg.commons-lab.org/kg/WDTaxon'), ('https://enpkg.commons-lab.org/kg/has_wd_id', 'https://enpkg.commons-lab.org/kg/XRef'), ('https://enpkg.commons-lab.org/kg/has_unresolved_taxon', 'Untyped'), ('https://enpkg.commons-lab.org/module/has_organe', 'Untyped'), ('https://enpkg.commons-lab.org/module/has_subsystem', 'Untyped'), ('https://enpkg.commons-lab.org/module/has_tissue'

 12%|█▎        | 6/48 [00:01<00:08,  4.93it/s]

[('https://enpkg.commons-lab.org/kg/has_mn_params', 'Untyped'), ('https://enpkg.commons-lab.org/kg/has_member', 'https://enpkg.commons-lab.org/kg/LCMSFeature'), ('https://enpkg.commons-lab.org/kg/has_member', 'https://enpkg.commons-lab.org/kg/MS2Spectrum')]
https://enpkg.commons-lab.org/kg/WDChemical
[]
https://enpkg.commons-lab.org/kg/WDTaxon


 15%|█▍        | 7/48 [00:01<00:08,  4.99it/s]

[]
https://enpkg.commons-lab.org/kg/InChIkey2D


 17%|█▋        | 8/48 [00:01<00:08,  4.60it/s]

[('https://enpkg.commons-lab.org/kg/has_npc_pathway', 'https://enpkg.commons-lab.org/kg/NPCPathway'), ('https://enpkg.commons-lab.org/kg/has_npc_pathway', 'https://enpkg.commons-lab.org/kg/ChemicalTaxonomy'), ('https://enpkg.commons-lab.org/kg/has_npc_superclass', 'https://enpkg.commons-lab.org/kg/NPCClass'), ('https://enpkg.commons-lab.org/kg/has_npc_superclass', 'https://enpkg.commons-lab.org/kg/NPCSuperclass'), ('https://enpkg.commons-lab.org/kg/has_npc_superclass', 'https://enpkg.commons-lab.org/kg/NPCPathway'), ('https://enpkg.commons-lab.org/kg/has_npc_superclass', 'https://enpkg.commons-lab.org/kg/ChemicalTaxonomy'), ('https://enpkg.commons-lab.org/kg/has_smiles', 'http://www.w3.org/2001/XMLSchema#string'), ('https://enpkg.commons-lab.org/kg/is_InChIkey2D_of', 'https://enpkg.commons-lab.org/kg/InChIkey'), ('https://enpkg.commons-lab.org/kg/is_InChIkey2D_of', 'https://enpkg.commons-lab.org/kg/ChemicalEntity'), ('https://enpkg.commons-lab.org/kg/has_npc_pathway', 'https://enpkg.co

 19%|█▉        | 9/48 [00:01<00:08,  4.36it/s]

[('https://enpkg.commons-lab.org/kg/has_wd_id', 'https://enpkg.commons-lab.org/kg/WDChemical'), ('https://enpkg.commons-lab.org/kg/has_wd_id', 'https://enpkg.commons-lab.org/kg/XRef'), ('https://enpkg.commons-lab.org/kg/has_npc_pathway', 'https://enpkg.commons-lab.org/kg/NPCPathway'), ('https://enpkg.commons-lab.org/kg/has_npc_pathway', 'https://enpkg.commons-lab.org/kg/ChemicalTaxonomy'), ('https://enpkg.commons-lab.org/kg/has_npc_superclass', 'https://enpkg.commons-lab.org/kg/NPCClass'), ('https://enpkg.commons-lab.org/kg/has_npc_superclass', 'https://enpkg.commons-lab.org/kg/NPCSuperclass'), ('https://enpkg.commons-lab.org/kg/has_npc_superclass', 'https://enpkg.commons-lab.org/kg/NPCPathway'), ('https://enpkg.commons-lab.org/kg/has_npc_superclass', 'https://enpkg.commons-lab.org/kg/ChemicalTaxonomy'), ('https://enpkg.commons-lab.org/kg/has_smiles', 'http://www.w3.org/2001/XMLSchema#string'), ('https://enpkg.commons-lab.org/kg/has_npc_pathway', 'https://enpkg.commons-lab.org/kg/NPCCl

 21%|██        | 10/48 [00:02<00:09,  4.02it/s]

[('https://enpkg.commons-lab.org/kg/has_LCMS', 'https://enpkg.commons-lab.org/kg/LCMSAnalysisPos'), ('https://enpkg.commons-lab.org/kg/has_LCMS', 'https://enpkg.commons-lab.org/kg/LCMSAnalysis'), ('https://enpkg.commons-lab.org/kg/has_LCMS', 'https://enpkg.commons-lab.org/kg/LCMSAnalysisNeg')]
https://enpkg.commons-lab.org/kg/LCMSAnalysisPos


 23%|██▎       | 11/48 [00:02<00:09,  4.03it/s]

[('https://enpkg.commons-lab.org/kg/has_gnpslcms_link', 'Untyped'), ('https://enpkg.commons-lab.org/kg/has_massive_license', 'Untyped')]
https://enpkg.commons-lab.org/kg/LCMSFeatureList


 25%|██▌       | 12/48 [00:04<00:34,  1.05it/s]

[('https://enpkg.commons-lab.org/kg/has_ionization', 'http://www.w3.org/2001/XMLSchema#string')]
https://enpkg.commons-lab.org/kg/SiriusCanopusAnnotation


 27%|██▋       | 13/48 [00:05<00:25,  1.35it/s]

[]
https://enpkg.commons-lab.org/kg/LCMSFeature


 29%|██▉       | 14/48 [00:05<00:21,  1.60it/s]

[('https://enpkg.commons-lab.org/kg/has_ionization', 'http://www.w3.org/2001/XMLSchema#string'), ('https://enpkg.commons-lab.org/kg/has_parent_mass', 'http://www.w3.org/2001/XMLSchema#float'), ('https://enpkg.commons-lab.org/kg/has_raw_spectrum', 'http://www.w3.org/2001/XMLSchema#string'), ('https://enpkg.commons-lab.org/kg/has_retention_time', 'http://www.w3.org/2001/XMLSchema#float'), ('https://enpkg.commons-lab.org/kg/has_row_id', 'http://www.w3.org/2001/XMLSchema#decimal'), ('https://enpkg.commons-lab.org/kg/has_usi', 'http://www.w3.org/2001/XMLSchema#string')]
https://enpkg.commons-lab.org/kg/Spec2VecLoss


 31%|███▏      | 15/48 [00:05<00:16,  2.00it/s]

[('https://enpkg.commons-lab.org/kg/has_value', 'http://www.w3.org/2001/XMLSchema#float')]
https://enpkg.commons-lab.org/kg/NPCClass
[]


 35%|███▌      | 17/48 [00:06<00:10,  2.95it/s]

https://enpkg.commons-lab.org/kg/NPCSuperclass
[]
https://enpkg.commons-lab.org/kg/Spec2VecPeak


 38%|███▊      | 18/48 [00:06<00:09,  3.32it/s]

[('https://enpkg.commons-lab.org/kg/has_value', 'http://www.w3.org/2001/XMLSchema#float')]
https://enpkg.commons-lab.org/kg/SiriusStructureAnnotation


 40%|███▉      | 19/48 [00:06<00:08,  3.52it/s]

[('https://enpkg.commons-lab.org/kg/has_ionization', 'http://www.w3.org/2001/XMLSchema#string'), ('https://enpkg.commons-lab.org/kg/has_InChIkey2D', 'https://enpkg.commons-lab.org/kg/InChIkey2D'), ('https://enpkg.commons-lab.org/kg/has_sirius_score', 'http://www.w3.org/2001/XMLSchema#float'), ('https://enpkg.commons-lab.org/kg/has_zodiac_score', 'http://www.w3.org/2001/XMLSchema#float')]
https://enpkg.commons-lab.org/kg/Spec2VecDoc


 42%|████▏     | 20/48 [00:07<00:12,  2.28it/s]

[('https://enpkg.commons-lab.org/kg/has_spec2vec_loss', 'https://enpkg.commons-lab.org/kg/Spec2VecLoss'), ('https://enpkg.commons-lab.org/kg/has_spec2vec_loss', 'https://enpkg.commons-lab.org/kg/spec2vec'), ('https://enpkg.commons-lab.org/kg/has_spec2vec_peak', 'https://enpkg.commons-lab.org/kg/Spec2VecPeak'), ('https://enpkg.commons-lab.org/kg/has_spec2vec_peak', 'https://enpkg.commons-lab.org/kg/spec2vec')]
https://enpkg.commons-lab.org/kg/IsdbAnnotation


 46%|████▌     | 22/48 [00:07<00:08,  3.17it/s]

[('https://enpkg.commons-lab.org/kg/has_InChIkey2D', 'https://enpkg.commons-lab.org/kg/InChIkey2D'), ('https://enpkg.commons-lab.org/kg/has_spectral_score', 'http://www.w3.org/2001/XMLSchema#float'), ('https://enpkg.commons-lab.org/kg/has_taxo_score', 'http://www.w3.org/2001/XMLSchema#float')]
https://enpkg.commons-lab.org/kg/NPCPathway
[]
https://enpkg.commons-lab.org/kg/LCMSAnalysisNeg


 48%|████▊     | 23/48 [00:08<00:07,  3.42it/s]

[('https://enpkg.commons-lab.org/kg/has_gnpslcms_link', 'Untyped'), ('https://enpkg.commons-lab.org/kg/has_massive_license', 'Untyped')]
https://enpkg.commons-lab.org/module/L610ugml


 50%|█████     | 24/48 [00:08<00:06,  3.65it/s]

[('https://enpkg.commons-lab.org/module/inhibition_percentage', 'http://www.w3.org/2001/XMLSchema#float')]
https://enpkg.commons-lab.org/module/Ldono10ugml


 52%|█████▏    | 25/48 [00:08<00:06,  3.80it/s]

[('https://enpkg.commons-lab.org/module/inhibition_percentage', 'http://www.w3.org/2001/XMLSchema#float'), ('https://enpkg.commons-lab.org/module/target_id', 'https://enpkg.commons-lab.org/module/ChEMBLTarget'), ('https://enpkg.commons-lab.org/module/target_id', 'https://enpkg.commons-lab.org/kg/XRef')]
https://enpkg.commons-lab.org/module/Ldono2ugml


 54%|█████▍    | 26/48 [00:08<00:05,  3.98it/s]

[('https://enpkg.commons-lab.org/module/inhibition_percentage', 'http://www.w3.org/2001/XMLSchema#float'), ('https://enpkg.commons-lab.org/module/target_id', 'https://enpkg.commons-lab.org/module/ChEMBLTarget'), ('https://enpkg.commons-lab.org/module/target_id', 'https://enpkg.commons-lab.org/kg/XRef')]
https://enpkg.commons-lab.org/module/Tbrucei10ugml


 56%|█████▋    | 27/48 [00:08<00:05,  4.11it/s]

[('https://enpkg.commons-lab.org/module/inhibition_percentage', 'http://www.w3.org/2001/XMLSchema#float'), ('https://enpkg.commons-lab.org/module/target_id', 'https://enpkg.commons-lab.org/module/ChEMBLTarget'), ('https://enpkg.commons-lab.org/module/target_id', 'https://enpkg.commons-lab.org/kg/XRef')]
https://enpkg.commons-lab.org/module/Tbrucei2ugml


 58%|█████▊    | 28/48 [00:09<00:04,  4.17it/s]

[('https://enpkg.commons-lab.org/module/inhibition_percentage', 'http://www.w3.org/2001/XMLSchema#float'), ('https://enpkg.commons-lab.org/module/target_id', 'https://enpkg.commons-lab.org/module/ChEMBLTarget'), ('https://enpkg.commons-lab.org/module/target_id', 'https://enpkg.commons-lab.org/kg/XRef')]
https://enpkg.commons-lab.org/module/Tcruzi10ugml


 62%|██████▎   | 30/48 [00:09<00:03,  4.62it/s]

[('https://enpkg.commons-lab.org/module/inhibition_percentage', 'http://www.w3.org/2001/XMLSchema#float'), ('https://enpkg.commons-lab.org/module/target_id', 'https://enpkg.commons-lab.org/module/ChEMBLTarget'), ('https://enpkg.commons-lab.org/module/target_id', 'https://enpkg.commons-lab.org/kg/XRef')]
https://enpkg.commons-lab.org/module/ChEMBLTarget
[('https://enpkg.commons-lab.org/module/target_name', 'http://www.w3.org/2001/XMLSchema#string')]
https://enpkg.commons-lab.org/kg/LabBlank


 67%|██████▋   | 32/48 [00:09<00:03,  5.17it/s]

[('https://enpkg.commons-lab.org/kg/has_LCMS', 'https://enpkg.commons-lab.org/kg/LCMSAnalysisNeg'), ('https://enpkg.commons-lab.org/kg/has_LCMS', 'https://enpkg.commons-lab.org/kg/LCMSAnalysis')]
https://enpkg.commons-lab.org/kg/LabQc
[('https://enpkg.commons-lab.org/kg/has_LCMS', 'https://enpkg.commons-lab.org/kg/LCMSAnalysisNeg'), ('https://enpkg.commons-lab.org/kg/has_LCMS', 'https://enpkg.commons-lab.org/kg/LCMSAnalysis')]
https://enpkg.commons-lab.org/module/ChEMBLAssayResults


 71%|███████   | 34/48 [00:10<00:02,  4.75it/s]

[('https://enpkg.commons-lab.org/module/target_id', 'https://enpkg.commons-lab.org/module/ChEMBLTarget'), ('https://enpkg.commons-lab.org/module/target_id', 'https://enpkg.commons-lab.org/kg/XRef'), ('https://enpkg.commons-lab.org/module/activity_relation', 'http://www.w3.org/2001/XMLSchema#string'), ('https://enpkg.commons-lab.org/module/activity_type', 'http://www.w3.org/2001/XMLSchema#string'), ('https://enpkg.commons-lab.org/module/activity_unit', 'http://www.w3.org/2001/XMLSchema#string'), ('https://enpkg.commons-lab.org/module/activity_value', 'http://www.w3.org/2001/XMLSchema#float'), ('https://enpkg.commons-lab.org/module/assay_id', 'https://enpkg.commons-lab.org/module/ChEMBLAssay'), ('https://enpkg.commons-lab.org/module/assay_id', 'https://enpkg.commons-lab.org/kg/XRef'), ('https://enpkg.commons-lab.org/module/target_name', 'http://www.w3.org/2001/XMLSchema#string')]
https://enpkg.commons-lab.org/module/ChEMBLAssay
[]
https://enpkg.commons-lab.org/module/ChEMBLChemical


 75%|███████▌  | 36/48 [00:10<00:02,  4.91it/s]

[]
https://enpkg.commons-lab.org/module/ChEMBLDocument
[('https://enpkg.commons-lab.org/module/journal_name', 'http://www.w3.org/2001/XMLSchema#string')]
https://enpkg.commons-lab.org/kg/LabObject


 77%|███████▋  | 37/48 [00:11<00:02,  4.61it/s]

[('https://enpkg.commons-lab.org/kg/has_lab_process', 'https://enpkg.commons-lab.org/kg/LabExtract'), ('https://enpkg.commons-lab.org/kg/has_lab_process', 'https://enpkg.commons-lab.org/kg/LabObject'), ('https://enpkg.commons-lab.org/kg/submitted_taxon', 'http://www.w3.org/2001/XMLSchema#string'), ('https://enpkg.commons-lab.org/kg/has_LCMS', 'https://enpkg.commons-lab.org/kg/WDTaxon'), ('https://enpkg.commons-lab.org/kg/has_LCMS', 'https://enpkg.commons-lab.org/kg/XRef'), ('https://enpkg.commons-lab.org/kg/has_LCMS', 'http://www.w3.org/2001/XMLSchema#string'), ('https://enpkg.commons-lab.org/kg/has_wd_id', 'https://enpkg.commons-lab.org/kg/WDTaxon'), ('https://enpkg.commons-lab.org/kg/has_wd_id', 'https://enpkg.commons-lab.org/kg/XRef'), ('https://enpkg.commons-lab.org/kg/has_LCMS', 'https://enpkg.commons-lab.org/kg/LCMSAnalysisPos'), ('https://enpkg.commons-lab.org/kg/has_LCMS', 'https://enpkg.commons-lab.org/kg/LCMSAnalysis'), ('https://enpkg.commons-lab.org/kg/has_unresolved_taxon'

 79%|███████▉  | 38/48 [00:11<00:02,  4.60it/s]

[('https://enpkg.commons-lab.org/module/inhibition_percentage', 'http://www.w3.org/2001/XMLSchema#float'), ('https://enpkg.commons-lab.org/module/target_id', 'https://enpkg.commons-lab.org/module/ChEMBLTarget'), ('https://enpkg.commons-lab.org/module/target_id', 'https://enpkg.commons-lab.org/kg/XRef')]
https://enpkg.commons-lab.org/kg/MS2Spectrum


 81%|████████▏ | 39/48 [00:11<00:02,  3.94it/s]

[('https://enpkg.commons-lab.org/kg/has_ionization', 'http://www.w3.org/2001/XMLSchema#string'), ('https://enpkg.commons-lab.org/kg/has_parent_mass', 'http://www.w3.org/2001/XMLSchema#float'), ('https://enpkg.commons-lab.org/kg/has_raw_spectrum', 'http://www.w3.org/2001/XMLSchema#string'), ('https://enpkg.commons-lab.org/kg/has_retention_time', 'http://www.w3.org/2001/XMLSchema#float'), ('https://enpkg.commons-lab.org/kg/has_row_id', 'http://www.w3.org/2001/XMLSchema#decimal'), ('https://enpkg.commons-lab.org/kg/has_usi', 'http://www.w3.org/2001/XMLSchema#string')]
https://enpkg.commons-lab.org/kg/LCMSAnalysis


 83%|████████▎ | 40/48 [00:11<00:02,  3.98it/s]

[('https://enpkg.commons-lab.org/kg/has_gnpslcms_link', 'Untyped'), ('https://enpkg.commons-lab.org/kg/has_massive_license', 'Untyped')]
https://enpkg.commons-lab.org/kg/Annotation


 88%|████████▊ | 42/48 [00:12<00:01,  4.56it/s]

[('https://enpkg.commons-lab.org/kg/has_InChIkey2D', 'https://enpkg.commons-lab.org/kg/InChIkey2D'), ('https://enpkg.commons-lab.org/kg/has_spectral_score', 'http://www.w3.org/2001/XMLSchema#float'), ('https://enpkg.commons-lab.org/kg/has_taxo_score', 'http://www.w3.org/2001/XMLSchema#float'), ('https://enpkg.commons-lab.org/kg/has_ionization', 'http://www.w3.org/2001/XMLSchema#string'), ('https://enpkg.commons-lab.org/kg/has_sirius_score', 'http://www.w3.org/2001/XMLSchema#float'), ('https://enpkg.commons-lab.org/kg/has_zodiac_score', 'http://www.w3.org/2001/XMLSchema#float')]
https://enpkg.commons-lab.org/kg/GNPSAnnotation
[]
https://enpkg.commons-lab.org/kg/SpectralPair


 90%|████████▉ | 43/48 [00:12<00:01,  4.45it/s]

[('https://enpkg.commons-lab.org/kg/has_mn_params', 'Untyped'), ('https://enpkg.commons-lab.org/kg/has_member', 'https://enpkg.commons-lab.org/kg/LCMSFeature'), ('https://enpkg.commons-lab.org/kg/has_member', 'https://enpkg.commons-lab.org/kg/MS2Spectrum')]
https://enpkg.commons-lab.org/kg/ChemicalEntity


 92%|█████████▏| 44/48 [00:12<00:00,  4.36it/s]

[('https://enpkg.commons-lab.org/kg/has_wd_id', 'https://enpkg.commons-lab.org/kg/WDChemical'), ('https://enpkg.commons-lab.org/kg/has_wd_id', 'https://enpkg.commons-lab.org/kg/XRef'), ('https://enpkg.commons-lab.org/kg/has_npc_pathway', 'https://enpkg.commons-lab.org/kg/NPCPathway'), ('https://enpkg.commons-lab.org/kg/has_npc_pathway', 'https://enpkg.commons-lab.org/kg/ChemicalTaxonomy'), ('https://enpkg.commons-lab.org/kg/has_npc_superclass', 'https://enpkg.commons-lab.org/kg/NPCClass'), ('https://enpkg.commons-lab.org/kg/has_npc_superclass', 'https://enpkg.commons-lab.org/kg/NPCSuperclass'), ('https://enpkg.commons-lab.org/kg/has_npc_superclass', 'https://enpkg.commons-lab.org/kg/NPCPathway'), ('https://enpkg.commons-lab.org/kg/has_npc_superclass', 'https://enpkg.commons-lab.org/kg/ChemicalTaxonomy'), ('https://enpkg.commons-lab.org/kg/has_smiles', 'http://www.w3.org/2001/XMLSchema#string'), ('https://enpkg.commons-lab.org/kg/has_npc_pathway', 'https://enpkg.commons-lab.org/kg/NPCCl

 94%|█████████▍| 45/48 [00:13<00:00,  4.14it/s]

[]
https://enpkg.commons-lab.org/kg/XRef


 96%|█████████▌| 46/48 [00:13<00:00,  3.69it/s]

[]
https://enpkg.commons-lab.org/kg/spec2vec


 98%|█████████▊| 47/48 [00:13<00:00,  3.70it/s]

[('https://enpkg.commons-lab.org/kg/has_spec2vec_loss', 'https://enpkg.commons-lab.org/kg/Spec2VecLoss'), ('https://enpkg.commons-lab.org/kg/has_spec2vec_loss', 'https://enpkg.commons-lab.org/kg/spec2vec'), ('https://enpkg.commons-lab.org/kg/has_spec2vec_peak', 'https://enpkg.commons-lab.org/kg/Spec2VecPeak'), ('https://enpkg.commons-lab.org/kg/has_spec2vec_peak', 'https://enpkg.commons-lab.org/kg/spec2vec'), ('https://enpkg.commons-lab.org/kg/has_value', 'http://www.w3.org/2001/XMLSchema#float')]
https://enpkg.commons-lab.org/module/SwissTPHBioAssay


100%|██████████| 48/48 [00:13<00:00,  3.47it/s]

[('https://enpkg.commons-lab.org/module/inhibition_percentage', 'http://www.w3.org/2001/XMLSchema#float'), ('https://enpkg.commons-lab.org/module/target_id', 'https://enpkg.commons-lab.org/module/ChEMBLTarget'), ('https://enpkg.commons-lab.org/module/target_id', 'https://enpkg.commons-lab.org/kg/XRef')]





## constructing ttl schemas

In [81]:
g = Graph()

for r in res:
    class_ref = URIRef(str(r["cls"]))
    properties_and_values = get_prop_and_val_types(str(r["cls"]))
    for property_uri, sample_value in properties_and_values:
        property_ref = URIRef(property_uri)
        sample_value_ref = URIRef(sample_value)
        g.add((class_ref, property_ref, sample_value_ref))

In [21]:
print(g.serialize(format='turtle'))

@prefix ns1: <https://enpkg.commons-lab.org/kg/> .
@prefix ns2: <https://enpkg.commons-lab.org/module/> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

ns1:Annotation ns1:has_InChIkey2D ns1:InChIkey2D ;
    ns1:has_ionization xsd:string ;
    ns1:has_sirius_score xsd:float ;
    ns1:has_spectral_score xsd:float ;
    ns1:has_taxo_score xsd:float ;
    ns1:has_zodiac_score xsd:float .

ns1:BioAssayResults ns2:inhibition_percentage xsd:float ;
    ns2:target_id ns1:XRef,
        ns2:ChEMBLTarget .

ns1:IsdbAnnotation ns1:has_InChIkey2D ns1:InChIkey2D ;
    ns1:has_spectral_score xsd:float ;
    ns1:has_taxo_score xsd:float .

ns1:LCMSFeatureList ns1:has_ionization xsd:string .

ns1:LFpair ns1:has_member ns1:LCMSFeature,
        ns1:MS2Spectrum ;
    ns1:has_mn_params <Untyped> .

ns1:LabBlank ns1:has_LCMS ns1:LCMSAnalysis,
        ns1:LCMSAnalysisNeg .

ns1:LabQc ns1:has_LCMS ns1:LCMSAnalysis,
        ns1:LCMSAnalysisNeg .

ns1:RawMaterial ns1:has_LCMS xsd:string,
        ns1:WDTax

### merging Classes and turtle schemas

In [43]:
schemas = f"In the following, each IRI is followed by the local name and optionally its description and optionally an example. \n" + \
f"The RDF graph supports the following node types:\n" + \
f'{", ".join([_res_to_str(r, "cls") for r in res])}\n'+ \
f"The RDF graph supports the following relationships:\n" + \
f"{g.serialize(format='turtle')}"

In [39]:
print('final schema \n', schemas)

final schema 
 In the following, each IRI is followed by the local name and optionally its description and optionally an example. 
The RDF graph supports the following node types:
<http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral> (XMLLiteral, None, None), <http://www.w3.org/2001/XMLSchema#nonNegativeInteger> (nonNegativeInteger, None, None), <http://www.w3.org/2001/XMLSchema#string> (string, None, None), <https://enpkg.commons-lab.org/kg/RawMaterial> (RawMaterial, A raw laboratory biological material, i.e. before extraction, https://enpkg.commons-lab.org/kg/walth_ind_aerparts), <https://enpkg.commons-lab.org/kg/LFpair> (LFpair, A pair of 2 LCMSFeature, https://enpkg.commons-lab.org/kg/lcms_feature_pair_mzspec:MSV000088521:SC_AP_Wi_DCM_features_ms2_pos.mgf:scan:9_mzspec:MSV000088521:SC_AP_Wi_DCM_features_ms2_pos.mgf:scan:59), <https://enpkg.commons-lab.org/kg/WDChemical> (WDChemical, Cross-reference to a chemical entity in Wikidata, http://www.wikidata.org/entity/Q94518604), <http

In [48]:

print(f"Number of tokens: {token_counter(schemas)}")


Number of tokens: 4684


## reduce number of tokens

In [77]:
def _get_local_name(iri: str) -> str:
    for sep in ['#', '/']:
        prefix, found, local_name = iri.rpartition(sep)
        if found:
            return local_name
    raise ValueError(f"Unexpected IRI '{iri}', contains neither '#' nor '/'.")

def _format_text(text, namespaces):
    if text is None:
        return text
    for key, url in namespaces:
        text = text.replace(url, f"{key}:")
    return text

def _res_to_str(res, var, namespaces) -> str:
    formatted_var = _format_text(str(res[var]), namespaces)
    local_name = _get_local_name(res[var])
    formatted_example = _format_text(str(res["example"]), namespaces)
    return f"<{formatted_var}> ({local_name}, {res['com']}, {formatted_example})"

# Assuming 'res' is a list of rdflib.query.ResultRow objects and 'g' has a namespaces() method:
namespaces = list(g.namespaces())  # Assuming 'g' is defined and has a namespaces method
", ".join([_res_to_str(r, "cls", namespaces) for r in res])


'<rdf:XMLLiteral> (XMLLiteral, None, None), <xsd:nonNegativeInteger> (nonNegativeInteger, None, None), <xsd:string> (string, None, None), <ns1:RawMaterial> (RawMaterial, A raw laboratory biological material, i.e. before extraction, ns1:V112049GP-01), <ns1:LFpair> (LFpair, A pair of 2 LCMSFeature, ns1:lcms_feature_pair_mzspec:MSV000088521:SC_AP_Wi_DCM_features_ms2_pos.mgf:scan:9_mzspec:MSV000088521:SC_AP_Wi_DCM_features_ms2_pos.mgf:scan:59), <ns1:WDChemical> (WDChemical, Cross-reference to a chemical entity in Wikidata, http://www.wikidata.org/entity/Q834660), <ns1:WDTaxon> (WDTaxon, Cross-reference to a taxon in Wikidata, http://www.wikidata.org/entity/Q15552476), <ns1:InChIkey2D> (InChIkey2D, The first 14 characters of an InChIKey, often returned by MS-based annotation tools, ns1:ZZZQPOQWMWIOMQ), <ns1:InChIkey> (InChIkey, A chemical structure represented by its InChIKey, ns1:ZZZQPOQWMWIOMQ-UHFFFAOYSA-N), <ns1:LabExtract> (LabExtract, A natural extract obtained from the processing of a

In [78]:
schemas = f"In the following, each IRI is followed by the local name and optionally its description and optionally an example. \n" + \
f"The RDF graph supports the following node types:\n" + \
f'{", ".join([_res_to_str(r, "cls", namespaces) for r in res])}\n'+ \
f"The RDF graph supports the following relationships:\n" + \
f"{g.serialize(format='turtle')}"

print(schemas)


In the following, each IRI is followed by the local name and optionally its description and optionally an example. 
The RDF graph supports the following node types:
<rdf:XMLLiteral> (XMLLiteral, None, None), <xsd:nonNegativeInteger> (nonNegativeInteger, None, None), <xsd:string> (string, None, None), <ns1:RawMaterial> (RawMaterial, A raw laboratory biological material, i.e. before extraction, ns1:V112049GP-01), <ns1:LFpair> (LFpair, A pair of 2 LCMSFeature, ns1:lcms_feature_pair_mzspec:MSV000088521:SC_AP_Wi_DCM_features_ms2_pos.mgf:scan:9_mzspec:MSV000088521:SC_AP_Wi_DCM_features_ms2_pos.mgf:scan:59), <ns1:WDChemical> (WDChemical, Cross-reference to a chemical entity in Wikidata, http://www.wikidata.org/entity/Q834660), <ns1:WDTaxon> (WDTaxon, Cross-reference to a taxon in Wikidata, http://www.wikidata.org/entity/Q15552476), <ns1:InChIkey2D> (InChIkey2D, The first 14 characters of an InChIKey, often returned by MS-based annotation tools, ns1:ZZZQPOQWMWIOMQ), <ns1:InChIkey> (InChIkey, A

In [80]:
print(namespaces)

[('brick', rdflib.term.URIRef('https://brickschema.org/schema/Brick#')), ('csvw', rdflib.term.URIRef('http://www.w3.org/ns/csvw#')), ('dc', rdflib.term.URIRef('http://purl.org/dc/elements/1.1/')), ('dcat', rdflib.term.URIRef('http://www.w3.org/ns/dcat#')), ('dcmitype', rdflib.term.URIRef('http://purl.org/dc/dcmitype/')), ('dcterms', rdflib.term.URIRef('http://purl.org/dc/terms/')), ('dcam', rdflib.term.URIRef('http://purl.org/dc/dcam/')), ('doap', rdflib.term.URIRef('http://usefulinc.com/ns/doap#')), ('foaf', rdflib.term.URIRef('http://xmlns.com/foaf/0.1/')), ('geo', rdflib.term.URIRef('http://www.opengis.net/ont/geosparql#')), ('odrl', rdflib.term.URIRef('http://www.w3.org/ns/odrl/2/')), ('org', rdflib.term.URIRef('http://www.w3.org/ns/org#')), ('prof', rdflib.term.URIRef('http://www.w3.org/ns/dx/prof/')), ('prov', rdflib.term.URIRef('http://www.w3.org/ns/prov#')), ('qb', rdflib.term.URIRef('http://purl.org/linked-data/cube#')), ('schema', rdflib.term.URIRef('https://schema.org/')), (

In [79]:
print(f"Number of tokens: {token_counter(schemas)}")

Number of tokens: 4093
