# Constructing Schema Notebook

### Getting all predicates
ENPKG is too large to query over all triples to get all unique predicates. Uses a subsampling method to get all predicates.

### Constructing schema
Constructs a schema.ttl file for the schema (all triples representing with predicates link with classes together)

In [2]:
# general python libs
import json
import os
from SPARQLWrapper import SPARQLWrapper, JSON, TURTLE
import glob
import rdflib
from rdflib import Graph
import tiktoken
import time


In [3]:
# This is the remote ENPKG SPARQL endpoint
endpoint_url = 'https://enpkg.commons-lab.org/graphdb/repositories/ENPKG'
directory = './schema'

if not os.path.exists(directory):
    os.mkdir(directory)

In [4]:
# HELPER FUNCTIONS

def run_sparql(query, 
               url=endpoint_url):
    sparql = SPARQLWrapper(url)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    sparql.setTimeout(600)
    results = sparql.query().convert()
    results = results['results']['bindings']
    return results

def run_sparql_construct(query, filename, url=endpoint_url):
    sparql = SPARQLWrapper(url)
    sparql.setQuery(query)
    sparql.setReturnFormat(TURTLE)
    sparql.setTimeout(600)
    results = sparql.queryAndConvert()
    graph = rdflib.Graph()
    graph.parse(data=results, format='turtle')
    graph.serialize(destination=filename, format='turtle')
    return results
    

# Get all predicates
## Step 1: get all classes 

In [5]:
query = """  
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX enpkg: <https://enpkg.commons-lab.org/kg/>
select distinct ?o where { 
	?s a ?o .
} 
"""

In [7]:
classes = [x['o']['value'] for x in run_sparql(query)]

In [9]:
print('number of classes: ', len(classes))
    
classes

number of classes:  55


['http://www.w3.org/1999/02/22-rdf-syntax-ns#Property',
 'http://www.w3.org/2002/07/owl#TransitiveProperty',
 'http://www.w3.org/2002/07/owl#SymmetricProperty',
 'http://www.w3.org/1999/02/22-rdf-syntax-ns#List',
 'http://www.w3.org/2000/01/rdf-schema#Class',
 'http://www.w3.org/2000/01/rdf-schema#Datatype',
 'http://www.w3.org/2000/01/rdf-schema#ContainerMembershipProperty',
 'https://enpkg.commons-lab.org/kg/RawMaterial',
 'https://enpkg.commons-lab.org/kg/LabObject',
 'http://rdfs.org/ns/void#Dataset',
 'https://enpkg.commons-lab.org/kg/LabExtract',
 'https://enpkg.commons-lab.org/kg/WDTaxon',
 'https://enpkg.commons-lab.org/kg/XRef',
 'https://enpkg.commons-lab.org/kg/LFpair',
 'https://enpkg.commons-lab.org/kg/SpectralPair',
 'https://enpkg.commons-lab.org/kg/LCMSFeature',
 'https://enpkg.commons-lab.org/kg/MS2Spectrum',
 'https://enpkg.commons-lab.org/kg/WDChemical',
 'https://enpkg.commons-lab.org/kg/InChIkey2D',
 'https://enpkg.commons-lab.org/kg/NPCClass',
 'https://enpkg.comm

## Step 2: get objects of each class and subsample

In [10]:
query2 = """  
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX enpkg: <https://enpkg.commons-lab.org/kg/>
select ?s where {{ 
	?s a <{cls}> .
}} ORDER BY RAND() LIMIT 100
"""

In [12]:
random_instances = {}
for c in classes:
    inst_output =  run_sparql(query2.format(cls = c))
    random_instances[c] = [x['s']['value'] for x in inst_output]

In [13]:
random_instances

{'http://www.w3.org/1999/02/22-rdf-syntax-ns#Property': ['http://rdfs.org/ns/void#properties',
  'https://enpkg.commons-lab.org/kg/has_fbmn_ci',
  'https://enpkg.commons-lab.org/kg/has_sirius_annotation_67cd1a41589bee5e1e4ccee651ddf01b',
  'http://proton.semanticweb.org/protonsys#transitiveOver',
  'http://example.org/has_sha256',
  'https://enpkg.commons-lab.org/kg/has_isdb_annotation',
  'https://enpkg.commons-lab.org/kg/has_relative_feature_area',
  'https://enpkg.commons-lab.org/kg/has_member',
  'https://enpkg.commons-lab.org/kg/has_LCMS',
  'https://enpkg.commons-lab.org/kg/has_canopus_npc_class',
  'http://www.w3.org/1999/02/22-rdf-syntax-ns#object',
  'https://enpkg.commons-lab.org/kg/fast_search_gnpslibrary_no_analog',
  'https://enpkg.commons-lab.org/module/target_id',
  'https://enpkg.commons-lab.org/kg/has_content',
  'https://enpkg.commons-lab.org/kg/has_LCMS_606e5e35b5df8d788a66e08a8f03f494',
  'https://enpkg.commons-lab.org/kg/has_cosmic_score',
  'https://enpkg.commons-

## Step 3: get all predicates for each instance

In [14]:
query3 = """  
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX enpkg: <https://enpkg.commons-lab.org/kg/>
select distinct ?p where {{ 
	{{<{inst}> ?p ?x .}}
    UNION
    {{?y ?p <{inst}> .}}
}} 
"""

In [15]:
all_preds = set()
class_preds = {}
for c in random_instances:
    class_preds[c] = set()
    for enum, i in enumerate(set(random_instances[c])):
        pred_output =  run_sparql(query3.format(inst = i))
        set_pred_output = set(x['p']['value'] for x in pred_output)
        class_preds[c] = class_preds[c].union(set_pred_output)
        all_preds = all_preds.union(set_pred_output)
    print(c, len(class_preds[c]), len(all_preds))

http://www.w3.org/1999/02/22-rdf-syntax-ns#Property 10 10
http://www.w3.org/2002/07/owl#TransitiveProperty 6 11
http://www.w3.org/2002/07/owl#SymmetricProperty 4 11
http://www.w3.org/1999/02/22-rdf-syntax-ns#List 1 11
http://www.w3.org/2000/01/rdf-schema#Class 6 12
http://www.w3.org/2000/01/rdf-schema#Datatype 2 12
http://www.w3.org/2000/01/rdf-schema#ContainerMembershipProperty 1 12
https://enpkg.commons-lab.org/kg/RawMaterial 11 22
https://enpkg.commons-lab.org/kg/LabObject 15 26
http://rdfs.org/ns/void#Dataset 8 33
https://enpkg.commons-lab.org/kg/LabExtract 7 33
https://enpkg.commons-lab.org/kg/WDTaxon 4 34
https://enpkg.commons-lab.org/kg/XRef 11 43
https://enpkg.commons-lab.org/kg/LFpair 7 49
https://enpkg.commons-lab.org/kg/SpectralPair 7 49
https://enpkg.commons-lab.org/kg/LCMSFeature 23 66
https://enpkg.commons-lab.org/kg/MS2Spectrum 23 66
https://enpkg.commons-lab.org/kg/WDChemical 2 66
https://enpkg.commons-lab.org/kg/InChIkey2D 2 67
https://enpkg.commons-lab.org/kg/NPCClass

In [16]:
class_preds = {key:list(value) for key,value in class_preds.items()}

In [17]:
# Write json file with all predicates sorted by class
with open(directory+'/predicates.json', 'w') as file:
    # Write the dictionary to the file as JSON
    json.dump(class_preds, file)
 

In [18]:
# Write txt file of all distinct predicates
with open(directory+'/predicates.txt', 'w') as file:
    # Iterate over each element in the list
    for element in all_preds:
        # Write the element to the file
        file.write(str(element) + '\n')


# Get subject predicate object triples for each predicate

### Load predicates

In [20]:
filename = directory + '/predicates.txt'
with open(filename, 'r') as file:
    lines = file.readlines()
    
preds = [line.strip() for line in lines]

In [22]:
query_triples = """  
CONSTRUCT {{?s_class <{pred}> ?o_class}} WHERE {{
SELECT DISTINCT ?s_class ?o_class WHERE {{
  ?s <{pred}> ?o .
  OPTIONAL {{ ?s a ?s_class . }}
  OPTIONAL {{ ?o a ?o_type . }}
  BIND(IF(bound(?o_type), ?o_type, IF(isLiteral(?o), datatype(?o),"")) AS ?o_class)
    }}
}}
"""

os.mkdir(directory+'/schema_ttl/')
filename_template = directory + """/schema_ttl/{pred}.ttl"""

def remove_prefix(uri, prefix='https://enpkg.commons-lab.org/'):
    return uri.split('/')[-1]

In [26]:
# Construct schema for each predicate 

for i, pred in enumerate(preds):
    pred_only = remove_prefix(pred)
    print('Starting', pred_only)

    start_time = time.time()  # Start time of the current iteration
    results = run_sparql_construct(query_triples.format(pred=pred), filename_template.format(pred=pred_only))

    end_time = time.time()  # End time of the current iteration
    duration = end_time - start_time  # Duration of the current iteration

    print('Finished', pred_only, 'in', duration, 'seconds')

    if i % 10 == 0:
        print('Cycle number:', i)

Starting has_member_2
Finished has_member_2 in 77.21234893798828 seconds
Starting has_usi
Finished has_usi in 28.674247980117798 seconds
Cycle number: 50
Starting creator
Finished creator in 0.01198720932006836 seconds
Starting preferredNamespaceUri
Finished preferredNamespaceUri in 0.013447046279907227 seconds
Starting activity_value
Finished activity_value in 1.0114738941192627 seconds
Starting has_consistency_score
Finished has_consistency_score in 3.777876853942871 seconds
Starting has_broad_organe
Finished has_broad_organe in 0.02123284339904785 seconds
Starting has_npc_class
Finished has_npc_class in 2.302672863006592 seconds
Starting has_canopus_annotation
Finished has_canopus_annotation in 8.891426086425781 seconds
Starting has_parent_mass
Finished has_parent_mass in 10.720993995666504 seconds
Starting rdf-schema#subClassOf
Finished rdf-schema#subClassOf in 0.007171154022216797 seconds
Starting assay_id
Finished assay_id in 0.5656979084014893 seconds
Cycle number: 60
Starting h

### Fix prefix conflicts and merge ttl files

In [27]:
prefix_map = {'http://schema.org/':'schema',
              'https://enpkg.commons-lab.org/module/':'enpkg_module',
              'http://purl.org/pav/':'pav',
              'http://example.org/':'example',
              'https://enpkg.commons-lab.org/kg/':'enpkg'}


# Iterate over each Turtle file
for filename in glob.glob('./schema_ttl/*.ttl'):
    g = Graph()
    g.parse(filename, format='turtle')

    # Update prefix definitions
    for namespace, prefix in prefix_map.items():
        g.bind(prefix, namespace)

    # Save the graph
    g.serialize(destination=filename, format='turtle')

g = Graph()

# Load all ttl files in the folder
for filename in glob.glob('./schema_ttl/*.ttl'):
    g.parse(filename, format='turtle')

# Save the merged graph
g.serialize(destination=directory+'/merged.ttl', format='turtle')



<Graph identifier=Nd87c33f08ff245c1bdeefff8b7140c97 (<class 'rdflib.graph.Graph'>)>

In [28]:
# Load the TTL file
ttl_file = 'schema_ttl/merged.ttl'
with open(ttl_file, 'r') as file:
    content = file.read()

# Initialize Tiktoken with the desired encoding model
encoding = tiktoken.encoding_for_model("gpt-4")

# Count the number of tokens in the TTL file
token_count = len(encoding.encode(content))

print(f"The TTL file '{ttl_file}' contains {token_count} tokens.")


The TTL file 'schema_local_ttl/merged.ttl' contains 4384 tokens.
