# Testing the classes of application

## testing RDFGraphCustom

In [1]:
from SPARQLWrapper import SPARQLWrapper, JSON, TURTLE, RDF, N3
from RdfGraphCustom import RdfGraph
from rdflib import Graph, Namespace
from rdflib.plugins.stores import sparqlstore
from rdflib.graph import DATASET_DEFAULT_GRAPH_ID as default
import pickle
from langchain.agents import initialize_agent, Tool
from langchain.agents import AgentType
from langchain.chat_models import ChatOpenAI
from langchain.agents.agent_types import AgentType
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_openai import ChatOpenAI, OpenAI
import os


In [2]:
def get_query(endpoint_url, query):
    sparql = SPARQLWrapper(endpoint_url)
    sparql.setQuery(query)
    sparql.setReturnFormat(N3)
    sparql.setTimeout(600)
    results = sparql.queryAndConvert()
    return results

endpoint_url = 'https://enpkg.commons-lab.org/graphdb/repositories/ENPKG'

In [None]:
g = Graph()


nspaces = {
    "dcterms" : Namespace("http://purl.org/dc/terms/"),
    "enpkg" : Namespace("https://enpkg.commons-lab.org/kg/"),
    "enpkg_module" : Namespace("https://enpkg.commons-lab.org/module/"),
    "ns1" : Namespace("http://proton.semanticweb.org/protonsys#")

}

store = sparqlstore.SPARQLStore()
store.open(endpoint_url)
g = Graph(store, identifier=default, bind_namespaces="rdflib")

for prefix, namespace in nspaces.items():
    g.bind(prefix, namespace)


query = """
SELECT DISTINCT ?rel ?com
    WHERE { 
        ?subj ?rel ?obj .
        OPTIONAL { ?rel rdfs:comment ?com } 
    }
"""

# result = g.query(query)


## SERIALIZE the ENPKG graph to Turtle format 
## this task takes a lot of time and memory, aborted
# turtle_data = g.serialize(destination='enpkg.ttl', format='turtle').decode("utf-8")
# g.serialize(destination='enpkg.ttl', format='turtle')


### serializing : toy example

In [9]:
from rdflib import Graph

# Create a Graph
g = Graph()

# Parse in an RDF file hosted on the Internet
g.parse("http://www.w3.org/People/Berners-Lee/card")

# Loop through each triple in the graph (subj, pred, obj)
for subj, pred, obj in g:
    # Check if there is at least one triple in the Graph
    if (subj, pred, obj) not in g:
       raise Exception("It better be!")

# Print the number of "triples" in the Graph
print(f"Graph g has {len(g)} statements.")
# Prints: Graph g has 86 statements.

# Print out the entire Graph in the RDF Turtle format
print(g.serialize(format="turtle"))


Graph g has 86 statements.
@prefix : <http://xmlns.com/foaf/0.1/> .
@prefix Be: <https://www.w3.org/People/Berners-Lee/> .
@prefix Pub: <https://timbl.com/timbl/Public/> .
@prefix blog: <http://dig.csail.mit.edu/breadcrumbs/blog/> .
@prefix card: <https://www.w3.org/People/Berners-Lee/card#> .
@prefix cc: <http://creativecommons.org/ns#> .
@prefix cert: <http://www.w3.org/ns/auth/cert#> .
@prefix con: <http://www.w3.org/2000/10/swap/pim/contact#> .
@prefix dc: <http://purl.org/dc/elements/1.1/> .
@prefix dct: <http://purl.org/dc/terms/> .
@prefix doap: <http://usefulinc.com/ns/doap#> .
@prefix geo1: <http://www.w3.org/2003/01/geo/wgs84_pos#> .
@prefix ldp: <http://www.w3.org/ns/ldp#> .
@prefix s: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix schema1: <http://schema.org/> .
@prefix sioc: <http://rdfs.org/sioc/ns#> .
@prefix solid: <http://www.w3.org/ns/solid/terms#> .
@prefix space: <http://www.w3.org/ns/pim/space#> .
@prefix vcard: <http://www.w3.org/2006/vcard/ns#> .
@prefix w3c: 

In [31]:
endpoint_url = 'https://enpkg.commons-lab.org/graphdb/repositories/ENPKG'

prefixes = {
    "owl": """PREFIX owl: <http://www.w3.org/2002/07/owl#>\n""",
    "rdf": """PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>\n""",
    "rdfs": """PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>\n""",
    "xsd": """PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>\n""",
    "dcterms" : """PREFIX dcterms: <http://purl.org/dc/terms/>\n""",
    "enpkg" : """PREFIX enpkg: <https://enpkg.commons-lab.org/kg/>\n""",
    "enpkg_module" : """PREFIX enpkg_module: <https://enpkg.commons-lab.org/module/>\n""",
    "ns1" : """PREFIX ns1: <http://proton.semanticweb.org/protonsys#>\n""",
}

# query classes
cls_query_rdf = ''.join([pref for pref in prefixes.values()]) + (
    """SELECT DISTINCT ?cls ?com\n"""
    """WHERE { \n"""
    """    ?instance a ?cls . \n"""
    """    OPTIONAL { ?cls rdfs:comment ?com } \n"""
    """}"""
)
print(cls_query_rdf)



# relation query
rel_query_rdf = ''.join([pref for pref in prefixes.values()]) + (
    """SELECT DISTINCT ?rel ?com\n"""
    """WHERE { \n"""
    """    ?subj ?rel ?obj . \n"""
    """    OPTIONAL { ?rel rdfs:comment ?com } \n"""
    """}"""
)
print(rel_query_rdf)


PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX dcterms: <http://purl.org/dc/terms/>
PREFIX enpkg: <https://enpkg.commons-lab.org/kg/>
PREFIX enpkg_module: <https://enpkg.commons-lab.org/module/>
PREFIX ns1: <http://proton.semanticweb.org/protonsys#>
SELECT DISTINCT ?cls ?com
WHERE { 
    ?instance a ?cls . 
    OPTIONAL { ?cls rdfs:comment ?com } 
}
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX dcterms: <http://purl.org/dc/terms/>
PREFIX enpkg: <https://enpkg.commons-lab.org/kg/>
PREFIX enpkg_module: <https://enpkg.commons-lab.org/module/>
PREFIX ns1: <http://proton.semanticweb.org/protonsys#>
SELECT DISTINCT ?rel ?com
WHERE { 
    ?subj ?rel ?obj . 
    OPTIONAL

## finding wikidata ID of a taxon

In [10]:
query = """
PREFIX prov: <http://www.w3.org/ns/prov#>
PREFIX pr: <http://www.wikidata.org/prop/reference/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX wd: <http://www.wikidata.org/entity/>

SELECT *
WHERE
{
  ?wikidata wdt:P225 "Tabernaemontana coffeoides" . #https://www.wikidata.org/wiki/Property:P225
}
"""

endpoint_url = 'https://query.wikidata.org/sparql'

In [11]:
get_query(endpoint_url, query)

{'head': {'vars': ['wikidata']},
 'results': {'bindings': [{'wikidata': {'type': 'uri',
     'value': 'http://www.wikidata.org/entity/Q15376858'}}]}}

## ENPKG Schemas from RDFGraphCustom

### RDF 

In [4]:

with open('/Users/benjamin/Developments/kgbot/kgai/drafts/app/graphs/graph.pkl', 'rb') as input_file:
    graph = pickle.load(input_file)

print(graph.get_schema)

In the following, each IRI is followed by the local name and optionally its description in parentheses. 
The RDF graph supports the following node types:
<http://www.w3.org/1999/02/22-rdf-syntax-ns#Property> (Property, None), <http://www.w3.org/2002/07/owl#TransitiveProperty> (TransitiveProperty, None), <http://www.w3.org/2002/07/owl#SymmetricProperty> (SymmetricProperty, None), <http://www.w3.org/1999/02/22-rdf-syntax-ns#List> (List, None), <http://www.w3.org/2000/01/rdf-schema#Class> (Class, None), <http://www.w3.org/2000/01/rdf-schema#Datatype> (Datatype, None), <http://www.w3.org/2000/01/rdf-schema#ContainerMembershipProperty> (ContainerMembershipProperty, None), <https://enpkg.commons-lab.org/kg/RawMaterial> (RawMaterial, A raw laboratory biological material, i.e. before extraction), <https://enpkg.commons-lab.org/kg/LabObject> (LabObject, An object that correspond to a physical laboratory object), <http://rdfs.org/ns/void#Dataset> (Dataset, None), <https://enpkg.commons-lab.org/k

### RDFS

In [5]:

with open('/Users/benjamin/Developments/kgbot/kgai/drafts/app/graphs/graph_rdfs.pkl', 'rb') as input_file:
    graph = pickle.load(input_file)

print(graph.get_schema)

In the following, each IRI is followed by the local name and optionally its description in parentheses. 
The RDF graph supports the following node types:
<http://www.w3.org/1999/02/22-rdf-syntax-ns#Property> (Property, None), <http://www.w3.org/2002/07/owl#TransitiveProperty> (TransitiveProperty, None), <http://www.w3.org/2002/07/owl#SymmetricProperty> (SymmetricProperty, None), <http://www.w3.org/1999/02/22-rdf-syntax-ns#List> (List, None), <http://www.w3.org/2000/01/rdf-schema#Class> (Class, None), <http://www.w3.org/2000/01/rdf-schema#Datatype> (Datatype, None), <http://www.w3.org/2000/01/rdf-schema#ContainerMembershipProperty> (ContainerMembershipProperty, None), <https://enpkg.commons-lab.org/kg/RawMaterial> (RawMaterial, A raw laboratory biological material, i.e. before extraction), <https://enpkg.commons-lab.org/kg/LabObject> (LabObject, An object that correspond to a physical laboratory object), <http://rdfs.org/ns/void#Dataset> (Dataset, None), <https://enpkg.commons-lab.org/k

### OWL

In [6]:

with open('/Users/benjamin/Developments/kgbot/kgai/drafts/app/graphs/graph_owl.pkl', 'rb') as input_file:
    graph = pickle.load(input_file)

print(graph.get_schema)

In the following, each IRI is followed by the local name and optionally its description in parentheses. 
The OWL graph supports the following node types:
<http://www.w3.org/1999/02/22-rdf-syntax-ns#Property> (Property, None), <http://www.w3.org/2002/07/owl#TransitiveProperty> (TransitiveProperty, None), <http://www.w3.org/2002/07/owl#SymmetricProperty> (SymmetricProperty, None), <http://www.w3.org/1999/02/22-rdf-syntax-ns#List> (List, None), <http://www.w3.org/2000/01/rdf-schema#Class> (Class, None), <http://www.w3.org/2000/01/rdf-schema#Datatype> (Datatype, None), <http://www.w3.org/2000/01/rdf-schema#ContainerMembershipProperty> (ContainerMembershipProperty, None), <https://enpkg.commons-lab.org/kg/RawMaterial> (RawMaterial, A raw laboratory biological material, i.e. before extraction), <https://enpkg.commons-lab.org/kg/LabObject> (LabObject, An object that correspond to a physical laboratory object), <http://rdfs.org/ns/void#Dataset> (Dataset, None), <https://enpkg.commons-lab.org/k

## replacing url with its PREFIX for RDFGraphCustom schemas string

In [1]:

NAMESPACES = {
    "brick": "https://brickschema.org/schema/Brick#",
    "csvw": "http://www.w3.org/ns/csvw#",
    "dc": "http://purl.org/dc/elements/1.1/",
    "dcat": "http://www.w3.org/ns/dcat#",
    "dcmitype": "http://purl.org/dc/dcmitype/",
    "dcterms": "http://purl.org/dc/terms/",
    "dcam": "http://purl.org/dc/dcam/",
    "doap": "http://usefulinc.com/ns/doap#",
    "foaf": "http://xmlns.com/foaf/0.1/",
    "geo": "http://www.opengis.net/ont/geosparql#",
    "odrl": "http://www.w3.org/ns/odrl/2/",
    "org": "http://www.w3.org/ns/org#",
    "prof": "http://www.w3.org/ns/dx/prof/",
    "prov": "http://www.w3.org/ns/prov#",
    "qb": "http://purl.org/linked-data/cube#",
    "schema": "https://schema.org/",
    "sh": "http://www.w3.org/ns/shacl#",
    "skos": "http://www.w3.org/2004/02/skos/core#",
    "sosa": "http://www.w3.org/ns/sosa/",
    "ssn": "http://www.w3.org/ns/ssn/",
    "time": "http://www.w3.org/2006/time#",
    "vann": "http://purl.org/vocab/vann/",
    "void": "http://rdfs.org/ns/void#",
    "wgs": "https://www.w3.org/2003/01/geo/wgs84_pos#",
    "owl": "http://www.w3.org/2002/07/owl#",
    "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
    "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
    "xsd": "http://www.w3.org/2001/XMLSchema#",
    "xml": "http://www.w3.org/XML/1998/namespace",
    "enpkg": "https://enpkg.commons-lab.org/kg/",
    "enpkg_module": "https://enpkg.commons-lab.org/module/",
    "ns1": "http://proton.semanticweb.org/protonsys#"
}


string_to_replace = """

In the following, each IRI is followed by the local name and optionally its description in parentheses. 
The RDF graph supports the following node types:
<http://www.w3.org/1999/02/22-rdf-syntax-ns#Property> (Property, None), 
<http://www.w3.org/2002/07/owl#TransitiveProperty> (TransitiveProperty, None), 
<http://www.w3.org/2002/07/owl#SymmetricProperty> (SymmetricProperty, None),
 <http://www.w3.org/1999/02/22-rdf-syntax-ns#List> (List, None),
 <http://www.w3.org/2000/01/rdf-schema#Class> (Class, None),
 <http://www.w3.org/2000/01/rdf-schema#Datatype> (Datatype, None),
 <http://www.w3.org/2000/01/rdf-schema#ContainerMembershipProperty> (ContainerMembershipProperty, None),
 <https://enpkg.commons-lab.org/kg/RawMaterial> (RawMaterial, A raw laboratory biological material, i.e. before extraction),
 <https://enpkg.commons-lab.org/kg/LabObject> (LabObject, An object that correspond to a physical laboratory object),
 <http://rdfs.org/ns/void#Dataset> (Dataset, None),
 <https://enpkg.commons-lab.org/kg/LabExtract> (LabExtract, A natural extract obtained from the processing of a RawMaterial),
 <https://enpkg.commons-lab.org/kg/WDTaxon> (WDTaxon, Cross-reference to a taxon in Wikidata),
 <https://enpkg.commons-lab.org/kg/XRef> (XRef, Any cross-reference),
 <https://enpkg.commons-lab.org/kg/LFpair> (LFpair, A pair of 2 LCMSFeature),
 <https://enpkg.commons-lab.org/kg/SpectralPair> (SpectralPair, A pair of 2 MS2Spectra),
 <https://enpkg.commons-lab.org/kg/LCMSFeature> (LCMSFeature, An LCMS feature from a processed LCMS analysis),
 <https://enpkg.commons-lab.org/kg/MS2Spectrum> (MS2Spectrum, A fragmentation mass spectrometry (or MS2) spectrum),
 <https://enpkg.commons-lab.org/kg/WDChemical> (WDChemical, Cross-reference to a chemical entity in Wikidata),
 <https://enpkg.commons-lab.org/kg/InChIkey2D> (InChIkey2D, The first 14 characters of an InChIKey, often returned by MS-based annotation tools),
 <https://enpkg.commons-lab.org/kg/NPCClass> (NPCClass, A NPClassifier (NPC) chemical class),
 <https://enpkg.commons-lab.org/kg/NPCSuperclass> (NPCSuperclass, A NPClassifier (NPC) chemical superclass),
 <https://enpkg.commons-lab.org/kg/NPCPathway> (NPCPathway, A NPClassifier (NPC) chemical pathway),
 <https://enpkg.commons-lab.org/kg/ChemicalTaxonomy> (ChemicalTaxonomy, None),
 <https://enpkg.commons-lab.org/kg/InChIkey> (InChIkey, A chemical structure represented by its InChIKey),
 <https://enpkg.commons-lab.org/kg/ChemicalEntity> (ChemicalEntity, None),
 <https://enpkg.commons-lab.org/kg/LCMSAnalysisPos> (LCMSAnalysisPos, An LCMS analysis in positive ionization mode (pos or neg)),
 <https://enpkg.commons-lab.org/kg/LCMSAnalysis> (LCMSAnalysis, An LCMS analysis in a given ionization mode (pos or neg)),
 <https://enpkg.commons-lab.org/kg/LCMSFeatureList> (LCMSFeatureList, A list of LCMS features obtained from the processing of a given LCMS analysis),
 <https://enpkg.commons-lab.org/kg/SiriusCanopusAnnotation> (SiriusCanopusAnnotation, A spectrum chemical class annotation by SIRIUS-CANOPUS),
 <https://enpkg.commons-lab.org/kg/Annotation> (Annotation, A spectral annotation),
 <https://enpkg.commons-lab.org/kg/Spec2VecDoc> (Spec2VecDoc, An ensemble of Spec2VecPeak and Spec2VecLoss objects that characterizes an MS2Spectrum),
 <https://enpkg.commons-lab.org/kg/spec2vec> (spec2vec, A spec2vec-related object),
 <https://enpkg.commons-lab.org/kg/IsdbAnnotation> (IsdbAnnotation, A spectrum structural annotation by comparison with an in-silico spectral DB, coupled to chemical and taxonomical reweighting),
 <https://enpkg.commons-lab.org/kg/SiriusStructureAnnotation> (SiriusStructureAnnotation, A spectrum structural annotation by SIRIUS),
 <https://enpkg.commons-lab.org/kg/Spec2VecLoss> (Spec2VecLoss, A Spec2VecLoss that partly characterizes an MS2Spectrum),
 <https://enpkg.commons-lab.org/kg/Spec2VecPeak> (Spec2VecPeak, A Spec2VecPeak that partly characterizes an MS2 spectrum),
 <https://enpkg.commons-lab.org/kg/LCMSAnalysisNeg> (LCMSAnalysisNeg, An LCMS analysis in negative ionization mode (pos or neg)),
 <https://enpkg.commons-lab.org/module/L610ugml> (L610ugml, A screening result at 10ug/mL from a phenotypic assay against L6 cells),
 <https://enpkg.commons-lab.org/kg/BioAssayResults> (BioAssayResults, An object to store bioactivity results),
 <https://enpkg.commons-lab.org/module/SwissTPHBioAssay> (SwissTPHBioAssay, A bioasay result from Swiss Tropical and Public Health Institute (sTPH)),
 <https://enpkg.commons-lab.org/module/Ldono10ugml> (Ldono10ugml, A screening result at 10ug/mL from a phenotypic assay against L.donovani),
 <https://enpkg.commons-lab.org/module/Ldono2ugml> (Ldono2ugml, A screening result at 2ug/mL from a phenotypic assay against L.donovani),
 <https://enpkg.commons-lab.org/module/Tbrucei10ugml> (Tbrucei10ugml, A screening result at 10ug/mL from a phenotypic assay against T.brucei rhodesiense),
 <https://enpkg.commons-lab.org/module/Tbrucei2ugml> (Tbrucei2ugml, A screening result at 2ug/mL from a phenotypic assay against T.brucei rhodesiense),
 <https://enpkg.commons-lab.org/module/Tcruzi10ugml> (Tcruzi10ugml, A screening result at 10ug/mL from a phenotypic assay against T.cruzi),
 <https://enpkg.commons-lab.org/module/ChEMBLTarget> (ChEMBLTarget, A ChEMBL target),
 <https://enpkg.commons-lab.org/kg/LabBlank> (LabBlank, A blank sample),
 <https://enpkg.commons-lab.org/kg/LabQc> (LabQc, A quality control (QC) sample),
 <https://enpkg.commons-lab.org/module/ChEMBLChemical> (ChEMBLChemical, A ChEMBL chemical),
 <https://enpkg.commons-lab.org/module/ChEMBLAssayResults> (ChEMBLAssayResults, A ChEMBL assay result),
 <https://enpkg.commons-lab.org/module/ChEMBLAssay> (ChEMBLAssay, A ChEMBL assay),
 <https://enpkg.commons-lab.org/module/ChEMBLDocument> (ChEMBLDocument, A ChEMBL document),
 <http://xmlns.com/foaf/0.1/Person> (Person, None),
 <http://www.w3.org/2002/07/owl#Ontology> (Ontology, None),
 <http://purl.org/dc/dcmitype/Dataset> (Dataset, None)







The RDF graph supports the following relationships:
<http://www.w3.org/1999/02/22-rdf-syntax-ns#type> (type, None),
 <http://www.w3.org/2000/01/rdf-schema#subPropertyOf> (subPropertyOf, None),
 <http://www.w3.org/2000/01/rdf-schema#subClassOf> (subClassOf, None),
 <http://www.w3.org/2000/01/rdf-schema#domain> (domain, None),
 <http://www.w3.org/2000/01/rdf-schema#range> (range, None),
 <http://proton.semanticweb.org/protonsys#transitiveOver> (transitiveOver, None),
 <http://www.w3.org/2002/07/owl#inverseOf> (inverseOf, None),
 <http://www.w3.org/2000/01/rdf-schema#comment> (comment, None),
 <http://www.w3.org/2000/01/rdf-schema#label> (label, None),
 <https://enpkg.commons-lab.org/kg/has_lab_process> (has_lab_process, None),
 <https://enpkg.commons-lab.org/kg/has_wd_id_c18527bea8b2606a55457d607b24df69> (has_wd_id_c18527bea8b2606a55457d607b24df69, None),
 <https://enpkg.commons-lab.org/kg/submitted_taxon> (submitted_taxon, None),
 <https://enpkg.commons-lab.org/kg/has_LCMS_a3d5d65516c4504ea94e2d82b8926826> (has_LCMS_a3d5d65516c4504ea94e2d82b8926826, None),
 <https://enpkg.commons-lab.org/kg/has_LCMS> (has_LCMS, None),
 <https://enpkg.commons-lab.org/kg/has_content> (has_content, None),
 <https://enpkg.commons-lab.org/kg/has_isdb_annotation_f4fec9f496001612d60a75b5e1a43991> (has_isdb_annotation_f4fec9f496001612d60a75b5e1a43991, None),
 <https://enpkg.commons-lab.org/kg/has_isdb_annotation> (has_isdb_annotation, None),
 <https://enpkg.commons-lab.org/kg/has_lcms_feature_list_a137fd4a263d3587d35f61a526932c09> (has_lcms_feature_list_a137fd4a263d3587d35f61a526932c09, None),
 <https://enpkg.commons-lab.org/kg/has_lcms_feature_list> (has_lcms_feature_list, None),
 <https://enpkg.commons-lab.org/kg/has_sirius_annotation_67cd1a41589bee5e1e4ccee651ddf01b> (has_sirius_annotation_67cd1a41589bee5e1e4ccee651ddf01b, None),
 <https://enpkg.commons-lab.org/kg/has_sirius_annotation> (has_sirius_annotation, None),
 <https://enpkg.commons-lab.org/kg/has_wd_id> (has_wd_id, None),
 <https://enpkg.commons-lab.org/kg/has_cosine> (has_cosine, None),
 <https://enpkg.commons-lab.org/kg/has_mass_difference> (has_mass_difference, None),
 <https://enpkg.commons-lab.org/kg/has_member_1> (has_member_1, The first member of a spectral pair),
 <https://enpkg.commons-lab.org/kg/has_member_2> (has_member_2, The second member of a spectral pair),
 <https://enpkg.commons-lab.org/kg/has_mn_params> (has_mn_params, None),
 <https://enpkg.commons-lab.org/kg/has_npc_class> (has_npc_class, None),
 <https://enpkg.commons-lab.org/kg/has_npc_pathway> (has_npc_pathway, None),
 <https://enpkg.commons-lab.org/kg/has_npc_superclass> (has_npc_superclass, None),
 <https://enpkg.commons-lab.org/kg/has_smiles> (has_smiles, None),
 <https://enpkg.commons-lab.org/kg/is_InChIkey2D_of> (is_InChIkey2D_of, None),
 <http://xmlns.com/foaf/0.1/depiction> (depiction, None),
 <https://enpkg.commons-lab.org/kg/has_gnpslcms_link> (has_gnpslcms_link, None),
 <https://enpkg.commons-lab.org/kg/has_massive_doi> (has_massive_doi, None),
 <https://enpkg.commons-lab.org/kg/has_massive_license> (has_massive_license, None),
 <https://enpkg.commons-lab.org/kg/has_ionization> (has_ionization, None),
 <https://enpkg.commons-lab.org/kg/has_lcms_feature> (has_lcms_feature, None),
 <https://enpkg.commons-lab.org/kg/has_canopus_npc_class> (has_canopus_npc_class, None),
 <https://enpkg.commons-lab.org/kg/has_canopus_npc_class_prob> (has_canopus_npc_class_prob, None),
 <https://enpkg.commons-lab.org/kg/has_canopus_npc_pathway> (has_canopus_npc_pathway, None),
 <https://enpkg.commons-lab.org/kg/has_canopus_npc_pathway_prob> (has_canopus_npc_pathway_prob, None),
 <https://enpkg.commons-lab.org/kg/has_canopus_npc_superclass> (has_canopus_npc_superclass, None),
 <https://enpkg.commons-lab.org/kg/has_canopus_npc_superclass_prob> (has_canopus_npc_superclass_prob, None),
 <https://enpkg.commons-lab.org/kg/fast_search_gnpsdata_index_analog> (fast_search_gnpsdata_index_analog, None),
 <https://enpkg.commons-lab.org/kg/fast_search_gnpsdata_index_no_analog> (fast_search_gnpsdata_index_no_analog, None),
 <https://enpkg.commons-lab.org/kg/fast_search_gnpslibrary_analog> (fast_search_gnpslibrary_analog, None),
 <https://enpkg.commons-lab.org/kg/fast_search_gnpslibrary_no_analog> (fast_search_gnpslibrary_no_analog, None),
 <https://enpkg.commons-lab.org/kg/gnps_dashboard_view> (gnps_dashboard_view, None),
 <https://enpkg.commons-lab.org/kg/has_feature_area> (has_feature_area, None),
 <https://enpkg.commons-lab.org/kg/has_parent_mass> (has_parent_mass, None),
 <https://enpkg.commons-lab.org/kg/has_raw_spectrum> (has_raw_spectrum, None),
 <https://enpkg.commons-lab.org/kg/has_relative_feature_area> (has_relative_feature_area, None),
 <https://enpkg.commons-lab.org/kg/has_retention_time> (has_retention_time, None),
 <https://enpkg.commons-lab.org/kg/has_row_id> (has_row_id, None),
 <https://enpkg.commons-lab.org/kg/has_spec2vec_doc> (has_spec2vec_doc, None),
 <https://enpkg.commons-lab.org/kg/has_usi> (has_usi, None),
 <https://enpkg.commons-lab.org/kg/has_canopus_annotation> (has_canopus_annotation, None),
 <https://enpkg.commons-lab.org/kg/has_value> (has_value, None),
 <https://enpkg.commons-lab.org/kg/has_InChIkey2D> (has_InChIkey2D, None),
 <https://enpkg.commons-lab.org/kg/has_cosmic_score> (has_cosmic_score, None),
 <https://enpkg.commons-lab.org/kg/has_sirius_adduct> (has_sirius_adduct, None),
 <https://enpkg.commons-lab.org/kg/has_sirius_score> (has_sirius_score, None),
 <https://enpkg.commons-lab.org/kg/has_zodiac_score> (has_zodiac_score, None),
 <https://enpkg.commons-lab.org/kg/has_spec2vec_loss> (has_spec2vec_loss, None),
 <https://enpkg.commons-lab.org/kg/has_spec2vec_peak> (has_spec2vec_peak, None),
 <https://enpkg.commons-lab.org/kg/has_adduct> (has_adduct, None),
 <https://enpkg.commons-lab.org/kg/has_consistency_score> (has_consistency_score, None),
 <https://enpkg.commons-lab.org/kg/has_final_score> (has_final_score, None),
 <https://enpkg.commons-lab.org/kg/has_spectral_score> (has_spectral_score, None),
 <https://enpkg.commons-lab.org/kg/has_taxo_score> (has_taxo_score, None),
 <https://enpkg.commons-lab.org/kg/has_fbmn_ci> (has_fbmn_ci, None),
 <https://enpkg.commons-lab.org/kg/has_sirius_annotation_134c8d71fc127e80bf11783c63ca2a6a> (has_sirius_annotation_134c8d71fc127e80bf11783c63ca2a6a, None),
 <https://enpkg.commons-lab.org/kg/has_unresolved_taxon> (has_unresolved_taxon, None),
 <https://enpkg.commons-lab.org/kg/has_sirius_annotation_aef4b4a1a4d837ba0cd2dc861c1c7edf> (has_sirius_annotation_aef4b4a1a4d837ba0cd2dc861c1c7edf, None),
 <http://rdfs.org/ns/void#triples> (triples, None),
 <http://rdfs.org/ns/void#distinctSubjects> (distinctSubjects, None),
 <http://rdfs.org/ns/void#properties> (properties, None),
 <http://rdfs.org/ns/void#distinctObjects> (distinctObjects, None),
 <http://purl.org/dc/terms/modified> (modified, None),
 <http://example.org/has_sha256> (has_sha256, None),
 <https://enpkg.commons-lab.org/kg/has_wd_id_58188f7fc5732f02a13169efb6cbd254> (has_wd_id_58188f7fc5732f02a13169efb6cbd254, None),
 <https://enpkg.commons-lab.org/module/has_broad_organe> (has_broad_organe, None),
 <https://enpkg.commons-lab.org/module/has_organe> (has_organe, None),
 <https://enpkg.commons-lab.org/module/has_subsystem> (has_subsystem, None),
 <https://enpkg.commons-lab.org/module/has_tissue> (has_tissue, None),
 <https://enpkg.commons-lab.org/kg/has_LCMS_115a7a437b9e53c614dfed0c7c2ae3eb> (has_LCMS_115a7a437b9e53c614dfed0c7c2ae3eb, None),
 <https://enpkg.commons-lab.org/kg/has_LCMS_41c2dacf4cb75801e3c2fd01ab8d599d> (has_LCMS_41c2dacf4cb75801e3c2fd01ab8d599d, None),
 <https://enpkg.commons-lab.org/kg/has_canopus_annotation_00c339c3b183cb7fbf466b4d334dbef2> (has_canopus_annotation_00c339c3b183cb7fbf466b4d334dbef2, None),
 <https://enpkg.commons-lab.org/kg/has_isdb_annotation_95a24ed68ee3b548d93d96b99ba630c4> (has_isdb_annotation_95a24ed68ee3b548d93d96b99ba630c4, None),
 <https://enpkg.commons-lab.org/kg/has_lcms_feature_list_a6a5420d414df1000ab74a2b82275839> (has_lcms_feature_list_a6a5420d414df1000ab74a2b82275839, None),
 <https://enpkg.commons-lab.org/kg/has_lcms_feature_list_d5f38c47bc9e90a297d4c26ee02d05b5> (has_lcms_feature_list_d5f38c47bc9e90a297d4c26ee02d05b5, None),
 <https://enpkg.commons-lab.org/kg/has_sirius_annotation_00c339c3b183cb7fbf466b4d334dbef2> (has_sirius_annotation_00c339c3b183cb7fbf466b4d334dbef2, None),
 <https://enpkg.commons-lab.org/kg/has_sirius_annotation_12308a51007e92ba83c7fa84441211ac> (has_sirius_annotation_12308a51007e92ba83c7fa84441211ac, None),
 <https://enpkg.commons-lab.org/module/has_bioassay_results> (has_bioassay_results, None),
 <https://enpkg.commons-lab.org/module/inhibition_percentage> (inhibition_percentage, None),
 <https://enpkg.commons-lab.org/module/target_id> (target_id, None),
 <https://enpkg.commons-lab.org/kg/has_isdb_annotation_fd78fc9d48e5f20e49885f4a87565b9a> (has_isdb_annotation_fd78fc9d48e5f20e49885f4a87565b9a, None),
 <https://enpkg.commons-lab.org/kg/has_isdb_annotation_048a68561ed8cd358e7f7e1bbf540e64> (has_isdb_annotation_048a68561ed8cd358e7f7e1bbf540e64, None),
 <https://enpkg.commons-lab.org/kg/has_isdb_annotation_8a8c6af3837c0b5e86a3b445528db176> (has_isdb_annotation_8a8c6af3837c0b5e86a3b445528db176, None),
 <https://enpkg.commons-lab.org/kg/has_sirius_annotation_56099f7e491091d5bb8a8e403b554202> (has_sirius_annotation_56099f7e491091d5bb8a8e403b554202, None),
 <https://enpkg.commons-lab.org/kg/has_wd_id_1a78e7ba3a4b0616999e360a0f8477e1> (has_wd_id_1a78e7ba3a4b0616999e360a0f8477e1, None),
 <https://enpkg.commons-lab.org/kg/has_LCMS_606e5e35b5df8d788a66e08a8f03f494> (has_LCMS_606e5e35b5df8d788a66e08a8f03f494, None),
 <https://enpkg.commons-lab.org/module/has_chembl_id> (has_chembl_id, None),
 <https://enpkg.commons-lab.org/module/activity_relation> (activity_relation, None),
 <https://enpkg.commons-lab.org/module/activity_type> (activity_type, None),
 <https://enpkg.commons-lab.org/module/activity_unit> (activity_unit, None),
 <https://enpkg.commons-lab.org/module/activity_value> (activity_value, None),
 <https://enpkg.commons-lab.org/module/assay_id> (assay_id, None),
 <https://enpkg.commons-lab.org/module/stated_in_document> (stated_in_document, None),
 <https://enpkg.commons-lab.org/module/target_name> (target_name, None),
 <https://enpkg.commons-lab.org/module/has_chembl_activity> (has_chembl_activity, None),
 <https://enpkg.commons-lab.org/module/journal_name> (journal_name, None),
 <http://schema.org/name> (name, None),
 <http://schema.org/email> (email, None),
 <http://schema.org/affiliation> (affiliation, None),
 <http://www.w3.org/2002/07/owl#versionIRI> (versionIRI, None),
 <http://www.w3.org/2002/07/owl#versionInfo> (versionInfo, None),
 <http://purl.org/dc/terms/description> (description, None),
 <http://purl.org/dc/terms/title> (title, None),
 <http://purl.org/dc/terms/license> (license, None),
 <http://purl.org/dc/terms/creator> (creator, None),
 <http://purl.org/dc/terms/contributor> (contributor, None),
 <http://purl.org/vocab/vann/preferredNamespacePrefix> (preferredNamespacePrefix, None),
 <http://purl.org/vocab/vann/preferredNamespaceUri> (preferredNamespaceUri, None),
 <http://xmlns.com/foaf/0.1/logo> (logo, None), <https://enpkg.commons-lab.org/kg/has_member> (has_member, LCMSFeature), <http://purl.org/dc/terms/accrualPeriodicity> (accrualPeriodicity, None), <http://purl.org/pav/hasCurrentVersion> (hasCurrentVersion, None), <http://purl.org/dc/terms/isVersionOf> (isVersionOf, None), <http://purl.org/pav/version> (version, None), <http://purl.org/dc/terms/issued> (issued, None), <http://purl.org/dc/terms/hasPart> (hasPart, None)

"""

for key, url in NAMESPACES.items():
    string_to_replace = string_to_replace.replace(f"<{url}", f"<{key}:")

print(string_to_replace)



In the following, each IRI is followed by the local name and optionally its description in parentheses. 
The RDF graph supports the following node types:
<rdf:Property> (Property, None), 
<owl:TransitiveProperty> (TransitiveProperty, None), 
<owl:SymmetricProperty> (SymmetricProperty, None),
 <rdf:List> (List, None),
 <rdfs:Class> (Class, None),
 <rdfs:Datatype> (Datatype, None),
 <rdfs:ContainerMembershipProperty> (ContainerMembershipProperty, None),
 <enpkg:RawMaterial> (RawMaterial, A raw laboratory biological material, i.e. before extraction),
 <enpkg:LabObject> (LabObject, An object that correspond to a physical laboratory object),
 <void:Dataset> (Dataset, None),
 <enpkg:LabExtract> (LabExtract, A natural extract obtained from the processing of a RawMaterial),
 <enpkg:WDTaxon> (WDTaxon, Cross-reference to a taxon in Wikidata),
 <enpkg:XRef> (XRef, Any cross-reference),
 <enpkg:LFpair> (LFpair, A pair of 2 LCMSFeature),
 <enpkg:SpectralPair> (SpectralPair, A pair of 2 MS2Spectra

## From SMILE to InCHLKey

In [4]:
import requests

def smiles_to_inchikey(smiles):
    """
    Convert a SMILES string to InChIKey notation using the GNPS API.

    :param smiles: A string containing the SMILES representation of a molecule.
    :return: A string containing the InChIKey notation of the molecule.
    """
    # url = "https://structure.gnps2.org/inchikey"
    url = "https://npclassifier.gnps2.org/classify"
    
    params = {'smiles': smiles}
    response = requests.get(url, params=params)

    if response.status_code == 200:
        return response.text
    else:
        # Handle errors (e.g., invalid SMILES string or server issue)
        response.raise_for_status()

# Example usage
smiles_string = "CCC12CCCN3C1C4(CC3)C(CC2)NC5=CC=CC=C45"  # Example SMILES string for benzoic acid
inchikey = smiles_to_inchikey(smiles_string)
print("InChIKey:", inchikey)


InChIKey: {"class_results": ["Aspidosperma type"], "superclass_results": ["Tryptophan alkaloids"], "pathway_results": ["Alkaloids"], "isglycoside": false}


In [12]:

llm = ChatOpenAI(temperature=0, model="gpt-4")
tools = [
    Tool(
        name = "SMILE_CONVERTER",
        func=smiles_to_inchikey,
        description="The function takes a SMILES string as input and returns the InChIKey notation of the molecule.",
    ),
]

mrkl = initialize_agent(tools, llm, agent=AgentType.OPENAI_FUNCTIONS, verbose=True)

In [14]:
mrkl.run("what is the InChIKey of CCC12CCCN3C1C4(CC3)C(CC2)NC5=CC=CC=C45")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `SMILE_CONVERTER` with `CCC12CCCN3C1C4(CC3)C(CC2)NC5=CC=CC=C45`


[0m[36;1m[1;3mYAAIPCQYJYPITK-UHFFFAOYSA-N[0m[32;1m[1;3mThe InChIKey of the molecule with the SMILES notation "CCC12CCCN3C1C4(CC3)C(CC2)NC5=CC=CC=C45" is "YAAIPCQYJYPITK-UHFFFAOYSA-N".[0m

[1m> Finished chain.[0m


'The InChIKey of the molecule with the SMILES notation "CCC12CCCN3C1C4(CC3)C(CC2)NC5=CC=CC=C45" is "YAAIPCQYJYPITK-UHFFFAOYSA-N".'

## Taxon resolver

In [11]:
from taxon_resolver import TaxonResolver
import importlib
importlib.reload(TaxonResolver)

taxon_res = TaxonResolver()
llm = ChatOpenAI(temperature=0, model="gpt-4")
tools = [
    Tool(
        name = "TAXON_RESOLVER",
        func=taxon_res.query_wikidata,
        description="The function takes a taxon string as input and returns the wikidata ID.",
    ),
]

mrkl = initialize_agent(tools, llm, agent=AgentType.OPENAI_FUNCTIONS, verbose=True)

In [12]:
mrkl.run("what is the wikidata ID of Tabernaemontana coffeoides")




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `TAXON_RESOLVER` with `Tabernaemontana coffeoides`


[0m[36;1m[1;3m['http://www.wikidata.org/entity/Q15376858'][0m[32;1m[1;3mThe Wikidata ID of Tabernaemontana coffeoides is [Q15376858](http://www.wikidata.org/entity/Q15376858).[0m

[1m> Finished chain.[0m


'The Wikidata ID of Tabernaemontana coffeoides is [Q15376858](http://www.wikidata.org/entity/Q15376858).'

## Chemical resolver

In [26]:
import chemical_resolver
import importlib
importlib.reload(chemical_resolver)

chem_res = chemical_resolver.ChemicalResolver()
llm = ChatOpenAI(temperature=0, model="gpt-4")
tools = [
    Tool(
        name = "CHEMICAL_RESOLVER",
        func=chem_res.query_wikidata,
        description="The function takes a chemical name string as input and returns the wikidata ID.",
    ),
]

mrkl = initialize_agent(tools, llm, agent=AgentType.OPENAI_FUNCTIONS, verbose=True)


In [27]:
mrkl.run("what is the wikidata ID of Nor Scopolamine")
# mrkl.run("what is the wikidata ID of Hydroxyhyoscyamine")




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `CHEMICAL_RESOLVER` with `Nor Scopolamine`


[0m{'head': {'vars': ['compound', 'label']}, 'results': {'bindings': [{'compound': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q122766935'}, 'label': {'xml:lang': 'en', 'type': 'literal', 'value': 'hexahydrocannabiphorol'}}, {'compound': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q123106511'}, 'label': {'xml:lang': 'en', 'type': 'literal', 'value': '1-Bromo-1-heptyne'}}, {'compound': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q123115106'}, 'label': {'xml:lang': 'en', 'type': 'literal', 'value': 'nitronium hexafluoroarsenate'}}, {'compound': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q123134400'}, 'label': {'xml:lang': 'en', 'type': 'literal', 'value': 'tetraarsenic tetrasulfide'}}, {'compound': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q123201043'}, 'label': {'xml:lang': 'en', 'type': 'literal', 'valu

'I\'m sorry, but I couldn\'t find a Wikidata ID for "Nor Scopolamine". It\'s possible that this chemical is not listed in the Wikidata database or it may be listed under a different name.'

In [3]:
from urllib.request import urlopen
from urllib.parse import quote


#Chemical name to Standard InChIKey
def CIRconvert(ids):
    try:
        url = 'http://cactus.nci.nih.gov/chemical/structure/' + quote(ids) + '/stdinchikey'
        ans = urlopen(url).read().decode('utf8')
        return ans
    except:
        return 'Did not work'

identifiers  = ['Terpenoids', 'Hyoscyamine']

for ids in identifiers :
    print(ids, CIRconvert(ids))

Terpenoids Did not work
Hyoscyamine InChIKey=RKUNBYITZUJHSG-LGGPCSOHSA-N


In [40]:
loader = CSVLoader(file_path="../data/npc_all.csv", 
                   csv_args={
    'delimiter': ',',
    'fieldnames': ['NPCClass', 'NPCPathway', 'NPCSuperClass']
}
                   )
data = loader.load()
print(data)

[Document(page_content='NPCClass: npc\nNPCPathway: pathway\nNPCSuperClass: super', metadata={'source': '../data/npc_all.csv', 'row': 0}), Document(page_content='NPCClass: https://enpkg.commons-lab.org/kg/npc_unknown\nNPCPathway: https://enpkg.commons-lab.org/kg/npc_Alkaloids\nNPCSuperClass: https://enpkg.commons-lab.org/kg/npc_unknown', metadata={'source': '../data/npc_all.csv', 'row': 1}), Document(page_content='NPCClass: https://enpkg.commons-lab.org/kg/npc_unknown\nNPCPathway: https://enpkg.commons-lab.org/kg/npc_unknown\nNPCSuperClass: https://enpkg.commons-lab.org/kg/npc_unknown', metadata={'source': '../data/npc_all.csv', 'row': 2}), Document(page_content='NPCClass: https://enpkg.commons-lab.org/kg/npc_Tropane_alkaloids\nNPCPathway: https://enpkg.commons-lab.org/kg/npc_Alkaloids\nNPCSuperClass: https://enpkg.commons-lab.org/kg/npc_Ornithine_alkaloids', metadata={'source': '../data/npc_all.csv', 'row': 3}), Document(page_content='NPCClass: https://enpkg.commons-lab.org/kg/npc_unkn

In [41]:
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)
embeddings = OpenAIEmbeddings()
db = FAISS.from_documents(texts, embeddings)

In [49]:
retriever = db.as_retriever(
    # search_kwargs={"k": 10}
    )

In [50]:
from langchain.tools.retriever import create_retriever_tool

tool = create_retriever_tool(
    retriever,
    "search_npc",
    "Searches and returns NPC classes, pathways and superclasses related to a given query.",
)
tools = [tool]

In [20]:
from langchain_openai import ChatOpenAI
from langchain.agents import AgentExecutor, create_openai_tools_agent
from langchain import hub

prompt = hub.pull("hwchase17/openai-tools-agent")
prompt.messages

llm = ChatOpenAI(temperature=0)


agent = create_openai_tools_agent(llm, tools, prompt)
agent_executor = AgentExecutor(agent=agent, tools=tools)

In [21]:
agent_executor.invoke({'input': 'terpenoids'})

{'input': 'terpenoids',
 'output': 'Terpenoids are a class of natural products that are derived from terpenes. Terpenes are a large and diverse group of organic compounds that are produced by a wide variety of plants and some animals. Terpenoids have a wide range of biological activities and are often used in the pharmaceutical, cosmetic, and food industries.\n\nHere are some NPC classes, pathways, and superclasses related to terpenoids:\n\n1. NPCClass: [Tetraketide meroterpenoids](https://enpkg.commons-lab.org/kg/npc_Tetraketide_meroterpenoids)\n   - NPCPathway: [Terpenoids](https://enpkg.commons-lab.org/kg/npc_Terpenoids)\n   - NPCSuperClass: [Sesquiterpenoids](https://enpkg.commons-lab.org/kg/npc_Sesquiterpenoids)\n\n2. NPCClass: [Acyclic monoterpenoids](https://enpkg.commons-lab.org/kg/npc_Acyclic_monoterpenoids)\n   - NPCPathway: [Terpenoids](https://enpkg.commons-lab.org/kg/npc_Terpenoids)\n   - NPCSuperClass: [Diterpenoids](https://enpkg.commons-lab.org/kg/npc_Diterpenoids)\n\n3

In [51]:
res = retriever.get_relevant_documents("meroterpenoids")
res

[Document(page_content='NPCClass: https://enpkg.commons-lab.org/kg/npc_Tetraketide_meroterpenoids\nNPCPathway: https://enpkg.commons-lab.org/kg/npc_Terpenoids\nNPCSuperClass: https://enpkg.commons-lab.org/kg/npc_Meroterpenoids', metadata={'source': '../data/npc_all.csv', 'row': 674}),
 Document(page_content='NPCClass: https://enpkg.commons-lab.org/kg/npc_Tetraketide_meroterpenoids\nNPCPathway: https://enpkg.commons-lab.org/kg/npc_Terpenoids\nNPCSuperClass: https://enpkg.commons-lab.org/kg/npc_Sesquiterpenoids', metadata={'source': '../data/npc_all.csv', 'row': 1369}),
 Document(page_content='NPCClass: https://enpkg.commons-lab.org/kg/npc_Tetraketide_meroterpenoids\nNPCPathway: https://enpkg.commons-lab.org/kg/npc_Polyketides\nNPCSuperClass: https://enpkg.commons-lab.org/kg/npc_Meroterpenoids', metadata={'source': '../data/npc_all.csv', 'row': 504}),
 Document(page_content='NPCClass: https://enpkg.commons-lab.org/kg/npc_Tetraketide_meroterpenoids\nNPCPathway: https://enpkg.commons-lab.o

In [54]:
from langchain.chains.llm import LLMChain
from langchain_core.prompts.prompt import PromptTemplate

qa_prompt = """
Task: find the best URI for a given chemical name.
Instructions: choose the best URI for {chemical_name} among the results below. the best URI is the one that is the most specific to the chemical name. If none of the results are relevant, choose "none of the above".
The URI key represent its class, among: 'NPCClass', 'NPCPathway', 'NPCSuperClass'.
Return "{chemical_name} : URI, class".
{results}
"""


qa_prompt = PromptTemplate(
    template=qa_prompt,
    input_variables=["chemical_name", "results"]
)

temperature = 0.3
model_id = "gpt-4" 

# https://api.python.langchain.com/en/latest/chat_models/langchain_community.chat_models.openai.ChatOpenAI.html?highlight=chatopenai#
model = ChatOpenAI(temperature=temperature, 
                    model=model_id, # default is 'gpt-3.5-turbo'
                    max_retries=3,
                    verbose=True,
                    model_kwargs={
                        "top_p": 0.95,
                        }
                    )

qa_chain = LLMChain(llm=model, prompt=qa_prompt)


qa_chain.run({"chemical_name": "tetraketide meroterpenoids", "results": res})

'"tetraketide meroterpenoids : https://enpkg.commons-lab.org/kg/npc_Tetraketide_meroterpenoids, NPCClass"'

In [6]:
from chemical_resolver import ChemicalResolver
# import importlib
# importlib.reload(ChemicalResolver)

temperature = 0.3
model_id = "gpt-4" 

# https://api.python.langchain.com/en/latest/chat_models/langchain_community.chat_models.openai.ChatOpenAI.html?highlight=chatopenai#
model = ChatOpenAI(temperature=temperature, 
                    model=model_id, # default is 'gpt-3.5-turbo'
                    max_retries=3,
                    verbose=True,
                    model_kwargs={
                        "top_p": 0.95,
                        }
                    )


chem_res = ChemicalResolver.from_llm(llm=model, verbose=True)

res1 = chem_res.invoke("tetraketide meroterpenoids")

res2 = chem_res.invoke("Hyoscyamine")

print(res1)
print(res2)




[1m> Entering new ChemicalResolver chain...[0m

[1m> Finished chain.[0m


[1m> Entering new ChemicalResolver chain...[0m

[1m> Finished chain.[0m
{'query': 'tetraketide meroterpenoids', 'result': '"tetraketide meroterpenoids : https://enpkg.commons-lab.org/kg/npc_Tetraketide_meroterpenoids, NPCClass"'}
{'query': 'Hyoscyamine', 'result': 'Hyoscyamine: InChIKey=RKUNBYITZUJHSG-LGGPCSOHSA-N'}


## TARGET RESOLVER

In [27]:
import requests
import xml.etree.ElementTree as ET
from urllib.parse import quote

def target_name_to_target_id(target_name):
    """
    Convert a target_name string to ChEMBLTarget notation using the CHEMBL API.

    :param target_name: A string containing the target_name representation.
    :return: A string containing the ChEMBLTarget notation.
    """
    url = 'https://www.ebi.ac.uk/chembl/api/data/target'
    params = {'pref_name__contains': quote(target_name)}

    try:
        response = requests.get(url, params=params)

        if response.status_code == 200:
            # Parse the XML response
            root = ET.fromstring(response.content)

            # Assuming that the target_chembl_id is directly under the root
            # Adjust the path according to the actual structure of the XML
            target_chembl_id = root.find('.//target_chembl_id').text

            return {target_name + ' IRI': 'https://www.ebi.ac.uk/chembl/target_report_card/'+target_chembl_id}

        else:
            # Handle errors (e.g., invalid target name or server issue)
            response.raise_for_status()

    except requests.RequestException as e:
        return f'An error occurred: {str(e)}'


# Example usage
target_name = "Leishmania donovani" 
target_id = target_name_to_target_id(target_name)
print("target_id:", target_id)

target_id: {'Leishmania donovani IRI': 'https://www.ebi.ac.uk/chembl/target_report_card/CHEMBL367'}
