In [1]:
import pandas as pd
from tqdm import tqdm
from rdflib import Graph,Literal,URIRef
from tera.DataIntegration import LogMapMapping

ONLY_ORGANIC = True

effect_data = pd.read_csv('only_organic_effect_data.csv' if ONLY_ORGANIC else 'effect_data.csv')
species = set(effect_data['species'].values)
chemicals = set(effect_data['chemical'].values)
len(species),len(chemicals)

(1090, 431)

In [2]:
from collections import defaultdict
mappings = {}
for part in range(11):
    aml = LogMapMapping(f'om_outputs/aml_output/{part}.rdf', strip=False, threshold=0.5, unique=True)
    aml.load()
    lm = LogMapMapping(f'om_outputs/logmap_outputs/{part}/logmap2_mappings.rdf', strip=False, threshold=0.5, unique=True)
    lm.load()
    mappings = {**aml.mappings,**mappings}
    mappings = {**lm.mappings,**mappings}

In [3]:
to_del = set()
for k in mappings:
    if len(mappings[k]) > 1:
        to_del.add(k)
    else:
        mappings[k] = mappings[k].pop(0)
mappings = {i:k for k,i in mappings.items() if k not in to_del}

In [4]:
ncbi_graph = Graph()
ncbi_graph.load('../TERA_OUTPUT/ncbi.nt',format='nt')

def remove_literals(graph):
    for s,p,o in graph:
        if isinstance(o,Literal):
            graph.remove((s,p,o))
            
    return graph
            
def replace_entities(graph,mappings):
    for k in mappings:
        triples = graph.triples((URIRef(k),None,None))
        for s,p,o in triples:
            graph.remove((s,p,o))
            graph.add((URIRef(mappings[k]),p,o))

        triples = graph.triples((None,None,URIRef(k)))
        for s,p,o in triples:
            graph.remove((s,p,o))
            graph.add((s,p,URIRef(mappings[k])))
            
    return graph

ncbi_graph = remove_literals(ncbi_graph)
ncbi_graph = replace_entities(ncbi_graph,mappings)


In [5]:
def get_all_connections(graph,entities,back_tracking=0):
    
    to_explore = set(entities)
    explored = set()
    out = Graph()
    
    if back_tracking < 0: return out
    
    while len(to_explore) > 0:
        e = to_explore.pop()
        explored.add(e)
        
        out += graph.triples((e,None,None))
        to_explore |= set(out.objects())
        to_explore -= explored
        
    tmp = set.union(*[set(graph.subjects(object=e)) for e in explored]) - explored
    out += get_all_connections(graph,tmp,back_tracking-1)
        
    return out


In [6]:
%%time
new_graph = get_all_connections(ncbi_graph,set(map(URIRef,species)),back_tracking=0)
new_graph.serialize('only_organic_reduced_kgs/reduced_taxonomy.nt' if ONLY_ORGANIC else 'reduced_kgs/reduced_taxonomy.nt',format='nt')

CPU times: user 29.3 s, sys: 144 ms, total: 29.5 s
Wall time: 29.5 s


In [7]:
!pip3 install sparqlwrapper



In [8]:

import sys
from SPARQLWrapper import SPARQLWrapper, JSON

endpoint_url = "https://query.wikidata.org/sparql"

query = """select ?from ?to where {
  [] wdt:P830 ?from ;
      wdt:P685 ?to .    
}
"""


def get_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    # TODO adjust user agent; see https://w.wiki/CX6
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()


results = get_results(endpoint_url, query)

eol_mappings = {}

for result in results["results"]["bindings"]:
    eol = 'https://eol.org/pages/' + result['from']['value']
    ncbi = 'https://www.ncbi.nlm.nih.gov/taxonomy/taxon/' + result['to']['value']
    try:
        eol_mappings[eol] = mappings[ncbi]
    except KeyError:
        pass
    

In [9]:
traits_graph = Graph()
traits_graph.load('../TERA_OUTPUT/traits.nt',format='nt')
traits_graph = remove_literals(traits_graph)
traits_graph = replace_entities(traits_graph,eol_mappings)

In [10]:
%%time
new_graph = get_all_connections(traits_graph,set(map(URIRef,species)),back_tracking=0)
new_graph.serialize('only_organic_reduced_kgs/reduced_traits.nt' if ONLY_ORGANIC else 'reduced_kgs/reduced_traits.nt',format='nt')

CPU times: user 38.6 s, sys: 91.9 ms, total: 38.7 s
Wall time: 38.7 s


In [11]:

import sys
from SPARQLWrapper import SPARQLWrapper, JSON

endpoint_url = "https://query.wikidata.org/sparql"

query = """select ?from ?to where {
   ?chem wdt:P231 ?tmp ;
         wdt:P683 ?to .
   bind( replace(?tmp,'-','') as ?from )
}"""


def get_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    # TODO adjust user agent; see https://w.wiki/CX6
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()


results = get_results(endpoint_url, query)
cas_chebi_mapping = {}
for result in results["results"]["bindings"]:
    cas_chebi_mapping['https://cfpub.epa.gov/ecotox/cas/'+ result['from']['value']]='http://purl.obolibrary.org/obo/CHEBI_'+result['to']['value']
cas_chebi_mapping = {i:k for k,i in cas_chebi_mapping.items()}

In [12]:
%%time
for filename in ['../chebi/chebi.ttl']:
    
    graph = Graph()
    graph.load(filename,format=filename.split('.')[-1])
    graph = remove_literals(graph)
    graph = replace_entities(graph,cas_chebi_mapping)
    new_graph = get_all_connections(graph,set(map(URIRef,chemicals)),back_tracking=0)
    fn,fm = 'reduced_' + filename.split('/')[-1],filename.split('.')[-1]
    new_graph.serialize(f'only_organic_reduced_kgs/{fn}' if ONLY_ORGANIC else f'reduced_kgs/{fn}',format=fm)
    

CPU times: user 7min 7s, sys: 1.63 s, total: 7min 9s
Wall time: 7min 11s


In [13]:

query = """select ?from ?to where {
   ?chem wdt:P231 ?tmp ;
         wdt:P486 ?to .
   bind( replace(?tmp,'-','') as ?from )
}
"""

results = get_results(endpoint_url, query)
cas_mesh_mapping = {}
for result in results["results"]["bindings"]:
    cas_mesh_mapping['https://cfpub.epa.gov/ecotox/cas/'+result['from']['value']]='http://id.nlm.nih.gov/mesh/'+result['to']['value']
cas_mesh_mapping = {i:k for k,i in cas_mesh_mapping.items()}

In [14]:
%%time
for filename in ['../mesh/mesh.nt']:
    
    graph = Graph()
    graph.load(filename,format=filename.split('.')[-1])
    graph = remove_literals(graph)
    graph = replace_entities(graph,cas_mesh_mapping)
    new_graph = get_all_connections(graph,set(map(URIRef,chemicals)),back_tracking=0)
    fn,fm = 'reduced_' + filename.split('/')[-1],filename.split('.')[-1]
    new_graph.serialize(f'only_organic_reduced_kgs/{fn}' if ONLY_ORGANIC else f'reduced_kgs/{fn}',format=fm)
    

CPU times: user 1h 23min 51s, sys: 6.27 s, total: 1h 23min 57s
Wall time: 1h 24min 1s


In [15]:

query = """select ?from ?to where {
   ?chem wdt:P231 ?tmp ;
         wdt:P592 ?to .
   bind( replace(?tmp,'-','') as ?from )
}
"""

results = get_results(endpoint_url, query)
cas_chembl_mapping = {}
for result in results["results"]["bindings"]:
    cas_chembl_mapping['https://cfpub.epa.gov/ecotox/cas/'+result['from']['value']]='http://rdf.ebi.ac.uk/resource/chembl/molecule/'+result['to']['value']
cas_chembl_mapping = {i:k for k,i in cas_chembl_mapping.items()}

In [16]:
species_mappings = {k.replace('https://www.ncbi.nlm.nih.gov/taxonomy/taxon/','http://identifiers.org/taxonomy/'):i for k,i in mappings.items()}

In [None]:
%%time
for filename in ['../chembl/chembl_26.0_bindingsite.ttl',
                 '../chembl/chembl_26.0_biocmpt.ttl',
                 '../chembl/chembl_26.0_cellline.ttl',
                 '../chembl/chembl_26.0_complextarget_targetcmpt_ls.ttl',
                 '../chembl/chembl_26.0_grouptarget_targetcmpt_ls.ttl',
                 '../chembl/chembl_26.0_indication.ttl',
                 '../chembl/chembl_26.0_journal.ttl',
                 '../chembl/chembl_26.0_moa.ttl',
                 '../chembl/chembl_26.0_molecule_chebi_ls.ttl',
                 '../chembl/chembl_26.0_molhierarchy.ttl',
                 '../chembl/chembl_26.0_protclass.ttl',
                 '../chembl/chembl_26.0_singletarget_targetcmpt_ls.ttl',
                 '../chembl/chembl_26.0_source.ttl',
                 '../chembl/chembl_26.0_target.ttl',
                 '../chembl/chembl_26.0_targetcmpt.ttl',
                 '../chembl/chembl_26.0_targetcmpt_uniprot_ls.ttl',
                 '../chembl/chembl_26.0_targetrel.ttl',
                 '../chembl/chembl_26.0_unichem.ttl']:
    
    graph = Graph()
    graph.load(filename,format=filename.split('.')[-1])
    
    graph = remove_literals(graph)
    graph = replace_entities(graph,cas_chembl_mapping)
    graph = replace_entities(graph,species_mappings)
    
    new_graph = get_all_connections(graph,set(map(URIRef,chemicals))|set(map(URIRef,species)),back_tracking=0)
    fn,fm = 'reduced_' + filename.split('/')[-1],filename.split('.')[-1]
    new_graph.serialize(f'only_organic_reduced_kgs/{fn}' if ONLY_ORGANIC else f'reduced_kgs/{fn}',format=fm)