In [69]:
import numpy as np
import pandas as pd
from rdflib import Graph, Literal, URIRef, Namespace
from rdflib.plugins.sparql import prepareQuery
from tqdm import tqdm

In [70]:
%%time
effects = Graph()
effects.load('../TERA_OUTPUT/effects.nt',format='nt')

CPU times: user 5min 7s, sys: 2.2 s, total: 5min 9s
Wall time: 5min 9s


In [71]:
namespace = 'https://cfpub.epa.gov/ecotox/'

In [72]:
from rdflib.namespace import RDF

q = """
    select ?species ?chem ?conc ?unit where {
        ?test ns:hasResult [ns:endpoint ?end ;
                            ns:effect effect:MOR ;
                            ns:concentration [
                                rdf:value ?conc ;
                                unit:units ?unit 
                            ]] .
        ?test ns:species ?species .
        ?test ns:chemical ?chem .
    filter (?unit in (unit:MicrogramPerLitre, unit:MilligramPerLitre))
    filter (?end in (endpoint:LC50, endpoint:LD50, endpoint:EC50) )
    }
    """

q = prepareQuery(q,
                 initNs = { 'ns': Namespace(namespace), 
                           'rdf':RDF,
                           'unit':Namespace('http://qudt.org/vocab/unit#'),
                           'endpoint':Namespace(namespace+'endpoint/'),
                            'effect':Namespace(namespace+'effect/')})


In [73]:
qres = effects.query(q)

In [74]:
import sys
from SPARQLWrapper import SPARQLWrapper, JSON

endpoint_url = "https://query.wikidata.org/sparql"

query = """select ?cas ?mw where {
  ?c wdt:P231 ?castmp ;
     wdt:P2067 ?mw .
  bind(replace(?castmp,'-','') as ?cas)
}"""

def get_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    # TODO adjust user agent; see https://w.wiki/CX6
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()

results = get_results(endpoint_url, query)

mw = {}
for result in results["results"]["bindings"]:
    mw[namespace +'cas/'+ result['cas']['value']] = float(result['mw']['value'])

In [75]:
query = """select ?cas where {
          ?chemical wdt:P527 ?part .
          ?chemical wdt:P231 ?castmp .
          bind(replace(?castmp,'-','') as ?cas)
          filter (?part in (wd:Q623, wd:Q556, wd:Q629, wd:Q627, wd:Q650, wd:Q688, wd:Q879, wd:Q1103, wd:Q682, wd:Q674, wd:Q670, wd:Q871, wd:Q925, wd:Q1096) )
        }"""

applicability_domain = set()
results = get_results(endpoint_url, query)

for result in results["results"]["bindings"]:
    applicability_domain.add(namespace +'cas/'+ result['cas']['value'])

In [76]:
data = []
j = 0
for i,row in enumerate(qres):
    a,c,b,u = row
    try:
        f = 1e6 if 'Micro' in str(u) else 1e3
        data.append((str(a),str(c),float(b)/f/mw[str(c)]))
    except:
        j += 1
len(data),j

(83847, 9602)

In [77]:
data = pd.DataFrame(data=dict(zip(['species','chemical','conc (mol/L)'],zip(*data))))
data = data[data['chemical'].isin(applicability_domain)]

idx = (data.groupby(['species','chemical']).count() >= 3).index.tolist()
data = data.set_index(['species','chemical'])
data = data.loc[data.index.isin(idx)].reset_index()
data

Unnamed: 0,species,chemical,conc (mol/L)
0,https://cfpub.epa.gov/ecotox/taxon/2,https://cfpub.epa.gov/ecotox/cas/7758987,1.825265e-05
1,https://cfpub.epa.gov/ecotox/taxon/7,https://cfpub.epa.gov/ecotox/cas/1910425,4.295979e-05
2,https://cfpub.epa.gov/ecotox/taxon/212,https://cfpub.epa.gov/ecotox/cas/7778509,4.765584e-05
3,https://cfpub.epa.gov/ecotox/taxon/1460,https://cfpub.epa.gov/ecotox/cas/2921882,2.865937e-08
4,https://cfpub.epa.gov/ecotox/taxon/230,https://cfpub.epa.gov/ecotox/cas/121755,1.117090e-01
...,...,...,...
39777,https://cfpub.epa.gov/ecotox/taxon/19218,https://cfpub.epa.gov/ecotox/cas/10108642,2.175793e-05
39778,https://cfpub.epa.gov/ecotox/taxon/5,https://cfpub.epa.gov/ecotox/cas/7681825,5.203677e-04
39779,https://cfpub.epa.gov/ecotox/taxon/109,https://cfpub.epa.gov/ecotox/cas/74839,9.580379e-06
39780,https://cfpub.epa.gov/ecotox/taxon/16176,https://cfpub.epa.gov/ecotox/cas/52645531,6.947311e-07


In [78]:
tmp = data.groupby(['species','chemical']).median()
effect_data = list(zip(*list(zip(*tmp.index.values)),-np.log10(tmp.values.ravel())))
pd.DataFrame(data=dict(zip(['species','chemical','conc (mol/L)'],zip(*effect_data)))).to_csv('effect_data.csv')

In [79]:
data.groupby(['species']).count().sort_values(ascending=False,by='chemical')

Unnamed: 0_level_0,chemical,conc (mol/L)
species,Unnamed: 1_level_1,Unnamed: 2_level_1
https://cfpub.epa.gov/ecotox/taxon/4,2907,2907
https://cfpub.epa.gov/ecotox/taxon/1,2863,2863
https://cfpub.epa.gov/ecotox/taxon/5,1987,1987
https://cfpub.epa.gov/ecotox/taxon/2,1214,1214
https://cfpub.epa.gov/ecotox/taxon/2371,707,707
...,...,...
https://cfpub.epa.gov/ecotox/taxon/28845,1,1
https://cfpub.epa.gov/ecotox/taxon/28846,1,1
https://cfpub.epa.gov/ecotox/taxon/28849,1,1
https://cfpub.epa.gov/ecotox/taxon/28850,1,1


In [80]:
len(applicability_domain.intersection(set(data['chemical'])))

608