In [20]:
import numpy as np
import pandas as pd
from rdflib import Graph, Literal, URIRef, Namespace
from rdflib.plugins.sparql import prepareQuery
from tqdm import tqdm

In [21]:
ONLY_ORGANIC = True

In [22]:
%%time
effects = Graph()
effects.load('../TERA_OUTPUT/effects.nt',format='nt')

CPU times: user 5min 15s, sys: 2.12 s, total: 5min 17s
Wall time: 5min 17s


In [23]:
namespace = 'https://cfpub.epa.gov/ecotox/'

In [24]:
from rdflib.namespace import RDF

q = """
    select ?species ?chem ?conc ?unit where {
        ?test ns:hasResult [ns:endpoint ?end ;
                            ns:effect effect:MOR ;
                            ns:concentration [
                                rdf:value ?conc ;
                                unit:units ?unit 
                            ]] .
        ?test ns:species ?species .
        ?test ns:chemical ?chem .
    filter (?unit in (unit:MicrogramPerLitre, unit:MilligramPerLitre))
    filter (?end in (endpoint:LC50, endpoint:LD50, endpoint:EC50) )
    }
    """

q = prepareQuery(q,
                 initNs = { 'ns': Namespace(namespace), 
                           'rdf':RDF,
                           'unit':Namespace('http://qudt.org/vocab/unit#'),
                           'endpoint':Namespace(namespace+'endpoint/'),
                            'effect':Namespace(namespace+'effect/')})


In [25]:
qres = effects.query(q)

In [26]:
import sys
from SPARQLWrapper import SPARQLWrapper, JSON

endpoint_url = "https://query.wikidata.org/sparql"

query = """select ?cas ?mw where {
  ?c wdt:P231 ?castmp ;
     wdt:P2067 ?mw .
  bind(replace(?castmp,'-','') as ?cas)
}"""

def get_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    # TODO adjust user agent; see https://w.wiki/CX6
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()

results = get_results(endpoint_url, query)

mw = {}
for result in results["results"]["bindings"]:
    mw[namespace +'cas/'+ result['cas']['value']] = float(result['mw']['value'])

In [27]:
query = """select ?cas where {
          ?chemical wdt:P527 ?part .
          ?chemical wdt:P231 ?castmp .
          bind(replace(?castmp,'-','') as ?cas)
          filter (?part in (wd:Q623, wd:Q556, wd:Q629, wd:Q627, wd:Q650, wd:Q688, wd:Q879, wd:Q1103, wd:Q682, wd:Q674, wd:Q670, wd:Q871, wd:Q925, wd:Q1096) )
        }"""

only_organic_query = """select ?cas where {
          ?chemical wdt:P527 ?part .
          ?chemical wdt:P231 ?castmp .
          bind(replace(?castmp,'-','') as ?cas)
          filter (?part in (wd:Q623) )
        }"""

applicability_domain = set()
results = get_results(endpoint_url, only_organic_query if ONLY_ORGANIC else query)

for result in results["results"]["bindings"]:
    applicability_domain.add(namespace +'cas/'+ result['cas']['value'])

In [28]:
data = []
j = 0
for i,row in enumerate(qres):
    a,c,b,u = row
    try:
        f = 1e6 if 'Micro' in str(u) else 1e3
        data.append((str(a),str(c),float(b)/f/mw[str(c)]))
    except:
        j += 1
len(data),j

(83674, 9775)

In [29]:
data = pd.DataFrame(data=dict(zip(['species','chemical','conc (mol/L)'],zip(*data))))
data = data[data['chemical'].isin(applicability_domain)]

idx = (data.groupby(['species','chemical']).count() >= 3).index.tolist()
data = data.set_index(['species','chemical'])
data = data.loc[data.index.isin(idx)].reset_index()
data.head()

Unnamed: 0,species,chemical,conc (mol/L)
0,https://cfpub.epa.gov/ecotox/taxon/101,https://cfpub.epa.gov/ecotox/cas/107028,9.102882e-07
1,https://cfpub.epa.gov/ecotox/taxon/5473,https://cfpub.epa.gov/ecotox/cas/62737,0.0005910542
2,https://cfpub.epa.gov/ecotox/taxon/1,https://cfpub.epa.gov/ecotox/cas/91203,4.747663e-05
3,https://cfpub.epa.gov/ecotox/taxon/29050,https://cfpub.epa.gov/ecotox/cas/2921882,4.493216e-05
4,https://cfpub.epa.gov/ecotox/taxon/1049,https://cfpub.epa.gov/ecotox/cas/2921882,2.808618e-07


In [30]:
tmp = data.groupby(['species','chemical']).median()
effect_data = list(zip(*list(zip(*tmp.index.values)),-np.log10(tmp.values.ravel())))
df = pd.DataFrame(data=dict(zip(['species','chemical','conc (mol/L)'],zip(*effect_data))))
df.to_csv('only_organic_effect_data.csv' if ONLY_ORGANIC else + 'effect_data.csv')

In [31]:
df

Unnamed: 0,species,chemical,conc (mol/L)
0,https://cfpub.epa.gov/ecotox/taxon/1,https://cfpub.epa.gov/ecotox/cas/100414,3.398977
1,https://cfpub.epa.gov/ecotox/taxon/1,https://cfpub.epa.gov/ecotox/cas/100425,3.512146
2,https://cfpub.epa.gov/ecotox/taxon/1,https://cfpub.epa.gov/ecotox/cas/100447,4.237130
3,https://cfpub.epa.gov/ecotox/taxon/1,https://cfpub.epa.gov/ecotox/cas/100516,2.147166
4,https://cfpub.epa.gov/ecotox/taxon/1,https://cfpub.epa.gov/ecotox/cas/100527,3.610505
...,...,...,...
4426,https://cfpub.epa.gov/ecotox/taxon/9943,https://cfpub.epa.gov/ecotox/cas/110918,3.062257
4427,https://cfpub.epa.gov/ecotox/taxon/997,https://cfpub.epa.gov/ecotox/cas/148243,0.582372
4428,https://cfpub.epa.gov/ecotox/taxon/997,https://cfpub.epa.gov/ecotox/cas/50293,6.004827
4429,https://cfpub.epa.gov/ecotox/taxon/997,https://cfpub.epa.gov/ecotox/cas/52645531,6.312399
