In [159]:
import pandas as pd
import numpy as np
predictions = pd.read_csv('predictions.csv')
predictions_embedding = pd.read_csv('predictions_embedding.csv')
predictions.head()

Unnamed: 0.1,Unnamed: 0,species,chemical,prediction
0,0,https://cfpub.epa.gov/ecotox/taxon/1,https://cfpub.epa.gov/ecotox/cas/10025919,3.409423
1,1,https://cfpub.epa.gov/ecotox/taxon/1,https://cfpub.epa.gov/ecotox/cas/10028156,4.745417
2,2,https://cfpub.epa.gov/ecotox/taxon/1,https://cfpub.epa.gov/ecotox/cas/100414,3.118691
3,3,https://cfpub.epa.gov/ecotox/taxon/1,https://cfpub.epa.gov/ecotox/cas/100425,3.371569
4,4,https://cfpub.epa.gov/ecotox/taxon/1,https://cfpub.epa.gov/ecotox/cas/10043013,3.91503


In [191]:
effect_data = pd.read_csv('effect_data_extra.csv')
effect_data.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,species,chemical,conc (mol/L),species_divisions,species_others,subClassOf,smiles,smiles_clusters
0,0,0,https://cfpub.epa.gov/ecotox/taxon/1,https://cfpub.epa.gov/ecotox/cas/10025919,3.051629,https://cfpub.epa.gov/ecotox/group/Fish,https://cfpub.epa.gov/ecotox/group/StandardTes...,,Cl[Sb](Cl)Cl,4
1,1,1,https://cfpub.epa.gov/ecotox/taxon/1,https://cfpub.epa.gov/ecotox/cas/10028156,5.681105,https://cfpub.epa.gov/ecotox/group/Fish,https://cfpub.epa.gov/ecotox/group/StandardTes...,,[O-][O+]=O,4
2,2,2,https://cfpub.epa.gov/ecotox/taxon/1,https://cfpub.epa.gov/ecotox/cas/100414,3.398977,https://cfpub.epa.gov/ecotox/group/Fish,https://cfpub.epa.gov/ecotox/group/StandardTes...,http://purl.obolibrary.org/obo/CHEBI_33832,CCC1=CC=CC=C1,2
3,3,3,https://cfpub.epa.gov/ecotox/taxon/1,https://cfpub.epa.gov/ecotox/cas/100425,3.512146,https://cfpub.epa.gov/ecotox/group/Fish,https://cfpub.epa.gov/ecotox/group/StandardTes...,"http://purl.obolibrary.org/obo/CHEBI_134179,ht...",C=CC1=CC=CC=C1,2
4,4,4,https://cfpub.epa.gov/ecotox/taxon/1,https://cfpub.epa.gov/ecotox/cas/10043013,3.127255,https://cfpub.epa.gov/ecotox/group/Fish,https://cfpub.epa.gov/ecotox/group/StandardTes...,,O=S1(=O)O[Al]2OS(=O)(=O)O[Al](O1)OS(=O)(=O)O2,4


In [161]:
predictions = pd.merge(predictions, effect_data,  how='left', left_on=['species','chemical'], right_on = ['species','chemical'])
predictions_embedding = pd.merge(predictions_embedding, effect_data,  how='left', left_on=['species','chemical'], right_on = ['species','chemical'])

In [162]:
import sys
from SPARQLWrapper import SPARQLWrapper, JSON
namespace = 'https://cfpub.epa.gov/ecotox/'

endpoint_url = "https://query.wikidata.org/sparql"

query = """select ?cas ?mw where {
  ?c wdt:P231 ?castmp ;
     wdt:P2067 ?mw .
  bind(replace(?castmp,'-','') as ?cas)
}"""

def get_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    # TODO adjust user agent; see https://w.wiki/CX6
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()

results = get_results(endpoint_url, query)

mw = {}
for result in results["results"]["bindings"]:
    mw['https://cfpub.epa.gov/ecotox/cas/'+result['cas']['value']] = float(result['mw']['value'])


In [163]:
predictions['molecular_weight'] = predictions['chemical'].apply(lambda x: mw[x])
predictions_embedding['molecular_weight'] = predictions_embedding['chemical'].apply(lambda x: mw[x])

In [164]:
f = lambda x,col: 1e3 * 10**(-x[col]) * x['molecular_weight']

predictions['predicted conc (mg/L)'] = f(predictions,'prediction')
predictions_embedding['predicted conc (mg/L)'] = f(predictions_embedding,'prediction')

predictions['true conc (mg/L)'] = f(predictions,'conc (mol/L)')
predictions_embedding['true conc (mg/L)'] = f(predictions_embedding,'conc (mol/L)')

In [165]:
def hazard_function(c):
    if np.isnan(c): 
        return 'NaN'
    if c <= 1: #mg/L
        return 'Category 1' # Very toxic
    if c <= 10:
        return 'Category 2' # Toxic
    if c <= 100:
        return 'Category 3' # Harmful
    return 'Category 4' # Maybe harmful

predictions['predicted hazard'] = list(map(hazard_function,predictions['predicted conc (mg/L)'].values))
predictions_embedding['predicted hazard'] = list(map(hazard_function,predictions_embedding['predicted conc (mg/L)'].values))
predictions['true hazard'] = list(map(hazard_function,predictions['true conc (mg/L)'].values))
predictions_embedding['true hazard'] = list(map(hazard_function,predictions_embedding['true conc (mg/L)'].values))


In [166]:
predictions.groupby('predicted hazard').count()

Unnamed: 0_level_0,Unnamed: 0_x,species,chemical,prediction,Unnamed: 0_y,Unnamed: 0.1,conc (mol/L),species_divisions,species_others,subClassOf,smiles,smiles_clusters,molecular_weight,predicted conc (mg/L),true conc (mg/L),true hazard
predicted hazard,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Category 1,1246,1246,1246,1246,1246,1246,1246,1246,428,478,1246,1246,1246,1246,1246,1246
Category 2,2039,2039,2039,2039,2039,2039,2039,2039,914,601,2039,2039,2039,2039,2039,2039
Category 3,1800,1800,1800,1800,1800,1800,1800,1800,873,715,1800,1800,1800,1800,1800,1800
Category 4,2036,2036,2036,2036,2036,2036,2036,2036,768,775,2036,2036,2036,2036,2036,2036


In [167]:
from rdflib import Graph, URIRef
import numpy as np
import glob 
graph = Graph()
for filename in glob.glob('reduced_kgs/reduced_*'):
    graph.load(filename,format=filename.split('.')[-1])
entities = sorted(list(set(graph.subjects()) | set(graph.objects())))
relations = sorted(list(set(graph.predicates())))

entity_mappings = {e:i for i,e in enumerate(entities)}
relation_mappings = {e:i for i,e in enumerate(relations)}
triples = np.asarray(list(map(lambda x: (entity_mappings[x[0]],
                                         relation_mappings[x[1]],
                                         entity_mappings[x[2]]),graph)))

In [168]:
import sys  
sys.path.insert(0, './')
from embedding_model import ComplEx

In [169]:
embedding_model = ComplEx(entities,relations)
embedding_model.load_weights('model.tf')
entity_matrix = embedding_model.get_layer('entity_embedding').weights[0].numpy()



In [170]:
species = set(predictions.species)
chemicals = set(predictions.chemical)

In [171]:
%%time
import tqdm.notebook as tq

distance_matrix_species = np.asarray([np.linalg.norm(entity_matrix[entity_mappings[URIRef(s)]]-entity_matrix,ord=2,axis=-1) for s in tq.tqdm(species)]) / len(entity_matrix[0])
distance_matrix_chemicals = np.asarray([np.linalg.norm(entity_matrix[entity_mappings[URIRef(c)]]-entity_matrix,ord=2,axis=-1) for c in tq.tqdm(chemicals)]) / len(entity_matrix[0])

HBox(children=(FloatProgress(value=0.0, max=1449.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=520.0), HTML(value='')))


CPU times: user 24.3 s, sys: 19.3 s, total: 43.6 s
Wall time: 43.6 s


In [172]:
distance_matrix_species.shape

(1449, 59953)

In [173]:
inverse_species_mappings = {k:i for i,k in enumerate(species)}
inverse_chemical_mappings = {k:i for i,k in enumerate(chemicals)}

In [174]:
taxonomy = Graph()
taxonomy.load('../TERA_OUTPUT/ecotox_taxonomy.nt',format='nt')
chemicals = Graph()
chemicals.load('../TERA_OUTPUT/ecotox_chemical.nt',format='nt')

In [224]:
from rdflib.namespace import RDFS
def taxon_name(uri):
    try:
        return str(list(taxonomy.objects(subject=URIRef(uri),predicate=URIRef('https://cfpub.epa.gov/ecotox/latinName'))).pop(0))
    except:
        return uri
    
def chemical_name(uri):
    try:
        l = list(chemicals.objects(subject=URIRef(uri),predicate=RDFS.subClassOf))
        return str(l.pop(0)).split('/')[-1]
    except:
        return uri
    

In [225]:
predictions_embedding['categorical error'] = [abs(int(true.split()[-1])-int(pred.split()[-1])) for true,pred in zip(predictions_embedding['true hazard'].values,predictions_embedding['predicted hazard'].values)]
predictions_embedding['log-mae (mol/L)'] = abs(predictions_embedding['conc (mol/L)']- predictions_embedding['prediction'])

In [226]:
predictions_embedding.sort_values(by='categorical error',axis=0,inplace=True)

In [228]:
n = 5
for i,row in enumerate(predictions_embedding[predictions_embedding['true hazard'] != 'Category 4'].iterrows()):
    
    true_haz = row[1]['true hazard']
    pred_haz = row[1]['predicted hazard']
    
    exp_species = np.argsort(distance_matrix_species[inverse_species_mappings[row[1]['species']]])[1:n+1]
    exp_chemical = np.argsort(distance_matrix_chemicals[inverse_chemical_mappings[row[1]['chemical']]])[1:n+1]
    
    print(f'True hazard: {true_haz}, predicted: {pred_haz}. log-mae (mol/L):',row[1]['log-mae (mol/L)'])
    
    tn = taxon_name(row[1]['species'])
    cn = chemical_name(row[1]['chemical'])
 
    print(f'{tn} close to',[taxon_name(inverse_entity_mappings[i]) for i in exp_species])
    print(f'{cn} in group',[chemical_name(inverse_entity_mappings[i]) for i in exp_chemical])
    print('')
    
    if i > 5: break

True hazard: Category 3, predicted: Category 3. log-mae (mol/L): 0.030371335695738022
Euplotes sp. close to ['Hydrophilus sp.', 'Pseudocandona sp.', 'Penaeidae', 'Strongylocentrotus droebachiensis', 'Aplocheilus panchax']
Chromium in group ['Mercury', 'https://cfpub.epa.gov/ecotox/cas/9012764', 'https://cfpub.epa.gov/ecotox/cas/1319773', 'http://id.nlm.nih.gov/mesh/D001547', 'http://id.nlm.nih.gov/mesh/D005035']

True hazard: Category 2, predicted: Category 2. log-mae (mol/L): 0.2880995056986704
Mytilopsis sallei close to ['Arthropoda', 'Micropterus sp.', 'Mercenaria mercenaria', 'Moina macrocopa', 'Cerastoderma edule']
Mercury in group ['http://id.nlm.nih.gov/mesh/D009588', 'http://id.nlm.nih.gov/mesh/D001547', 'Chromium', 'https://cfpub.epa.gov/ecotox/cas/1319773', 'https://cfpub.epa.gov/ecotox/cas/25322683']

True hazard: Category 1, predicted: Category 1. log-mae (mol/L): 0.6494298202966942
Ophiogomphus sp. close to ['Maxillopoda', 'Fundulus diaphanus', 'Streptocephalus proboscideu