In [154]:
from rdflib import Graph, URIRef
import numpy as np
import glob 
graph = Graph()
for filename in glob.glob('reduced_kgs/reduced_*'):
    graph.load(filename,format=filename.split('.')[-1])

In [155]:
len(graph)


218327

In [156]:
TRAINING = False

In [157]:
entities = set(graph.subjects()) | set(graph.objects())
relations = set(graph.predicates())
len(entities), len(relations)

(58176, 41)

In [158]:
entity_mappings = {e:i for i,e in enumerate(entities)}
relation_mappings = {e:i for i,e in enumerate(relations)}
triples = np.asarray(list(map(lambda x: (entity_mappings[x[0]],
                                         relation_mappings[x[1]],
                                         entity_mappings[x[2]]),graph)))

In [159]:
from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout, Conv2D, Flatten
import tensorflow as tf

def TransE(dim=200,bias=1,lamb=1):
    
    inp = Input((3,))
    inp_label = Input(())
    
    s,p,o = tf.unstack(inp,axis=-1)
    
    entity_embedding = Embedding(len(entities),dim,name='entity_embedding')
    relation_embedding = Embedding(len(relations),dim,name='relation_embedding')
    
    h,r,t = entity_embedding(s),relation_embedding(p),entity_embedding(o)
    
    score = bias - tf.norm(h+r-t, ord=2, axis=-1)
    
    loss = lamb - inp_label * score
    loss = tf.where(loss>0,loss,0) + \
    1e-3 * tf.norm(entity_embedding.weights[0],ord=2)**2
    
    model = Model(inputs=[inp,inp_label],outputs=score)
    model.add_loss(loss)
    model.compile(optimizer='adam',loss=None)
    
    return model

def DistMult(dim=200):
    inp = Input((3,))
    inp_label = Input(())
    
    s,p,o = tf.unstack(inp,axis=-1)
    
    entity_embedding = Embedding(len(entities),dim,name='entity_embedding')
    relation_embedding = Embedding(len(relations),dim,name='relation_embedding')
    
    h,r,t = entity_embedding(s),relation_embedding(p),entity_embedding(o)
    
    score = tf.keras.layers.Activation('tanh')(tf.reduce_sum(h*r*t,axis=-1))
    
    model = Model(inputs=[inp,inp_label],outputs=score)
    
    loss = lambda true,pred: tf.reduce_sum(tf.math.log(1+tf.math.exp(-true*pred))) + \
    1e-3 * tf.norm(entity_embedding.weights[0],ord=2)**2
    
    model.compile(optimizer='adam',loss=loss)
    
    return model

def ComplEx(dim=200):
    inp = Input((3,))
    inp_label = Input(())
    
    s,p,o = tf.unstack(inp,axis=-1)
    
    entity_embedding = Embedding(len(entities),dim,name='entity_embedding')
    relation_embedding = Embedding(len(relations),dim,name='relation_embedding')
    
    h,r,t = entity_embedding(s),relation_embedding(p),entity_embedding(o)
    
    h_real,h_img = tf.split(h,2,axis=-1)
    r_real,r_img = tf.split(r,2,axis=-1)
    t_real,t_img = tf.split(t,2,axis=-1)
    
    score = tf.reduce_sum(r_real*h_real*t_real,axis=-1) + \
    tf.reduce_sum(r_real*h_img*t_img,axis=-1) + \
    tf.reduce_sum(r_img*h_real*t_img,axis=-1) - \
    tf.reduce_sum(r_img*h_img*t_real,axis=-1)
        
    model = Model(inputs=[inp,inp_label],outputs=score)
    
    loss = lambda true,pred: tf.reduce_sum(tf.math.log(1+tf.math.exp(-true*pred))) + \
    1e-3 * tf.norm(entity_embedding.weights[0],ord=2)**2
    
    model.compile(optimizer='adam',loss=loss)
    
    return model


def ConvE():
    inp = Input((3,))
    inp_label = Input(())
    
    s,p,o = tf.unstack(inp,axis=-1)
    
    entity_embedding = Embedding(len(entities),200,name='entity_embedding')
    relation_embedding = Embedding(len(relations),200,name='relation_embedding')
    
    h,r,t = entity_embedding(s),relation_embedding(p),entity_embedding(o)
    
    h = tf.reshape(h,(-1,20,10))
    r = tf.reshape(r,(-1,20,10))
    
    x = Concatenate(axis=-1)([h,r])
    
    x = Conv2D(16,(5,5),activation='relu')(x)
    x = Dropout(0.2)(x)
    x = Conv2D(16,(3,3),activation='relu')(x)
    x = Dropout(0.2)(x)
    x = Flatten()(x)
    
    x = Dense(1,activation='sigmoid')
    
    model = Model(inputs=[inp,inp_label],outputs=x)
    
    loss = lambda true,pred: tf.keras.losses.binary_crossentropy(true,pred) + \
    1e-3 * tf.norm(entity_embedding.weights[0],ord=2)**2
    
    model.compile(optimizer='adam',loss=loss)
    
    return model


In [160]:
def create_negative(postive,n=2):
    negative = np.repeat(postive,n,axis=0)
    negative[:,0] = np.random.randint(0,len(entities),size=len(negative))
    negative[:,2] = np.random.randint(0,len(entities),size=len(negative))
    return negative

In [161]:
%timeit create_negative(triples,n=10)

39.4 ms ± 450 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [162]:

import tqdm.notebook as tq

embedding_model = ComplEx()

best_loss = float('inf')
losses = []
patience=10

if TRAINING:

    for i in tq.tqdm(range(100)):

        negative = create_negative(triples,n=32)

        X = np.concatenate([triples,negative],axis=0)
        y = np.concatenate([np.ones(len(triples)),-1*np.ones(len(negative))],axis=0)

        hist = embedding_model.fit((X,y),y,
                         batch_size=8192,
                         shuffle=True,
                         verbose=0)

        l = hist.history['loss'][-1]
        losses.append(l)
        if l < best_loss:
            best_loss = l
            c = 0
        else:
            c += 1

        if c > patience: break
    
    embedding_model.save_weights('model.tf')
            
else:
    embedding_model.load_weights('model.tf')
    



In [163]:
%matplotlib inline
import matplotlib.pyplot as plt
if TRAINING:
    plt.plot(losses)

In [164]:
import pandas as pd

effect_data = pd.read_csv('effect_data.csv')

In [165]:
fps = {}

import sys
from SPARQLWrapper import SPARQLWrapper, JSON
from pubchempy import Compound

endpoint_url = "https://query.wikidata.org/sparql"

query = """select ?cas ?pc where {
  ?c wdt:P231 ?tmp ;
     wdt:P662 ?pc .
  bind(replace(?tmp,'-','') as ?cas)
}"""


def get_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    # TODO adjust user agent; see https://w.wiki/CX6
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()

try: 
    fps = pd.read_pickle('fingerprints.pkl')

except FileNotFoundError:
    
    results = get_results(endpoint_url, query)
    for result in tq.tqdm(results["results"]["bindings"]):
        chem_id = 'https://cfpub.epa.gov/ecotox/cas/'+result['cas']['value']
        if chem_id in set(effect_data.chemical.values): 
            vioxx = Compound.from_cid(int(result['pc']['value']))
            fps[chem_id] = vioxx.fingerprint
    pd.to_pickle(fps,'fingerprints.pkl')


In [166]:
def to_bin(he): 
    scale = 16 ## equals to hexadecimal
    num_of_bits = 900
    return bin(int(he, scale))[2:].zfill(num_of_bits)

effect_data['fp'] = [to_bin(fps[c]) if c in fps else to_bin('0') for c in effect_data['chemical'].values]

In [167]:
effect_data = effect_data[effect_data['fp']!=to_bin('0')]
effect_data.shape

(8441, 5)

In [168]:
idx = (effect_data.groupby('species').count() > 9)['fp']
idx = [i for i,v in zip(idx.index,idx.values) if v]

In [169]:
#effect_data = effect_data.where(effect_data['species'].isin(idx)).dropna().reset_index()
effect_data

Unnamed: 0.1,Unnamed: 0,species,chemical,conc (mol/L),fp
0,0,https://cfpub.epa.gov/ecotox/taxon/1,https://cfpub.epa.gov/ecotox/cas/10025919,3.051629,0011011100010000000000000000000000000000000000...
1,1,https://cfpub.epa.gov/ecotox/taxon/1,https://cfpub.epa.gov/ecotox/cas/10028156,5.681105,0011011100010000000000000000001100000000000000...
2,2,https://cfpub.epa.gov/ecotox/taxon/1,https://cfpub.epa.gov/ecotox/cas/100414,3.398977,0011011100011100000001110000000000000000000000...
3,3,https://cfpub.epa.gov/ecotox/taxon/1,https://cfpub.epa.gov/ecotox/cas/100425,3.512146,0011011100011100000001110000000000000000000000...
4,4,https://cfpub.epa.gov/ecotox/taxon/1,https://cfpub.epa.gov/ecotox/cas/10043013,3.127255,0011011100010000000000000000001111000000000001...
...,...,...,...,...,...
8447,8447,https://cfpub.epa.gov/ecotox/taxon/995,https://cfpub.epa.gov/ecotox/cas/10108642,3.444898,0011011100010000000000000000000000000000000000...
8448,8448,https://cfpub.epa.gov/ecotox/taxon/997,https://cfpub.epa.gov/ecotox/cas/148243,0.582372,0011011100011000000001110010001000000000000000...
8449,8449,https://cfpub.epa.gov/ecotox/taxon/997,https://cfpub.epa.gov/ecotox/cas/50293,6.004827,0011011100011100000001110000000000000000000000...
8450,8450,https://cfpub.epa.gov/ecotox/taxon/997,https://cfpub.epa.gov/ecotox/cas/52645531,6.312399,0011011100011110000001111000001100000000000000...


In [170]:
data = effect_data[['fp','species','chemical','conc (mol/L)']].values

In [171]:
data = np.asarray([[fp,
                    entity_mappings[URIRef(s)],
                    entity_mappings[URIRef(c)],
                    conc] for fp,s,c,conc in data if URIRef(s) in entity_mappings and URIRef(c) in entity_mappings])

In [172]:
embedding_model.trainable=False
from tensorflow.keras.layers import Concatenate, BatchNormalization, Activation
def mlp(input_shape,use_embedding=False):
    
    inp_s = Input(())
    inp_c = Input(())
    inp = Input(input_shape)
    
    if use_embedding:
        s = embedding_model.get_layer('entity_embedding')(inp_s)
        c = embedding_model.get_layer('entity_embedding')(inp_c)
    else:
        el = Embedding(len(entities),200)
        s = el(inp_s)
        c = el(inp_c)
    
    fp = Dense(128)(inp)
    #fp = BatchNormalization()(fp)
    fp = Activation('relu')(fp)
    fp = Dropout(0.4)(fp)
    
    s = Dense(128)(s)
    #s = BatchNormalization()(s)
    s = Activation('relu')(s)
    s = Dropout(0.4)(s)
    
    c = Dense(128)(c)
    #c = BatchNormalization()(c)
    c = Activation('relu')(c)
    c = Dropout(0.4)(c)
    
    x = Concatenate(axis=-1)([s,c,fp])
    x = Dense(128)(x)
    #x = BatchNormalization()(x)
    x = Activation('relu')(x)
    
    x = Dropout(0.2)(x)
    x = Dense(1)(x)
    model = Model(inputs=[inp,inp_s,inp_c],outputs=x)
    model.compile(optimizer='adam',loss='mse')
    return model
   
    

In [173]:
from sklearn.model_selection import KFold, GroupKFold, GroupShuffleSplit
from sklearn.metrics import r2_score
import random


Xs,Xc = data[:,1].astype(int),data[:,2].astype(int)
y = data[:,3].astype('float32')
X = data[:,0]

X = np.asarray(list(map(lambda x: np.asarray([float(a) for a in x]), X))).astype('float32')

scores = []

groups = Xc

for seed in tq.tqdm(range(10)):
    np.random.seed(seed)
    random.seed(seed)
    tf.random.set_seed(seed)

    oof = np.zeros(y.shape)
    oof_embedding = np.zeros(y.shape)

    for i,(train,test) in tq.tqdm(enumerate(GroupKFold(5).split(X,y,groups=groups))
                                  ,total=5,desc='Folds',leave=False):

        model = mlp(X.shape[-1],use_embedding=False)
        model.fit((X[train],Xs[train],Xc[train]),y[train],
                  validation_data=((X[test],Xs[test],Xc[test]),y[test]),
                  batch_size=8192,epochs=1000,verbose=0,
                 callbacks=[tf.keras.callbacks.EarlyStopping('val_loss',patience=10,restore_best_weights=True)])

        oof[test] += model.predict((X[test],Xs[test],Xc[test])).ravel()

        model = mlp(X.shape[-1],use_embedding=True)
        model.fit((X[train],Xs[train],Xc[train]),y[train],
                  validation_data=((X[test],Xs[test],Xc[test]),y[test]),
                  batch_size=8192,epochs=1000,verbose=0,
                 callbacks=[tf.keras.callbacks.EarlyStopping('val_loss',patience=10,restore_best_weights=True)])

        oof_embedding[test] += model.predict((X[test],Xs[test],Xc[test])).ravel()    
    
    scores.append((r2_score(y,oof),r2_score(y,oof_embedding)))
    

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Folds', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Folds', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Folds', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Folds', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Folds', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Folds', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Folds', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Folds', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Folds', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Folds', max=5.0, style=ProgressStyle(description_width='i…




In [174]:
scores = np.asarray(scores)
m,s = np.mean(scores,axis=0),np.std(scores,axis=0)
f'Out-of-fold, {m[0]} +- {s[0]}', f'Out-of-fold embedding, {m[1]} +- {s[1]}'

('Out-of-fold, 0.11799794378627744 +- 0.016752933877773695',
 'Out-of-fold embedding, 0.18855524973345555 +- 0.013843017892894377')

In [175]:
# Remove species or chemical groups. 
# Rare species. Get chemical groups, species groups. Fish -> amphibes. Insects -> crustsaens
# Per chemical, how many species. 
# What is 100% coverage. MxN matrix without holes.
# Endangered species https://cfpub.epa.gov/ecotox/group/U.S.ThreatenedandEndangeredSpecies
# Weekly update meeting, not fridays. 
# TODO write in Teams channel

In [176]:
len(effect_data['species'].unique()),len(effect_data['chemical'].unique())

(1750, 604)