In [1]:
from rdflib import Graph, URIRef
import numpy as np
import glob 
graph = Graph()
for filename in glob.glob('reduced_kgs/reduced_*'):
    graph.load(filename,format=filename.split('.')[-1])

In [2]:
len(graph)

218327

In [3]:
entities = set(graph.subjects()) | set(graph.objects())
relations = set(graph.predicates())
len(entities), len(relations)

(58176, 41)

In [4]:
entity_mappings = {e:i for i,e in enumerate(entities)}
relation_mappings = {e:i for i,e in enumerate(relations)}
triples = np.asarray(list(map(lambda x: (entity_mappings[x[0]],
                                         relation_mappings[x[1]],
                                         entity_mappings[x[2]]),graph)))

In [5]:
from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout
import tensorflow as tf

def TransE(dim=200,bias=1,lamb=1):
    
    inp = Input((3,))
    inp_label = Input(())
    
    s,p,o = tf.unstack(inp,axis=-1)
    
    entity_embedding = Embedding(len(entities),dim,name='entity_embedding')
    relation_embedding = Embedding(len(relations),dim,name='relation_embedding')
    
    h,r,t = entity_embedding(s),relation_embedding(p),entity_embedding(o)
    
    score = bias - tf.norm(h+r-t, ord=2, axis=-1)
    
    loss = lamb - inp_label * score
    loss = tf.where(loss>0,loss,0) + \
    1e-3 * tf.norm(entity_embedding.weights[0],ord=2)**2
    
    model = Model(inputs=[inp,inp_label],outputs=score)
    model.add_loss(loss)
    model.compile(optimizer='adam',loss=None)
    
    return model

def DistMult(dim=200):
    inp = Input((3,))
    inp_label = Input(())
    
    s,p,o = tf.unstack(inp,axis=-1)
    
    entity_embedding = Embedding(len(entities),dim,name='entity_embedding')
    relation_embedding = Embedding(len(relations),dim,name='relation_embedding')
    
    h,r,t = entity_embedding(s),relation_embedding(p),entity_embedding(o)
    
    score = tf.keras.layers.Activation('tanh')(tf.reduce_sum(h*r*t,axis=-1))
    
    model = Model(inputs=[inp,inp_label],outputs=score)
    
    loss = lambda true,pred: tf.reduce_sum(tf.math.log(1+tf.math.exp(-true*pred))) + \
    1e-3 * tf.norm(entity_embedding.weights[0],ord=2)**2
    
    model.compile(optimizer='adam',loss=loss)
    
    return model

def ComplEx(dim=200):
    inp = Input((3,))
    inp_label = Input(())
    
    s,p,o = tf.unstack(inp,axis=-1)
    
    entity_embedding = Embedding(len(entities),dim,name='entity_embedding')
    relation_embedding = Embedding(len(relations),dim,name='relation_embedding')
    
    h,r,t = entity_embedding(s),relation_embedding(p),entity_embedding(o)
    
    h_real,h_img = tf.split(h,2,axis=-1)
    r_real,r_img = tf.split(r,2,axis=-1)
    t_real,t_img = tf.split(t,2,axis=-1)
    
    score = tf.reduce_sum(r_real*h_real*t_real,axis=-1) + \
    tf.reduce_sum(r_real*h_img*t_img,axis=-1) + \
    tf.reduce_sum(r_img*h_real*t_img,axis=-1) - \
    tf.reduce_sum(r_img*h_img*t_real,axis=-1)
        
    model = Model(inputs=[inp,inp_label],outputs=score)
    
    loss = lambda true,pred: tf.reduce_sum(tf.math.log(1+tf.math.exp(-true*pred))) + \
    1e-3 * tf.norm(entity_embedding.weights[0],ord=2)**2
    
    model.compile(optimizer='adam',loss=loss)
    
    return model



In [24]:
def create_negative(positive,n=2):
    negative = np.repeat(positive,n,axis=0)
    negative[:,0] = np.random.randint(0,len(entities),size=len(negative))
    negative[:,2] = np.random.randint(0,len(entities),size=len(negative))
    return negative

In [25]:
%timeit create_negative(triples,n=10)

38.7 ms ± 824 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
from tqdm import tqdm

embedding_model = ComplEx()

best_loss = float('inf')
patience=10

for i in tqdm(range(100)):
    
    negative = create_negative(triples,n=10)
    
    X = np.concatenate([triples,negative],axis=0)
    y = np.concatenate([np.ones(len(triples)),-1*np.ones(len(negative))],axis=0)
    
    hist = embedding_model.fit((X,y),y,
                     batch_size=8192,
                     shuffle=True,
                     verbose=1)

    l = hist.history['loss'][-1]
    if l < best_loss:
        best_loss = l
        c = 0
    else:
        c += 1
    
    if c > patience: break
    

  0%|          | 0/100 [00:00<?, ?it/s]



  1%|          | 1/100 [00:28<46:47, 28.36s/it]



  2%|▏         | 2/100 [00:56<46:14, 28.31s/it]



  3%|▎         | 3/100 [01:24<45:47, 28.33s/it]



  4%|▍         | 4/100 [01:53<45:29, 28.43s/it]



  5%|▌         | 5/100 [02:22<45:14, 28.57s/it]



  6%|▌         | 6/100 [02:51<44:53, 28.65s/it]



  7%|▋         | 7/100 [03:20<44:32, 28.74s/it]



  8%|▊         | 8/100 [03:48<43:47, 28.56s/it]



  9%|▉         | 9/100 [04:16<43:06, 28.43s/it]



 10%|█         | 10/100 [04:44<42:23, 28.26s/it]



In [None]:
p = embedding_model((X,y),training=False).ravel()

In [None]:
np.mean(p[:len(triples)]),np.mean(p[len(triples):])

In [None]:
import pandas as pd
effect_data = pd.read_csv('effect_data.csv')

In [None]:
fps = {}

import sys
from SPARQLWrapper import SPARQLWrapper, JSON
from pubchempy import Compound

endpoint_url = "https://query.wikidata.org/sparql"

query = """select ?cas ?pc where {
  ?c wdt:P231 ?tmp ;
     wdt:P662 ?pc .
  bind(replace(?tmp,'-','') as ?cas)
}"""


def get_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    # TODO adjust user agent; see https://w.wiki/CX6
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()

results = get_results(endpoint_url, query)

for result in results["results"]["bindings"]:
    chem_id = 'https://cfpub.epa.gov/ecotox/cas/'+result['cas']['value']
    if chem_id in set(effect_data.chemical.values): 
        vioxx = Compound.from_cid(int(result['pc']['value']))
        fps[chem_id] = vioxx.fingerprint

In [None]:
def to_bin(he): 
    scale = 16 ## equals to hexadecimal
    num_of_bits = 900
    return bin(int(he, scale))[2:].zfill(num_of_bits)

effect_data['fp'] = [to_bin(fps[c]) if c in fps else to_bin('0') for c in effect_data['chemical'].values]

In [None]:
effect_data = effect_data[effect_data['fp']!=to_bin('0')]
effect_data.shape

In [None]:
data = effect_data[['fp','species','chemical','conc (mol/L)']].values

In [None]:
data = np.asarray([[fp,
                    entity_mappings[URIRef(s)],
                    entity_mappings[URIRef(c)],
                    conc] for fp,s,c,conc in data if URIRef(s) in entity_mappings and URIRef(c) in entity_mappings])

In [None]:
data.shape

In [None]:
embedding_model.trainable=False
from tensorflow.keras.layers import Concatenate
def mlp(input_shape,use_embedding=False):
    
    inp_s = Input(())
    inp_c = Input(())
    
    if use_embedding:
        s = embedding_model.get_layer('entity_embedding')(inp_s)
        c = embedding_model.get_layer('entity_embedding')(inp_c)
    else:
        el = Embedding(len(entities),200)
        s = el(inp_s)
        c = el(inp_c)
        
    x = Concatenate(axis=-1)([s,c])
    x = Dense(128,activation='relu')(x)
    x = Dropout(0.2)(x)
    x = Dense(1)(x)
    model = Model(inputs=[inp_s,inp_c],outputs=x)
    model.compile(optimizer='adam',loss='mae')
    return model
   

In [None]:

from sklearn.model_selection import KFold
from sklearn.metrics import r2_score

Xs,Xc = data[:,1].astype(int),data[:,2].astype(int)
y = data[:,3].astype('float32')
X = data[:,0]

X = np.asarray(list(map(lambda x: np.asarray([float(a) for a in x]), X))).astype('float32')

oof = np.zeros(y.shape)
oof_embedding = np.zeros(y.shape)

for train,test in KFold(5).split(y):
    model = mlp(X.shape[-1],use_embedding=False)
    model.fit((Xs[train],Xc[train]),y[train],
              validation_data=((Xs[test],Xc[test]),y[test]),
              batch_size=8192,epochs=1000,
             callbacks=[tf.keras.callbacks.EarlyStopping('val_loss',patience=10,restore_best_weights=True)])
    
    oof[test] += model.predict((Xs[test],Xc[test])).ravel()
    
    model = mlp(X.shape[-1],use_embedding=True)
    model.fit((Xs[train],Xc[train]),y[train],
              validation_data=((Xs[test],Xc[test]),y[test]),
              batch_size=8192,epochs=1000,
             callbacks=[tf.keras.callbacks.EarlyStopping('val_loss',patience=10,restore_best_weights=True)])
    
    oof_embedding[test] += model.predict((Xs[test],Xc[test])).ravel()
    

In [None]:
r2_score(y,oof),r2_score(y,oof_embedding)