In [1]:
import pandas as pd


DISPLAY_ALL_TEXT = False

pd.set_option("display.max_colwidth", 0 if DISPLAY_ALL_TEXT else 50)

In [2]:
!pip install rdflib



In [3]:
from rdflib import Namespace
from rdflib import Graph, URIRef, Literal, RDF, ConjunctiveGraph

### Dataset contains text and disease and drug mentions which were extracted from the DailyMed drug lables

In [4]:

unlabeled_df  = pd.read_csv('../data/input/raw_cleaned.csv.zip',  compression='zip')
unlabeled_df = unlabeled_df[['DB_ID', 'DO_ID','disease', 'Label_ID', 'DrugName', 'Context','Section' ]]
len(unlabeled_df)

54040

In [5]:
unlabeled_df.head()

Unnamed: 0,DB_ID,DO_ID,disease,Label_ID,DrugName,Context,Section
0,DB00752,DOID_0050773,paraganglioma,9f3ad5aa-8e9d-4512-88a5-15cfd50293f9.xml,TRANYLCYPROMINE,concomitant use or use in rapid succession wit...,Contraindication
1,DB00752,DOID_225,syndrome,9f3ad5aa-8e9d-4512-88a5-15cfd50293f9.xml,TRANYLCYPROMINE,concomitant use or use in rapid succession wit...,Contraindication
2,DB00752,DOID_4481,hay fever,9f3ad5aa-8e9d-4512-88a5-15cfd50293f9.xml,TRANYLCYPROMINE,concomitant use or use in rapid succession wit...,Contraindication
3,DB00752,DOID_0050773,paraganglioma,4de045ff-f76c-48dd-9e5f-4a4f859eec8f.xml,TRANYLCYPROMINE,concomitant use or use in rapid succession wit...,Contraindication
4,DB00752,DOID_225,syndrome,4de045ff-f76c-48dd-9e5f-4a4f859eec8f.xml,TRANYLCYPROMINE,concomitant use or use in rapid succession wit...,Contraindication


In [6]:
unlabeled_df= unlabeled_df.rename(columns={'DrugName':'Drug_name'})
unlabeled_df.drop_duplicates(subset=['Context','DO_ID'], keep='first', inplace=True)
len(unlabeled_df)

39810

In [7]:
unlabeled_df.columns

Index(['DB_ID', 'DO_ID', 'disease', 'Label_ID', 'Drug_name', 'Context',
       'Section'],
      dtype='object')

In [8]:
unlabeled_df.DB_ID= 'http://bio2rdf.org/drugbank:'+unlabeled_df.DB_ID

In [9]:
unlabeled_df.DO_ID= 'http://bio2rdf.org/doid:'+unlabeled_df.DO_ID.str[5:]

### Dataset labelled by experts to be used as test data

In [10]:
experts = pd.read_csv('../data/input/expert_resolved_all.csv')
experts.sample()

Unnamed: 0,#,context,disease_name,drug_name,workers_answers,medical_expert1,medical_expert2,medical_expert3,do_id,drug_brand_name,drug_id,label_id,sheet,expert_consensus,inter_agree_experts,medical_expert4,Final \n(closest option)
124,41.0,Bumetanide tablets USP are indicated for the t...,NEPHROTIC SYNDROME,BUMETANIDE,indication_treatment\nindication_symptomatic_r...,Indication: Symptomatic Relief,Inconsistent,Indication: Treatment,DOID_1184,,DB00887,a94471b1-a588-44e7-b17b-b99f5c3ba31d.xml,indication_symptomatic_relief,Indication: Symptomatic Relief,0,,Indication: Symptomatic Relief


In [11]:
experts = experts.rename(columns={'drug_name':'Drug_name',
                        'disease_name':'disease', 'context':'Context',
                        'do_id':'DO_ID','drug_id':'DB_ID',
                        'label_id':'Label_ID', 'expert_consensus':'relation'})
#experts[['DB_ID', 'DO_ID','disease', 'Label_ID', 'Set_ID','relation', 'Drug_name', 'Context','Section']]


### Defininig a rule as a SPARQL query

If drug ?a treats disease ?f and disease ?f and ?b, then drug ?a can treat disease ?b.
```
Rule : ?a CtD ?f ^ ?f DrD ?b => ?a CtD ?b 
```

```
SELECT DISTINCT ?a ?b where 
{ 
    ?a <http://bio2rdf.org/hetionet:CtD> ?f .
    ?f <http://bio2rdf.org/hetionet:DrD> ?b .
    MINUS { ?a  <http://bio2rdf.org/hetionet:CtD>  ?b .
}

```

In [12]:
query='SELECT DISTINCT ?a ?b where { ?a <http://bio2rdf.org/hetionet:CtD> ?f .'
' \n ?f <http://bio2rdf.org/hetionet:DrD> ?b . '
'\nMINUS { ?a  <http://bio2rdf.org/hetionet:CtD>  ?b .}\n    }'

'\nMINUS { ?a  <http://bio2rdf.org/hetionet:CtD>  ?b .}\n    }'

In [13]:
query='SELECT DISTINCT ?a ?b where { ?a <http://bio2rdf.org/hetionet:CtD> ?f . \n ?f <http://bio2rdf.org/hetionet:DrD> ?b . \n    }'

In [14]:
g= Graph()

In [15]:
!gunzip ../data/rdf/hetionet.ttl.gz

In [16]:
## We used Hetionet KG: https://het.io/about/ 
g.parse('../data/rdf/hetionet.ttl',format="ttl")

<Graph identifier=N957b3f04ce15494c8dabeba44951c731 (<class 'rdflib.graph.Graph'>)>

In [17]:
query='SELECT DISTINCT ?a ?b where { ?a  <http://bio2rdf.org/hetionet:CtD>  ?b   }'

In [18]:
qres= g.query(query)

In [19]:
indications_het =set()
for row in qres:
    #print (row['a'], row['b'])
    indications_het.add((str(row['a']),str(row['b'])))
    #break
len(indications_het)

755

In [20]:
('http://bio2rdf.org/drugbank:DB00858', 'http://bio2rdf.org/doid:1612') in indications_het

False

### Define KG Rules as Labeling Functions in Snorkel
Snorkel is data labeling framework where multiple heuristic and programatic rules can be combined to assign a label to training data 

In [21]:
# Labels to be assigned 
ABSTAIN = -1
INDICATION = 1


In [22]:
#!pip install snorkel

In [23]:
from snorkel.labeling import labeling_function


@labeling_function()
def check_rule(x):
    drug= x.DB_ID
    disease = x.DO_ID
    for row in qres:
        #print (row['a'], row['b'], drug, disease)
        if str(row['a']) == drug and str(row['b']) == disease:
            print (row['a'], row['b'])
            return INDICATION
    else:
        return ABSTAIN

### Define heuristics as Labeling Functions in Snorkel
One common approach to define a rule to label text data is use of phrase, we define a list of phrases to look for in the text


In [24]:
treatment_phrases= [" indicated for the treatment of",
" indicated in the management of",
" indicated for the management of",
"for the management of",
"management of",
" indicated for the maintenance of remission", 
"or the treatment of",
"in the treatment of",
" indicated as",
" indicated in",
"be effective",
"active treatment of",
" indicated for",
"treatment of",
" indicated as an adjunct",
" indicated for use in the treatment of", 
" indicated for the intermittent treatment", 
" indicated to reduce the rate of",
" indicated for the rapid control",
" indicated for the control",
"reduce the risk of",
" indicated as adjunctive treatment",
"for the treatment of"]

In [25]:
@labeling_function()
def check_phrase(x):
    if str(x.Context) !='nan':
        text = x.Context.lower()
        #print ('text')
        for phr in treatment_phrases:
            if phr in text:
                return INDICATION
    return ABSTAIN        

In [26]:
from snorkel.labeling import PandasLFApplier

lfs = [check_rule, check_phrase]

applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=unlabeled_df.sample(100))

 35%|███▌      | 35/100 [00:00<00:00, 174.69it/s]

http://bio2rdf.org/drugbank:DB01234 http://bio2rdf.org/doid:3310
http://bio2rdf.org/drugbank:DB01234 http://bio2rdf.org/doid:2377
http://bio2rdf.org/drugbank:DB01234 http://bio2rdf.org/doid:2377
http://bio2rdf.org/drugbank:DB00381 http://bio2rdf.org/doid:10763


 91%|█████████ | 91/100 [00:00<00:00, 184.47it/s]

http://bio2rdf.org/drugbank:DB01076 http://bio2rdf.org/doid:10763
http://bio2rdf.org/drugbank:DB00571 http://bio2rdf.org/doid:10763


100%|██████████| 100/100 [00:00<00:00, 177.83it/s]


In [27]:
coverage_check_out, coverage_check = (L_train != ABSTAIN).mean(axis=0)
print(f"check_out coverage: {coverage_check_out * 100:.1f}%")
print(f"check coverage: {coverage_check * 100:.1f}%")

check_out coverage: 6.0%
check coverage: 69.0%


### Defining labeling function for each rule

In [29]:
# This table is the output of the rule mining tool AIME+ using Hetnet KG
df_rules = pd.read_csv('../data/input/rules-hetionet-edges-20-04-2020.tsv',sep='\t')
df_rules.head()

Unnamed: 0,Rule,Head Coverage,Std Confidence,PCA Confidence,Positive Examples,Body size,PCA Body size,Functional variable,Std. Lower Bound,PCA Lower Bound,PCA Conf estimation
0,?a CpD ?f ?f DrD ?b => ?a CpD ?b,0.235897,0.105747,0.105747,92,870,870,?a,0.0,0.0,0.0
1,?f CpD ?b ?a CrC ?f => ?a CpD ?b,0.338462,0.080097,0.27907,132,1648,473,?a,0.0,0.0,0.0
2,?e CpD ?b ?e CrC ?a => ?a CpD ?b,0.317949,0.067982,0.227106,124,1824,546,?a,0.0,0.0,0.0
3,?a DrD ?f ?b DrD ?f => ?a DrD ?b,0.644567,0.150215,0.150215,350,2330,2330,?a,0.0,0.0,0.0
4,?f DrD ?b ?a DrD ?f => ?a DrD ?b,0.631676,0.286789,0.286789,343,1196,1196,?a,0.0,0.0,0.0


### Extract the rules that are related to Compound and Disease (CtD relation)

In [30]:
rules = []
for rule in df_rules.Rule:
    #print(rule)
    body,head = rule.split(' =>')
    
    #print(head)
    relation = head.split('  ')[1].replace('>','')
    if relation == 'CtD':
        body = body.replace('CtD','<http://bio2rdf.org/hetionet:CtD>').\
        replace('CrC','<http://bio2rdf.org/hetionet:CrC>').\
        replace('DrD','<http://bio2rdf.org/hetionet:DrD>')
        
        head = head.replace('CtD','<http://bio2rdf.org/hetionet:CtD>').\
        replace('CrC','<http://bio2rdf.org/hetionet:CrC>').\
        replace('DrD','<http://bio2rdf.org/hetionet:DrD>')
        print (relation)
        print(body, head)
        rules.append({'head':head, 'body':body, 'relation':'CtD'})

CtD
?a  <http://bio2rdf.org/hetionet:CtD>  ?f  ?f  <http://bio2rdf.org/hetionet:DrD>  ?b    ?a  <http://bio2rdf.org/hetionet:CtD>  ?b
CtD
?a  <http://bio2rdf.org/hetionet:CtD>  ?f  ?b  <http://bio2rdf.org/hetionet:DrD>  ?f    ?a  <http://bio2rdf.org/hetionet:CtD>  ?b
CtD
?a  <http://bio2rdf.org/hetionet:CrC>  ?f  ?f  <http://bio2rdf.org/hetionet:CtD>  ?b    ?a  <http://bio2rdf.org/hetionet:CtD>  ?b
CtD
?e  <http://bio2rdf.org/hetionet:CrC>  ?a  ?e  <http://bio2rdf.org/hetionet:CtD>  ?b    ?a  <http://bio2rdf.org/hetionet:CtD>  ?b


In [31]:
rules

[{'head': ' ?a  <http://bio2rdf.org/hetionet:CtD>  ?b',
  'body': '?a  <http://bio2rdf.org/hetionet:CtD>  ?f  ?f  <http://bio2rdf.org/hetionet:DrD>  ?b  ',
  'relation': 'CtD'},
 {'head': ' ?a  <http://bio2rdf.org/hetionet:CtD>  ?b',
  'body': '?a  <http://bio2rdf.org/hetionet:CtD>  ?f  ?b  <http://bio2rdf.org/hetionet:DrD>  ?f  ',
  'relation': 'CtD'},
 {'head': ' ?a  <http://bio2rdf.org/hetionet:CtD>  ?b',
  'body': '?a  <http://bio2rdf.org/hetionet:CrC>  ?f  ?f  <http://bio2rdf.org/hetionet:CtD>  ?b  ',
  'relation': 'CtD'},
 {'head': ' ?a  <http://bio2rdf.org/hetionet:CtD>  ?b',
  'body': '?e  <http://bio2rdf.org/hetionet:CrC>  ?a  ?e  <http://bio2rdf.org/hetionet:CtD>  ?b  ',
  'relation': 'CtD'}]

In [32]:
! pip install gensim

Collecting gensim
  Downloading gensim-4.0.1-cp37-cp37m-manylinux1_x86_64.whl (23.9 MB)
[K     |████████████████████████████████| 23.9 MB 12.1 MB/s eta 0:00:01
[?25hCollecting smart-open>=1.8.1
  Downloading smart_open-5.1.0-py3-none-any.whl (57 kB)
[K     |████████████████████████████████| 57 kB 12.2 MB/s eta 0:00:01
[31mERROR: Could not install packages due to an OSError: [Errno 2] No such file or directory: '/workspace/anaconda3/envs/bio/lib/python3.7/site-packages/numpy-1.20.2.dist-info/METADATA'
[0m


### Embedding

In [37]:
# from gensim.models import KeyedVectors

In [38]:
word_vectors = KeyedVectors.load('../embeddings/hetionet/w2v_vectors.kv')

NameError: name 'KeyedVectors' is not defined

### Convert the rules to SPARQL queries

In [40]:
def construct_query_for_rule(rule):
    patterns = rule['body'].split('  ')
    n_pattern = len(patterns)
    triple_pattern = """SELECT DISTINCT ?a ?b where { 
      
    """
    k =3
    for i in range(int(n_pattern/k)):
        triple_pattern += patterns[i*k] + ' '+patterns[k*i+1] + ' '+patterns[k*i+2] +' . \n'
        #print (triple_pattern)

    triple_pattern+= "MINUS {"+rule['head']+" .}"
    triple_pattern+= """
    }"""
    return triple_pattern

In [34]:
sparql0 = construct_query_for_rule(rules[0])
sparql0
qres0 = g.query(sparql0)

In [41]:
sparql1 = construct_query_for_rule(rules[1])
qres1 = g.query(sparql1)
sparql1

'SELECT DISTINCT ?a ?b where { \n      \n    ?a <http://bio2rdf.org/hetionet:CtD> ?f . \n?b <http://bio2rdf.org/hetionet:DrD> ?f . \nMINUS { ?a  <http://bio2rdf.org/hetionet:CtD>  ?b .}\n    }'

In [43]:
sparql2 = construct_query_for_rule(rules[2])
qres2 = g.query(sparql2)
sparql2

'SELECT DISTINCT ?a ?b where { \n      \n    ?a <http://bio2rdf.org/hetionet:CrC> ?f . \n?f <http://bio2rdf.org/hetionet:CtD> ?b . \nMINUS { ?a  <http://bio2rdf.org/hetionet:CtD>  ?b .}\n    }'

In [45]:
sparql3 = construct_query_for_rule(rules[3])
qres3 = g.query(sparql3)
sparql3

'SELECT DISTINCT ?a ?b where { \n      \n    ?e <http://bio2rdf.org/hetionet:CrC> ?a . \n?e <http://bio2rdf.org/hetionet:CtD> ?b . \nMINUS { ?a  <http://bio2rdf.org/hetionet:CtD>  ?b .}\n    }'

### Use the SPARQL query results in the labeling fuctions 
If the SPARQL results defined for a rule contain a drug and disease pair, it means this rule supports the fact that the drug treats the target

In [46]:
@labeling_function()
def check_rule0(x):
    drug= x.DB_ID
    disease = x.DO_ID
    for row in qres0:
        #print (row['a'], row['b'], drug, disease)
        if str(row['a']) == drug and str(row['b']) == disease:
            return INDICATION
    return ABSTAIN


@labeling_function()
def check_rule1(x):
    drug= x.DB_ID
    disease = x.DO_ID
    for row in qres1:
        #print (row['a'], row['b'], drug, disease)
        if str(row['a']) == drug and str(row['b']) == disease:
            return INDICATION
    return ABSTAIN

@labeling_function()
def check_rule2(x):
    drug= x.DB_ID
    disease = x.DO_ID
    for row in qres2:
        #print (row['a'], row['b'], drug, disease)
        if str(row['a']) == drug and str(row['b']) == disease:
            return INDICATION
    return ABSTAIN

@labeling_function()
def check_rule3(x):
    drug= x.DB_ID
    disease = x.DO_ID
    for row in qres2:
        #print (row['a'], row['b'], drug, disease)
        if str(row['a']) == drug and str(row['b']) == disease:
            return INDICATION
    return ABSTAIN


@labeling_function()
def check_phrase(x):
    if str(x.Context) !='nan':
        text = x.Context.lower()
        #print ('text')
        for phr in treatment_phrases:
            if phr in text:
                return INDICATION
    return ABSTAIN        

In [47]:
# Use Embedding model in the labeling function  
! pip install csv

[31mERROR: Could not find a version that satisfies the requirement csv[0m
[31mERROR: No matching distribution found for csv[0m


In [52]:
import numpy as np
import csv

entity_emb = np.load('../embeddings/embed/hetionet_TransE_l2_entity.npy')
relation_emb = np.load('../embeddings/embed/hetionet_TransE_l2_relation.npy')

entity_idmap_file = '../embeddings/hetionet/entities.tsv'
relation_idmap_file = '../embeddings/hetionet/relations.tsv'

entity_map = {}
entity_id_map = {}
relation_map = {}
with open(entity_idmap_file, newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile, delimiter='\t', fieldnames=['id','name'])
    for row_val in reader:
        entity_map[row_val['name']] = int(row_val['id'])
        entity_id_map[int(row_val['id'])] = row_val['name']
        
with open(relation_idmap_file, newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile, delimiter='\t', fieldnames=['id','name'])
    for row_val in reader:
        relation_map[row_val['name']] = int(row_val['id'])    
entity_map

{'Anatomy::UBERON:0001295': 0,
 'Gene::6954': 1,
 'Anatomy::UBERON:0000473': 2,
 'Gene::127845': 3,
 'Anatomy::UBERON:0002185': 4,
 'Gene::11346': 5,
 'Gene::7424': 6,
 'Biological Process::GO:0045776': 7,
 'Gene::4297': 8,
 'Gene::8351': 9,
 'Compound::DB00635': 10,
 'Side Effect::C0042384': 11,
 'Gene::200879': 12,
 'Gene::10610': 13,
 'Anatomy::UBERON:0001296': 14,
 'Gene::55159': 15,
 'Anatomy::UBERON:0000998': 16,
 'Gene::1359': 17,
 'Gene::10472': 18,
 'Gene::2771': 19,
 'Gene::1244': 20,
 'Gene::5607': 21,
 'Gene::3988': 22,
 'Gene::9641': 23,
 'Gene::6134': 24,
 'Gene::7412': 25,
 'Gene::322': 26,
 'Biological Process::GO:0051054': 27,
 'Gene::64064': 28,
 'Molecular Function::GO:0008260': 29,
 'Compound::DB00667': 30,
 'Side Effect::C0032343': 31,
 'Gene::5252': 32,
 'Biological Process::GO:0031061': 33,
 'Gene::79649': 34,
 'Cellular Component::GO:0005815': 35,
 'Gene::2079': 36,
 'Gene::3108': 37,
 'Gene::51444': 38,
 'Biological Process::GO:0070647': 39,
 'Compound::DB08868

In [54]:
sim_matrix = np.zeros((entity_emb.shape[0],entity_emb.shape[0]))

In [55]:
# import collections
entity_emb.shape
sim_matrix = np.zeros((entity_emb.shape[0],entity_emb.shape[0]),dtype=float)

for i,emb1 in enumerate(entity_emb):
    for j,emb2 in enumerate(entity_emb):
        cos_sim = np.dot(emb1, emb2)/(np.linalg.norm(emb1)*np.linalg.norm(emb2))
        sim_matrix[i][j] = cos_sim

KeyboardInterrupt: 

In [None]:
# from numpy import dot
# from numpy.linalg import norm

def kg_most_similar(entity_name, topn=20):
#     entity_emb = entity_emb[entity_map[entity_name]]
    similarity = sim_matrix[entity_map[entity_name]]
    mostsimilar_idx = similarity.argsort()[-topn:][::-1]  
    mostsimilar_entities = []
    mostsimilar_entities = [entity_id_map[idx] for idx in mostsimilar_idx]
    return mostsimilar_entities

In [63]:
kg_most_similar('Gene::1136', topn=20)


0

In [None]:

def check_drug_sim(x):
    drug= x.DB_ID
    disease = x.DO_ID
    #print (drug, disease)
    # search all indications treating the same disease
    # if the drug that is similar to the one in the known indication,
    # return 'INDICATION' 
    for (dr, ds) in indications_het:
        if ds == disease and dr in word_vectors:
            similarDrugs = word_vectors.most_similar(dr,topn=20)
            for en,sim in similarDrugs:
                if en == drug:
                    return INDICATION
    return ABSTAIN 

@labeling_function()
def check_disease_sim(x):
    drug= x.DB_ID
    disease = x.DO_ID
    #print (drug, disease)
    for (dr, ds) in indications_het:
        if dr == drug and ds in word_vectors:
            similarDiseases = word_vectors.most_similar(ds,topn=20)
            for en,sim in similarDiseases:
                if en == disease:
                    return INDICATION
    return ABSTAIN 


In [43]:
@labeling_function()
def check_drug_sim(x):
    drug= x.DB_ID
    disease = x.DO_ID
    #print (drug, disease)
    # search all indications treating the same disease
    # if the drug that is similar to the one in the known indication,
    # return 'INDICATION' 
    for (dr, ds) in indications_het:
        if ds == disease and dr in word_vectors:
            similarDrugs = word_vectors.most_similar(dr,topn=20)
            for en,sim in similarDrugs:
                if en == drug:
                    return INDICATION
    return ABSTAIN 

@labeling_function()
def check_disease_sim(x):
    drug= x.DB_ID
    disease = x.DO_ID
    #print (drug, disease)
    for (dr, ds) in indications_het:
        if dr == drug and ds in word_vectors:
            similarDiseases = word_vectors.most_similar(ds,topn=20)
            for en,sim in similarDiseases:
                if en == disease:
                    return INDICATION
    return ABSTAIN 

In [189]:
from snorkel.labeling import PandasLFApplier

lfs = [check_rule0, check_rule1, check_rule2, check_rule3, check_drug_sim, check_disease_sim]

applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=unlabeled_df.sample(100))

  from pandas import Panel
100%|██████████| 100/100 [00:14<00:00,  6.95it/s]


In [190]:
coverage_check_rule0, coverage_check_rule1, coverage_check_rule2,coverage_check_rule3, coverage_check_drug_sim, coverage_check_disease_sim = (L_train != ABSTAIN).mean(axis=0)
print(f"check_out coverage: {coverage_check_rule0 * 100:.1f}%")
print(f"check coverage: {coverage_check_rule1 * 100:.1f}%")
print(f"check_out coverage: {coverage_check_rule2 * 100:.1f}%")
print(f"check coverage: {coverage_check_rule3 * 100:.1f}%")
print(f"check coverage: {coverage_check_drug_sim * 100:.1f}%")
print(f"check coverage: {coverage_check_disease_sim * 100:.1f}%")


check_out coverage: 1.0%
check coverage: 0.0%
check_out coverage: 0.0%
check coverage: 0.0%
check coverage: 6.0%
check coverage: 4.0%
