In [1]:
import pandas as pd


DISPLAY_ALL_TEXT = False
data_folder = "../../../data"

pd.set_option("display.max_colwidth", 0 if DISPLAY_ALL_TEXT else 50) 

In [None]:
from rdflib

In [11]:
df_op = pd.read_csv(f'{data_folder}/input/openpredict-omim-drug.csv')

In [12]:
df_op.head()

Unnamed: 0,drugid,omimid
0,DB01148,231200
1,DB01148,155100
2,DB01148,273800
3,DB00575,607554
4,DB00575,171300


In [13]:
df_op = df_op.rename(columns={'omimid':'disease_id','drugid':'drug_id'})

In [14]:
df_op.disease_id= df_op.disease_id.astype(str)

In [6]:
#df_op.drug_id= 'https://identifiers.org/drugbank:'+df_op.drug_id

In [7]:
#df_op.disease_id= 'https://identifiers.org/omim:'+df_op.disease_id

In [45]:
df_op.head(20)

Unnamed: 0,drug_id,disease_id
0,DB01148,231200
1,DB01148,155100
2,DB01148,273800
3,DB00575,607554
4,DB00575,171300
5,DB00575,102300
6,DB00575,137580
7,DB00575,147530
8,DB00575,601042
9,DB00575,608622


In [30]:
indications_dict =set()
for i, row in df_op.iterrows():
    #row['DB_ID'], row['DO_ID']
    pair = (str(row['drug_id']),str(row['disease_id']))
    indications_dict.add(pair)
len(indications_dict)

1933

### Embedding

### Drug features

In [16]:
drug_fp_df =pd.read_csv(f'{data_folder}/baseline_features/drugs-fingerprint-sim.csv')

In [17]:
drug_fp_df= drug_fp_df.pivot(index='Drug1', columns='Drug2', values=['TC'])

In [18]:
drug_fp_df= drug_fp_df.fillna(0.0)

### Disease features

In [58]:
disease_hp_df =pd.read_csv(f'{data_folder}/baseline_features/diseases-hpo-sim.csv')

In [59]:
disease_hp_df.head()

Unnamed: 0,Disease1,Disease2,HPO-SIM
0,157950,606798,0.373642
1,157950,115300,0.344222
2,157950,270960,0.355539
3,157950,606842,0.322567
4,157950,246400,0.343388


In [60]:
disease_hp_df= disease_hp_df.pivot(index='Disease1', columns='Disease2', values=['HPO-SIM'])

In [61]:
disease_hp_df= disease_hp_df.fillna(0.0)

In [62]:
### Convert similarity matrix to embeddings (gensim word2vec format)

In [21]:
def sim_mat2emb_file(sm_df, file_path):
    with open(file_path, 'a') as file:
        file.write(str(sm_df.shape[0])+' '+str(sm_df.shape[1])+'\n')
        sm_df.to_csv(file, sep=' ', header=False)

In [22]:
sim_mat2emb_file(drug_fp_df,f'{data_folder}/embedding/drugs_fp_embed.txt')

In [63]:
sim_mat2emb_file(disease_hp_df,f'{data_folder}/embedding/disease_hp_embed.txt')

In [23]:
from gensim.models import KeyedVectors

In [24]:
drug_fp_vectors = KeyedVectors.load_word2vec_format(f'{data_folder}/embedding/drugs_fp_embed.txt', binary=False)


In [25]:
drug_fp_vectors.most_similar('DB00627', topn=5)

[('DB00586', 0.9244383573532104),
 ('DB00642', 0.9216973781585693),
 ('DB00622', 0.915402889251709),
 ('DB00631', 0.9129185676574707),
 ('DB00640', 0.9127779603004456)]

In [64]:
disease_hp_vectors = KeyedVectors.load_word2vec_format(f'{data_folder}/embedding/disease_hp_embed.txt', binary=False)


In [66]:
disease_hp_vectors.most_similar('606798',topn=5)

[('182830', 0.9539228081703186),
 ('604827', 0.9504092335700989),
 ('160300', 0.9473865032196045),
 ('606842', 0.9470750093460083),
 ('137800', 0.9442803859710693)]

### API to retrieve evidence graph for query drug-disease pair

In [27]:
from rdflib import Namespace
from rdflib import Graph, URIRef, Literal, RDF, ConjunctiveGraph

In [47]:
OMIM = Namespace("https://identifiers.org/omim:")
DRUGB = Namespace("https://identifiers.org/drugbank:")
BIOLINK = Namespace("https://w3id.org/biolink/vocab/")


In [55]:
# openpredict:most_similar('DB00570', 10)
def get_evidence_path_for_pairs(drug, disease, emb_vectors):
    g= Graph()
    for (dr, ds) in indications_dict:
        if ds == disease and dr in emb_vectors:
            similarDrugs = emb_vectors.most_similar(dr,topn=10)
            #print (similarDrugs)
            for en,sim in similarDrugs:
                if en == drug:
                    print (dr,' ',drug)
                    g.add((DRUGB[dr],BIOLINK['treats'],OMIM[ds]))
                    g.add((DRUGB[dr], BIOLINK['similar_to'],DRUGB[drug]))
        if dr == drug and ds in emb_vectors:
            similarDiseases = word_vectors.most_similar(ds,topn=10)
            for en,sim in similarDiseases:
                if en == disease:
                    print (ds,' ',disease)
                    g.add((DRUGB[dr],BIOLINK['treats'],OMIM[ds]))
                    g.add((OMIM[ds], BIOLINK['similar_to'], OMIM[disease]))
    return g

In [68]:
g1 = get_evidence_path_for_pairs('DB00570','236000', drug_fp_vectors)

DB00563   DB00570
DB00541   DB00570


In [69]:
g2 = get_evidence_path_for_pairs('DB00570','236000', disease_hp_df)

In [70]:
for (s,p,o) in g1:
    print (s,p,o)

https://identifiers.org/drugbank:DB00541 https://w3id.org/biolink/vocab/similar_to https://identifiers.org/drugbank:DB00570
https://identifiers.org/drugbank:DB00541 https://w3id.org/biolink/vocab/treats https://identifiers.org/omim:236000
https://identifiers.org/drugbank:DB00563 https://w3id.org/biolink/vocab/similar_to https://identifiers.org/drugbank:DB00570
https://identifiers.org/drugbank:DB00563 https://w3id.org/biolink/vocab/treats https://identifiers.org/omim:236000


In [71]:
for (s,p,o) in g2:
    print (s,p,o)