# Create Translator benchmark

Addresses https://github.com/TranslatorSRI/Benchmarks/issues/1

In [1]:
import networkx as nx
import pandas as pd
import yaml

from collections import defaultdict
from data_tools.files import download
from pathlib import Path

  from tqdm.autonotebook import tqdm


In [2]:
# Make the output folders
this_name = '5_create_translator_benchmark'
out_dir = Path('../2_pipeline').joinpath(this_name, 'out').resolve()
out_dir.mkdir(parents=True, exist_ok=True)

data_dir = Path('../0_data/external').resolve()
data_dir.mkdir(parents=True, exist_ok=True)

In [3]:
DMDB_URL = 'https://raw.githubusercontent.com/SuLab/DrugMechDB/main/indication_paths.yaml'
download(DMDB_URL, data_dir.joinpath('indication_paths.yaml'), redownload=False)

File indication_paths.yaml exits. Skipping...


In [4]:
with open(data_dir.joinpath('indication_paths.yaml'), 'r') as fh:
        ind = yaml.safe_load(fh)

In [5]:
### functions copied from 1_basic_dmdb_analysis.ipynb

def path_to_tup(path):
    return (path['graph']['drugbank'], path['graph']['disease_mesh'])

def path_to_G(path):
    return nx.node_link_graph(path)                                                                                        

def get_all_paths(path):
    source_id = path['links'][0]['source']                                                                              
    target_ids = list(set([l['target'] for l in path['links']]) - set([l['source'] for l in path['links']]))
    G = path_to_G(path)
    this_paths = list(chain(*[list(nx.all_simple_paths(G, source_id, target_id)) for target_id in target_ids]))         
    return this_paths

def get_id_to_type(G):
    id_to_type = {}
    for n in G.nodes.data():
        id_to_type[n[0]] = n[1]['label']
    return id_to_type

def get_id_to_name(G):
    id_to_name = {}
    for n in G.nodes.data():
        id_to_name[n[0]] = n[1]['name']
    return id_to_name

def add_metaedges(G):
    id_to_type = get_id_to_type(G)
    for e in G.edges:
        G.edges[e]['metaedge'] = id_to_type[e[0]] + ' - ' + e[2] + ' - ' + id_to_type[e[1]]
    return G

def add_meanode_pairs(G):
    id_to_type = get_id_to_type(G)
    for e in G.edges:
        G.edges[e]['mn_pair'] = id_to_type[e[0]] + ' - ' + id_to_type[e[1]]
    return G

def get_targets(G):
    drug = list(G.edges)[0][0]
    targets = []
    for e in G.edges:
        if e[0] == drug:
            targets.append(e[1])
    return targets

def get_target_metaedges(G):
    drug = list(G.edges)[0][0]
    target_mes = []
    if 'metaedge' not in G.edges[list(G.edges)[0]]:
        G = add_metaedges(G)
    
    for e in G.edges:
        if e[0] == drug:
            target_mes.append(G.edges[e]['metaedge'])
    return target_mes

In [6]:
target_metapath = ['Drug', 'Protein','BiologicalProcess','Disease']
df = pd.DataFrame(columns=target_metapath)

for i, p in enumerate(ind):
    G = path_to_G(p)
    metapath = [n[1]['label'] for n in G.nodes.data()]
    if metapath != target_metapath:
        continue
    new_record = {}
    for idx in range(len(target_metapath)):
        new_record[target_metapath[idx]] = list(G.nodes)[idx]
    
    df = df.append([new_record])  


In [7]:
df


Unnamed: 0,Drug,Protein,BiologicalProcess,Disease
0,MESH:D000865,UniProt:P35367,GO:0034776,MESH:D012223
0,MESH:C004649,UniProt:P35367,GO:0034776,MESH:D003233
0,MESH:D005640,UniProt:P23945,GO:0030728,MESH:D016649
0,MESH:C415771,UniProt:P48357,GO:0060612,MESH:D008060
0,MESH:D017336,UniProt:P35367,GO:0034776,MESH:D065631
...,...,...,...,...
0,MESH:C411345,UniProt:P00742,GO:0072378,MESH:D011655
0,MESH:C586847,UniProt:Q9UM73,GO:0008283,MESH:D002289
0,MESH:C030814,UniProt:P38435,GO:0007596,MESH:D006475
0,MESH:C000589393,UniProt:P16234,GO:0008283,MESH:D012509


In [8]:
df.to_csv(out_dir.joinpath('translator_benchmark.txt'),index=None,sep="\t")