In [1]:
from load_data import LoadData
from get_diffusion import Diffusion

import pandas as pd

In [3]:
dataloader = LoadData()
diffusion = Diffusion()

protein_id2name, protein_name2id = dataloader.get_dict(type='protein')
drug_id2name, drug_name2id = dataloader.get_dict(type='drug')
indication_id2name, indication_name2id = dataloader.get_dict(type='indication')
biof_id2name, biof_name2id = dataloader.get_dict(type='biological_function')

ind2ptn = pd.read_csv('data/raw/2_indication_to_protein.tsv', sep='\t')
drug2ptn = pd.read_csv('data/raw/1_drug_to_protein.tsv', sep='\t')
ptn2biof = pd.read_csv('data/raw/4_protein_to_biological_function.tsv', sep='\t')
biof2biof = pd.read_csv('data/raw/5_biological_function_to_biological_function.tsv', sep='\t')

### PPI network에서만 coverage 확인 (Gene Ontology term 제외)

여기서 coverage란?
- 어떤 entity (indication 또는 drug)와 연관있는 protein의 집합들이 서로 얼마나 겹치는지 check하는 것.

In [5]:
# naive하게 indication 1개와 두 drug의 PPI network coverage 계산
def compute_coverage_naive(ind, drug1, drug2):
    ind_id = indication_name2id[ind]
    drug1_id = drug_name2id[drug1]
    drug2_id = drug_name2id[drug2]
    ind_ptn = ind2ptn[ind2ptn['node_1'] == ind_id]['node_2'].tolist()
    drug1_ptn = drug2ptn[drug2ptn['node_1'] == drug1_id]['node_2'].tolist()
    drug2_ptn = drug2ptn[drug2ptn['node_1'] == drug2_id]['node_2'].tolist()
    return len(set(ind_ptn) & set(drug1_ptn)), len(set(ind_ptn) & set(drug2_ptn)), len(set(drug1_ptn) & set(drug2_ptn))

In [6]:
# diffusion profile 상에서, top k개의 node를 가지고 coverage 계산
def compute_coverage_diff(ind, drug1, drug2, k=10):
    ind_id = indication_name2id[ind]
    ind_top_k = diffusion.rank_top_k_nodes(prof=diffusion.get_prof(id_=ind_id), k=k)
    drug1_id = drug_name2id[drug1]
    drug1_top_k = diffusion.rank_top_k_nodes(prof=diffusion.get_prof(id_=drug1_id), k=k)
    drug2_id = drug_name2id[drug2]
    drug2_top_k = diffusion.rank_top_k_nodes(prof=diffusion.get_prof(id_=drug2_id), k=k)
    return len(set(ind_top_k) & set(drug1_top_k)), len(set(ind_top_k) & set(drug2_top_k)), len(set(drug1_top_k) & set(drug2_top_k))

In [9]:
compute_coverage_naive('Hypertensive disease', 'acebutolol', 'amlodipine')

(1, 3, 0)

In [11]:
compute_coverage_diff('Hypertensive disease', 'timolol', 'hydrochlorothiazide', k=20)

(0, 1, 0)

### Gene Ontology term의 coverage 확인

여기서 coverage란?
- 어떤 entity (indication 또는 drug)와 연관된 protein 집합들에 대해, 각 protein들이 연결된 GO term 집합들끼리 얼마나 겹치는지 check.

In [12]:
# Gene ontology term의 coverage 계산
def compute_coverage_gene_ontology(ind, drug1, drug2):
    ind_id = indication_name2id[ind]
    drug1_id = drug_name2id[drug1]
    drug2_id = drug_name2id[drug2]
    ind_ptns = list(set(ind2ptn[ind2ptn['node_1'] == ind_id]['node_2'].tolist()))
    drug1_ptns = list(set(drug2ptn[drug2ptn['node_1'] == drug1_id]['node_2'].tolist()))
    drug2_ptns = list(set(drug2ptn[drug2ptn['node_1'] == drug2_id]['node_2'].tolist()))
    
    ind_gos = set(ptn2biof[ptn2biof['node_1'].isin(ind_ptns)]['node_2'].tolist())
    drug1_gos = set(ptn2biof[ptn2biof['node_1'].isin(drug1_ptns)]['node_2'].tolist())
    drug2_gos = set(ptn2biof[ptn2biof['node_1'].isin(drug2_ptns)]['node_2'].tolist())
    
    return len(set(ind_gos) & set(drug1_gos)), len(set(ind_gos) & set(drug2_gos)), len(set(drug1_gos) & set(drug2_gos))

In [14]:
compute_coverage_gene_ontology('hyperlipidemia', 'rosuvastatin', 'fenofibrate')

(0, 5, 0)