In [1]:
import numpy as np 
import json
import pandas as pd
import networkx as nx
from sklearn.model_selection import train_test_split

In [2]:
ppi = nx.read_graphml("../ppi_network.graphml")
nodes = list(ppi.nodes())
nodes, len(nodes)

(['ENSP00000000233',
  'ENSP00000158762',
  'ENSP00000357048',
  'ENSP00000262305',
  'ENSP00000329419',
  'ENSP00000469035',
  'ENSP00000256682',
  'ENSP00000314615',
  'ENSP00000331342',
  'ENSP00000389095',
  'ENSP00000480301',
  'ENSP00000261890',
  'ENSP00000434442',
  'ENSP00000249923',
  'ENSP00000306010',
  'ENSP00000429900',
  'ENSP00000482620',
  'ENSP00000356737',
  'ENSP00000324287',
  'ENSP00000263245',
  'ENSP00000296557',
  'ENSP00000440005',
  'ENSP00000418915',
  'ENSP00000359000',
  'ENSP00000484121',
  'ENSP00000307634',
  'ENSP00000281419',
  'ENSP00000449270',
  'ENSP00000000412',
  'ENSP00000438085',
  'ENSP00000376792',
  'ENSP00000386443',
  'ENSP00000311962',
  'ENSP00000371175',
  'ENSP00000349437',
  'ENSP00000221957',
  'ENSP00000001008',
  'ENSP00000351646',
  'ENSP00000350815',
  'ENSP00000444810',
  'ENSP00000401645',
  'ENSP00000437125',
  'ENSP00000359385',
  'ENSP00000431512',
  'ENSP00000378199',
  'ENSP00000354558',
  'ENSP00000451828',
  'ENSP000004

In [10]:
df = pd.read_csv("../kegg/data_with_atc_kegg.csv")
shortest_path = json.load(open("../shortest_path_length_dict.json"))
drug_targets = json.load(open("../data_target_ENSP.json"))
targets_all_mentioned = set()
for d in drug_targets:
    targets_all_mentioned.update(d["target_ENSP"])
targets_all_mentioned

{'ENSP00000358327',
 'ENSP00000426909',
 'ENSP00000325822',
 'ENSP00000237696',
 'ENSP00000386029',
 'ENSP00000222286',
 'ENSP00000231420',
 'ENSP00000259457',
 'ENSP00000271450',
 'ENSP00000396688',
 'ENSP00000482548',
 'ENSP00000481450',
 'ENSP00000419087',
 'ENSP00000347754',
 'ENSP00000399968',
 'ENSP00000467262',
 'ENSP00000242057',
 'ENSP00000306490',
 'ENSP00000295802',
 'ENSP00000264381',
 'ENSP00000335632',
 'ENSP00000436714',
 'ENSP00000334910',
 'ENSP00000420161',
 'ENSP00000354982',
 'ENSP00000349437',
 'ENSP00000339916',
 'ENSP00000303057',
 'ENSP00000366410',
 'ENSP00000315602',
 'ENSP00000219271',
 'ENSP00000332887',
 'ENSP00000371420',
 'ENSP00000219599',
 'ENSP00000410402',
 'ENSP00000216962',
 'ENSP00000482773',
 'ENSP00000370867',
 'ENSP00000361635',
 'ENSP00000345997',
 'ENSP00000362057',
 'ENSP00000477796',
 'ENSP00000246069',
 'ENSP00000385675',
 'ENSP00000216271',
 'ENSP00000265171',
 'ENSP00000357470',
 'ENSP00000264428',
 'ENSP00000349320',
 'ENSP00000445306',


In [11]:
#check if all the targets mentioned in the drug_targets are in the ppi network
targets_not_in_ppi = []
for t in targets_all_mentioned:
    if t not in nodes:
        targets_not_in_ppi.append(t)
targets_not_in_ppi, len(targets_not_in_ppi)

(['ENSP00000237696',
  'ENSP00000303057',
  'ENSP00000457180',
  'ENSP00000478755',
  'ENSP00000297375',
  'ENSP00000317257',
  'ENSP00000290705',
  'ENSP00000287275',
  'ENSP00000379964',
  'ENSP00000499251',
  'ENSP00000498855',
  'ENSP00000264930',
  'ENSP00000347767',
  'ENSP00000337146',
  'ENSP00000244314',
  'ENSP00000344829',
  'ENSP00000295087',
  'ENSP00000372857',
  'ENSP00000478639',
  'ENSP00000348911',
  'ENSP00000292609',
  'ENSP00000239451',
  'ENSP00000385763',
  'ENSP00000392549',
  'ENSP00000303887',
  'ENSP00000345133',
  'ENSP00000376807',
  'ENSP00000351035',
  'ENSP00000367032',
  'ENSP00000319991',
  'ENSP00000220763',
  'ENSP00000255858',
  'ENSP00000417085',
  'ENSP00000264908',
  'ENSP00000483467',
  'ENSP00000215812',
  'ENSP00000369739',
  'ENSP00000084798',
  'ENSP00000343248',
  'ENSP00000260229',
  'ENSP00000406773',
  'ENSP00000340660',
  'ENSP00000306340',
  'ENSP00000459799',
  'ENSP00000384932',
  'ENSP00000420849',
  'ENSP00000370648',
  'ENSP000002

In [12]:
train, test = train_test_split(df, test_size=0.2, random_state=42)
drug_targets = json.load(open("../data_target_ENSP.json"))
targets_in_test = set()

def get_target(drug):
    for d in drug_targets:
        if d["dg_id"] == drug:
            return d["target_ENSP"]
    return []

for i, row in test.iterrows():
    targets_in_test.update(get_target(row["dg_id"]))
targets_in_test, len(targets_in_test)


({'ENSP00000003084',
  'ENSP00000004531',
  'ENSP00000176183',
  'ENSP00000215375',
  'ENSP00000216797',
  'ENSP00000217971',
  'ENSP00000218099',
  'ENSP00000219240',
  'ENSP00000219431',
  'ENSP00000219833',
  'ENSP00000221130',
  'ENSP00000221476',
  'ENSP00000222982',
  'ENSP00000225823',
  'ENSP00000225831',
  'ENSP00000226021',
  'ENSP00000227507',
  'ENSP00000228468',
  'ENSP00000229135',
  'ENSP00000231509',
  'ENSP00000231948',
  'ENSP00000233838',
  'ENSP00000234071',
  'ENSP00000234961',
  'ENSP00000241052',
  'ENSP00000241356',
  'ENSP00000242057',
  'ENSP00000243347',
  'ENSP00000248437',
  'ENSP00000248594',
  'ENSP00000249071',
  'ENSP00000250615',
  'ENSP00000250699',
  'ENSP00000251595',
  'ENSP00000253122',
  'ENSP00000254066',
  'ENSP00000254667',
  'ENSP00000255380',
  'ENSP00000256906',
  'ENSP00000258400',
  'ENSP00000261416',
  'ENSP00000261707',
  'ENSP00000261733',
  'ENSP00000261751',
  'ENSP00000262186',
  'ENSP00000262209',
  'ENSP00000262461',
  'ENSP000002

In [40]:
targets_for_GR = set()
drug_with_targets = []
for i, row in df.iterrows():
    targets = get_target(row["dg_id"])
    if len(targets) > 0:
        drug_with_targets.append(row["dg_id"])
        targets_for_GR.update(targets)

test_ = 0
for i, row in test.iterrows():
    targets = get_target(row["dg_id"])
    if len(targets) > 0:
        test_ += 1
test_, len(test)

(157, 164)

In [13]:
for t in targets_in_test:
    if t not in nodes:
        print(t)

ENSP00000376807
ENSP00000367032
ENSP00000417085
ENSP00000384932
ENSP00000290705
ENSP00000369822


In [21]:
# cal the shortest path between all the targets in the test set
shortest_path_test = np.full((len(targets_in_test), len(targets_in_test)), np.inf)
targets_in_test = list(targets_in_test)
for i in range(len(targets_in_test)):
    try:
        shortest_path_test_list = nx.single_source_dijkstra_path_length(ppi, targets_in_test[i])
        for j in range(i+1, len(targets_in_test)):
            if targets_in_test[j] in shortest_path_test_list:
                shortest_path_test[i, j] = shortest_path_test[j, i] = shortest_path_test_list[targets_in_test[j]]
            else:
                shortest_path_test[i, j] = shortest_path_test[j, i] = np.inf
        shortest_path_test[i, i] = 0
    except nx.NodeNotFound:
        continue
np.save("ppi_shortest_path.npy", shortest_path_test)
shortest_path_test

array([[   0., 3028., 2890., ..., 3008., 2192., 2375.],
       [3028.,    0., 2963., ..., 3051., 2913., 2548.],
       [2890., 2963.,    0., ..., 1669.,  748., 2972.],
       ...,
       [3008., 3051., 1669., ...,    0., 1709., 2603.],
       [2192., 2913.,  748., ..., 1709.,    0., 2890.],
       [2375., 2548., 2972., ..., 2603., 2890.,    0.]])

In [31]:
idx_of_targets_in_test = {t: i for i, t in enumerate(targets_in_test)}
idx_of_targets_in_test

{'ENSP00000353292': 0,
 'ENSP00000370128': 1,
 'ENSP00000347754': 2,
 'ENSP00000376807': 3,
 'ENSP00000299178': 4,
 'ENSP00000242057': 5,
 'ENSP00000306490': 6,
 'ENSP00000391592': 7,
 'ENSP00000363417': 8,
 'ENSP00000176183': 9,
 'ENSP00000385026': 10,
 'ENSP00000332296': 11,
 'ENSP00000265593': 12,
 'ENSP00000476228': 13,
 'ENSP00000227507': 14,
 'ENSP00000315602': 15,
 'ENSP00000367032': 16,
 'ENSP00000329380': 17,
 'ENSP00000342952': 18,
 'ENSP00000269305': 19,
 'ENSP00000372750': 20,
 'ENSP00000004531': 21,
 'ENSP00000264428': 22,
 'ENSP00000349320': 23,
 'ENSP00000445306': 24,
 'ENSP00000263980': 25,
 'ENSP00000308541': 26,
 'ENSP00000414303': 27,
 'ENSP00000345708': 28,
 'ENSP00000363822': 29,
 'ENSP00000430656': 30,
 'ENSP00000430432': 31,
 'ENSP00000428994': 32,
 'ENSP00000274024': 33,
 'ENSP00000478561': 34,
 'ENSP00000378426': 35,
 'ENSP00000458585': 36,
 'ENSP00000380638': 37,
 'ENSP00000282249': 38,
 'ENSP00000226021': 39,
 'ENSP00000370473': 40,
 'ENSP00000393097': 41,
 '

In [42]:
def drug_target_interaction(drug, target):
    res = 0
    drug_targets = get_target(drug)
    for p in drug_targets:
        length = shortest_path_test[idx_of_targets_in_test[target], idx_of_targets_in_test[p]]
        length = length / 1000
        res += np.exp(-(length**2))
    return res

def GR_score(drug1, drug2):
    drug1_targets = get_target(drug1)
    drug2_targets = get_target(drug2)   
    num_of_drug1_targets = len(drug1_targets)
    num_of_drug2_targets = len(drug2_targets)
    if num_of_drug1_targets == 0 or num_of_drug2_targets == 0:
        return 0
    score = np.sum([drug_target_interaction(drug1, t) for t in drug2_targets]) / (num_of_drug1_targets + num_of_drug2_targets)
    return score

GR_matrix = np.full((len(test), len(test)), np.inf)
for i in range(len(test)):
    for j in range(i, len(test)):
        GR_matrix[i, j] = GR_matrix[j, i] = GR_score(test.iloc[i]["dg_id"], test.iloc[j]["dg_id"])
np.save("GR_matrix.npy", GR_matrix)
GR_matrix


array([[5.00000000e-01, 1.93316605e-03, 4.19194098e-03, ...,
        1.87117512e-02, 5.13832496e-03, 2.66517250e-02],
       [1.93316605e-03, 5.00000000e-01, 3.82532571e-03, ...,
        2.85627143e-03, 2.45620423e-03, 3.54255774e-03],
       [4.19194098e-03, 3.82532571e-03, 8.11165561e-01, ...,
        1.09857826e-01, 2.26525250e-04, 6.14441272e-03],
       ...,
       [1.87117512e-02, 2.85627143e-03, 1.09857826e-01, ...,
        9.11958500e-01, 1.97957169e-02, 9.57456658e-02],
       [5.13832496e-03, 2.45620423e-03, 2.26525250e-04, ...,
        1.97957169e-02, 7.73809360e-01, 3.60716646e-01],
       [2.66517250e-02, 3.54255774e-03, 6.14441272e-03, ...,
        9.57456658e-02, 3.60716646e-01, 1.11948173e+00]])

In [None]:
len_of_all_targets = len(targets_all_mentioned)
len_of_targets_in_test = len(targets_in_test)
shortest_path_matrix = np.full((len_of_targets_in_test, len_of_all_targets), -1)
idx_of_targets_in_test = {t: i for i, t in enumerate(targets_in_test)}
idx_of_all_targets = {t: i for i, t in enumerate(targets_all_mentioned)}
with open("protein_to_idx_all.json","w") as f:
    json.dump(idx_of_all_targets, f, indent=4)
with open("protein_to_idx_test.json","w") as f:
    json.dump(idx_of_targets_in_test, f, indent=4)

In [21]:
targets_in_test = list(targets_in_test)
targets_all_mentioned = list(targets_all_mentioned)

In [None]:
for source in targets_in_test:
    try:
        shortest_path_to_all = nx.single_source_dijkstra_path_length(ppi, source)
        for target in targets_all_mentioned:
            if target in shortest_path_to_all:
                shortest_path_matrix[idx_of_targets_in_test[source], idx_of_all_targets[target]] = shortest_path_to_all[target]
    except nx.NodeNotFound:
        continue    


array([[   0, 3043, 3073, ..., 2625, 2802, 3158],
       [2217, 2416, 1490, ..., 1735, 2181, 1610],
       [3073, 3022,    0, ..., 3020, 2111, 3100],
       ...,
       [2548, 3189, 3080, ..., 2992, 3007, 3070],
       [2470, 2739, 2372, ..., 2517, 2425, 3116],
       [2580, 2309, 2312, ..., 2344, 2344, 3029]])

In [36]:
train

Unnamed: 0,dg_id,dg_name,dg_atc_codes,dg_atc_levels,kegg_cid
450,DB00962,Zaleplon,['N05CF03'],"[['N05CF', 'N05C', 'N05', 'N']]",C07484
549,DB01127,Econazole,"['G01AF05', 'D01AC03', 'G01AF20', 'G01AF55']","[['G01AF', 'G01A', 'G01', 'G'], ['D01AC', 'D01...",C08068
296,DB00683,Midazolam,['N05CD08'],"[['N05CD', 'N05C', 'N05', 'N']]",C07524
741,DB06715,Potassium Iodide,"['S01XA04', 'V03AB21', 'R05CA02']","[['S01XA', 'S01X', 'S01', 'S'], ['V03AB', 'V03...",C08219
79,DB00266,Dicoumarol,['B01AA01'],"[['B01AA', 'B01A', 'B01', 'B']]",C00796
...,...,...,...,...,...
71,DB00247,Methysergide,['N02CA04'],"[['N02CA', 'N02C', 'N02', 'N']]",C07199
106,DB00318,Codeine,"['N02AA79', 'R05DA04', 'N02AA59', 'N02AJ08', '...","[['N02AA', 'N02A', 'N02', 'N'], ['R05DA', 'R05...",C06174
270,DB00630,Alendronic acid,"['M05BB03', 'M05BA04', 'M05BB06', 'M05BB05']","[['M05BB', 'M05B', 'M05', 'M'], ['M05BA', 'M05...",C07752
435,DB00934,Maprotiline,['N06AA21'],"[['N06AA', 'N06A', 'N06', 'N']]",C07107


In [43]:
target_drug_relevance_matrix = np.full((len_of_targets_in_test, len(train)), -1, dtype=np.float64)

def drug_target_interaction(drug, target):
    res = 0
    drug_targets = get_target(drug)
    for p in drug_targets:
        length = shortest_path_matrix[idx_of_targets_in_test[target], idx_of_all_targets[p]]
        if length != -1:
            length = length / 1000
            res += np.exp(-(length**2))
    return res

for target in targets_in_test:
    thisidx = idx_of_targets_in_test[target]
    j = 0
    for i, row in train.iterrows():
        target_drug_relevance_matrix[thisidx, j] = drug_target_interaction(row["dg_id"], target)
        j += 1

target_drug_relevance_matrix


array([[3.80693904e-05, 9.41968064e-02, 5.47505196e-03, ...,
        3.88377995e-04, 1.01386695e-01, 1.02242191e-01],
       [8.18981475e-03, 1.18994144e-01, 1.45028876e-01, ...,
        2.44204268e-01, 3.32379538e-01, 4.40132252e-01],
       [7.87308094e-05, 1.67153000e-04, 4.41059662e-03, ...,
        3.74146134e-01, 4.08539877e-02, 8.05936208e-02],
       ...,
       [1.33133885e-01, 1.63253067e-04, 2.66356356e-02, ...,
        4.54341789e-03, 1.13333461e-01, 2.03934080e-01],
       [5.57599780e-03, 4.38791986e-03, 4.38896886e-02, ...,
        1.11348455e-02, 7.10178641e-01, 1.43389780e-01],
       [6.10550778e-03, 7.46620083e-03, 1.28723150e-01, ...,
        9.45305074e-03, 1.83574758e+00, 8.56942310e-02]])

In [44]:
np.save("target_drug_relevance_matrix.npy", target_drug_relevance_matrix)

In [3]:
import numpy as np

matrix = np.load("target_drug_relevance_matrix.npy")
# check if nan exists
np.isnan(matrix).any()

False

In [5]:
test_idx = json.load(open("protein_to_idx_test.json"))
all_idx = json.load(open("protein_to_idx_all.json"))
len(matrix), len(test_idx), len(all_idx)

(328, 328, 2739)

In [7]:
test_in_all_idx = []
for k in test_idx.keys():
    test_in_all_idx.append(all_idx[k])

In [18]:
has_zero_row = np.where(np.all(matrix == 0, axis=1))[0]
has_zero_row

array([ 20,  27,  46, 187, 206, 287], dtype=int64)