## Import Libraries

In [1]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import squareform, pdist,jaccard
import sys,  datetime, os
import requests
import chardet

## Load Data

#### Load PharmagKB Data

In [2]:
df = pd.read_table('Input/PharmagKB.tsv')

In [16]:
df.head()

Unnamed: 0,stitch_id,drug,UMLS ID,event,rr,log2rr,t_statistic,pvalue,observed,expected,bg_correction,sider,future_aers,medeffect
0,CID000000076,dehydroepiandrosterone,C0000737,abdominal pain,2.25,1.169925,6.537095,6.156712e-07,9,4.0,0.002849,0,0,0
1,CID000000076,dehydroepiandrosterone,C0001622,hyperadrenalism,11.0,3.459432,4.782699,0.001644408,2,0.181818,4.1e-05,0,0,0
2,CID000000076,dehydroepiandrosterone,C0001623,adrenal insufficiency,2.2,1.137504,4.315199,0.009884952,2,0.909091,0.002491,0,0,0
3,CID000000076,dehydroepiandrosterone,C0002792,anaphylactic reaction,2.588235,1.371969,4.590918,0.00335538,4,1.545455,0.000503,0,0,0
4,CID000000076,dehydroepiandrosterone,C0002940,aneurysm,7.333333,2.874469,4.598374,0.003399457,2,0.272727,0.000135,0,0,0


In [6]:
df = df.dropna(subset=['umls_id']) 

In [12]:
df = df.rename(index=str, columns = {'umls_id':'UMLS ID'})

In [7]:
len(df['stitch_id'].unique())

1332

#### Load LINCS Small Molecules

In [4]:
lincs = pd.read_csv('Input/LINCS_SmallMolecules.csv', encoding = 'ISO-8859-1')

## Get Drug Name from Stitch API and make dictionary

In [8]:
stitch_ID_df = df['stitch_id']
stitch_ID_df = stitch_ID_df.drop_duplicates()
stitch_ID_df.shape

(1332,)

In [9]:
stitch_ID_list = stitch_ID_df.tolist()
failed_to_get_name = 0
drug_name_dict = {}

for stitch_ID in stitch_ID_list:
    url = 'http://stitch.embl.de/api/json/resolve?identifier=' + stitch_ID
    response = requests.get(url)
    try:
        response.json()
    except ValueError:
        continue
    if 'annotation' in response.json()[0].keys():
        drug_name = response.json()[0]['annotation']
        drug_name_dict[stitch_ID] = drug_name
    else:
        failed_to_get_name +=1
    
    if len(drug_name_dict) % 50 == 0:
        print(stitch_ID_list.index(stitch_ID))

print(failed_to_get_name)
print(len(drug_name_dict))
print(len(stitch_ID_list))

50
106
160
229
291
353
408
473
533
593
654
706
771
823
887
954
1025
1089
1162
1254
0
1043
1332


## Get PubChem ID and Map to Stitch ID

In [11]:
failed_to_get_CID = 0
CID_dict = {}
count = 0

for ID, name in drug_name_dict.items():
    count += 1
    name = name.replace(' ', '%20')
    url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/' + name + '/cids/JSON'
    response = requests.get(url)
    try:
        response.json()
    except ValueError:
        continue
    if 'IdentifierList' in response.json().keys():
        CID = response.json()['IdentifierList']['CID'][0]
        CID_dict[ID] = CID
    else:
        failed_to_get_CID += 1
        
    if len(CID_dict) % 100 == 0:
        print(count)


print(failed_to_get_CID)
print(len(CID_dict))
print(len(drug_name_dict))

118
227
334
440
544
652
773
882
1004
111
932
1043


In [23]:
CIDs = []
for index, row in df.iterrows():
    stitch_id = row.loc['stitch_id']
    if stitch_id not in CID_dict:
        df.drop(index, inplace = True)
    else: 
        CIDs.append(int(CID_dict[stitch_id]))

df.loc[:,'CIDs'] = pd.Series(np.array(CIDs), index=df.index)

In [24]:
index

'438800'

## Filter by LINCS Approved Small Molecules

In [25]:
count = 0
for index, row in df.iterrows():
    CID = row.loc['CIDs']
    if CID not in lincs['SM_PubChem_CID'].values:
        df.drop(index, inplace = True)
        count += 1
        
print(count)

118998


In [26]:
df.head()

Unnamed: 0,stitch_id,drug,UMLS ID,event,rr,log2rr,t_statistic,pvalue,observed,expected,bg_correction,sider,future_aers,medeffect,CIDs
940,CID000000143,leucovorin,C0000737,abdominal pain,7.484085,2.903826,24.469162,2.568504e-58,513,68.545455,0.004799,0,1,0,6006
941,CID000000143,leucovorin,C0001263,abdominal infection,8.8,3.137504,6.321542,4.06283e-06,8,0.909091,1.3e-05,0,0,0,6006
942,CID000000143,leucovorin,C0002736,als,11.0,3.459432,4.802784,0.008064572,3,0.272727,0.000254,0,0,0,6006
943,CID000000143,leucovorin,C0002792,anaphylactic reaction,7.973799,2.995267,17.467515,5.022008e-38,166,20.818182,0.000301,0,0,0,6006
944,CID000000143,leucovorin,C0002962,angina,7.980392,2.99646,10.498235,1.486924e-17,74,9.272727,0.002103,0,0,0,6006


## Make Binary Matrix

In [27]:
grouped_df = df.groupby(['UMLS ID'])['CIDs'].apply(lambda x: ','.join(x.astype(str))).reset_index()

In [28]:
grouped_df.set_index('UMLS ID', inplace=True)
grouped_df = grouped_df.sort_index()
len(df['UMLS ID'].unique())

9112

In [29]:
grouped_matrix = grouped_df.iloc[:,0].str.get_dummies(sep=',')

In [30]:
grouped_matrix.head()

Unnamed: 0_level_0,10182969,1046,104741,104865,10531,1054,10660,107807,110634,110635,...,8982,91610,92722,92727,936,938,941650,9417,9651,9878
UMLS ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C0000727,0,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
C0000731,0,0,1,1,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
C0000733,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C0000734,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C0000735,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
grouped_matrix.shape

(9112, 528)

## Save Binary Matrix

In [32]:
filename = 'Output/PharmagKB_SE_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
grouped_matrix.to_csv(filename, sep='\t', compression='gzip')

## Convert Binary Matrix to GMT and Save as GMT

In [33]:
drugset_library = []
for index, row in grouped_matrix.iterrows():
    drugset = []
    drugset.append(index)
    drugset.append('')
    for i in range(grouped_matrix.shape[1]):
        if row.iloc[i]==1:
            drugset.append(row.index[i])
    if len(drugset) >= 7:
        drugset_library.append(drugset)
dsl = np.array(drugset_library)

In [34]:
len(max(dsl, key=len))

307

In [35]:
filename = 'Output/PharmagKB_SE_%s.gmt'% str(datetime.date.today())[0:7].replace('-', '_')
with open(filename,'w',encoding='utf-8') as f:
    for row in dsl:
        np.savetxt(f, [row], fmt = '%s',delimiter = '\t')