## Import Libraries

In [1]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import squareform, pdist,jaccard
import sys,  datetime, os
import requests
import chardet

## Load Data

#### Load Sider Interactions Data

In [6]:
df_with_types = pd.read_table('Input/meddra_all_indications.tsv', names = ['STITCH ID','UMLS Concept ID Label','Method Detection','Concept Name','Concept Type','UMLS Concept ID MedDRA', 'MedDRA Concept Name'])

In [11]:
df_with_types.head(10)

Unnamed: 0,STITCH ID,UMLS Concept ID Label,Method Detection,Concept Name,Concept Type,UMLS Concept ID MedDRA,MedDRA Concept Name
0,CID100000085,C0015544,text_mention,Failure to Thrive,LLT,C0015544,Failure to thrive
1,CID100000085,C0015544,text_mention,Failure to Thrive,PT,C0015544,Failure to thrive
2,CID100000085,C0020615,text_mention,Hypoglycemia,LLT,C0020615,Hypoglycaemia
3,CID100000085,C0020615,text_mention,Hypoglycemia,PT,C0020615,Hypoglycaemia
4,CID100000085,C0022661,NLP_indication,"Kidney Failure, Chronic",LLT,C0022661,Renal failure chronic
5,CID100000085,C0022661,NLP_indication,"Kidney Failure, Chronic",PT,C0022661,Renal failure chronic
6,CID100000085,C0025521,NLP_indication,Inborn Errors of Metabolism,LLT,C0025521,Inborn error of metabolism
7,CID100000085,C0025521,NLP_indication,Inborn Errors of Metabolism,PT,C0025521,Inborn error of metabolism
8,CID100000085,C0026827,text_mention,Muscle hypotonia,LLT,C0026827,Hypotonia
9,CID100000085,C0026827,text_mention,Muscle hypotonia,PT,C0026827,Hypotonia


In [15]:
df_with_types.shape

(30835, 7)

#### Load LINCS Small Molecules

In [8]:
lincs = pd.read_csv('Input/LINCS_SmallMolecules.csv', encoding = 'ISO-8859-1')

In [9]:
lincs.head()

Unnamed: 0,SM_Name,SM_LINCS_ID,SM_Alternative_Name,SM_PubChem_CID,SM_SMILES_Parent,SM_SMILES_Batch,SM_InChi_Parent,SM_Molecular_Mass,MOLECULAR_FORMULA,SM_ChEBI_ID
0,Dichlobenil,LSM-19017,,3031.0,Clc1cccc(Cl)c1C#N,,InChI=1S/C7H3Cl2N/c8-6-2-1-3-7(9)5(6)4-10/h1-3H,172.01,C7H3Cl2N,943.0
1,AC1NWAJC,LSM-43967,Vulpinic acid,5701993.0,COC(=O)\C(=C\1/OC(=O)C(C1=O)c2ccccc2)\c3ccccc3,,InChI=1S/C19H14O5/c1-23-18(21)15(13-10-6-3-7-1...,322.31,C19H14O5,
2,Sinapic Acid Methyl Ether,LSM-44124,,735755.0,COc1cc(\C=C\C(=O)O)cc(OC)c1OC,,InChI=1S/C12H14O5/c1-15-9-6-8(4-5-11(13)14)7-1...,238.24,C12H14O5,
3,Ferulic acid,LSM-44126,Ferulic acid,445858.0,COc1cc(\C=C\C(=O)O)ccc1O,,InChI=1S/C10H10O4/c1-14-9-6-7(2-4-8(9)11)3-5-1...,194.18,C10H10O4,17620.0
4,Pinosylvin Methyl Ether,LSM-43902,,5281719.0,COc1cc(O)cc(\C=C\c2ccccc2)c1,,InChI=1S/C15H14O2/c1-17-15-10-13(9-14(16)11-15...,226.27,C15H14O2,8227.0


## Drop Duplicates in the DF

In [10]:
df = df_with_types.drop(columns = ['Concept Type'])

In [12]:
df = df.drop_duplicates()

In [13]:
df.head()

Unnamed: 0,STITCH ID,UMLS Concept ID Label,Method Detection,Concept Name,UMLS Concept ID MedDRA,MedDRA Concept Name
0,CID100000085,C0015544,text_mention,Failure to Thrive,C0015544,Failure to thrive
2,CID100000085,C0020615,text_mention,Hypoglycemia,C0020615,Hypoglycaemia
4,CID100000085,C0022661,NLP_indication,"Kidney Failure, Chronic",C0022661,Renal failure chronic
6,CID100000085,C0025521,NLP_indication,Inborn Errors of Metabolism,C0025521,Inborn error of metabolism
8,CID100000085,C0026827,text_mention,Muscle hypotonia,C0026827,Hypotonia


In [14]:
df.shape

(19001, 6)

In [16]:
len(df['STITCH ID'].unique())

1437

## Get Drug Name from Stitch API and make dictionary

In [22]:
stitch_ID_df = df['STITCH ID']
stitch_ID_df = stitch_ID_df.drop_duplicates()
stitch_ID_df.shape

(1437,)

In [None]:
stitch_ID_list = stitch_ID_df.tolist()
failed_to_get_name = 0
drug_name_dict = {}

for stitch_ID in stitch_ID_list:
    url = 'http://stitch.embl.de/api/json/resolve?identifier=' + stitch_ID
    response = requests.get(url)
    try:
        response.json()
    except ValueError:
        continue
    if 'annotation' in response.json()[0].keys():
        drug_name = response.json()[0]['annotation']
        drug_name_dict[stitch_ID] = drug_name
    else:
        failed_to_get_name +=1
    
    if len(drug_name_dict) % 50 == 0:
        print(stitch_ID_list.index(stitch_ID))

print(failed_to_get_name)
print(len(drug_name_dict))
print(len(stitch_ID_list))

## Get PubChem ID and Map to Stitch ID

In [None]:
failed_to_get_CID = 0
CID_dict = {}
count = 0

for ID, name in drug_name_dict.items():
    count += 1
    name = name.replace(' ', '%20')
    url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/' + name + '/cids/JSON'
    response = requests.get(url)
    try:
        response.json()
    except ValueError:
        continue
    if 'IdentifierList' in response.json().keys():
        CID = response.json()['IdentifierList']['CID'][0]
        CID_dict[ID] = CID
    else:
        failed_to_get_CID += 1
        
    if len(CID_dict) % 50 == 0:
        print(count)


print(failed_to_get_CID)
print(len(CID_dict))
print(len(drug_name_dict))

In [28]:
CIDs = []
for index, row in df.iterrows():
    stitch_id = row.loc['STITCH ID']
    if stitch_id not in CID_dict:
        df.drop(index, inplace = True)
    else: 
        CIDs.append(int(CID_dict[stitch_id]))

df.loc[:,'CIDs'] = pd.Series(np.array(CIDs), index=df.index)

In [29]:
df.shape

(16687, 7)

## Filter by LINCS Approved Small Molecules

In [30]:
count = 0
for index, row in df.iterrows():
    CID = row.loc['CIDs']
    if CID not in lincs['SM_PubChem_CID'].values:
        df.drop(index, inplace = True)
        count += 1
        
print(count)

5962


In [35]:
df.head()

Unnamed: 0,STITCH ID,UMLS Concept ID Label,Method Detection,Concept Name,UMLS Concept ID MedDRA,MedDRA Concept Name,CIDs
53,CID100000137,C0022602,NLP_indication,Actinic keratosis,C0022602,Actinic keratosis,137
55,CID100000137,C0022602,NLP_indication,Actinic keratosis,C0022603,Seborrhoeic keratosis,137
56,CID100000137,C0162568,NLP_precondition,Erythropoietic Protoporphyria,C0162568,Erythropoietic protoporphyria,137
57,CID100000137,C0162568,NLP_precondition,Erythropoietic Protoporphyria,C0853026,Porphyria non-acute,137
58,CID100000137,C0555198,NLP_indication,Malignant Glioma,C0555198,Malignant glioma,137


## Make Binary Matrix

In [36]:
grouped_df = df.groupby(['UMLS Concept ID MedDRA'])['CIDs'].apply(lambda x: ','.join(x.astype(str))).reset_index()

In [37]:
grouped_df.set_index('UMLS Concept ID MedDRA', inplace=True)
grouped_df = grouped_df.sort_index()
len(df['UMLS Concept ID MedDRA'].unique())

2339

In [38]:
grouped_matrix = grouped_df.iloc[:,0].str.get_dummies(sep=',')

In [39]:
grouped_matrix.head()

Unnamed: 0_level_0,10096344,10113978,10182969,10184653,1046,104741,104758,104850,104865,10531,...,9869929,9875401,9878,9880,9904,9924495,9926791,9930049,9941444,9966051
UMLS Concept ID MedDRA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C0000729,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C0000731,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C0000737,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C0000768,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C0000786,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [40]:
grouped_matrix.shape

(2338, 708)

## Save Binary Matrix

In [41]:
filename = 'Output/Sider_Indications_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
grouped_matrix.to_csv(filename, sep='\t', compression='gzip')

## Convert Binary Matrix to GMT and Save as GMT

In [42]:
drugset_library = []
for index, row in grouped_matrix.iterrows():
    drugset = []
    drugset.append(index)
    drugset.append('')
    for i in range(grouped_matrix.shape[1]):
        if row.iloc[i]==1:
            drugset.append(row.index[i])
    if len(drugset) >= 7:
        drugset_library.append(drugset)
dsl = np.array(drugset_library)

In [43]:
len(max(dsl, key=len))

98

In [44]:
filename = 'Output/Sider_Indications_DrugSetLibrary_%s.gmt'% str(datetime.date.today())[0:7].replace('-', '_')
with open(filename,'w',encoding='utf-8') as f:
    for row in dsl:
        np.savetxt(f, [row], fmt = '%s',delimiter = '\t')