## Import Libraries

In [1]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import squareform, pdist,jaccard
import sys,  datetime, os
import requests
import chardet

## Load Data

#### Load DrugIndicationDB data

In [13]:
df_everything = pd.read_csv('Input/drug_indication_database.csv',encoding = 'ISO-8859-1')

  interactivity=interactivity, compiler=compiler, result=result)


In [14]:
df_everything.head()

Unnamed: 0,DID record unique ID,source name,source record ID or other metadata,raw drug name,Preferred CAS#,PT,source,match type,"PT (""name"")",ChEBI ID#,...,semantic type 4.1,entry term match type.1,entry term.1,PT.3,CUI.1,entry term type.1,semantic type 1.2,semantic type 2.2,semantic type 3.2,semantic type 4.2
0,1_000001,NDFRT,C0016157,Fish Oils,8016-13-5,Fish oil,ChemID+,<syn per source>,,,...,,,,,,,,,,
1,1_000002,NDFRT,C0016157,Fish Oils,8016-13-5,Fish oil,ChemID+,<syn per source>,,,...,,,,,,,,,,
2,1_000003,NDFRT,C0016157,Fish Oils,8016-13-5,Fish oil,ChemID+,<syn per source>,,,...,,,,,,,,,,
3,1_000004,NDFRT,C0016157,Fish Oils,8016-13-5,Fish oil,ChemID+,<syn per source>,,,...,,,,,,,,,,
4,1_000005,NDFRT,C0016157,Fish Oils,8016-13-5,Fish oil,ChemID+,<syn per source>,,,...,,,,,,,,,,


#### Load LINCS Small Molecules

In [7]:
lincs = pd.read_csv('Input/LINCS_SmallMolecules.csv', encoding = 'ISO-8859-1')

In [8]:
lincs.head()

Unnamed: 0,SM_Name,SM_LINCS_ID,SM_Alternative_Name,SM_PubChem_CID,SM_SMILES_Parent,SM_SMILES_Batch,SM_InChi_Parent,SM_Molecular_Mass,MOLECULAR_FORMULA,SM_ChEBI_ID
0,Dichlobenil,LSM-19017,,3031.0,Clc1cccc(Cl)c1C#N,,InChI=1S/C7H3Cl2N/c8-6-2-1-3-7(9)5(6)4-10/h1-3H,172.01,C7H3Cl2N,943.0
1,AC1NWAJC,LSM-43967,Vulpinic acid,5701993.0,COC(=O)\C(=C\1/OC(=O)C(C1=O)c2ccccc2)\c3ccccc3,,InChI=1S/C19H14O5/c1-23-18(21)15(13-10-6-3-7-1...,322.31,C19H14O5,
2,Sinapic Acid Methyl Ether,LSM-44124,,735755.0,COc1cc(\C=C\C(=O)O)cc(OC)c1OC,,InChI=1S/C12H14O5/c1-15-9-6-8(4-5-11(13)14)7-1...,238.24,C12H14O5,
3,Ferulic acid,LSM-44126,Ferulic acid,445858.0,COc1cc(\C=C\C(=O)O)ccc1O,,InChI=1S/C10H10O4/c1-14-9-6-7(2-4-8(9)11)3-5-1...,194.18,C10H10O4,17620.0
4,Pinosylvin Methyl Ether,LSM-43902,,5281719.0,COc1cc(O)cc(\C=C\c2ccccc2)c1,,InChI=1S/C15H14O2/c1-17-15-10-13(9-14(16)11-15...,226.27,C15H14O2,8227.0


## Drop Duplicates in DF

In [11]:
df = df_everything[['raw drug name','CUI_indication']]

In [16]:
df.shape

(192838, 2)

In [17]:
df = df.drop_duplicates()

In [20]:
df.shape

(178941, 2)

In [19]:
df = df.dropna(subset=['CUI_indication']) 

In [21]:
len(df['raw drug name'].unique())

34137

## Get PubChemID and Map to Drug Name

In [22]:
namesdf = df['raw drug name']
namesdf = namesdf.drop_duplicates()
namesdf.shape

(34137,)

In [None]:
nameslist = namesdf.tolist()
failed_to_get_CID = 0
# CID_dict = {}

for name in nameslist:
    if name in CID_dict:
        continue
    name = name.replace(' ','%20')
    url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/' + name + '/cids/JSON'
    response = requests.get(url)
    try:
        response.json()
    except ValueError:
        continue
    if 'IdentifierList' in response.json().keys():
        CID = response.json()['IdentifierList']['CID'][0]
        name = name.replace('%20', ' ')
        CID_dict[name] = CID
    else:
        failed_to_get_CID += 1
        
    if len(CID_dict) % 1000 == 0:
        name = name.replace('%20',' ')
        print(nameslist.index(name))

print(failed_to_get_CID)
print(len(CID_dict))
print(len(nameslist))

In [32]:
len(CID_dict)

26107

In [33]:
CIDs = []
for index, row in df.iterrows():
    stitch_id = row.loc['raw drug name']
    if stitch_id not in CID_dict:
        df.drop(index, inplace = True)
    else: 
        CIDs.append(int(CID_dict[stitch_id]))

df.loc[:,'CIDs'] = pd.Series(np.array(CIDs), index=df.index)

## Filter by LINCS Approved Small Molecules

In [34]:
count = 0
for index, row in df.iterrows():
    CID = row.loc['CIDs']
    if CID not in lincs['SM_PubChem_CID'].values:
        df.drop(index, inplace = True)
        count += 1
        
print(count)

70423


In [37]:
df.head()

Unnamed: 0,raw drug name,CUI_indication,CIDs
17,thiazolidine-4-carboxylic acid,C0032343,9934
18,thiazolidine-4-carboxylic acid,C0006826,9934
19,thiazolidine-4-carboxylic acid,C0599059,9934
20,thiazolidine-4-carboxylic acid,C0007222,9934
21,thiazolidine-4-carboxylic acid,C0678771,9934


## Make Binary Matrix

In [38]:
grouped_df = df.groupby(['CUI_indication'])['CIDs'].apply(lambda x: ','.join(x.astype(str))).reset_index()

In [39]:
grouped_df.set_index('CUI_indication', inplace=True)
grouped_df = grouped_df.sort_index()
len(df['CUI_indication'].unique())

4237

In [40]:
grouped_matrix = grouped_df.iloc[:,0].str.get_dummies(sep=',')
grouped_matrix.head()

Unnamed: 0_level_0,10020353,10026128,10052040,10074640,10090485,10096344,10113978,10117987,10127622,10133,...,9931954,9933475,9934,9935681,9939609,9941444,9949641,9952884,9955,9966051
CUI_indication,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
<No Term>,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C0000727,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C0000729,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C0000735,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C0000737,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [41]:
grouped_matrix.shape

(4237, 2168)

## Save Binary Matrix

In [42]:
filename = 'Output/DrugIndicationDB_Indications_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
grouped_matrix.to_csv(filename, sep='\t', compression='gzip')

## Convert Binary Matrix to GMT and Save as GMT

In [43]:
drugset_library = []
for index, row in grouped_matrix.iterrows():
    drugset = []
    drugset.append(index)
    drugset.append('')
    for i in range(grouped_matrix.shape[1]):
        if row.iloc[i]==1:
            drugset.append(row.index[i])
    if len(drugset) >= 7:
        drugset_library.append(drugset)
dsl = np.array(drugset_library)

In [44]:
len(max(dsl, key=len))

424

In [45]:
filename = 'Output/DrugIndicationDB_Indications_DrugSetLibrary_%s.gmt'% str(datetime.date.today())[0:7].replace('-', '_')
with open(filename,'w',encoding='utf-8') as f:
    for row in dsl:
        np.savetxt(f, [row], fmt = '%s',delimiter = '\t')