## Import Libraries

In [126]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import squareform, pdist,jaccard
import sys,  datetime, os
import requests
import chardet

## Load Data

#### Load DrugRepurposingHub Data

In [144]:
df = pd.read_table('Input/repurposing_drugs_20170327.txt', engine = 'python')

In [145]:
df.head()

Unnamed: 0,pert_iname,clinical_phase,moa,target
0,(1E)-1-(2-hydroxy-5-methylphenyl)-1-dodecanone...,Preclinical,,
1,A-317491,Preclinical,purinergic receptor antagonist,P2RX3
2,A-33903,Phase 2,,
3,A-366,Preclinical,histone lysine methyltransferase inhibitor,EHMT1|EHMT2
4,A-674563,Preclinical,AKT inhibitor,AKT1|PKIA|PRKACA


#### Load Targets Mapping File

In [146]:
target_update = pd.read_table('Input/mappingFile_2017.txt', names = ['Old Targets','Updated Targets'], engine='python')

In [147]:
target_update.head()

Unnamed: 0,Old Targets,Updated Targets
0,A1BG,A1BG
1,A1BG-AS1,A1BG-AS1
2,NCRNA00181,A1BG-AS1
3,A1BGAS,A1BG-AS1
4,A1BG-AS,A1BG-AS1


#### Load LINCS Small Molecules

In [148]:
lincs = pd.read_csv('Input/LINCS_SmallMolecules.csv',encoding='ISO-8859-1')

In [149]:
lincs.head()

Unnamed: 0,SM_Name,SM_LINCS_ID,SM_Alternative_Name,SM_PubChem_CID,SM_SMILES_Parent,SM_SMILES_Batch,SM_InChi_Parent,SM_Molecular_Mass,MOLECULAR_FORMULA,SM_ChEBI_ID
0,Dichlobenil,LSM-19017,,3031.0,Clc1cccc(Cl)c1C#N,,InChI=1S/C7H3Cl2N/c8-6-2-1-3-7(9)5(6)4-10/h1-3H,172.01,C7H3Cl2N,943.0
1,AC1NWAJC,LSM-43967,Vulpinic acid,5701993.0,COC(=O)\C(=C\1/OC(=O)C(C1=O)c2ccccc2)\c3ccccc3,,InChI=1S/C19H14O5/c1-23-18(21)15(13-10-6-3-7-1...,322.31,C19H14O5,
2,Sinapic Acid Methyl Ether,LSM-44124,,735755.0,COc1cc(\C=C\C(=O)O)cc(OC)c1OC,,InChI=1S/C12H14O5/c1-15-9-6-8(4-5-11(13)14)7-1...,238.24,C12H14O5,
3,Ferulic acid,LSM-44126,Ferulic acid,445858.0,COc1cc(\C=C\C(=O)O)ccc1O,,InChI=1S/C10H10O4/c1-14-9-6-7(2-4-8(9)11)3-5-1...,194.18,C10H10O4,17620.0
4,Pinosylvin Methyl Ether,LSM-43902,,5281719.0,COc1cc(O)cc(\C=C\c2ccccc2)c1,,InChI=1S/C15H14O2/c1-17-15-10-13(9-14(16)11-15...,226.27,C15H14O2,8227.0


## Get PubChemID and Map to Drug Name

In [80]:
namesdf = df['pert_iname']
namesdf = namesdf.drop_duplicates()
namesdf.shape

(5628,)

In [None]:
nameslist = namesdf.tolist()
failed_to_get_CID = 0
CID_dict = {}

for name in nameslist:
    name = name.replace(' ','%20')
    url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/' + name + '/cids/JSON'
    response = requests.get(url)
    try:
        response.json()
    except ValueError:
        continue
    if 'IdentifierList' in response.json().keys():
        CID = response.json()['IdentifierList']['CID'][0]
        name = name.replace('%20', ' ')
        CID_dict[name] = CID
    else:
        failed_to_get_CID += 1
        
    if len(CID_dict) % 100 == 0:
        name = name.replace('%20',' ')
        print(nameslist.index(name))

print(failed_to_get_CID)
print(len(CID_dict))
print(len(nameslist))

In [150]:
CIDs = []
for index, row in df.iterrows():
    drugname = row.loc['pert_iname']
    if drugname not in CID_dict:
        df.drop(index, inplace = True)
    else: 
        CIDs.append(int(CID_dict[drugname]))

df.loc[:,'CIDs'] = pd.Series(np.array(CIDs), index=df.index)

## Filter by LINCS Approved Small Molecules

In [151]:
count = 0
for index, row in df.iterrows():
    CID = row.loc['CIDs']
    if CID not in lincs['SM_PubChem_CID'].values:
        df.drop(index, inplace = True)
        count += 1
        
print(count)

1852


# For Targets, run this code

In [85]:
df = df.dropna(subset=['target']) 

## Fix DrugRepurposing targets column

In [86]:
how_many = 0
appended_df = []
gene_index = np.where(df.columns.values=='target')[0][0]

for index, row in df.iterrows():
    gene_group = row.loc['target']
    if '|' in gene_group:
        gene_split = gene_group.split('|')
        for i in gene_split:
            row_as_list = row.values.tolist()
            row_as_list[gene_index] = i
            appended_df.append(row_as_list)
        df.drop(index, inplace = True)
        how_many += 1


print(len(appended_df))
print(how_many)

4948
924


In [87]:
columnnames = list(df.columns.values)
fix_gene_df = pd.DataFrame(appended_df,columns = columnnames)

In [88]:
fix_gene_df.head()

Unnamed: 0,pert_iname,clinical_phase,moa,target,CIDs
0,AC-55649,Preclinical,retinoid receptor agonist,RARA,1714884
1,AC-55649,Preclinical,retinoid receptor agonist,RARB,1714884
2,acemetacin,Launched,cyclooxygenase inhibitor,PTGS1,1981
3,acemetacin,Launched,cyclooxygenase inhibitor,PTGS2,1981
4,acetohexamide,Launched,ATP channel blocker,ABCC8,1989


In [89]:
df = df.append(fix_gene_df)

In [90]:
df.head()

Unnamed: 0,pert_iname,clinical_phase,moa,target,CIDs
12,ABC-294640,Phase 1/Phase 2,sphingosine kinase inhibitor,SPHK2,15604015
36,aceclofenac,Launched,prostanoid receptor antagonist,PTGS2,71771
39,acefylline,Launched,adenosine receptor agonist,ADORA1,69550
53,acetyl-farnesyl-cysteine,Launched,methyltransferase inhibitor,PPARG,6438381
61,acipimox,Launched,cholesterol inhibitor,HCAR2,5310993


## Update Target Names

In [91]:
df = df.rename(index=str, columns = {'target':'Old Targets'})
df.set_index('Old Targets', inplace = True)

In [92]:
target_update.set_index('Old Targets',inplace = True)

In [93]:
df = pd.merge(df, target_update, how= 'left', on = 'Old Targets')
df.shape

(5616, 5)

In [94]:
df.reset_index(inplace=True)
df = df.rename(index=str, columns = {'Updated Targets':'Targets'})
df = df.dropna(subset=['Targets']) 

In [95]:
df.shape

(5614, 6)

## Targets Binary Matrix

In [96]:
grouped_df = df.groupby(['Targets'])['CIDs'].apply(lambda x: ','.join(x.astype(str))).reset_index()

In [97]:
grouped_df.set_index('Targets', inplace=True)
grouped_df = grouped_df.sort_index()
len(df['Targets'].unique())

1161

In [98]:
grouped_matrix = grouped_df.iloc[:,0].str.get_dummies(sep=',')

In [99]:
grouped_matrix.head()

Unnamed: 0_level_0,10000456,100016,10029385,10071166,10071196,10074640,10077147,10096344,10113978,10117987,...,9952773,9952884,9953599,9953769,9956119,9956637,9960285,9966051,9967941,9989505
Targets,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ABAT,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ABCA1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ABCB1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ABCB11,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ABCB4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [100]:
grouped_matrix.shape

(1161, 1589)

## Save Targets Binary Matrix

In [101]:
filename = 'Output/DrugRepurposing_Targets_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
grouped_matrix.to_csv(filename, sep='\t', compression='gzip')

## Convert Binary Matrix to GMT and Save as GMT

In [102]:
drugset_library = []
for index, row in grouped_matrix.iterrows():
    drugset = []
    drugset.append(index)
    drugset.append('')
    for i in range(grouped_matrix.shape[1]):
        if row.iloc[i]==1:
            drugset.append(row.index[i])
    if len(drugset) >= 7:
        drugset_library.append(drugset)
dsl = np.array(drugset_library)

In [103]:
len(max(dsl, key=len))

83

In [104]:
filename = 'Output/DrugRepurposing_Targets_DrugSetLibrary_%s.gmt'% str(datetime.date.today())[0:7].replace('-', '_')
with open(filename,'w',encoding='utf-8') as f:
    for row in dsl:
        np.savetxt(f, [row], fmt = '%s',delimiter = '\t')

# For MOA, run this code

In [152]:
df = df.dropna(subset=['moa']) 

In [153]:
df.head()

Unnamed: 0,pert_iname,clinical_phase,moa,target,CIDs
6,A-769662,Preclinical,AMPK activator,,54708532
12,ABC-294640,Phase 1/Phase 2,sphingosine kinase inhibitor,SPHK2,15604015
25,AC-55649,Preclinical,retinoid receptor agonist,RARA|RARB,1714884
36,aceclofenac,Launched,prostanoid receptor antagonist,PTGS2,71771
39,acefylline,Launched,adenosine receptor agonist,ADORA1,69550


In [154]:
df = df.rename(index=str, columns = {'moa':'MOA'})

## Fix MOA Column

In [155]:
how_many = 0
appended_df = []
moa_index = np.where(df.columns.values=='MOA')[0][0]

for index, row in df.iterrows():
    moa_group = row.loc['MOA']
    if '|' in moa_group:
        moa_split = moa_group.split('|')
        for i in moa_split:
            row_as_list = row.values.tolist()
            row_as_list[moa_index] = i
            appended_df.append(row_as_list)
        df.drop(index, inplace = True)
        how_many += 1


print(len(appended_df))
print(how_many)

518
221


In [156]:
columnnames = list(df.columns.values)
fix_moa_df = pd.DataFrame(appended_df,columns = columnnames)
fix_moa_df.head()

Unnamed: 0,pert_iname,clinical_phase,MOA,target,CIDs
0,adatanserin,Phase 2,serotonin receptor agonist,HTR1A|HTR2A,130918
1,adatanserin,Phase 2,serotonin receptor antagonist,HTR1A|HTR2A,130918
2,agomelatine,Launched,melatonin receptor agonist,HTR2A|HTR2B|HTR2C|MTNR1A|MTNR1B,82148
3,agomelatine,Launched,serotonin receptor antagonist,HTR2A|HTR2B|HTR2C|MTNR1A|MTNR1B,82148
4,AM-404,Preclinical,cyclooxygenase inhibitor,CNR1|CNR2|FAAH|TRPV1,6604822


In [157]:
df = df.append(fix_moa_df)
fix_moa_df.head()

Unnamed: 0,pert_iname,clinical_phase,MOA,target,CIDs
0,adatanserin,Phase 2,serotonin receptor agonist,HTR1A|HTR2A,130918
1,adatanserin,Phase 2,serotonin receptor antagonist,HTR1A|HTR2A,130918
2,agomelatine,Launched,melatonin receptor agonist,HTR2A|HTR2B|HTR2C|MTNR1A|MTNR1B,82148
3,agomelatine,Launched,serotonin receptor antagonist,HTR2A|HTR2B|HTR2C|MTNR1A|MTNR1B,82148
4,AM-404,Preclinical,cyclooxygenase inhibitor,CNR1|CNR2|FAAH|TRPV1,6604822


## MOA Binary Matrix

In [158]:
grouped_df = df.groupby(['MOA'])['CIDs'].apply(lambda x: ','.join(x.astype(str))).reset_index()

In [159]:
grouped_df.set_index('MOA', inplace=True)
grouped_df = grouped_df.sort_index()
len(df['MOA'].unique())

573

In [160]:
grouped_matrix = grouped_df.iloc[:,0].str.get_dummies(sep=',')
grouped_matrix.head()

Unnamed: 0_level_0,10000456,100016,10029385,10071166,10071196,10074640,10077147,10096344,10113978,10117987,...,9952884,9953599,9953769,9956119,9956222,9956637,9960285,9966051,9967941,9989505
MOA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11-beta hydroxysteroid dehydrogenase inhibitor,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3-ketoacyl CoA thiolase inhibitor,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5 alpha reductase inhibitor,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ACAT inhibitor,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AGE inhibitor,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [161]:
grouped_matrix.shape

(573, 1896)

## Save MOA Binary Matrix

In [162]:
filename = 'Output/DrugRepurposing_MOA_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
grouped_matrix.to_csv(filename, sep='\t', compression='gzip')

## Convert Binary Matrix to GMT and Save as GMT

In [163]:
drugset_library = []
for index, row in grouped_matrix.iterrows():
    drugset = []
    drugset.append(index)
    drugset.append('')
    for i in range(grouped_matrix.shape[1]):
        if row.iloc[i]==1:
            drugset.append(row.index[i])
    if len(drugset) >= 7:
        drugset_library.append(drugset)
dsl = np.array(drugset_library)

In [164]:
len(max(dsl, key=len))

58

In [165]:
filename = 'Output/DrugRepurposing_MOA_DrugSetLibrary_%s.gmt'% str(datetime.date.today())[0:7].replace('-', '_')
with open(filename,'w',encoding='utf-8') as f:
    for row in dsl:
        np.savetxt(f, [row], fmt = '%s',delimiter = '\t')