## Import Libraries

In [94]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import squareform, pdist,jaccard
import sys,  datetime, os
import requests
import chardet

## Load Data

#### Load DrugCentral Data

In [173]:
df = pd.read_table('Input/drugcentral_targets.tsv')

In [174]:
df.head()

Unnamed: 0,DRUG_NAME,STRUCT_ID,TARGET_NAME,TARGET_CLASS,ACCESSION,GENE,SWISSPROT,ACT_VALUE,ACT_UNIT,ACT_TYPE,ACT_COMMENT,ACT_SOURCE,RELATION,MOA,MOA_SOURCE,ACT_SOURCE_URL,MOA_SOURCE_URL,ACTION_TYPE,TDL,ORGANISM
0,levobupivacaine,4,Sodium channel protein type 4 subunit alpha,Ion channel,P35499,SCN4A,SCN4A_HUMAN,,,,,WOMBAT-PK,,1.0,CHEMBL,,https://www.ebi.ac.uk/chembl/compound/inspect/...,BLOCKER,Tclin,Homo sapiens
1,levobupivacaine,4,Cytochrome P450 2D6,Enzyme,P10635,CYP2D6,CP2D6_HUMAN,6.706859,,IC50,"DRUGMATRIX: CYP450, 2D6 enzyme inhibition (sub...",DRUG MATRIX,=,,,,,,Tclin,Homo sapiens
2,levobupivacaine,4,Potassium voltage-gated channel subfamily H me...,Ion channel,Q12809,KCNH2,KCNH2_HUMAN,4.89,,IC50,Inhibition of wild-type human ERG channel expr...,CHEMBL,=,,,https://www.ebi.ac.uk/chembl/compound/inspect/...,,,Tclin,Homo sapiens
3,levobupivacaine,4,Potassium voltage-gated channel subfamily D me...,Ion channel,Q9UK17,KCND3,KCND3_HUMAN,4.5,,IC50,,WOMBAT-PK,=,,,,,,Tclin,Homo sapiens
4,levobupivacaine,4,Prostaglandin E2 receptor EP1 subtype,GPCR,P34995,PTGER1,PE2R1_HUMAN,,,,,WOMBAT-PK,,,,,,,Tclin,Homo sapiens


In [175]:
df.shape

(16702, 20)

In [176]:
df = df.dropna(subset=['GENE']) 

#### Load Targets Mapping File

In [177]:
target_update = pd.read_table('Input/mappingFile_2017.txt', names = ['Old Targets','Updated Targets'], engine='python')

In [178]:
target_update.head()

Unnamed: 0,Old Targets,Updated Targets
0,A1BG,A1BG
1,A1BG-AS1,A1BG-AS1
2,NCRNA00181,A1BG-AS1
3,A1BGAS,A1BG-AS1
4,A1BG-AS,A1BG-AS1


In [179]:
target_update.set_index('Old Targets',inplace = True)

#### Load LINCS Small Molecules

In [180]:
lincs = pd.read_csv('Input/LINCS_SmallMolecules.csv',encoding='ISO-8859-1')

In [181]:
lincs.head()

Unnamed: 0,SM_Name,SM_LINCS_ID,SM_Alternative_Name,SM_PubChem_CID,SM_SMILES_Parent,SM_SMILES_Batch,SM_InChi_Parent,SM_Molecular_Mass,MOLECULAR_FORMULA,SM_ChEBI_ID
0,Dichlobenil,LSM-19017,,3031.0,Clc1cccc(Cl)c1C#N,,InChI=1S/C7H3Cl2N/c8-6-2-1-3-7(9)5(6)4-10/h1-3H,172.01,C7H3Cl2N,943.0
1,AC1NWAJC,LSM-43967,Vulpinic acid,5701993.0,COC(=O)\C(=C\1/OC(=O)C(C1=O)c2ccccc2)\c3ccccc3,,InChI=1S/C19H14O5/c1-23-18(21)15(13-10-6-3-7-1...,322.31,C19H14O5,
2,Sinapic Acid Methyl Ether,LSM-44124,,735755.0,COc1cc(\C=C\C(=O)O)cc(OC)c1OC,,InChI=1S/C12H14O5/c1-15-9-6-8(4-5-11(13)14)7-1...,238.24,C12H14O5,
3,Ferulic acid,LSM-44126,Ferulic acid,445858.0,COc1cc(\C=C\C(=O)O)ccc1O,,InChI=1S/C10H10O4/c1-14-9-6-7(2-4-8(9)11)3-5-1...,194.18,C10H10O4,17620.0
4,Pinosylvin Methyl Ether,LSM-43902,,5281719.0,COc1cc(O)cc(\C=C\c2ccccc2)c1,,InChI=1S/C15H14O2/c1-17-15-10-13(9-14(16)11-15...,226.27,C15H14O2,8227.0


## Fix DrugCentral GENE column

In [182]:
# how_many = 0
appended_df = []
gene_index = np.where(df.columns.values=='GENE')[0][0]

for index, row in df.iterrows():
    gene_group = row.loc['GENE']
    if '|' in gene_group:
        gene_split = gene_group.split('|')
        for i in gene_split:
            row_as_list = row.values.tolist()
            row_as_list[gene_index] = i
            appended_df.append(row_as_list)
        df.drop(index, inplace = True)
#         how_many += 1

print(len(appended_df))
print(how_many)

4695
1130


In [183]:
columnnames = list(df.columns.values)
fix_gene_df = pd.DataFrame(appended_df,columns = columnnames)

In [184]:
fix_gene_df.head()

Unnamed: 0,DRUG_NAME,STRUCT_ID,TARGET_NAME,TARGET_CLASS,ACCESSION,GENE,SWISSPROT,ACT_VALUE,ACT_UNIT,ACT_TYPE,ACT_COMMENT,ACT_SOURCE,RELATION,MOA,MOA_SOURCE,ACT_SOURCE_URL,MOA_SOURCE_URL,ACTION_TYPE,TDL,ORGANISM
0,(S)-nicardipine,5,Voltage-gated L-type calcium channel,Ion channel,Q01668|Q13936,CACNA1C,CAC1C_HUMAN|CAC1D_HUMAN,,,,Mechanism of Action,DRUG LABEL,,1.0,DRUG LABEL,http://www.accessdata.fda.gov/drugsatfda_docs/...,http://www.accessdata.fda.gov/drugsatfda_docs/...,BLOCKER,Tclin|Tclin,Homo sapiens
1,(S)-nicardipine,5,Voltage-gated L-type calcium channel,Ion channel,Q01668|Q13936,CACNA1D,CAC1C_HUMAN|CAC1D_HUMAN,,,,Mechanism of Action,DRUG LABEL,,1.0,DRUG LABEL,http://www.accessdata.fda.gov/drugsatfda_docs/...,http://www.accessdata.fda.gov/drugsatfda_docs/...,BLOCKER,Tclin|Tclin,Homo sapiens
2,acamprosate,38,Glutamate [NMDA] receptor,Ion channel,O15399|O60391|Q05586|Q12879|Q13224|Q14957|Q8TCU5,GRIN1,NMD3A_HUMAN|NMD3B_HUMAN|NMDE1_HUMAN|NMDE2_HUMA...,,,,Mechanism of Action; CHEMBL2094124; PROTEIN CO...,CHEMBL,,1.0,CHEMBL,https://www.ebi.ac.uk/chembl/compound/inspect/...,https://www.ebi.ac.uk/chembl/compound/inspect/...,ANTAGONIST,Tclin|Tclin|Tclin|Tclin|Tclin|Tclin|Tclin,Homo sapiens
3,acamprosate,38,Glutamate [NMDA] receptor,Ion channel,O15399|O60391|Q05586|Q12879|Q13224|Q14957|Q8TCU5,GRIN2A,NMD3A_HUMAN|NMD3B_HUMAN|NMDE1_HUMAN|NMDE2_HUMA...,,,,Mechanism of Action; CHEMBL2094124; PROTEIN CO...,CHEMBL,,1.0,CHEMBL,https://www.ebi.ac.uk/chembl/compound/inspect/...,https://www.ebi.ac.uk/chembl/compound/inspect/...,ANTAGONIST,Tclin|Tclin|Tclin|Tclin|Tclin|Tclin|Tclin,Homo sapiens
4,acamprosate,38,Glutamate [NMDA] receptor,Ion channel,O15399|O60391|Q05586|Q12879|Q13224|Q14957|Q8TCU5,GRIN2B,NMD3A_HUMAN|NMD3B_HUMAN|NMDE1_HUMAN|NMDE2_HUMA...,,,,Mechanism of Action; CHEMBL2094124; PROTEIN CO...,CHEMBL,,1.0,CHEMBL,https://www.ebi.ac.uk/chembl/compound/inspect/...,https://www.ebi.ac.uk/chembl/compound/inspect/...,ANTAGONIST,Tclin|Tclin|Tclin|Tclin|Tclin|Tclin|Tclin,Homo sapiens


In [185]:
fix_gene_df.shape

(4695, 20)

In [186]:
df = df.append(fix_gene_df)

In [187]:
df.head()

Unnamed: 0,DRUG_NAME,STRUCT_ID,TARGET_NAME,TARGET_CLASS,ACCESSION,GENE,SWISSPROT,ACT_VALUE,ACT_UNIT,ACT_TYPE,ACT_COMMENT,ACT_SOURCE,RELATION,MOA,MOA_SOURCE,ACT_SOURCE_URL,MOA_SOURCE_URL,ACTION_TYPE,TDL,ORGANISM
0,levobupivacaine,4,Sodium channel protein type 4 subunit alpha,Ion channel,P35499,SCN4A,SCN4A_HUMAN,,,,,WOMBAT-PK,,1.0,CHEMBL,,https://www.ebi.ac.uk/chembl/compound/inspect/...,BLOCKER,Tclin,Homo sapiens
1,levobupivacaine,4,Cytochrome P450 2D6,Enzyme,P10635,CYP2D6,CP2D6_HUMAN,6.706859,,IC50,"DRUGMATRIX: CYP450, 2D6 enzyme inhibition (sub...",DRUG MATRIX,=,,,,,,Tclin,Homo sapiens
2,levobupivacaine,4,Potassium voltage-gated channel subfamily H me...,Ion channel,Q12809,KCNH2,KCNH2_HUMAN,4.89,,IC50,Inhibition of wild-type human ERG channel expr...,CHEMBL,=,,,https://www.ebi.ac.uk/chembl/compound/inspect/...,,,Tclin,Homo sapiens
3,levobupivacaine,4,Potassium voltage-gated channel subfamily D me...,Ion channel,Q9UK17,KCND3,KCND3_HUMAN,4.5,,IC50,,WOMBAT-PK,=,,,,,,Tclin,Homo sapiens
4,levobupivacaine,4,Prostaglandin E2 receptor EP1 subtype,GPCR,P34995,PTGER1,PE2R1_HUMAN,,,,,WOMBAT-PK,,,,,,,Tclin,Homo sapiens


## Update Target Names

In [196]:
df = df.rename(index=str, columns = {'GENE':'Old Targets'})

In [189]:
df.set_index('Old Targets', inplace = True)

In [190]:
df = pd.merge(df, target_update, how= 'left', on = 'Old Targets')
df.shape

(19854, 20)

In [191]:
df.reset_index(inplace=True)

In [192]:
df = df.rename(index=str, columns = {'Updated Targets':'Targets'})

In [193]:
df = df.dropna(subset=['Targets']) 

In [194]:
df.shape

(18510, 21)

## Get PubChemID and Map to drug name

In [123]:
namesdf = df['DRUG_NAME']
namesdf = namesdf.drop_duplicates()
namesdf.shape

(2172,)

In [122]:
nameslist = namesdf.tolist()
failed_to_get_CID = 0
CID_dict = {}

for name in nameslist:
    name = name.replace(' ','%20')
    url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/' + name + '/cids/JSON'
    response = requests.get(url)
    if 'IdentifierList' in response.json().keys():
        CID = response.json()['IdentifierList']['CID'][0]
        name = name.replace('%20', ' ')
        CID_dict[name] = CID
    else:
        failed_to_get_CID += 1
        
    if len(CID_dict) % 100 == 0:
        name = name.replace('%20',' ')
        print(nameslist.index(name))

print(failed_to_get_CID)
print(len(CID_dict))
print(len(nameslist))

105
215
321
431
538
647
753
864
969
1074
1178
1281
1388
1492
1596
1703
1810
1985
1986
1987
1988
2108
230
1942
2172


In [150]:
CIDs = []
for index, row in df.iterrows():
    drugname = row.loc['DRUG_NAME']
    if drugname not in CID_dict:
        df.drop(index, inplace = True)
    else: 
        CIDs.append(int(CID_dict[drugname]))

df.loc[:,'CIDs'] = pd.Series(np.array(CIDs), index=df.index)

In [151]:
df.head()

Unnamed: 0,Old Targets,DRUG_NAME,STRUCT_ID,TARGET_NAME,TARGET_CLASS,ACCESSION,SWISSPROT,ACT_VALUE,ACT_UNIT,ACT_TYPE,...,RELATION,MOA,MOA_SOURCE,ACT_SOURCE_URL,MOA_SOURCE_URL,ACTION_TYPE,TDL,ORGANISM,Targets,CIDs
0,SCN4A,levobupivacaine,4,Sodium channel protein type 4 subunit alpha,Ion channel,P35499,SCN4A_HUMAN,,,,...,,1.0,CHEMBL,,https://www.ebi.ac.uk/chembl/compound/inspect/...,BLOCKER,Tclin,Homo sapiens,SCN4A,92253
1,CYP2D6,levobupivacaine,4,Cytochrome P450 2D6,Enzyme,P10635,CP2D6_HUMAN,6.706859,,IC50,...,=,,,,,,Tclin,Homo sapiens,CYP2D6,92253
2,KCNH2,levobupivacaine,4,Potassium voltage-gated channel subfamily H me...,Ion channel,Q12809,KCNH2_HUMAN,4.89,,IC50,...,=,,,https://www.ebi.ac.uk/chembl/compound/inspect/...,,,Tclin,Homo sapiens,KCNH2,92253
3,KCND3,levobupivacaine,4,Potassium voltage-gated channel subfamily D me...,Ion channel,Q9UK17,KCND3_HUMAN,4.5,,IC50,...,=,,,,,,Tclin,Homo sapiens,KCND3,92253
4,PTGER1,levobupivacaine,4,Prostaglandin E2 receptor EP1 subtype,GPCR,P34995,PE2R1_HUMAN,,,,...,,,,,,,Tclin,Homo sapiens,PTGER1,92253


In [152]:
df.shape

(17233, 22)

## Filter by LINCS approved Small Molecules

In [153]:
count = 0
for index, row in df.iterrows():
    CID = row.loc['CIDs']
    if CID not in lincs['SM_PubChem_CID'].values:
        df.drop(index, inplace = True)
        count += 1
        
print(count)

4715


In [157]:
df['CIDs'][2391]

36811

In [158]:
df.shape

(12518, 22)

## Make Binary Matrix

In [161]:
grouped_df = df.groupby(['Targets'])['CIDs'].apply(lambda x: ','.join(x.astype(str))).reset_index()

In [163]:
grouped_df.set_index('Targets', inplace=True)
grouped_df = grouped_df.sort_index()
len(df['Targets'].unique())

1383

In [164]:
grouped_matrix = grouped_df.iloc[:,0].str.get_dummies(sep=',')

In [165]:
grouped_matrix.head()

Unnamed: 0_level_0,10071196,10096344,10113978,10133,10182969,10184653,10219,1046,104741,104758,...,9867642,9869929,9875401,9878,9880,9904,9926791,9930049,9941444,9966051
Targets,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAK1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ABAT,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ABCA1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ABCB1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ABCB11,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [166]:
grouped_matrix.shape

(1383, 1086)

## Save Binary Matrix

In [167]:
filename = 'Output/DrugCentral_Targets_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
grouped_matrix.to_csv(filename, sep='\t', compression='gzip')

## Convert Binary Matrix to GMT and Save as GMT

In [169]:
drugset_library = []
for index, row in grouped_matrix.iterrows():
    drugset = []
    drugset.append(index)
    drugset.append('')
    for i in range(grouped_matrix.shape[1]):
        if row.iloc[i]==1:
            drugset.append(row.index[i])
    if len(drugset) >= 7:
        drugset_library.append(drugset)
dsl = np.array(drugset_library)

In [170]:
len(max(dsl, key=len))

149

In [171]:
print(dsl)

[list(['AAK1', '', '10113978', '11626560', '176870', '25126798', '5328940', '5329102', '6450551'])
 list(['ABCB1', '', '11243969', '124087', '126941', '13342', '16362', '193962', '208908', '2157', '2170', '2247', '2267', '2520', '2726', '2733526', '2812', '2895', '2913', '30323', '3151', '3152', '3157', '31703', '3241', '36314', '36462', '3715', '3827', '39186', '392622', '3957', '3961', '40692', '4192', '4212', '42890', '441074', '441243', '441336', '447043', '4474', '456201', '4594', '4932', '4946', '5280343', '5284373', '5284514', '5284616', '5291', '5405', '55245', '5568', '56959', '5732', '5770', '5994', '60149', '60663', '60700', '60795', '60838', '6167', '64143', '6442177', '68770', '6918097'])
 list(['ABCB11', '', '104865', '12560', '1548887', '32798', '3397', '3474', '3488', '36314', '4449', '4485', '4659569', '4829', '4891', '5284373', '5284513', '5472', '5591', '71277', '77999'])
 list(['ABCC1', '', '126941', '13342', '16362', '193962', '2333', '2520', '30323', '31703', '364

In [172]:
filename = 'Output/DrugCentral_Targets_DrugSetLibrary_%s.gmt'% str(datetime.date.today())[0:7].replace('-', '_')
with open(filename,'w',encoding='utf-8') as f:
    for row in dsl:
        np.savetxt(f, [row], fmt = '%s',delimiter = '\t')   