## Import Libraries

In [1]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import squareform, pdist,jaccard
import sys,  datetime, os
import requests
import chardet

## Load Data

#### Load BindingDB data

In [2]:
raw_df = pd.read_table('Input/BindingDB_All.tsv', quoting=3, encoding='utf-8',error_bad_lines=False)

b'Skipping line 546861: expected 193 fields, saw 265\n'
b'Skipping line 950663: expected 193 fields, saw 241\n'
  interactivity=interactivity, compiler=compiler, result=result)


In [206]:
raw_df.head()

Unnamed: 0,BindingDB Reactant_set_id,Ligand SMILES,Ligand InChI,Ligand InChI Key,BindingDB MonomerID,BindingDB Ligand Name,Target Name Assigned by Curator or DataSource,Target Source Organism According to Curator or DataSource,Ki (nM),IC50 (nM),...,UniProt (SwissProt) Recommended Name of Target Chain.12,UniProt (SwissProt) Entry Name of Target Chain.12,UniProt (SwissProt) Primary ID of Target Chain.12,UniProt (SwissProt) Secondary ID(s) of Target Chain.12,UniProt (SwissProt) Alternative ID(s) of Target Chain.12,UniProt (TrEMBL) Submitted Name of Target Chain.12,UniProt (TrEMBL) Entry Name of Target Chain.12,UniProt (TrEMBL) Primary ID of Target Chain.12,UniProt (TrEMBL) Secondary ID(s) of Target Chain.12,UniProt (TrEMBL) Alternative ID(s) of Target Chain.12
0,1,COc1cc2c(Nc3ccc(Br)cc3F)ncnc2cc1OCC1CCN(C)CC1,InChI=1S/C22H24BrFN4O2/c1-28-7-5-14(6-8-28)12-...,UHTHHESEBZOYNR-UHFFFAOYSA-N,21,"6-[(4R,5S,6S,7R)-4,7-dibenzyl-3-(5-carboxypent...",HIV-1 Protease,Human immunodeficiency virus 1,0.24,,...,,,,,,,,,,
1,2,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(C\C=C\c2cn...,InChI=1S/C31H34N6O3/c38-29-27(17-23-9-3-1-4-10...,UZLMEAPBHYEHAC-UNTBESQGSA-N,22,"(4R,5S,6S,7R)-4,7-dibenzyl-5,6-dihydroxy-1,3-b...",HIV-1 Protease,Human immunodeficiency virus 1,0.25,,...,,,,,,,,,,
2,3,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(CC2CC2)C(=...,InChI=1S/C29H34N4O3/c34-27-25(16-21-8-3-1-4-9-...,HYNYUFZPPJMPOB-UTWJFGBXSA-N,23,"(4R,5S,6S,7R)-4,7-dibenzyl-1-(cyclopropylmethy...",HIV-1 Protease,Human immunodeficiency virus 1,0.41,,...,,,,,,,,,,
3,4,OCCCCCCN1[C@H](Cc2ccccc2)[C@H](O)[C@@H](O)[C@@...,InChI=1S/C29H40N2O4/c32-18-10-2-1-9-17-30-25(1...,YXVAZXDWVZTGGD-VIJSPRBVSA-N,24,"(4R,5S,6S,7R)-4,7-dibenzyl-1-(cyclopropylmethy...",HIV-1 Protease,Human immunodeficiency virus 1,0.8,,...,,,,,,,,,,
4,5,OCCCCCN1[C@H](Cc2ccccc2)[C@H](O)[C@@H](O)[C@@H...,InChI=1S/C28H38N2O4/c31-17-9-3-8-16-29-24(18-2...,WWTSWTPNILRSJX-XDZXDJIYSA-N,25,"(4R,5S,6S,7R)-4,7-dibenzyl-1-(cyclopropylmethy...",HIV-1 Protease,Human immunodeficiency virus 1,0.99,,...,,,,,,,,,,


#### Load PDB to UniProt file

In [4]:
pdb_uniprot = pd.read_table('Input/pdbsws_chain.txt',names = ['info'])

In [5]:
pdb_uniprot.head()

Unnamed: 0,info
0,101m A P02185
1,102l A P00720
2,102m A P02185
3,103l A P00720
4,103m A P02185


#### Load UniProt to Gene file

In [6]:
uniprot = pd.read_table('Input/gene_to_uniprot.txt')

#### Load Target Update File

In [7]:
target_update = pd.read_table('Input/mappingFile_2017.txt', names = ['Old Targets','Updated Targets'], engine='python')

#### Load LINCS Small Molecules

In [8]:
lincs = pd.read_csv('Input/LINCS_SmallMolecules.csv',encoding='ISO-8859-1')

## Make df of PCID and PDB ID

In [10]:
raw_df['PDB ID(s) of Target Chain'].head()

0    1AJV,1AJX,1C70,1D4H,1D4I,1D4J,1DIF,1EBW,1EBZ,1...
1    1AJV,1AJX,1C70,1D4H,1D4I,1D4J,1DIF,1EBW,1EBZ,1...
2    1AJV,1AJX,1C70,1D4H,1D4I,1D4J,1DIF,1EBW,1EBZ,1...
3    1AJV,1AJX,1C70,1D4H,1D4I,1D4J,1DIF,1EBW,1EBZ,1...
4    1AJV,1AJX,1C70,1D4H,1D4I,1D4J,1DIF,1EBW,1EBZ,1...
Name: PDB ID(s) of Target Chain, dtype: object

In [11]:
raw_df['PubChem CID'][1]

5327236.0

In [12]:
df = raw_df[['PubChem CID','PDB ID(s) of Target Chain']]

In [13]:
df.head(15)

Unnamed: 0,PubChem CID,PDB ID(s) of Target Chain
0,3081361.0,"1AJV,1AJX,1C70,1D4H,1D4I,1D4J,1DIF,1EBW,1EBZ,1..."
1,5327236.0,"1AJV,1AJX,1C70,1D4H,1D4I,1D4J,1DIF,1EBW,1EBZ,1..."
2,5327235.0,"1AJV,1AJX,1C70,1D4H,1D4I,1D4J,1DIF,1EBW,1EBZ,1..."
3,5327234.0,"1AJV,1AJX,1C70,1D4H,1D4I,1D4J,1DIF,1EBW,1EBZ,1..."
4,3009319.0,"1AJV,1AJX,1C70,1D4H,1D4I,1D4J,1DIF,1EBW,1EBZ,1..."
5,5327233.0,"1AJV,1AJX,1C70,1D4H,1D4I,1D4J,1DIF,1EBW,1EBZ,1..."
6,463335.0,"1AJV,1AJX,1C70,1D4H,1D4I,1D4J,1DIF,1EBW,1EBZ,1..."
7,3009298.0,"1AJV,1AJX,1C70,1D4H,1D4I,1D4J,1DIF,1EBW,1EBZ,1..."
8,463323.0,"1AJV,1AJX,1C70,1D4H,1D4I,1D4J,1DIF,1EBW,1EBZ,1..."
9,469353.0,"1AJV,1AJX,1C70,1D4H,1D4I,1D4J,1DIF,1EBW,1EBZ,1..."


In [14]:
df = df.dropna(subset=['PubChem CID'])
df = df.dropna(subset=['PDB ID(s) of Target Chain']) 

In [15]:
df['PubChem CID']=df['PubChem CID'].apply(lambda x: int(float(x)))

In [16]:
df = df.rename(index=str, columns = {'PDB ID(s) of Target Chain':'PDB'})

In [17]:
df.shape

(841704, 2)

## Fix PDB ID column

In [18]:
how_many = 0
appended_df = []
prot_index = np.where(df.columns.values=='PDB')[0][0]

for index, row in df.iterrows():
    prot_group = row.loc['PDB']
    if ',' in prot_group:
        prot_split = prot_group.split(',')
        for i in prot_split:
            row_as_list = row.values.tolist()
            row_as_list[prot_index] = i
            appended_df.append(row_as_list)
        df.drop(index, inplace = True)
        how_many += 1


print(len(appended_df))
print(how_many)

27615029
727948


In [19]:
df.head()

Unnamed: 0,PubChem CID,PDB
236,53308627,3LZS
237,53308628,3LZS
238,53308629,3LZS
239,53308630,3LZS
240,53308631,3LZS


In [20]:
columnnames = list(df.columns.values)
fix_gene_df = pd.DataFrame(appended_df,columns = columnnames)

In [21]:
fix_gene_df.head(10)

Unnamed: 0,PubChem CID,PDB
0,3081361,1AJV
1,3081361,1AJX
2,3081361,1C70
3,3081361,1D4H
4,3081361,1D4I
5,3081361,1D4J
6,3081361,1DIF
7,3081361,1EBW
8,3081361,1EBZ
9,3081361,1EC0


In [22]:
df = df.append(fix_gene_df)

In [23]:
df.shape

(27728785, 2)

## Fix PDB Column in PDB File

In [24]:
pdb_uniprot_fixed = pdb_uniprot['info'].apply(lambda x: pd.Series(x.split(' ')))

In [25]:
pdb_uniprot_fixed.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,101m,A,P02185,,,,,,
1,102l,A,P00720,,,,,,
2,102m,A,P02185,,,,,,
3,103l,A,P00720,,,,,,
4,103m,A,P02185,,,,,,


In [26]:
pdb_uniprot_fixed = pdb_uniprot_fixed.rename(index=str, columns = {0:'PDB ID'})
pdb_uniprot_fixed = pdb_uniprot_fixed.rename(index=str, columns = {1:'Letter'})
pdb_uniprot_fixed = pdb_uniprot_fixed.rename(index=str, columns = {2:'UniProt'})

In [27]:
pdb_uniprot = pdb_uniprot_fixed[['PDB ID','Letter','UniProt']]

In [28]:
pdb_uniprot.head()

Unnamed: 0,PDB ID,Letter,UniProt
0,101m,A,P02185
1,102l,A,P00720
2,102m,A,P02185
3,103l,A,P00720
4,103m,A,P02185


## Map PDB ID to UniProt ID

In [29]:
pdb_uniprot['PDB ID']= pdb_uniprot['PDB ID'].str.upper()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [30]:
'1D4I'in list(pdb_uniprot['PDB ID'])

True

In [31]:
pdb_uniprot.set_index('PDB ID', inplace = True)
pdb_uniprot.drop_duplicates()
pdb_uniprot.head()

Unnamed: 0_level_0,Letter,UniProt
PDB ID,Unnamed: 1_level_1,Unnamed: 2_level_1
101M,A,P02185
102L,A,P00720
102M,A,P02185
103L,A,P00720
103M,A,P02185


In [115]:
pdb_uniprot.shape

(344378, 2)

In [118]:
pdb_df = df['PDB']
pdb_df = pdb_df.drop_duplicates()
type(pdb_df)

pandas.core.series.Series

In [127]:
## Make dictionary for the PDBs actually in the df- don't need to check Uniprot df every time
pdb_dict = {}

for pdb in pdb_df:
    if pdb in list(pdb_uniprot.index):
        if type(pdb_uniprot['UniProt'][pdb]) != str:
            pdb_dict[pdb] = list(pdb_uniprot.loc[:,'UniProt'][pdb])[0]
        else:
            pdb_dict[pdb] = pdb_uniprot.loc[:,'UniProt'][pdb]
    if len(pdb_dict) % 1000 == 0:
        print(len(pdb_dict))

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000


In [78]:
df.reset_index(inplace= True)

In [151]:
df['UniProt ID'] = None

for index,row in df.iterrows():
    pdb_id = row.loc['PDB']
    if pdb_id in list(pdb_dict.keys()):
        df.at[index,'UniProt ID'] = pdb_dict[pdb_id]
    if index % 10000 == 0:
        print(index)

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
450000
460000
470000
480000
490000
500000
510000
520000
530000
540000
550000
560000
570000
580000
590000
600000
610000
620000
630000
640000
650000
660000
670000
680000
690000
700000
710000
720000
730000
740000
750000
760000
770000
780000
790000
800000
810000
820000
830000
840000
850000
860000
870000
880000
890000
900000
910000
920000
930000
940000
950000
960000
970000
980000
990000
1000000
1010000
1020000
1030000
1040000
1050000
1060000
1070000
1080000
1090000
1100000
1110000
1120000
1130000
1140000
1150000
1160000
1170000
1180000
1190000
1200000
1210000
1220000
1230000
1240000
1250000
1260000
1270000
1280000
1290000
1300000
1310000
1320000
1330000
1340000
1350000
1360000
1370000
1380000
13

10350000
10360000
10370000
10380000
10390000
10400000
10410000
10420000
10430000
10440000
10450000
10460000
10470000
10480000
10490000
10500000
10510000
10520000
10530000
10540000
10550000
10560000
10570000
10580000
10590000
10600000
10610000
10620000
10630000
10640000
10650000
10660000
10670000
10680000
10690000
10700000
10710000
10720000
10730000
10740000
10750000
10760000
10770000
10780000
10790000
10800000
10810000
10820000
10830000
10840000
10850000
10860000
10870000
10880000
10890000
10900000
10910000
10920000
10930000
10940000
10950000
10960000
10970000
10980000
10990000
11000000
11010000
11020000
11030000
11040000
11050000
11060000
11070000
11080000
11090000
11100000
11110000
11120000
11130000
11140000
11150000
11160000
11170000
11180000
11190000
11200000
11210000
11220000
11230000
11240000
11250000
11260000
11270000
11280000
11290000
11300000
11310000
11320000
11330000
11340000
11350000
11360000
11370000
11380000
11390000
11400000
11410000
11420000
11430000
11440000
11450000
1

19460000
19470000
19480000
19490000
19500000
19510000
19520000
19530000
19540000
19550000
19560000
19570000
19580000
19590000
19600000
19610000
19620000
19630000
19640000
19650000
19660000
19670000
19680000
19690000
19700000
19710000
19720000
19730000
19740000
19750000
19760000
19770000
19780000
19790000
19800000
19810000
19820000
19830000
19840000
19850000
19860000
19870000
19880000
19890000
19900000
19910000
19920000
19930000
19940000
19950000
19960000
19970000
19980000
19990000
20000000
20010000
20020000
20030000
20040000
20050000
20060000
20070000
20080000
20090000
20100000
20110000
20120000
20130000
20140000
20150000
20160000
20170000
20180000
20190000
20200000
20210000
20220000
20230000
20240000
20250000
20260000
20270000
20280000
20290000
20300000
20310000
20320000
20330000
20340000
20350000
20360000
20370000
20380000
20390000
20400000
20410000
20420000
20430000
20440000
20450000
20460000
20470000
20480000
20490000
20500000
20510000
20520000
20530000
20540000
20550000
20560000
2

In [153]:
df.shape

(27728785, 5)

In [154]:
df = df.dropna(subset=['UniProt ID']) 

In [155]:
df.shape

(27004035, 5)

In [157]:
df = df[['PubChem CID','UniProt ID']]

In [158]:
df.head()

Unnamed: 0,PubChem CID,UniProt ID
0,53308627,P24740
1,53308628,P24740
2,53308629,P24740
3,53308630,P24740
4,53308631,P24740


In [159]:
df=df.drop_duplicates()

In [160]:
df.shape

(1338842, 2)

## Map UniProt ID to Gene Target

In [161]:
df.set_index('UniProt ID', inplace = True)
uniprot.set_index('UniProt ID',inplace=True)
uniprot.head()

Unnamed: 0_level_0,Approved Symbol
UniProt ID,Unnamed: 1_level_1
P04217,A1BG
,A1BG-AS1
Q9NQ94,A1CF
P01023,A2M
,A2M-AS1


In [162]:
df.head()

Unnamed: 0_level_0,PubChem CID
UniProt ID,Unnamed: 1_level_1
P24740,53308627
P24740,53308628
P24740,53308629
P24740,53308630
P24740,53308631


In [166]:
df = pd.merge(df, uniprot, how= 'left', on = 'UniProt ID')
df = df.rename(index=str, columns = {'Approved Symbol':'Old Targets'})
df.reset_index(inplace=True)
df.head()

Unnamed: 0,index,UniProt ID,PubChem CID,Old Targets,Old Targets.1
0,0,P24740,53308627,,
1,1,P24740,53308628,,
2,2,P24740,53308629,,
3,3,P24740,53308630,,
4,4,P24740,53308631,,
5,5,P24740,53308632,,
6,6,P24740,53308633,,
7,7,P24740,53308634,,
8,8,P24740,53308635,,
9,9,P24740,53308636,,


In [169]:
df = df.dropna(subset=['Old Targets']) 
df.shape

(732879, 5)

In [170]:
df.head()

Unnamed: 0,index,UniProt ID,PubChem CID,Old Targets,Old Targets.1
89,89,P24723,5287736,PRKCH,PRKCH
90,90,P24723,5327919,PRKCH,PRKCH
91,91,P24723,5327920,PRKCH,PRKCH
92,92,P24723,5327921,PRKCH,PRKCH
93,93,P24723,5327922,PRKCH,PRKCH


## Update Target Names

In [178]:
df=df.iloc[:,[0,1,2]]
df.head()

Unnamed: 0,PubChem CID,UniProt ID,Old Targets
89,5287736,P24723,PRKCH
90,5327919,P24723,PRKCH
91,5327920,P24723,PRKCH
92,5327921,P24723,PRKCH
93,5327922,P24723,PRKCH


In [179]:
df.set_index('Old Targets', inplace = True)
target_update.set_index('Old Targets', inplace = True)
df = pd.merge(df, target_update, how= 'left', on = 'Old Targets')
df.reset_index(inplace=True)
df = df.rename(index=str, columns = {'Updated Targets':'Targets'})
df = df.dropna(subset=['Targets']) 
df.shape

(732879, 4)

## Filter by LINCS Approved Small Molecules

In [193]:
# count = 0
for index, row in df.iterrows():
    CID = row.loc['PubChem CID']
    if CID not in lincs['SM_PubChem_CID'].values:
        df.drop(index, inplace = True)
        count += 1
        
print(count)

708783


In [194]:
count,index

(708783, '732878')

In [198]:
df.head()

Unnamed: 0,Old Targets,PubChem CID,UniProt ID,Targets
48,CCND1,5330790,P24385,CCND1
56,CDK5,3820,Q00535,CDK5
151,CDK5,3641059,Q00535,CDK5
165,CDK5,5326843,Q00535,CDK5
178,CDK5,5318433,Q00535,CDK5


In [196]:
df.shape

(24096, 4)

## Make Binary Matrix

In [199]:
grouped_df = df.groupby(['Targets'])['PubChem CID'].apply(lambda x: ','.join(x.astype(str))).reset_index()
grouped_df.set_index('Targets', inplace=True)
grouped_df = grouped_df.sort_index()
len(df['Targets'].unique())

916

In [200]:
grouped_matrix = grouped_df.iloc[:,0].str.get_dummies(sep=',')
grouped_matrix.head()

Unnamed: 0_level_0,10026128,10029385,10040286,10050129,10068193,10074640,10077147,10090485,10096344,10109069,...,9949093,9949641,9952773,9954280,9955,9956119,9956145,9966051,9967941,997475
Targets,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAK1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AASDHPPT,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ABL1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ABL2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ACACA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [201]:
grouped_matrix.shape

(916, 2508)

## Save Binary Matrix

In [202]:
filename = 'Output/BindingDB_Targets_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
grouped_matrix.to_csv(filename, sep='\t', compression='gzip')

## Convert Binary Matrix to GMT

In [203]:
drugset_library = []
for index, row in grouped_matrix.iterrows():
    
    drugset = []
    drugset.append(index)
    drugset.append('')
    for i in range(grouped_matrix.shape[1]):
        if row.iloc[i]==1:
            drugset.append(row.index[i])
    if len(drugset) >= 7:
        drugset_library.append(drugset)
    
dsl = np.array(drugset_library)

In [204]:
len(max(dsl, key=len))

193

In [205]:
filename = 'Output/BindingDB_Targets_DrugSetLibrary_%s.gmt'% str(datetime.date.today())[0:7].replace('-', '_')
with open(filename,'w',encoding='utf-8') as f:
    for row in dsl:
        np.savetxt(f, [row], fmt = '%s',delimiter = '\t')   