## RDKIT Molecular Fingerprint Matrices
#### ALL DATABASES ACCESSED 12/2019
##### Author : Eryk Kropiwnicki | eryk.kropiwnicki@icahn.mssm.edu

In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import MACCSkeys
from rdkit.Chem import PandasTools
from rdkit.Chem import AllChem
from rdkit import Avalon
from rdkit.Chem.AtomPairs import Pairs
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem.Pharm2D import Gobbi_Pharm2D,Generate
from rdkit.Chem.Pharm2D.SigFactory import SigFactory
from rdkit.Chem import ChemicalFeatures
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
import os

In [2]:
os.chdir('../../scripts')
from export_script import *
os.chdir('../notebooks/RDKIT')

#### Input file : drug_metadata.csv (master drug list generated from Drug Metadata Aggregation.ipynb)

In [3]:
df = pd.read_csv('../../metadata/drugmonizome_metadata.tsv', sep = '\t', usecols = ['Common name',
                                                                                   'InChI Key',
                                                                                   'Canonical_SMILES'])

In [4]:
df.head()

Unnamed: 0,Common name,InChI Key,Canonical_SMILES
0,Bivalirudin,OIRCOABEOLEUMC-GEJPAHFPSA-N,CCC(C)C(C(=O)N1CCCC1C(=O)NC(CCC(=O)O)C(=O)NC(C...
1,Leuprolide,GFIJNRVAKGFPGQ-LIJARHBVSA-N,CCNC(=O)C1CCCN1C(=O)C(CCCN=C(N)N)NC(=O)C(CC(C)...
2,Goserelin,BLCLNMBMMGCOAS-URPVMXJPSA-N,CC(C)CC(C(=O)NC(CCCN=C(N)N)C(=O)N1CCCC1C(=O)NN...
3,Gramicidin D,NDAYQJDHGXTBJL-MWWSRJDJSA-N,CC(C)CC(C(=O)NC(C)C(=O)NC(C(C)C)C(=O)NC(C(C)C)...
4,Desmopressin,NFLWUMRGJYTJIN-PNIOQBSNSA-N,C1CC(N(C1)C(=O)C2CSSCCC(=O)NC(C(=O)NC(C(=O)NC(...


In [5]:
# Drop NANs
df = df.dropna()

In [6]:
molecule = [Chem.MolFromSmiles(x) for x in df['Canonical_SMILES']]
df.loc[:,'Molecule'] = pd.Series(molecule, index=df.index)

RDKit ERROR: [11:09:02] Explicit valence for atom # 7 Cl, 5, is greater than permitted
RDKit ERROR: [11:09:03] SMILES Parse Error: syntax error while parsing: [O+]#[C-][Re+]|1(|n2cccc3ccc4cccn|1c4c23)([C-]#[O+])[C-]#[O+]
RDKit ERROR: [11:09:03] SMILES Parse Error: Failed parsing SMILES '[O+]#[C-][Re+]|1(|n2cccc3ccc4cccn|1c4c23)([C-]#[O+])[C-]#[O+]' for input: '[O+]#[C-][Re+]|1(|n2cccc3ccc4cccn|1c4c23)([C-]#[O+])[C-]#[O+]'
RDKit ERROR: [11:09:04] Explicit valence for atom # 1 Cl, 4, is greater than permitted


In [7]:
# Drop invalid molecular representation
df = df.dropna()

In [8]:
all_drugs = list(df['InChI Key'])

In [9]:
os.chdir('../../data/RDKIT')

### Morgan Fingerprints

In [10]:
# change radius and useFeatures = True as needed 
info = {}
morg_fps = [AllChem.GetMorganFingerprintAsBitVect(x,4,bitInfo=info) for x in df['Molecule']]

In [11]:
# Converting bit vectors into binary array 
morg_np_fps = []
for fp in morg_fps:
    arr = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(fp, arr)
    morg_np_fps.append(arr)
    
morg_df = pd.DataFrame(morg_np_fps, index = all_drugs)

In [12]:
column_labels = []
for col in morg_df.columns:
    column_labels.append('Morgan_' + str(col))
morg_df.columns = column_labels

In [13]:
morg_df.head()

Unnamed: 0,Morgan_0,Morgan_1,Morgan_2,Morgan_3,Morgan_4,Morgan_5,Morgan_6,Morgan_7,Morgan_8,Morgan_9,...,Morgan_2038,Morgan_2039,Morgan_2040,Morgan_2041,Morgan_2042,Morgan_2043,Morgan_2044,Morgan_2045,Morgan_2046,Morgan_2047
OIRCOABEOLEUMC-GEJPAHFPSA-N,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
GFIJNRVAKGFPGQ-LIJARHBVSA-N,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BLCLNMBMMGCOAS-URPVMXJPSA-N,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NDAYQJDHGXTBJL-MWWSRJDJSA-N,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NFLWUMRGJYTJIN-PNIOQBSNSA-N,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
morg_df.to_csv('RDKIT_morgan_fp_matrix.tsv', sep = '\t')

### Avalon Fingerprints

In [15]:
avalon_fps = [Avalon.pyAvalonTools.GetAvalonFP(x) for x in df['Molecule']]

In [16]:
avalon_np_fps = []
for fp in avalon_fps:
    arr = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(fp, arr)
    avalon_np_fps.append(arr)

avalon_df = pd.DataFrame(avalon_np_fps, index = all_drugs)

In [17]:
column_labels = []
for col in avalon_df.columns:
    column_labels.append('Avalon_' + str(col))
avalon_df.columns = column_labels

In [18]:
avalon_df.head()

Unnamed: 0,Avalon_0,Avalon_1,Avalon_2,Avalon_3,Avalon_4,Avalon_5,Avalon_6,Avalon_7,Avalon_8,Avalon_9,...,Avalon_502,Avalon_503,Avalon_504,Avalon_505,Avalon_506,Avalon_507,Avalon_508,Avalon_509,Avalon_510,Avalon_511
OIRCOABEOLEUMC-GEJPAHFPSA-N,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
GFIJNRVAKGFPGQ-LIJARHBVSA-N,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0
BLCLNMBMMGCOAS-URPVMXJPSA-N,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0
NDAYQJDHGXTBJL-MWWSRJDJSA-N,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
NFLWUMRGJYTJIN-PNIOQBSNSA-N,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [19]:
avalon_df.to_csv('RDKIT_avalon_fp_matrix.tsv', sep = '\t')

### Atom Pair Fingerprints

In [20]:
atom_pairs_fps = [rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(x) for x in df['Molecule']]

In [21]:
ap_np_fps = []
for fp in atom_pairs_fps:
    arr = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(fp, arr)
    ap_np_fps.append(arr)

ap_df = pd.DataFrame(ap_np_fps, index = all_drugs)

In [22]:
column_labels = []
for col in ap_df.columns:
    column_labels.append('AtomPair_' + str(col))
ap_df.columns = column_labels

In [23]:
ap_df.head()

Unnamed: 0,AtomPair_0,AtomPair_1,AtomPair_2,AtomPair_3,AtomPair_4,AtomPair_5,AtomPair_6,AtomPair_7,AtomPair_8,AtomPair_9,...,AtomPair_2038,AtomPair_2039,AtomPair_2040,AtomPair_2041,AtomPair_2042,AtomPair_2043,AtomPair_2044,AtomPair_2045,AtomPair_2046,AtomPair_2047
OIRCOABEOLEUMC-GEJPAHFPSA-N,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
GFIJNRVAKGFPGQ-LIJARHBVSA-N,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
BLCLNMBMMGCOAS-URPVMXJPSA-N,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0
NDAYQJDHGXTBJL-MWWSRJDJSA-N,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
NFLWUMRGJYTJIN-PNIOQBSNSA-N,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0


In [24]:
ap_df.to_csv('RDKIT_atompair_fp_matrix.tsv', sep = '\t')

### Topological Fingerprints

In [25]:
top_fps = [FingerprintMols.FingerprintMol(x , minPath = 1,
                                          maxPath = 7, fpSize = 2048, bitsPerHash = 2,
                                          useHs = True, tgtDensity = 0, minSize = 128) for x in df['Molecule']]


In [26]:
top_np_fps = []
for fp in top_fps:
    arr = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(fp, arr)
    top_np_fps.append(arr)

top_df = pd.DataFrame(top_np_fps, index = all_drugs)

In [27]:
column_labels = []
for col in top_df.columns:
    column_labels.append('Topological_' + str(col))
top_df.columns = column_labels

In [28]:
top_df.head()

Unnamed: 0,Topological_0,Topological_1,Topological_2,Topological_3,Topological_4,Topological_5,Topological_6,Topological_7,Topological_8,Topological_9,...,Topological_2038,Topological_2039,Topological_2040,Topological_2041,Topological_2042,Topological_2043,Topological_2044,Topological_2045,Topological_2046,Topological_2047
OIRCOABEOLEUMC-GEJPAHFPSA-N,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
GFIJNRVAKGFPGQ-LIJARHBVSA-N,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0
BLCLNMBMMGCOAS-URPVMXJPSA-N,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
NDAYQJDHGXTBJL-MWWSRJDJSA-N,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
NFLWUMRGJYTJIN-PNIOQBSNSA-N,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [29]:
top_df.to_csv('RDKIT_topological_fp_matrix.tsv', sep = '\t')

### RDKit Fingerprints

In [30]:
rdk_fps = [Chem.RDKFingerprint(x,maxPath=2) for x in df['Molecule']]

In [31]:
rdk_np_fps = []
for fp in rdk_fps:
    arr = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(fp, arr)
    rdk_np_fps.append(arr)

rdk_df = pd.DataFrame(rdk_np_fps, index = all_drugs)

In [32]:
column_labels = []
for col in rdk_df.columns:
    column_labels.append('RDKf2_' + str(col))
rdk_df.columns = column_labels

In [33]:
rdk_df.to_csv('RDKIT_rdkit_fp_matrix.tsv', sep = '\t')