## RDKIT Molecular Fingerprints Drug-Set Library
### Drug-set labels: Molecular Fingerprints
#### ALL DATABASES ACCESSED 12/2019
##### Author : Eryk Kropiwnicki | eryk.kropiwnicki@icahn.mssm.edu

In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import MACCSkeys
from rdkit.Chem import PandasTools
from rdkit.Chem import AllChem
from rdkit import Avalon
from rdkit.Chem.AtomPairs import Pairs
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem.Pharm2D import Gobbi_Pharm2D,Generate
from rdkit.Chem.Pharm2D.SigFactory import SigFactory
from rdkit.Chem import ChemicalFeatures
from rdkit.Chem.Fingerprints import FingerprintMols
import os

In [2]:
os.chdir('../../scripts')
from export_script import *
os.chdir('../notebooks/RDKIT')

#### Input file : drug_metadata.csv (master drug list generated from Drug Metadata Aggregation.ipynb)

In [3]:
df = pd.read_csv('../../metadata/drugmonizome_metadata.tsv', sep = '\t', usecols = ['Common name',
                                                                                   'InChI Key',
                                                                                   'Canonical_SMILES'])

In [4]:
df.head()

Unnamed: 0,Common name,InChI Key,Canonical_SMILES
0,Bivalirudin,OIRCOABEOLEUMC-GEJPAHFPSA-N,CCC(C)C(C(=O)N1CCCC1C(=O)NC(CCC(=O)O)C(=O)NC(C...
1,Leuprolide,GFIJNRVAKGFPGQ-LIJARHBVSA-N,CCNC(=O)C1CCCN1C(=O)C(CCCN=C(N)N)NC(=O)C(CC(C)...
2,Goserelin,BLCLNMBMMGCOAS-URPVMXJPSA-N,CC(C)CC(C(=O)NC(CCCN=C(N)N)C(=O)N1CCCC1C(=O)NN...
3,Gramicidin D,NDAYQJDHGXTBJL-MWWSRJDJSA-N,CC(C)CC(C(=O)NC(C)C(=O)NC(C(C)C)C(=O)NC(C(C)C)...
4,Desmopressin,NFLWUMRGJYTJIN-PNIOQBSNSA-N,C1CC(N(C1)C(=O)C2CSSCCC(=O)NC(C(=O)NC(C(=O)NC(...


In [5]:
# Drop any NaNs
df = df.dropna()

In [6]:
molecule = [Chem.MolFromSmiles(x) for x in df['Canonical_SMILES']]
df.loc[:,'Molecule'] = pd.Series(molecule, index=df.index)

In [7]:
# Drop any NaNs from unresolvable SMILEs strings
df = df.dropna()

In [8]:
all_drugs = list(df['InChI Key'])

### MACCS Key Fingerprints

In [9]:
# Generating molecule bit vectors 
maccs_fps = [MACCSkeys.GenMACCSKeys(x) for x in df['Molecule']]

In [10]:
# Converting bit vectors into binary array 
maccs_np_fps = []
for fp in maccs_fps:
    arr = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(fp, arr)
    maccs_np_fps.append(arr)

# Creating dataframe and indexing with all valid drugs #
maccs_df = pd.DataFrame(maccs_np_fps, index = all_drugs)

In [11]:
# Renaming column labels #
column_labels = []
for col in maccs_df.columns:
    column_labels.append('MACCS'+ str(col))

maccs_df.columns = column_labels

In [12]:
maccs_df.head()

Unnamed: 0,MACCS0,MACCS1,MACCS2,MACCS3,MACCS4,MACCS5,MACCS6,MACCS7,MACCS8,MACCS9,...,MACCS157,MACCS158,MACCS159,MACCS160,MACCS161,MACCS162,MACCS163,MACCS164,MACCS165,MACCS166
OIRCOABEOLEUMC-GEJPAHFPSA-N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
GFIJNRVAKGFPGQ-LIJARHBVSA-N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
BLCLNMBMMGCOAS-URPVMXJPSA-N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
NDAYQJDHGXTBJL-MWWSRJDJSA-N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
NFLWUMRGJYTJIN-PNIOQBSNSA-N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0


### Export as binary matrix

In [13]:
os.chdir('../../data/RDKIT')

In [14]:
maccs_df.to_csv('RDKIT_maccs_fp_matrix.tsv', sep='\t')

### Exporting as drug-set library

In [15]:
drugsetlibrary = {}
for i,col in enumerate(maccs_df.columns):
    index = maccs_df[maccs_df[col] == 1].index
    if len(set(index)) >= 5:
        drugsetlibrary[col] = list(set(index))

In [16]:
gmt_formatter(drugsetlibrary, 'RDKIT_maccs_fingerprints_drugsetlibrary.gmt')

### Library counts

In [17]:
library_counts(drugsetlibrary)

14308 unique drugs
163 unique association terms
665070 unique associations
4080.1840490797545 average drugs per term
