## RDKIT Molecular Fingerprints Drug-Set Library
### Drug-set labels: Molecular Fingerprints
#### ALL DATABASES ACCESSED 12/2019
##### Author : Eryk Kropiwnicki | eryk.kropiwnicki@icahn.mssm.edu

In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import MACCSkeys
from rdkit.Chem import PandasTools
from rdkit.Chem import AllChem
from rdkit import Avalon
from rdkit.Chem.AtomPairs import Pairs
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem.Pharm2D import Gobbi_Pharm2D,Generate
from rdkit.Chem.Pharm2D.SigFactory import SigFactory
from rdkit.Chem import ChemicalFeatures
from rdkit.Chem.Fingerprints import FingerprintMols
import os

In [2]:
os.chdir('../scripts')
from export_script import *
os.chdir('../RDKIT')

#### Input file : drug_metadata.csv (master drug list generated from Drug Metadata Aggregation.ipynb)

In [3]:
df = pd.read_csv('../mapping_files/merged_drug_metadata.tsv', sep = '\t')

In [4]:
df = df.drop(columns = ['Synonyms','PubChemID','InChiKey','Molecular_formula'])

In [5]:
df.head()

Unnamed: 0,Drug_name,Canonical_SMILES
0,goserelin,CC(C)CC(C(=O)NC(CCCN=C(N)N)C(=O)N1CCCC1C(=O)NN...
1,cetrorelix,CC(C)CC(C(=O)NC(CCCN=C(N)N)C(=O)N1CCCC1C(=O)NC...
2,ciclosporin,CCC1C(=O)N(CC(=O)N(C(C(=O)NC(C(=O)N(C(C(=O)NC(...
3,octreotide,CC(C1C(=O)NC(CSSCC(C(=O)NC(C(=O)NC(C(=O)NC(C(=...
4,choline,C[N+](C)(C)CCO


In [6]:
molecule = [Chem.MolFromSmiles(x) for x in df['Canonical_SMILES']]
df.loc[:,'Molecule'] = pd.Series(molecule, index=df.index)

In [7]:
# Drop any NaNs from unresolvable SMILEs strings
df = df.dropna()

In [8]:
all_drugs = list(df['Drug_name'])

### MACCS Key Fingerprints

In [9]:
# Generating molecule bit vectors 
maccs_fps = [MACCSkeys.GenMACCSKeys(x) for x in df['Molecule']]

In [10]:
# Converting bit vectors into binary array 
maccs_np_fps = []
for fp in maccs_fps:
    arr = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(fp, arr)
    maccs_np_fps.append(arr)

# Creating dataframe and indexing with all valid drugs #
maccs_df = pd.DataFrame(maccs_np_fps, index = all_drugs)

In [11]:
# Renaming column labels #
column_labels = []
for col in maccs_df.columns:
    column_labels.append('MACCS'+ str(col))

maccs_df.columns = column_labels

In [12]:
maccs_df.head()

Unnamed: 0,MACCS0,MACCS1,MACCS2,MACCS3,MACCS4,MACCS5,MACCS6,MACCS7,MACCS8,MACCS9,...,MACCS157,MACCS158,MACCS159,MACCS160,MACCS161,MACCS162,MACCS163,MACCS164,MACCS165,MACCS166
goserelin,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
cetrorelix,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
ciclosporin,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0
octreotide,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
choline,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0


### Export as binary matrix

In [13]:
maccs_df.to_csv('../data/RDKIT/RDKIT_maccs_fp_matrix.tsv', sep='\t')

### Exporting as drug-set library

In [14]:
for i,col in enumerate(maccs_df.columns):
    index = maccs_df[maccs_df[col] == 1].index
    lst = index.values.tolist()
    if len(lst) >= 5:
        lst.insert(0, col)
    else:
        lst.insert(1, 'NA')
    
    if 'NA' in lst:
        pass
    else:
        lst = ['{0}\t'.format(elem) for elem in lst] # add tabs between terms in the lst
        lst.insert(len(lst), '\n') # add a newline char at the end of each lst
        with open('../data/RDKIT/RDKIT_maccs_fingerprints_drugsetlibrary.txt', 'a') as f:
            f.writelines(lst)

### Library counts

In [15]:
import csv

with open('../data/RDKIT/RDKIT_maccs_fingerprints_drugsetlibrary.txt', 'r') as f:
    reader = csv.reader(f, delimiter = '\t')
    drugsetlibrary = {line[0]:list(([(str(g)).split('\t')[0] for g in line[2:]])) for line in reader}
    
library_counts(drugsetlibrary)

43941 unique drugs
163 unique association terms
2241402 unique associations
13750.932515337423 average drugs per term
