## Drugmonizome ETL: RDKIT

##### Author : Eryk Kropiwnicki | eryk.kropiwnicki@icahn.mssm.edu


In [None]:
# appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
import os
import sys
import zipfile
import datetime

import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import MACCSkeys
from rdkit.Chem import PandasTools
from rdkit.Chem import AllChem
from rdkit import Avalon
from rdkit.Chem.AtomPairs import Pairs
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem.Fingerprints import FingerprintMols

import drugmonizome.utility_functions as uf

In [None]:
%load_ext autoreload
%autoreload 2

### Notebook Information

In [None]:
print('This notebook was run on:', datetime.date.today(), '\nPython version:', sys.version)

### Initializing Notebook

In [None]:
%%appyter hide_code

{% do SectionField(
    name='data',
    title='Upload Data',
    img='load_icon.png'
) %}

In [None]:
%%appyter code_eval

{% do DescriptionField(
    name='description',
    text='This notebook allows for the conversion of SMILES string representations of small molecules into molecular fingerprints',
    section='data'
) %}

{% set data_file = FileField(
    constraint='.*\.txt$',
    name='SMILES representations of small molecules', 
    label='SMILES representations of small molecules (.txt)',
    description = 'SMILES string representations of small molecules should be in a text file and separated by newlines. \
              If no file is selected, a default list will be used.',
    default='smiles.txt',
    examples={
        'smiles.txt': 'https://appyters.maayanlab.cloud/storage/Drugmonizome_ETL_Appyters/RDKIT_smiles.txt'
    },
    section='data'
) %}

{% set fingerprint_type = ChoiceField(
    name='fingerprint_type',
    label='Choose chemical fingerprinting method',
    choices=[
        'MACCS Keys',
        'Morgan Fingerprints',
        'Avalon Fingerprints',
        'AtomPair Fingerprints',
        'Topological Fingerprints',
        'RDKit Fingerprints',
    ],
    default= 'MACCS Keys',
    section='data'
) %}

### Create Output Path

In [None]:
%%appyter code_exec

output_name = 'rdkit_' + '{{ fingerprint_type }}'
path = 'output/drugmonizome_rdkit_' + '{{ fingerprint_type }}'
if not os.path.exists(path):
    os.makedirs(path)

### Load Data

In [None]:
%%appyter code_exec
df = pd.read_table({{data_file}})
df.columns = ['SMILES']
df.drop_duplicates(inplace = True)
df.head()

In [None]:
molecule = [Chem.MolFromSmiles(x) for x in df['SMILES']]
df.loc[:,'Molecule'] = pd.Series(molecule, index=df.index)

In [None]:
# Drop invalid molecular representations
df.dropna(inplace = True)
df.shape

In [None]:
# Create list of drugs to index subsequent dataframes by
all_drugs = list(df['SMILES'])

In [None]:
%%appyter markdown
### {{ fingerprint_type }}

In [None]:
%%appyter code_exec
{%if 'MACCS Keys' == fingerprint_type.value%}
# Generating molecule bit vectors 
maccs_fps = [MACCSkeys.GenMACCSKeys(x) for x in df['Molecule']]
# Converting bit vectors into binary array 
maccs_np_fps = []
for fp in maccs_fps:
    arr = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(fp, arr)
    maccs_np_fps.append(arr)
df_fps = pd.DataFrame(maccs_np_fps, index = all_drugs)

{%elif 'Morgan Fingerprints' == fingerprint_type.value%}
# change radius and useFeatures = True as needed 
info = {}
morg_fps = [AllChem.GetMorganFingerprintAsBitVect(x,4,bitInfo=info) for x in df['Molecule']]
# Converting bit vectors into binary array 
morg_np_fps = []
for fp in morg_fps:
    arr = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(fp, arr)
    morg_np_fps.append(arr)
df_fps = pd.DataFrame(morg_np_fps, index = all_drugs)

{%elif 'Avalon Fingerprints' == fingerprint_type.value%}
avalon_fps = [Avalon.pyAvalonTools.GetAvalonFP(x) for x in df['Molecule']]
avalon_np_fps = []
for fp in avalon_fps:
    arr = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(fp, arr)
    avalon_np_fps.append(arr)
df_fps = pd.DataFrame(avalon_np_fps, index = all_drugs)

{%elif 'AtomPair Fingerprints' == fingerprint_type.value%}
atom_pairs_fps = [rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(x) for x in df['Molecule']]
ap_np_fps = []
for fp in atom_pairs_fps:
    arr = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(fp, arr)
    ap_np_fps.append(arr)
df_fps = pd.DataFrame(ap_np_fps, index = all_drugs)

{%elif 'Topological Fingerprints' == fingerprint_type.value%}
top_fps = [FingerprintMols.FingerprintMol(x , minPath = 1,
                                          maxPath = 7, fpSize = 2048, bitsPerHash = 2,
                                          useHs = True, tgtDensity = 0, minSize = 128) for x in df['Molecule']]
top_np_fps = []
for fp in top_fps:
    arr = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(fp, arr)
    top_np_fps.append(arr)
df_fps = pd.DataFrame(top_np_fps, index = all_drugs)

{%elif 'RDKit Fingerprints' == fingerprint_type.value%}
rdk_fps = [Chem.RDKFingerprint(x,maxPath=2) for x in df['Molecule']]
rdk_np_fps = []
for fp in rdk_fps:
    arr = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(fp, arr)
    rdk_np_fps.append(arr)
df_fps = pd.DataFrame(rdk_np_fps, index = all_drugs)

{%else%}
{%endif%}


# Renaming column labels #
column_labels = []
for col in df_fps.columns:
    column_labels.append('{{fingerprint_type}}'.split(' ')[0]+ str(col))
df_fps.columns = column_labels
df_fps.shape

## Analyze Data

### Export binary matrix

In [None]:
df_fps.head()

In [None]:
uf.save_data(df_fps, path, output_name + '_binary_matrix', 
            compression='npz', dtype=np.uint8)

### Create drug and attribute set library

In [None]:
uf.save_setlib(df_fps, 'drug', path, output_name + '_drug_setlibrary')

In [None]:
uf.save_setlib(df_fps, 'attribute', path, output_name + '_attribute_setlibrary')

### Create Attribute Similarity Matrix

In [None]:
attribute_similarity_matrix = uf.similarity_matrix(df_fps.T, 'jaccard', sparse=True)
attribute_similarity_matrix.head()

In [None]:
uf.save_data(attribute_similarity_matrix, path,
            output_name + '_attribute_similarity_matrix', 
            compression='npz', symmetric=True, dtype=np.float32)

### Create Drug Similarity Matrix

In [None]:
drug_similarity_matrix = uf.similarity_matrix(df_fps, 'jaccard', sparse=True)
drug_similarity_matrix.head()

In [None]:
uf.save_data(drug_similarity_matrix, path,
            output_name + '_drug_similarity_matrix', 
            compression='npz', symmetric=True, dtype=np.float32)

### Create download folder with all outputs

In [None]:
uf.archive(path)

### Link to the output folder: [Download](./output_archive.zip)