

# Calculating molecular descriptors and fingerprints using RDkit and Mordred 

In [None]:
!pip install rdkit-pypi

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rdkit-pypi
  Downloading rdkit_pypi-2022.9.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.4/29.4 MB[0m [31m39.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit-pypi
Successfully installed rdkit-pypi-2022.9.5
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting mordred
  Downloading mordred-1.2.0.tar.gz (128 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m128.8/128.8 KB[0m [31m851.1 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting networkx==2.*
  Downloading networkx-2.8.8-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for colle

In [None]:

from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
import csv
import pandas as pd
import numpy as np
from mordred import Calculator, descriptors

In [None]:
import time
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
inputs_path = '/content/drive/MyDrive/DTBA/datasets'
outputs_path = '/content/drive/MyDrive/DTBA/features'

In [None]:
#import Dataset (has to has 'smiles' column name)

dataset = pd.read_csv(f'{inputs_path}/KIBA_dta_df_drugs_pkd.csv')
dataset.shape

(118254, 5)

In [None]:
dataset.head()

Unnamed: 0,Chembl-ID,ID,smiles,sequence,kd
0,CHEMBL1087421,O00141,COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl,MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS...,11.1
1,CHEMBL1087421,O14920,COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl,MSWSPSLTTQTCGAWEMKERLGTGGFGNVIRWHNQETGEQIAIKQC...,11.1
2,CHEMBL1087421,O15111,COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl,MERPPGLRPGAGGPWEMRERLGTGGFGNVCLYQHRELDLKIAIKSC...,11.1
3,CHEMBL1087421,P00533,COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...,11.1
4,CHEMBL1087421,P04626,COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl,MELAALCRWGLLLALLPPGAASTQVCTGTDMKLRLPASPETHLDML...,11.1


## 1. Generate canonical SMILES

In [None]:
def canonical_smiles(smiles):
    mols = [Chem.MolFromSmiles(smi) for smi in smiles] 
    smiles = [Chem.MolToSmiles(mol) for mol in mols]
    return smiles

In [None]:
# Canonical SMILES
Canon_SMILES = canonical_smiles(dataset.smiles)
len(Canon_SMILES)

118254

In [None]:
# Put the smiles in the dataframe
dataset['smiles'] = Canon_SMILES
dataset

Unnamed: 0,Chembl-ID,ID,smiles,sequence,kd
0,CHEMBL1087421,O00141,COc1cc2c(cc1Cl)C(c1ccc(Cl)c(Cl)c1)=NCC2,MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS...,11.10000
1,CHEMBL1087421,O14920,COc1cc2c(cc1Cl)C(c1ccc(Cl)c(Cl)c1)=NCC2,MSWSPSLTTQTCGAWEMKERLGTGGFGNVIRWHNQETGEQIAIKQC...,11.10000
2,CHEMBL1087421,O15111,COc1cc2c(cc1Cl)C(c1ccc(Cl)c(Cl)c1)=NCC2,MERPPGLRPGAGGPWEMRERLGTGGFGNVCLYQHRELDLKIAIKSC...,11.10000
3,CHEMBL1087421,P00533,COc1cc2c(cc1Cl)C(c1ccc(Cl)c(Cl)c1)=NCC2,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...,11.10000
4,CHEMBL1087421,P04626,COc1cc2c(cc1Cl)C(c1ccc(Cl)c(Cl)c1)=NCC2,MELAALCRWGLLLALLPPGAASTQVCTGTDMKLRLPASPETHLDML...,11.10000
...,...,...,...,...,...
118249,CHEMBL230654,Q13554,CCCc1nc[nH]c1CNc1cc(Cl)c2ncc(C#N)c(Nc3ccc(F)c(...,MATTVTCTRFTDEYQLYEDIGKGAFSVVRRCVKLCTGHEYAAKIIN...,10.49794
118250,CHEMBL230654,Q13555,CCCc1nc[nH]c1CNc1cc(Cl)c2ncc(C#N)c(Nc3ccc(F)c(...,MATTATCTRFTDDYQLFEELGKGAFSVVRRCVKKTSTQEYAAKIIN...,10.49794
118251,CHEMBL230654,Q13557,CCCc1nc[nH]c1CNc1cc(Cl)c2ncc(C#N)c(Nc3ccc(F)c(...,MASTTTCTRFTDEYQLFEELGKGAFSVVRRCMKIPTGQEYAAKIIN...,10.49794
118252,CHEMBL230654,Q16539,CCCc1nc[nH]c1CNc1cc(Cl)c2ncc(C#N)c(Nc3ccc(F)c(...,MSQERPTFYRQELNKTIWEVPERYQNLSPVGSGAYGSVCAAFDTKT...,10.49794


## Calculate descriptors using RDkit

### a. Molecular descriptors

In [None]:
def RDkit_descriptors(smiles):
    mols = [Chem.MolFromSmiles(i) for i in smiles] 
    calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
    desc_names = calc.GetDescriptorNames()
    
    Mol_descriptors =[]
    for mol in mols:
        # add hydrogens to molecules
        mol=Chem.AddHs(mol)
        # Calculate all 200 descriptors for each molecule
        descriptors = calc.CalcDescriptors(mol)
        Mol_descriptors.append(descriptors)
    return Mol_descriptors,desc_names 

# Function call
Mol_descriptors,desc_names = RDkit_descriptors(dataset['smiles'])

In [None]:
df_with_200_descriptors = pd.DataFrame(Mol_descriptors,columns=desc_names)
df_with_200_descriptors.to_csv (r'/content/sample_data/DAVIS_dta_df_drugs_pkd_descriptors.csv', index = True, header=True)

### b. Fingerprints

In [None]:
# fpts size can be adjusted to 1024 to avoid too 0s.
def morgan_fpts(data):
    Morgan_fpts = []
    for i in data:
        mol = Chem.MolFromSmiles(i) 
        fpts =  AllChem.GetMorganFingerprintAsBitVect(mol,2,64)
        mfpts = np.array(fpts)
        Morgan_fpts.append(mfpts)  
    return np.array(Morgan_fpts)

In [None]:
Morgan_fpts = morgan_fpts(dataset['smiles'])
Morgan_fpts.shape

In [None]:
Morgan_fingerprints = pd.DataFrame(Morgan_fpts,columns=['Col_{}'.format(i) for i in range(Morgan_fpts.shape[1])])
Morgan_fingerprints.to_csv (r'/content/sample_data/DAVIS_dta_df_drugs_pkd_morganfingerprints.csv', index = True, header=True)
Morgan_fingerprints

In [2]:
#Sources:
# RDKit Docu.: https://www.rdkit.org/docs/
# Tutorial: https://www.youtube.com/watch?v=9i9SY6Nd1Zw