In [None]:
# wget https://github.com/dataprofessor/padel/raw/main/fingerprints_xml.zip
# !pip install padelpy

In [1]:
import pandas as pd
bace_inhibitors = pd.read_csv('clean_bace_selected_features.csv')
bace_inhibitors.shape

(461, 6)

In [4]:
# bace_inhibitors['BindingDB Reactant_set_id'].duplicated().sum()
bace_inhibitors['BindingDB MonomerID'].duplicated().sum()

112

In [6]:
bace_inhibitors = bace_inhibitors.drop_duplicates(subset=['BindingDB MonomerID'], keep= 'first')
bace_inhibitors.shape

(349, 6)

### Descriptors

In [8]:
# Prepare dataset for calculating padel descriptor

smiles_n_id = pd.concat([bace_inhibitors['Ligand SMILES'],bace_inhibitors['BindingDB Reactant_set_id']], axis=1)
smiles_n_id.to_csv('molecule.smi', sep='\t', index=False, header=False)
smiles_n_id.shape

(349, 2)

In [9]:
# solve ddll error => https://github.com/conda/conda/issues/11795#issuecomment-1335666474
# this worked
# pip install rdkit --user

In [61]:
# pip install rdkit-pypi

In [4]:
# https://github.com/mordred-descriptor/mordred
# pip install mordred

In [9]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors 

In [10]:
# check canonicity of the ligand smile notation using RDKit
def generate_canonical_smiles(mol_smiles):
    mols = [Chem.MolFromSmiles(mol_smile) for mol_smile in mol_smiles]
    canonical_smiles = [Chem.MolToSmiles(mol) for mol in mols] 
    return canonical_smiles 

In [12]:
canonical_smile = generate_canonical_smiles(bace_inhibitors['Ligand SMILES'])
canonical_smile[0:5]

['O=C(O)c1cc(/C=C/c2ccc(/C=C/c3ccc(O)c(C(=O)O)c3)c(Br)c2)ccc1O',
 'O=C(O)c1cc(/C=C\\c2ccc(/C=C\\c3ccc(O)c(C(=O)O)c3)c(Br)c2)ccc1O',
 'O=C(O)c1cc(N=Nc2ccc(-c3ccc(N=Nc4ccc(O)c(C(=O)O)c4)cc3)cc2)ccc1O',
 'COc1ccc(/C=C/c2ccc(/C=C/c3ccc(OC)c(C(=O)O)c3)c(I)c2)cc1C(=O)O',
 'O=C(O)c1cc(/C=C\\c2ccc(/C=C/c3ccc(O)c(C(=O)O)c3)cc2Br)ccc1O']

In [13]:
bace_inhibitors['Ligand SMILES'] = canonical_smile
bace_inhibitors.shape

(349, 6)

In [31]:
bace_inhibitors.to_csv('clean_bace_inhibitors.csv', index=False)

In [14]:
# check for list of duplicated smiles
duplicated_smiles = bace_inhibitors[bace_inhibitors['Ligand SMILES'].duplicated()]['Ligand SMILES'].values
len(duplicated_smiles)

0

In [15]:
# list of duplicated files
bace_inhibitors[bace_inhibitors['Ligand SMILES'].isin(duplicated_smiles)].sort_values(by=['Ligand SMILES'])

Unnamed: 0,BindingDB Reactant_set_id,Ligand SMILES,BindingDB MonomerID,BindingDB Ligand Name,Target Name,Ki (nM)


### RDKit descriptors


In [55]:
# list of rdKit decriptors and length (209 in total)
# Descriptors._descList
# len(Descriptors._descList)

In [16]:
# list of rdkit descriptors
rdkit_descriptor_names = [descriptor[0] for descriptor in Descriptors._descList]

# initialize rdkit descriptor calculator with passing list of rdkit descriptor names as an arguement
# returns an instance of the descriptot calculator
rdkit_descriptors = MoleculeDescriptors.MolecularDescriptorCalculator(rdkit_descriptor_names);
rdkit_descriptors


<rdkit.ML.Descriptors.MoleculeDescriptors.MolecularDescriptorCalculator at 0x21c0641ea40>

In [28]:
# pass in the list of molecules into the calculate descriptors method
# returns the list of generated descriptors

def generate_rdkit_descriptors (smile_list):
    
    # list of rdkit descriptors
    rdkit_descriptor_names = [descriptor[0] for descriptor in Descriptors._descList]
    
    # initialize rdkit descriptor calculator with passing list of rdkit descriptor names as an arguement
    # returns an instance of the descriptot calculator
    rdkit_descriptors = MoleculeDescriptors.MolecularDescriptorCalculator(rdkit_descriptor_names);
    
    # convert smile to rdkit molecule.
    # returns a list of rdkit molecules
    ligands = [Chem.MolFromSmiles(ligand_smile) for ligand_smile in smile_list]
    
    print(ligands)
    
    desc_name = rdkit_descriptors.GetDescriptorNames()
    
    # list of descriptors
    ligand_rdkit_descriptors = []
    
    #loop through each ligand
    for ligand in ligands:
        
        # add hydrogens to each ligand
        ligand = Chem.AddHs(ligand)
        
        # calculate descriptors for each ligand
        ligand_descriptor = rdkit_descriptors.CalcDescriptors(ligand)
        
        # append ligand_descriptor to  ligand_rdkit_descriptors list
        ligand_rdkit_descriptors.append(ligand_descriptor)
        
#     print(ligand_descriptor, desc_name)
    return  ligand_rdkit_descriptors, desc_name
        
        


In [29]:
ligand_descriptors, descriptor_names = generate_rdkit_descriptors(bace_inhibitors['Ligand SMILES'])

[<rdkit.Chem.rdchem.Mol object at 0x0000021C108E33E0>, <rdkit.Chem.rdchem.Mol object at 0x0000021C108E1150>, <rdkit.Chem.rdchem.Mol object at 0x0000021C108E15B0>, <rdkit.Chem.rdchem.Mol object at 0x0000021C108E3680>, <rdkit.Chem.rdchem.Mol object at 0x0000021C108E3C30>, <rdkit.Chem.rdchem.Mol object at 0x0000021C108E3A70>, <rdkit.Chem.rdchem.Mol object at 0x0000021C108E38B0>, <rdkit.Chem.rdchem.Mol object at 0x0000021C108E3370>, <rdkit.Chem.rdchem.Mol object at 0x0000021C108E3530>, <rdkit.Chem.rdchem.Mol object at 0x0000021C108E30D0>, <rdkit.Chem.rdchem.Mol object at 0x0000021C108E25E0>, <rdkit.Chem.rdchem.Mol object at 0x0000021C108E1E00>, <rdkit.Chem.rdchem.Mol object at 0x0000021C108E2110>, <rdkit.Chem.rdchem.Mol object at 0x0000021C108E18C0>, <rdkit.Chem.rdchem.Mol object at 0x0000021C108E1540>, <rdkit.Chem.rdchem.Mol object at 0x0000021C108E10E0>, <rdkit.Chem.rdchem.Mol object at 0x0000021C108E0D60>, <rdkit.Chem.rdchem.Mol object at 0x0000021C108E0740>, <rdkit.Chem.rdchem.Mol obje

In [30]:
descriptor_df = pd.DataFrame(ligand_descriptors, columns = descriptor_names)
descriptor_df

Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,12.142662,12.142662,0.540127,-1.601878,0.341105,481.298,464.162,480.020850,156,0,...,0,0,0,0,0,0,0,0,0,0
1,12.142662,12.142662,0.540127,-1.601878,0.341105,481.298,464.162,480.020850,156,0,...,0,0,0,0,0,0,0,0,0,0
2,12.064142,12.064142,0.761169,-1.562466,0.207180,482.452,464.308,482.122634,178,0,...,0,0,0,0,0,0,0,0,0,0
3,12.400966,12.400966,0.466225,-3.331026,0.259569,556.352,535.184,556.038286,168,0,...,0,0,0,0,0,0,0,0,0,0
4,12.142662,12.142662,0.540127,-1.601878,0.341105,481.298,464.162,480.020850,156,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
344,8.324513,8.324513,0.313384,-3.496620,0.683595,318.464,292.256,318.209599,124,0,...,0,0,0,0,0,0,0,0,0,0
345,13.693231,13.693231,1.158710,-4.384810,0.552430,252.285,235.149,252.116173,98,0,...,0,0,0,0,0,0,0,0,0,0
346,11.974850,11.974850,0.475639,-3.746287,0.417437,299.282,286.178,299.079373,112,0,...,0,0,0,0,0,0,0,0,0,0
347,8.258090,8.258090,0.102040,-3.428397,0.772520,353.203,337.075,353.027662,94,0,...,0,0,0,0,0,0,0,0,0,0


In [41]:
import pandas as pd

bace_inhibitors = pd.read_csv('clean_bace_inhibitors.csv')
bace_inhibitors

combined_bace_inhibitors = pd.concat([bace_inhibitors, descriptor_df], axis=1)
combined_bace_inhibitors.to_csv('combined_bace_inhibitors_ki_and_rdkit_descriptors.csv', index=False)
combined_bace_inhibitors.shape


(349, 215)

### Mordred descriptors

In [62]:
import pandas as pd

bace_inhibitors = pd.read_csv('clean_bace_inhibitors.csv')
bace_inhibitors.shape

(349, 6)

In [46]:
from mordred import Calculator, descriptors
calc = Calculator(descriptors, ignore_3D=False)
len(calc)


1613

In [56]:
from mordred import Calculator, descriptors
from rdkit import Chem

def generate_mordred_descriptors(smile_molecules):
    
    # generate molecules from rdkit
    ligands = [Chem.MolFromSmiles(smile) for smile in smile_molecules]
          
    # generate descriptors
    calc = Calculator(descriptors, ignore_3D=False)
    
    return calc.pandas(ligands)

In [57]:
mordered_desc = generate_mordred_descriptors(bace_inhibitors['Ligand SMILES'])
mordered_desc

 20%|██████████████████▌                                                                           | 69/349 [00:05<00:20, 13.82it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 32%|██████████████████████████████                                                               | 113/349 [00:10<00:48,  4.84it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 43%|███████████████████████████████████████▋                                                     | 149/349 [00:11<00:15, 12.85it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 48%|████████████████████████████████████████████▊                                                | 168/349 [00:13<00:18, 10.05it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 66%|█████████████████████████████████████████████████████████████▎                               | 230/349 [00:19<00:13,  9.01it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|█████████████████████████████████████████████████████████████████████████████████████████████| 349/349 [00:34<00:00, 10.14it/s]


Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,23.898052,17.722337,2,0,38.786181,2.352319,4.704637,38.786181,1.251167,4.345471,...,10.246616,66.622526,480.020850,10.000434,3302,48,158.0,182.0,11.472222,6.805556
1,23.898052,17.722337,2,0,38.786181,2.352319,4.704637,38.786181,1.251167,4.345471,...,10.246616,66.622526,480.020850,10.000434,3302,48,158.0,182.0,11.472222,6.805556
2,28.031303,18.801352,2,0,46.231486,2.354504,4.709008,46.231486,1.284208,4.502712,...,10.399981,72.422166,482.122634,8.928197,5336,56,186.0,215.0,11.833333,7.888889
3,25.093486,18.643069,2,0,41.841513,2.361244,4.722488,41.841513,1.267925,4.403148,...,10.299273,68.872284,556.038286,10.297005,3932,52,166.0,192.0,11.972222,7.472222
4,23.898052,17.722337,2,0,38.786181,2.352319,4.704637,38.786181,1.251167,4.345471,...,10.246616,66.622526,480.020850,10.000434,3302,48,158.0,182.0,11.472222,6.805556
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
344,18.034348,13.285507,0,0,30.331531,2.263370,4.526739,30.331531,1.263814,4.069893,...,9.695294,57.578011,318.209599,6.364192,1846,31,114.0,126.0,8.166667,5.472222
345,12.906262,11.231958,0,0,22.144514,2.316799,4.633597,22.144514,1.230251,3.759697,...,9.313529,49.733700,252.116173,7.203319,704,22,80.0,87.0,6.944444,4.361111
346,17.217852,12.602268,0,0,28.287891,2.443002,4.886004,28.287891,1.285813,4.020533,...,9.959348,56.047219,299.079373,8.545125,1191,32,114.0,131.0,6.777778,4.805556
347,13.722758,11.139535,0,0,22.853672,2.285395,4.570790,22.853672,1.269648,3.798861,...,9.499197,50.416113,353.027662,10.383167,691,23,88.0,98.0,6.055556,4.027778


In [58]:
combined_bace_inhibitors = pd.concat([bace_inhibitors, mordered_desc], axis=1)
combined_bace_inhibitors.to_csv('combined_bace_inhibitors_ki_and_mordred_descriptors.csv', index=False)
combined_bace_inhibitors.shape


(349, 1832)