<span style="color:Red">MinMax diversity</span> is a particular case of <span style="color:Red">Kennard-Stone algorithm</span>, the purpose of these algorithms is to split the dataset into two parts based on user-selected factors, e.g. molecular descriptors. The main purpose of this procedure in cheminformatics is to get the test (or validation, or optimization set) for Machine-learning, such as QSAR modelling.

The gist of the algorithm is an iterative selection of new training set candidates from the remaining dataset, so that every new candidate had the highest out of lowest dissimilarities between the candidates and all of the training set compounds. The procedure is repeated until the desired split ratio (e. g. 75% training, 25% test is achieved). The goal of the splitting is to create the training set that comprises the full diversity of the overall dataset, whereas test set does not have compounds that are too different from the training set. <u>It is important to mention that the results of the splitting are quite dependent on similarity metric and descriptor space.<u>

In [1]:
import rdkit as rd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
import copy
from rdkit.Chem import PandasTools
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import MACCSkeys
from rdkit.Chem import GraphDescriptors
from rdkit import DataStructs
from rdkit.ML.Descriptors import MoleculeDescriptors
import numpy as np
import pandas as pd
import pickle

In [2]:
from rdkit.Chem import rdFMCS
cox2_sdf = r'assets/COX2_inhibitors_final.sdf'
df = PandasTools.LoadSDF(cox2_sdf, molColName='Mol')
df["Inhibition, %"] = df["Inhibition, %"].astype(int)

In [8]:
descr_df = np. full((df.shape[0], 4), 0, dtype="float64")
for i in range(0, len(df.index)):
    descr_bundle = []
    mol = df[ 'Mol'][i]
    descr_bundle.append(rdMolDescriptors.CalcNumAromaticRings(mol))
    descr_bundle.append(Descriptors.NumValenceElectrons(mol))
    descr_bundle.append(round(GraphDescriptors.BalabanJ(mol), 2))
    descr_bundle.append(round(rdMolDescriptors.CalcExactMolWt(mol), 1))
    descr_df[i,0:len(descr_bundle)] = descr_bundle
descr_df = pd.DataFrame(descr_df, index = df[ 'CHEMBLID'])
### naming the descr df
descr_names = ['NumAromaticRings', 'NumValenceElectrons','BalabanJ', 'MW']
descr_df.columns = descr_names
descr_df.head()

Unnamed: 0_level_0,NumAromaticRings,NumValenceElectrons,BalabanJ,MW
CHEMBLID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CHEMBL366429,2.0,126.0,2.02,365.1
CHEMBL176216,2.0,146.0,1.79,458.0
CHEMBL174680,3.0,136.0,1.86,380.1
CHEMBL176357,3.0,138.0,1.83,421.0
CHEMBL369840,2.0,142.0,2.13,465.9


In [9]:
### structural keys
finger_df = np. full((df.shape[0], 167), 0, dtype="float64" )
for i in range(0, len(df. index)) :
    finger_df[i, :] = np.array(rdMolDescriptors.GetMACCSKeysFingerprint(df['Mol'][i]))
finger_df = pd. DataFrame(finger_df, index = df[ 'CHEMBLID' ])
del (finger_df[0]) # removing Ithe empty coLumn
MACCSkeys_names = list(MACCSkeys. smartsPatts.values())
finger_df.columns = MACCSkeys_names

In [17]:
# capstone descr
capstone_desc_path = r"/Users/marcusc/Documents/Courses/neovarsity_chemoinformatics_2024/assets/chap8_output_halogen.csv" 
capstone_desc = pd.read_csv(capstone_desc_path, index_col='CHEMBLID')[['HalogenBondDonor']]
capstone_desc.head()

Unnamed: 0_level_0,HalogenBondDonor
CHEMBLID,Unnamed: 1_level_1
CHEMBL366429,False
CHEMBL176216,True
CHEMBL174680,False
CHEMBL176357,True
CHEMBL369840,True


In [26]:
### merge descr spaces
descr_df_full = pd. concat((capstone_desc, descr_df, finger_df), axis = 1)
descr_df_full.head()

Unnamed: 0_level_0,HalogenBondDonor,NumAromaticRings,NumValenceElectrons,BalabanJ,MW,"(?, 0)","([#104], 0)","([#32,#33,#34,#50,#51,#52,#82,#83,#84], 0)","([Ac,Th,Pa,U,Np,Pu,Am,Cm,Bk,Cf,Es,Fm,Md,No,Lr], 0)","([Sc,Ti,Y,Zr,Hf], 0)",...,"([#6]-[#8], 0)","([#6]-[#7], 0)","([#8], 1)","([C;H3,H4], 0)","([#7], 0)","(a, 0)","(*1~*~*~*~*~*~1, 0)","([#8], 0)","([R], 0)","(?, 0)"
CHEMBLID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CHEMBL366429,False,2.0,126.0,2.02,365.1,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
CHEMBL176216,True,2.0,146.0,1.79,458.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0
CHEMBL174680,False,3.0,136.0,1.86,380.1,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
CHEMBL176357,True,3.0,138.0,1.83,421.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
CHEMBL369840,True,2.0,142.0,2.13,465.9,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [27]:
### descr space output
descr_df_full.to_csv(r"/Users/marcusc/Documents/Courses/neovarsity_chemoinformatics_2024/assets/chap10_full_desc.csv")

### Kennard-Stone algorithm description
1. first two starting points are picked based on biggest Euclidean distance in the dataset
2. next point is added based on highest out of lowest Euclidean distance between the candidates and all of the training set compounds.
3. repeat step 2 until the desired training-to-test ratio is achieved

### source: R. W. Kennard & L. A. Stone (1969) Computer Aided Design of Experiments, Technometrics, 11:1, 137-148. doi:10.1080/00401706.1969.10490666

In [28]:
print(*descr_df_full.columns)

HalogenBondDonor NumAromaticRings NumValenceElectrons BalabanJ MW ('?', 0) ('[#104]', 0) ('[#32,#33,#34,#50,#51,#52,#82,#83,#84]', 0) ('[Ac,Th,Pa,U,Np,Pu,Am,Cm,Bk,Cf,Es,Fm,Md,No,Lr]', 0) ('[Sc,Ti,Y,Zr,Hf]', 0) ('[La,Ce,Pr,Nd,Pm,Sm,Eu,Gd,Tb,Dy,Ho,Er,Tm,Yb,Lu]', 0) ('[V,Cr,Mn,Nb,Mo,Tc,Ta,W,Re]', 0) ('[!#6;!#1]1~*~*~*~1', 0) ('[Fe,Co,Ni,Ru,Rh,Pd,Os,Ir,Pt]', 0) ('[Be,Mg,Ca,Sr,Ba,Ra]', 0) ('*1~*~*~*~1', 0) ('[Cu,Zn,Ag,Cd,Au,Hg]', 0) ('[#8]~[#7](~[#6])~[#6]', 0) ('[#16]-[#16]', 0) ('[#8]~[#6](~[#8])~[#8]', 0) ('[!#6;!#1]1~*~*~1', 0) ('[#6]#[#6]', 0) ('[#5,#13,#31,#49,#81]', 0) ('*1~*~*~*~*~*~*~1', 0) ('[#14]', 0) ('[#6]=[#6](~[!#6;!#1])~[!#6;!#1]', 0) ('*1~*~*~1', 0) ('[#7]~[#6](~[#8])~[#8]', 0) ('[#7]-[#8]', 0) ('[#7]~[#6](~[#7])~[#7]', 0) ('[#6]=;@[#6](@*)@*', 0) ('[I]', 0) ('[!#6;!#1]~[CH2]~[!#6;!#1]', 0) ('[#15]', 0) ('[#6]~[!#6;!#1](~[#6])(~[#6])~*', 0) ('[!#6;!#1]~[F,Cl,Br,I]', 0) ('[#6]~[#16]~[#7]', 0) ('[#7]~[#16]', 0) ('[CH2]=*', 0) ('[Li,Na,K,Rb,Cs,Fr]', 0) ('[#16R]', 0) ('[#7]~[#6

In [29]:
# from kennard
descr_df_full.head()

Unnamed: 0_level_0,HalogenBondDonor,NumAromaticRings,NumValenceElectrons,BalabanJ,MW,"(?, 0)","([#104], 0)","([#32,#33,#34,#50,#51,#52,#82,#83,#84], 0)","([Ac,Th,Pa,U,Np,Pu,Am,Cm,Bk,Cf,Es,Fm,Md,No,Lr], 0)","([Sc,Ti,Y,Zr,Hf], 0)",...,"([#6]-[#8], 0)","([#6]-[#7], 0)","([#8], 1)","([C;H3,H4], 0)","([#7], 0)","(a, 0)","(*1~*~*~*~*~*~1, 0)","([#8], 0)","([R], 0)","(?, 0)"
CHEMBLID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CHEMBL366429,False,2.0,126.0,2.02,365.1,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
CHEMBL176216,True,2.0,146.0,1.79,458.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0
CHEMBL174680,False,3.0,136.0,1.86,380.1,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
CHEMBL176357,True,3.0,138.0,1.83,421.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
CHEMBL369840,True,2.0,142.0,2.13,465.9,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


## Kennard stone algo

In [None]:
# install kennard_stone
# conda activate neovarsity
# conda install kennard-stone

In [None]:
print(df.columns) # id is integers
descr_df_full.head() # id is chemblid

Index(['SMILES', 'CHEMBLID', 'Inhibition, %', 'ID', 'Mol'], dtype='object')


Unnamed: 0_level_0,HalogenBondDonor,NumAromaticRings,NumValenceElectrons,BalabanJ,MW,"(?, 0)","([#104], 0)","([#32,#33,#34,#50,#51,#52,#82,#83,#84], 0)","([Ac,Th,Pa,U,Np,Pu,Am,Cm,Bk,Cf,Es,Fm,Md,No,Lr], 0)","([Sc,Ti,Y,Zr,Hf], 0)",...,"([#6]-[#8], 0)","([#6]-[#7], 0)","([#8], 1)","([C;H3,H4], 0)","([#7], 0)","(a, 0)","(*1~*~*~*~*~*~1, 0)","([#8], 0)","([R], 0)","(?, 0)"
CHEMBLID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CHEMBL366429,False,2.0,126.0,2.02,365.1,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
CHEMBL176216,True,2.0,146.0,1.79,458.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0
CHEMBL174680,False,3.0,136.0,1.86,380.1,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
CHEMBL176357,True,3.0,138.0,1.83,421.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
CHEMBL369840,True,2.0,142.0,2.13,465.9,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [32]:
from kennard_stone import train_test_split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(descr_df_full, df['Inhibition, %'])