## 1. Load dataset

The example datasets contain molecule structure (SMILES) and measured bioactivity (pKi or IC50) – the higher the better. Each SMILES is converted to a Mol object in RDKit.

In [1]:
import pandas as pd
from rdkit import Chem

In [2]:
data_train = pd.read_csv('data/CHEMBL1871_Ki/train.csv', header=None)
data_test = pd.read_csv('data/CHEMBL1871_Ki/test.csv', header=None)

In [3]:
smi_train, prop_train = data_train[0].to_list(), data_train[1].to_list()
smi_test, prop_test = data_test[0].to_list(), data_test[1].to_list()

In [4]:
mols_train, y_train = [], []
for smi, prop in zip(smi_train, prop_train):
    mol = Chem.MolFromSmiles(smi)
    if mol:
        mols_train.append(mol)
        y_train.append(prop)

In [5]:
mols_test, y_test = [], []
for smi, prop in zip(smi_test, prop_test):
    mol = Chem.MolFromSmiles(smi)
    if mol:
        mols_test.append(mol)
        y_test.append(prop)

## 1.5 Reduce the dataset size for faster pipeline (for playing around)

In [6]:
# mols_train, y_train = mols_train[:30], y_train[:30]
# mols_test, y_test = mols_test[:10], y_test[:10]

## 2. Conformer generation

For each molecule, an ensemble of conformers is generated. Then, molecules for which conformer generation failed are filtered out from both, the training and test set. Generated conformers can be accessed by mol.GetConformers(confID=0).

In [7]:
from qsarmil.conformer import RDKitConformerGenerator

from qsarmil.utils.logging import FailedConformer, FailedDescriptor

In [8]:
conf_gen = RDKitConformerGenerator(num_conf=10, num_cpu=40)

In [9]:
confs_train = conf_gen.run(mols_train)

tmp = [(c, y) for c, y in zip(confs_train, y_train) if not isinstance(c, FailedConformer)]
confs_train, y_train = zip(*tmp) 
confs_train, y_train = list(confs_train), list(y_train)

Generating conformers: 100%|██████████████████████████████████████████████████████████| 525/525 [05:26<00:00,  1.61it/s]


In [10]:
confs_test = conf_gen.run(mols_test)

tmp = [(c, y) for c, y in zip(confs_test, y_test) if not isinstance(c, FailedConformer)]
confs_test, y_test = zip(*tmp) 
confs_test, y_test = list(confs_test), list(y_test)

Generating conformers: 100%|██████████████████████████████████████████████████████████| 134/134 [04:14<00:00,  1.90s/it]


## 3. Descriptor calculation

Then, for each molecule with associated conformers 3D descriptors are calculated. Here, a descriptor wrapper is used, which is designed to apply descriptor calculators from external packages. The resulting descriptors are a list of 2D arrays (bags). Also, the resulting descriptors are scaled.

In [11]:
from qsarmil.descriptor.rdkit import (RDKitGEOM, 
                                      RDKitAUTOCORR, 
                                      RDKitRDF, 
                                      RDKitMORSE, 
                                      RDKitWHIM, 
                                      RDKitGETAWAY)

from molfeat.calc import Pharmacophore3D, USRDescriptors, ElectroShapeDescriptors

from qsarmil.descriptor.wrapper import DescriptorWrapper

from qsarmil.mil.preprocessing import BagMinMaxScaler

In [12]:
desc_calc = DescriptorWrapper(Pharmacophore3D())

In [13]:
x_train = desc_calc.transform(confs_train)
x_test = desc_calc.transform(confs_test)

In [14]:
scaler = BagMinMaxScaler()

scaler.fit(x_train)

x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

## 4. Model training

In [15]:
from sklearn.metrics import r2_score

from sklearn.ensemble import RandomForestRegressor

from qsarmil.mil.wrapper import InstanceWrapper, BagWrapper
from qsarmil.mil.network.regressor import InstanceNetworkRegressor, BagNetworkRegressor

from qsarmil.mil.network.regressor import (AttentionNetworkRegressor, 
                                           SelfAttentionNetworkRegressor,
                                           GatedAttentionNetworkRegressor,
                                           TemperatureAttentionNetworkRegressor,
                                           GaussianPoolingNetworkRegressor,
                                           DynamicPoolingNetworkRegressor)

In [16]:
network_hparams = {'hidden_layer_sizes':(256, 128, 64),
                   'num_epoch':300,
                   'batch_size':128,
                   'learning_rate':0.001,
                   'weight_decay':0.001,
                   'instance_weight_dropout':0.01,
                   'init_cuda':False,
                   'verbose':False}

In [17]:
method_list = [
               ("MeanInstanceWrapper", InstanceWrapper(estimator=RandomForestRegressor(), pool="mean")), 
               ("MaxInstanceWrapper", InstanceWrapper(RandomForestRegressor(), pool="max")), 
               ("MeanBagWrapper", BagWrapper(RandomForestRegressor(), pool="mean")), 
               ("MaxBagWrapper", BagWrapper(RandomForestRegressor(), pool="max")), 
               ("MinBagWrapper", BagWrapper(RandomForestRegressor(), pool="min")), 
               ("ExtremeBagWrapper", BagWrapper(RandomForestRegressor(), pool="extreme")),
               ("MeanInstanceNetwork", InstanceNetworkRegressor(**network_hparams, pool="mean")),
               ("MaxInstanceNetwork", InstanceNetworkRegressor(**network_hparams, pool="max")),
               ("MeanBagNetwork", BagNetworkRegressor(**network_hparams, pool="mean")),
               ("MaxBagNetwork", BagNetworkRegressor(**network_hparams, pool="max")),
               ("AttentionNetworkRegressor", AttentionNetworkRegressor(**network_hparams)),
               ("SelfAttentionNetworkRegressor", SelfAttentionNetworkRegressor(**network_hparams)),
               ("GatedAttentionNetworkRegressor", GatedAttentionNetworkRegressor(**network_hparams)),
               ("TemperatureAttentionNetworkRegressor", TemperatureAttentionNetworkRegressor(**network_hparams)),
               ("GaussianPoolingNetworkRegressor", GaussianPoolingNetworkRegressor(**network_hparams)),
               ("DynamicPoolingNetworkRegressor", DynamicPoolingNetworkRegressor(**network_hparams))
              ]

In [18]:
res_df = pd.DataFrame()
for method_name, model in method_list:
    model.fit(x_train_scaled, y_train)
    
    y_pred = model.predict(x_test_scaled)
    
    res_df.loc[method_name, "ACC"] = r2_score(y_test, y_pred)

In [19]:
res_df.sort_values(by="ACC", ascending=False)

Unnamed: 0,ACC
ExtremeBagWrapper,0.511966
MaxBagWrapper,0.488062
MeanBagWrapper,0.483324
DynamicPoolingNetworkRegressor,0.481889
MeanInstanceNetwork,0.471364
SelfAttentionNetworkRegressor,0.47006
AttentionNetworkRegressor,0.468548
MeanBagNetwork,0.467231
GaussianPoolingNetworkRegressor,0.453823
MeanInstanceWrapper,0.449591


## 5. Key Instance Detection

Some MIL algorithms can identify key instances (if they have get_instance_weights method). In this section, AttentionNetworkRegressor is used to estimate the conformer weights. Here, different 3D descriptors are used to estimate the weight distribution depending on the representation type.

**Conclusion:** With current representations available the weight distribution is not definitive (almost uniform).

In [20]:
from qsarmil.descriptor.rdkit import (RDKitGEOM, 
                                      RDKitAUTOCORR, 
                                      RDKitRDF, 
                                      RDKitMORSE, 
                                      RDKitWHIM, 
                                      RDKitGETAWAY)

from molfeat.calc import (Pharmacophore3D, 
                          USRDescriptors, 
                          ElectroShapeDescriptors)

from qsarmil.descriptor.wrapper import DescriptorWrapper

In [21]:
desc_list = [
             ("RDKitGEOM", DescriptorWrapper(RDKitGEOM())),
             ("RDKitAUTOCORR", DescriptorWrapper(RDKitAUTOCORR())),
             ("RDKitRDF", DescriptorWrapper(RDKitRDF())),
             ("RDKitMORSE", DescriptorWrapper(RDKitMORSE())),
             ("RDKitWHIM", DescriptorWrapper(RDKitWHIM())),
             ("RDKitGETAWAY", DescriptorWrapper(RDKitGETAWAY())),
             ("MolFeatPmapper", DescriptorWrapper(Pharmacophore3D(factory='pmapper'))),
             ("MolFeatUSRD", DescriptorWrapper(USRDescriptors())),
             ("MolFeatElectroShape", DescriptorWrapper(ElectroShapeDescriptors())),
            ]

In [22]:
network_hparams = {'hidden_layer_sizes':(256, 128, 64),
                   'num_epoch':300,
                   'batch_size':128,
                   'learning_rate':0.001,
                   'weight_decay':0.001,
                   'instance_weight_dropout':0.01,
                   'init_cuda':False,
                   'verbose':False}

In [23]:
w_list = [pd.DataFrame() for _ in confs_test]
for desc_name, desc_calc in desc_list:

    # calc descriptors
    x_train = desc_calc.transform(confs_train)
    x_test = desc_calc.transform(confs_test)

    # scale descriptors
    scaler = BagMinMaxScaler()
    scaler.fit(x_train)
    x_train_scaled = scaler.transform(x_train)
    x_test_scaled = scaler.transform(x_test)

    # train model
    model = AttentionNetworkRegressor(**network_hparams)
    model.fit(x_train_scaled, y_train)

    # get instance weights
    w_pred = model.get_instance_weights(x_test_scaled)
    for w, df in zip(w_pred, w_list):
        df[desc_name] = w
        df.index = [f"Conformer_{i + 1}" for i in range(len(w))]

In [24]:
w_list[0].round(2) # molecule 0

Unnamed: 0,RDKitGEOM,RDKitAUTOCORR,RDKitRDF,RDKitMORSE,RDKitWHIM,RDKitGETAWAY,MolFeatPmapper,MolFeatUSRD,MolFeatElectroShape
Conformer_1,0.1,0.1,0.11,0.08,0.03,0.1,0.1,0.13,0.09
Conformer_2,0.1,0.1,0.12,0.12,0.1,0.1,0.09,0.11,0.12
Conformer_3,0.1,0.1,0.1,0.1,0.12,0.1,0.1,0.08,0.09
Conformer_4,0.1,0.1,0.08,0.1,0.06,0.1,0.1,0.16,0.09
Conformer_5,0.1,0.1,0.06,0.1,0.14,0.1,0.1,0.07,0.11
Conformer_6,0.1,0.1,0.08,0.11,0.13,0.1,0.1,0.07,0.11
Conformer_7,0.1,0.1,0.08,0.11,0.13,0.1,0.1,0.07,0.11
Conformer_8,0.1,0.1,0.11,0.09,0.12,0.1,0.11,0.08,0.09
Conformer_9,0.1,0.1,0.15,0.11,0.13,0.1,0.1,0.09,0.1
Conformer_10,0.1,0.1,0.11,0.08,0.03,0.1,0.1,0.13,0.1


In [25]:
w_list[1].round(2) # molecule 1

Unnamed: 0,RDKitGEOM,RDKitAUTOCORR,RDKitRDF,RDKitMORSE,RDKitWHIM,RDKitGETAWAY,MolFeatPmapper,MolFeatUSRD,MolFeatElectroShape
Conformer_1,0.1,0.1,0.11,0.1,0.09,0.1,0.11,0.08,0.11
Conformer_2,0.1,0.1,0.1,0.11,0.08,0.1,0.1,0.12,0.17
Conformer_3,0.1,0.1,0.05,0.09,0.08,0.1,0.11,0.13,0.19
Conformer_4,0.1,0.1,0.07,0.1,0.1,0.1,0.1,0.13,0.02
Conformer_5,0.1,0.1,0.1,0.1,0.13,0.1,0.1,0.08,0.07
Conformer_6,0.1,0.1,0.07,0.1,0.09,0.1,0.09,0.07,0.1
Conformer_7,0.1,0.1,0.13,0.09,0.12,0.1,0.1,0.09,0.08
Conformer_8,0.1,0.1,0.13,0.09,0.12,0.1,0.1,0.09,0.08
Conformer_9,0.1,0.1,0.15,0.12,0.1,0.1,0.09,0.09,0.07
Conformer_10,0.1,0.1,0.1,0.1,0.09,0.1,0.11,0.11,0.13
