## 1. Load dataset

The example datasets contain molecule structure (SMILES) and measured bioactivity (pKi or IC50) – the higher the better. Each SMILES is converted to a Mol object in RDKit.

In [None]:
import numpy as np
import pandas as pd
from rdkit import Chem

from sklearn.metrics import r2_score, accuracy_score
from sklearn.model_selection import train_test_split

# Data
from huggingface_hub import hf_hub_download

In [None]:
def reg_to_clf(y):
    return np.where(np.array(y) > 6, 1, 0)

def accuracy_metric(y_true, y_pred, task=None):
    if task == "classification":
        return accuracy_score(y_true, y_pred)
    elif task == "regression":
        return r2_score(y_true, y_pred)

In [None]:
TASK = "regression"
# TASK = "classification"

In [None]:
REPO_ID = "KagakuData/notebooks"

csv_path = hf_hub_download(REPO_ID, filename="chembl/CHEMBL279.csv", repo_type="dataset")
data = pd.read_csv(csv_path, header=None)

data_train, data_test = train_test_split(data, test_size=0.2)

In [None]:
smi_train, prop_train = data_train[0].to_list(), data_train[2].to_list()
smi_test, prop_test = data_test[0].to_list(), data_test[2].to_list()

if TASK == "classification":
    prop_train, prop_test = reg_to_clf(prop_train), reg_to_clf(prop_test)

In [None]:
mols_train, y_train = [], []
for smi, prop in zip(smi_train, prop_train):
    mol = Chem.MolFromSmiles(smi)
    if mol:
        mols_train.append(mol)
        y_train.append(prop)

In [None]:
mols_test, y_test = [], []
for smi, prop in zip(smi_test, prop_test):
    mol = Chem.MolFromSmiles(smi)
    if mol:
        mols_test.append(mol)
        y_test.append(prop)

## 1.5 Reduce the dataset size for faster pipeline (for playing around)

In [None]:
# mols_train, y_train = mols_train[:80], y_train[:80]
# mols_test, y_test = mols_test[:20], y_test[:20]

## 2. Conformer generation

For each molecule, an ensemble of conformers is generated. Then, molecules for which conformer generation failed are filtered out from both, the training and test set. Generated conformers can be accessed by mol.GetConformers(confID=0).

In [None]:
from qsarmil.conformer import RDKitConformerGenerator

from qsarmil.utils.logging import FailedConformer, FailedDescriptor

In [None]:
conf_gen = RDKitConformerGenerator(num_conf=10, num_cpu=40)

In [None]:
confs_train = conf_gen.run(mols_train)

tmp = [(c, y) for c, y in zip(confs_train, y_train) if not isinstance(c, FailedConformer)]
confs_train, y_train = zip(*tmp) 
confs_train, y_train = list(confs_train), list(y_train)

In [None]:
confs_test = conf_gen.run(mols_test)

tmp = [(c, y) for c, y in zip(confs_test, y_test) if not isinstance(c, FailedConformer)]
confs_test, y_test = zip(*tmp) 
confs_test, y_test = list(confs_test), list(y_test)

## 3. Descriptor calculation

Then, for each molecule with associated conformers 3D descriptors are calculated. Here, a descriptor wrapper is used, which is designed to apply descriptor calculators from external packages. The resulting descriptors are a list of 2D arrays (bags). Also, the resulting descriptors are scaled.

In [None]:
from qsarmil.descriptor.rdkit import (RDKitGEOM, 
                                      RDKitAUTOCORR, 
                                      RDKitRDF, 
                                      RDKitMORSE, 
                                      RDKitWHIM, 
                                      RDKitGETAWAY)

from molfeat.calc import Pharmacophore3D, USRDescriptors, ElectroShapeDescriptors

from qsarmil.descriptor.wrapper import DescriptorWrapper

from milearn.preprocessing import BagMinMaxScaler

In [None]:
desc_calc = DescriptorWrapper(RDKitRDF())

In [None]:
x_train = desc_calc.transform(confs_train)
x_test = desc_calc.transform(confs_test)

In [None]:
scaler = BagMinMaxScaler()

scaler.fit(x_train)

x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

## 4. Model training

In [None]:
import logging
import warnings
warnings.filterwarnings("ignore")
logging.getLogger("pytorch_lightning").setLevel(logging.ERROR)
logging.getLogger("lightning").setLevel(logging.ERROR)

import time
import torch
import random

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

# MNIST dataset creation
from milearn.data.mnist import load_mnist, create_bags_or, create_bags_and, create_bags_xor, create_bags_reg

# Preprocessing
from milearn.preprocessing import BagMinMaxScaler

# Network hparams
from milearn.network.module.hopt import DEFAULT_PARAM_GRID

# MIL wrappers
from milearn.network.regressor import BagWrapperMLPNetworkRegressor, InstanceWrapperMLPNetworkRegressor
from milearn.network.classifier import BagWrapperMLPNetworkClassifier, InstanceWrapperMLPNetworkClassifier

# MIL networks
from milearn.network.regressor import (InstanceNetworkRegressor,
                                       BagNetworkRegressor,
                                       AdditiveAttentionNetworkRegressor,
                                       SelfAttentionNetworkRegressor,
                                       HopfieldAttentionNetworkRegressor,
                                       DynamicPoolingNetworkRegressor)

from milearn.network.classifier import (InstanceNetworkClassifier,
                                        BagNetworkClassifier,
                                        AdditiveAttentionNetworkClassifier,
                                        SelfAttentionNetworkClassifier,
                                        HopfieldAttentionNetworkClassifier,
                                        DynamicPoolingNetworkClassifier)

# Utils
from sklearn.metrics import r2_score, accuracy_score
from sklearn.model_selection import train_test_split

In [None]:
regressor_list = [

        # wrapper mil networks
        ("MeanBagWrapperMLPNetworkRegressor", BagWrapperMLPNetworkRegressor(pool="mean")),
        ("MeanInstanceWrapperMLPNetworkRegressor", InstanceWrapperMLPNetworkRegressor(pool="mean")),
    
        # classic mil networks
        ("MeanBagNetworkRegressor", BagNetworkRegressor(pool="mean")),
        ("MeanInstanceNetworkRegressor", InstanceNetworkRegressor(pool="mean")),

        # attention mil networks
        ("AdditiveAttentionNetworkRegressor", AdditiveAttentionNetworkRegressor()),
        ("SelfAttentionNetworkRegressor", SelfAttentionNetworkRegressor()),
        ("HopfieldAttentionNetworkRegressor", HopfieldAttentionNetworkRegressor()),

        # other mil networks
        ("DynamicPoolingNetworkRegressor", DynamicPoolingNetworkRegressor()),
    ]

classifier_list = [

        # wrapper mil networks
        ("MeanBagWrapperMLPNetworkClassifier", BagWrapperMLPNetworkClassifier(pool="mean")),
        ("MeanInstanceWrapperMLPNetworkClassifier", InstanceWrapperMLPNetworkClassifier(pool="mean")),
    
        # classic mil networks
        ("MeanBagNetworkClassifier", BagNetworkClassifier(pool="mean")),
        ("MeanInstanceNetworkClassifier", InstanceNetworkClassifier(pool="mean")),

        # attention mil networks
        ("AdditiveAttentionNetworkClassifier", AdditiveAttentionNetworkClassifier()),
        ("SelfAttentionNetworkClassifier", SelfAttentionNetworkClassifier()),
        ("HopfieldAttentionNetworkClassifier", HopfieldAttentionNetworkClassifier()),

        # other mil networks
        ("DynamicPoolingNetworkClassifier", DynamicPoolingNetworkClassifier()),
    ]

In [None]:
if TASK == "regression":
    method_list = regressor_list
elif TASK == "classification":
    method_list = classifier_list

res_df = pd.DataFrame()
for method_name, model in method_list:
    print(method_name)

    # model.hopt(x_train_scaled, y_train, param_grid=DEFAULT_PARAM_GRID, verbose=True)
    model.fit(x_train_scaled, y_train)

    if TASK == "regression":
        y_pred = model.predict(x_test_scaled)
    elif TASK == "classification":
        y_prob = model.predict(x_test_scaled)
        y_pred = np.where(y_prob > 0.5, 1, 0)
    
    res_df.loc[method_name, "ACC"] = accuracy_metric(y_test, y_pred, task=TASK)

In [None]:
res_df.sort_values(by="ACC", ascending=False)