### Introduction

Each bag contains a fixed number of MNIST digits (e.g., 5). The label is a sum of digits in the bag. The task is to predict the sum of digits and the weights of contribution of each digit. Perfectly, the bigger the digit is, the higher the predicted weight it should have.

**Instance:** One MNIST digit image.

**Bag**: A collection of digits (e.g., a list of 5 MNIST digits).

**Label:** A sum of digits in the bag.

**Key instance:** All digits.

In [1]:
import logging
import warnings
warnings.filterwarnings("ignore")
logging.getLogger("pytorch_lightning").setLevel(logging.ERROR)
logging.getLogger("lightning").setLevel(logging.ERROR)

import time
import torch
import random

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

# MNIST dataset creation
from milearn.data.mnist import load_mnist, create_bags_or, create_bags_and, create_bags_xor, create_bags_reg

# Preprocessing
from milearn.preprocessing import BagMinMaxScaler

# Network hparams
from milearn.network.module.hopt import DEFAULT_PARAM_GRID

# Wrappers
from milearn.wrapper import BagWrapper, InstanceWrapper
from sklearn.linear_model import (
    SGDRegressor,
    Ridge,
    Lasso,
    ElasticNet,
    LinearRegression
)
from sklearn.ensemble import (
    HistGradientBoostingRegressor,
    RandomForestRegressor,
    BaggingRegressor
)
from sklearn.svm import LinearSVR

# MIL network wrappers
from milearn.network.regressor import BagWrapperMLPNetworkRegressor, InstanceWrapperMLPNetworkRegressor
from milearn.network.classifier import BagWrapperMLPNetworkClassifier, InstanceWrapperMLPNetworkClassifier

# MIL networks
from milearn.network.regressor import (InstanceNetworkRegressor,
                                       BagNetworkRegressor,
                                       AdditiveAttentionNetworkRegressor,
                                       SelfAttentionNetworkRegressor,
                                       HopfieldAttentionNetworkRegressor,
                                       DynamicPoolingNetworkRegressor)

# Utils
from sklearn.metrics import r2_score, accuracy_score
from scipy.stats import spearmanr
from sklearn.model_selection import train_test_split

# Prediction visualisation
from milearn.data.mnist import visualize_bag_with_weights

# QSARcons
from qsarcons.consensus import RandomSearchRegressor, SystematicSearchRegressor, GeneticSearchRegressor

### Key Instance Detection Ranking Accuracy for Regression

This function evaluates how well a model's predicted attention weights rank the important instances in a bag, by computing the Spearman rank correlation between:

* The true importance ranking (represented here by the digit values)

* The predicted importance scores (attention weights)

### 1. Create MIL dataset

In [6]:
# bag_size = 5
# num_bags = 1000

# data, targets = load_mnist()
# bags, labels, key = create_bags_reg(data, targets, bag_size=bag_size, num_bags=num_bags, bag_agg="mean", random_state=42)

from huggingface_hub import hf_hub_download
from rdkit import Chem
from qsarmil.fragment.rdkit import FragmentGenerator
from rdkit.Chem import Descriptors, rdMolDescriptors
from rdkit.Chem import AllChem
PROPERTY_FUNCTIONS = {
    "LogP": Descriptors.MolLogP,
    "MolWt": Descriptors.MolWt,
    "TPSA": rdMolDescriptors.CalcTPSA,
    "NumHDonors": Descriptors.NumHDonors,
    "NumHAcceptors": Descriptors.NumHAcceptors,
    "MolMR": Descriptors.MolMR,
    "NumRotatableBonds": Descriptors.NumRotatableBonds,
    "RingCount": Descriptors.RingCount,
    "FractionCSP3": Descriptors.FractionCSP3,
}
REPO_ID = "KagakuData/notebooks"
csv_path = hf_hub_download(REPO_ID, filename="chembl/CHEMBL217.csv", repo_type="dataset")
data = pd.read_csv(csv_path, header=None)
smiles = list(data[0])
mols = [Chem.MolFromSmiles(s) for s in smiles]
# generate fragments
frag_gen = FragmentGenerator(num_cpu=40, verbose=True)
frags = frag_gen.run(mols)
# sample bags with multiple fragments
bag_size = 5
rng = np.random.RandomState(42)
frags = [mol for mol in frags if len(mol) > bag_size]
frags = [rng.choice(mol, size=bag_size, replace=False).tolist() for mol in frags]
property_name = "LogP"
get_property = PROPERTY_FUNCTIONS[property_name]
contribs = [[get_property(f) for f in mol] for mol in frags]
props = [sum(m) for m in contribs]
# Supported RDKit molecular property functions
def compute_fragment_descriptors(frags, n_bits=128, radius=2):
    bags_descriptors = []
    for frag in frags:
        descs = [np.array(AllChem.GetMorganFingerprintAsBitVect(f, radius, nBits=n_bits)) for f in frag]
        bags_descriptors.append(descs)
    return bags_descriptors
desc = compute_fragment_descriptors(frags)

Generating fragments: 100%|████████████████████████████████████████████████████████| 5012/5012 [00:36<00:00, 138.28it/s]


In [7]:
# train/test split
x_train, x_test, y_train, y_test, key_train, key_test, frg_train, frg_test = train_test_split(desc, props, contribs, frags, random_state=42)

x_train, x_val, y_train, y_val, key_train, key_val = train_test_split(x_train, y_train, key_train, random_state=42)

# features scaling
scaler = BagMinMaxScaler()
scaler.fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_val_scaled = scaler.transform(x_val)
x_test_scaled = scaler.transform(x_test)

### 2. Build multiple models

In [8]:
wrapper_list = []
for Method in [SGDRegressor, Ridge, Lasso, ElasticNet, LinearRegression, 
               HistGradientBoostingRegressor, RandomForestRegressor, BaggingRegressor, LinearSVR]:
    
    wrapper_list.append(
        (f"MeanBagWrapperRegressor[{Method.__name__}]", BagWrapper(Method(), pool="mean"))
    )

    wrapper_list.append(
        (f"MeanInstanceWrapperRegressor[{Method.__name__}]", InstanceWrapper(Method(), pool="mean"))
    )

In [9]:
regressor_list = [

        # wrapper mil networks
        ("MeanBagWrapperMLPNetworkRegressor", BagWrapperMLPNetworkRegressor(pool="mean")),
        ("MeanInstanceWrapperMLPNetworkRegressor", InstanceWrapperMLPNetworkRegressor(pool="mean")),
    
        # classic mil networks
        ("MeanBagNetworkRegressor", BagNetworkRegressor(pool="mean")),
        ("MeanInstanceNetworkRegressor", InstanceNetworkRegressor(pool="mean")),

        # attention mil networks
        ("AdditiveAttentionNetworkRegressor", AdditiveAttentionNetworkRegressor()),
        ("SelfAttentionNetworkRegressor", SelfAttentionNetworkRegressor()),
        ("HopfieldAttentionNetworkRegressor", HopfieldAttentionNetworkRegressor()),

        # other mil networks
        ("DynamicPoolingNetworkRegressor", DynamicPoolingNetworkRegressor()),
    ]

regressor_list.extend(wrapper_list)

In [10]:
# build models
df_val, df_test = pd.DataFrame(), pd.DataFrame()
for name, model in regressor_list:
    print(f"Training model: '{name}'")

    # train model
    # model.hopt(x_train_scaled, y_train, param_grid=DEFAULT_PARAM_GRID, verbose=False)
    
    model.fit(x_train_scaled, y_train)
    
    # predict
    df_val[name] = model.predict(x_val_scaled)
    df_test[name] = model.predict(x_test_scaled)

Training model: 'MeanBagWrapperMLPNetworkRegressor'
Training model: 'MeanInstanceWrapperMLPNetworkRegressor'
Training model: 'MeanBagNetworkRegressor'
Training model: 'MeanInstanceNetworkRegressor'
Training model: 'AdditiveAttentionNetworkRegressor'
Training model: 'SelfAttentionNetworkRegressor'
Training model: 'HopfieldAttentionNetworkRegressor'
Training model: 'DynamicPoolingNetworkRegressor'
Training model: 'MeanBagWrapperRegressor[SGDRegressor]'
Training model: 'MeanInstanceWrapperRegressor[SGDRegressor]'
Training model: 'MeanBagWrapperRegressor[Ridge]'
Training model: 'MeanInstanceWrapperRegressor[Ridge]'
Training model: 'MeanBagWrapperRegressor[Lasso]'
Training model: 'MeanInstanceWrapperRegressor[Lasso]'
Training model: 'MeanBagWrapperRegressor[ElasticNet]'
Training model: 'MeanInstanceWrapperRegressor[ElasticNet]'
Training model: 'MeanBagWrapperRegressor[LinearRegression]'
Training model: 'MeanInstanceWrapperRegressor[LinearRegression]'
Training model: 'MeanBagWrapperRegressor

### 3. Model consensus search

In [11]:
metric = "auto"
cons_size = "auto"
cons_size_candidates = [3, 5, 10]

In [12]:
cons_methods = [
    ("Best", SystematicSearchRegressor(cons_size=1, metric=metric, cons_size_candidates=cons_size_candidates)),
    
    ("All", SystematicSearchRegressor(cons_size=10000, metric=metric, cons_size_candidates=cons_size_candidates)), 
    
    ("Random", RandomSearchRegressor(cons_size=cons_size, n_iter=1000, metric=metric, cons_size_candidates=cons_size_candidates)), 
    
    ("Systematic", SystematicSearchRegressor(cons_size=cons_size, metric=metric, cons_size_candidates=cons_size_candidates)),
    
    ("Genetic", GeneticSearchRegressor(cons_size=cons_size, n_iter=50, pop_size=50, mut_prob=0.2, metric=metric, 
                                       cons_size_candidates=cons_size_candidates))
]

In [13]:
y_val, y_test = pd.Series(y_val), pd.Series(y_test)

for name, cons_searcher in cons_methods:
    
    # run search
    best_cons = cons_searcher.run(df_val, y_val)
    print(name, len(best_cons))
    
    # make val and test predictions
    pred_val = cons_searcher._consensus_predict(df_val[best_cons])
    pred_test = cons_searcher._consensus_predict(df_test[best_cons])
    
    # write prediction accuracy metric
    df_val[name] = pred_val
    df_test[name] = pred_test

Best 1
All 23
Random 3
Systematic 5
Genetic 5


In [14]:
res = pd.DataFrame()
for model in df_val.columns[2:]:
    res.loc[model, "R2"] = r2_score(y_val, df_val[model])
res.sort_values(by="R2", ascending=False).head(10).round(2)

Unnamed: 0,R2
Genetic,0.95
Systematic,0.95
Random,0.95
Best,0.94
AdditiveAttentionNetworkRegressor,0.94
SelfAttentionNetworkRegressor,0.93
MeanInstanceNetworkRegressor,0.92
MeanBagNetworkRegressor,0.92
MeanBagWrapperRegressor[HistGradientBoostingRegressor],0.84
MeanBagWrapperRegressor[LinearRegression],0.82


In [15]:
res = pd.DataFrame()
for model in df_test.columns[2:]:
    res.loc[model, "R2"] = r2_score(y_val, df_val[model])
res.sort_values(by="R2", ascending=False).head(10).round(2)

Unnamed: 0,R2
Genetic,0.95
Systematic,0.95
Random,0.95
Best,0.94
AdditiveAttentionNetworkRegressor,0.94
SelfAttentionNetworkRegressor,0.93
MeanInstanceNetworkRegressor,0.92
MeanBagNetworkRegressor,0.92
MeanBagWrapperRegressor[HistGradientBoostingRegressor],0.84
MeanBagWrapperRegressor[LinearRegression],0.82
