In [1]:
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

from qsarcons.consensus import RandomSearchRegressor, SystematicSearchRegressor, GeneticSearchRegressor
from qsarcons.stacking import StackingRegressor

### Load built QSAR models

The input data for consensus/stacking methods is a pandas DataFrame where each column is a model prediction. For benchmarking proposes, we need two tables:

- Validation set predictions – they are used to find an optimal consensus or stacking ensemble
- Test set predictions - is needed to evaluate the performance of the consensus/stacking

### Input file format

The input CSV files (`train.csv` and `test.csv`) are expected to have the following format:
- **Column 0** – SMILES (or any other molecule ID)
- **Column 1** – Observed/Experimental property/activity values
- **Columns 2+** – Model predictions (from individual QSAR models)

In [2]:
df_val = pd.read_csv("CHEMBL1785/train.csv", )
df_test = pd.read_csv("CHEMBL1785/test.csv")

In [3]:
df_test.head(5)

Unnamed: 0,SMILES,Y_TRUE,atompair-count|BayesianRidge,atompair-count|DecisionTreeRegressor,atompair-count|ElasticNet,atompair-count|HuberRegressor,atompair-count|KNeighborsRegressor,atompair-count|Lasso,atompair-count|MLPRegressor,atompair-count|PLSRegression,...,topological|ElasticNet,topological|HuberRegressor,topological|KNeighborsRegressor,topological|Lasso,topological|MLPRegressor,topological|PLSRegression,topological|RandomForestRegressor,topological|RidgeRegression,topological|SVR,topological|XGBRegressor
0,COCCOc1nc(NS(=O)(=O)NCc2ccccc2)c(Oc2ccccc2OC)c...,6.256,6.61002,6.3062,6.890746,7.309935,6.475773,6.829488,6.304784,6.444161,...,6.812998,5.728646,6.475603,6.46852,5.356423,6.243217,6.494189,6.314992,6.418615,6.628993
1,COc1cnc(O[C@H](C(=O)O)[C@@]2(c3ccccc3)NCC(=O)N...,5.034,5.543897,5.364375,5.313822,5.699822,5.631566,5.259852,4.50762,5.451443,...,5.720473,5.176263,5.306299,5.679028,5.425185,5.808187,5.386879,5.766536,5.120113,5.258121
2,CCOc1ccc2c(c1)c(=O)c(Cc1cccc(C(=O)O)c1)c(C(=O)...,5.506,6.069682,5.642833,6.2229,6.447204,5.95081,6.151965,6.365187,5.857042,...,5.792353,6.1203,5.804565,5.804876,6.102401,5.799607,5.865963,5.821107,6.008066,5.883258
3,CCOc1ccc2c(c1)c(-c1ccc(OC)cc1OC)c(C(=O)O)c(=O)...,6.268,6.046365,7.0014,6.30107,6.842113,6.001984,6.292202,6.563304,5.84838,...,5.795346,6.059051,5.908972,5.922968,6.206772,5.821332,5.954252,5.898207,5.992958,6.383691
4,Cc1ccc(S(=O)(=O)Nc2onc(C)c2Br)cc1,5.398,4.952641,5.333167,5.100456,5.492576,4.59869,5.098454,5.08333,4.882305,...,4.951271,5.060801,5.13164,4.970824,5.07225,4.818004,5.091551,4.989698,5.410053,5.154193


In [4]:
# skip first two columns (smiles and true property value)
x_val, true_val = df_val.iloc[:, 2:], df_val.iloc[:, 1]
x_test, true_test = df_test.iloc[:, 2:], df_test.iloc[:, 1]

### Build Consensus / Stacking

For consensus methods, there are two main parameters that must be predefined:

**1. Metric**

This is the *prediction accuracy metric* to be optimized. The following regression metrics are currently supported:

- ``r2`` – Coefficient of determination
- ``rmse`` – Root mean squared error
- ``mae`` – Mean absolute error
- ``spearmanr`` – Spearman rank correlation coefficient

For benchmarking purposes, it is recommended to set the optimized metric to match the one used for estimating prediction accuracy on the test set.

**2. Consensus Size**

The *consensus size* determines how many models are included in the consensus. It can be defined in two ways:

- **Manual selection**: for example, ``cons_size=10``
- **Automatic selection**: e.g., ``cons_size="auto", cons_size_candidates=[3, 5, 7, 9, 12, 15]``, where the best consensus size is chosen based on metric values computed on the validation set.

In [5]:
metric = "r2"

In [6]:
df_comp = pd.DataFrame()

### Random consensus

In a random search for optimal consensus the subset of models is chosen randomly and its prediction accuracy is computed. This procedure is repeated ``n_iter`` times, then the subset (consensus) with the highest prediction accuracy on the validation set is chosen as the best consensus.

In [7]:
cons_searcher = RandomSearchRegressor(cons_size="auto", n_iter=5000, metric=metric)
best_cons = cons_searcher.run(x_val, true_val)
best_cons.to_list()

['atompair-count|XGBRegressor', 'pharm2D-gobbi|SVR', 'fcfp-count|XGBRegressor']

In [8]:
# make val and test predictions
pred_val = x_val[best_cons].mean(axis=1)
pred_test = x_test[best_cons].mean(axis=1)

# write prediction accuracy metric
df_comp.loc["val", "Random"] = r2_score(true_val, pred_val)
df_comp.loc["test", "Random"] = r2_score(true_test, pred_test)

# display current results
df_comp

Unnamed: 0,Random
val,0.810415
test,0.843243


### Systematic consensus

In a systematic search for optimal consensus, all models are sorted according to their prediction accuracy on the validation set, and then the first *N* models (*N*=``cons_size``) are chosen as the best consensus.

In [9]:
cons_searcher = SystematicSearchRegressor(cons_size="auto", metric=metric)
best_cons = cons_searcher.run(x_val, true_val)
best_cons.to_list()

['atompair-count|XGBRegressor',
 'ecfp-count|XGBRegressor',
 'fcfp-count|XGBRegressor',
 'atompair-count|RandomForestRegressor',
 'topological|RandomForestRegressor',
 'ecfp|XGBRegressor',
 'fcfp|XGBRegressor',
 'ecfp-count|RandomForestRegressor',
 'topological|XGBRegressor',
 'rdkit|SVR',
 'atompair-count|BayesianRidge']

In [10]:
# make val and test predictions
pred_val = x_val[best_cons].mean(axis=1)
pred_test = x_test[best_cons].mean(axis=1)

# write prediction accuracy metric
df_comp.loc["val", "Systematic"] = r2_score(true_val, pred_val)
df_comp.loc["test", "Systematic"] = r2_score(true_test, pred_test)

# display current results
df_comp

Unnamed: 0,Random,Systematic
val,0.810415,0.807979
test,0.843243,0.845686


### Genetic consensus

In a genetic search for optimal consensus, the best consensus of models is being founded using a genetic algorithm. In this case, the objective function is a prediction accuracy for the consensus (individual). 

In [11]:
cons_searcher = GeneticSearchRegressor(cons_size="auto", n_iter=100, pop_size=50, mut_prob=0.1, metric=metric)
best_cons = cons_searcher.run(x_val, true_val)
best_cons.to_list()

['atompair-count|XGBRegressor',
 'atompair-count|Lasso',
 'rdkit|MLPRegressor',
 'scaffoldkeys|KNeighborsRegressor',
 'topological|XGBRegressor',
 'pharm2D-gobbi|Lasso',
 'ecfp-count|XGBRegressor']

In [12]:
# make val and test predictions
pred_val = x_val[best_cons].mean(axis=1)
pred_test = x_test[best_cons].mean(axis=1)

# write prediction accuracy metric
df_comp.loc["val", "Genetic"] = r2_score(true_val, pred_val)
df_comp.loc["test", "Genetic"] = r2_score(true_test, pred_test)

# display current results
df_comp

Unnamed: 0,Random,Systematic,Genetic
val,0.810415,0.807979,0.819317
test,0.843243,0.845686,0.849826


### Stacking ensemble

In a stacking ensembling, a meta-model is trained on the validation set predictions and used for predicting the property on the test set.

In [13]:
stack_searcher = StackingRegressor(method=RandomForestRegressor())
stack_searcher.run(x_val, true_val)

<qsarcons.stacking.StackingRegressor at 0x7f4435bfa660>

In [14]:
# make val and test predictions
pred_val = stack_searcher.predict(x_val)
pred_test = stack_searcher.predict(x_test)

# write prediction accuracy metric
df_comp.loc["val", "Stacking"] = r2_score(true_val, pred_val)
df_comp.loc["test", "Stacking"] = r2_score(true_test, pred_test)

# display current results
df_comp.round(2)

Unnamed: 0,Random,Systematic,Genetic,Stacking
val,0.81,0.81,0.82,0.98
test,0.84,0.85,0.85,0.85
