In [1]:
import polaris
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

from qsarcons.lazy import LazyML
from qsarcons.consensus import RandomSearchRegressor, SystematicSearchRegressor, GeneticSearchRegressor

  from .autonotebook import tqdm as notebook_tqdm


### 1. Load data

In [3]:
# Load the benchmark from the Hub
benchmark = polaris.load_benchmark("polaris/adme-fang-solu-1")

# Get the train and test data-loaders
data_train, data_test = benchmark.get_train_test_split()
data_train, data_test = data_train.as_dataframe(), data_test.as_dataframe()

smi_train, prop_train = data_train["smiles"].to_list(), data_train["LOG_SOLUBILITY"].to_list()

data_train, data_val = train_test_split(data_train, test_size=0.2, random_state=42)

### 2. Build multiple 2D models

In [4]:
data_test["LogS"] = [0 for i in data_test.index]

# lazy_ml = LazyML(task="regression", output_folder="logs_bench", verbose=True)
# lazy_ml.run(data_train, data_val, data_test)

Total models to build: 152


100%|█████████████████████████████████████████████████████████████████████████████████| 152/152 [20:45<00:00,  8.19s/it]

All models completed.





<qsarcons.lazy.LazyML at 0x7f423d838610>

### 3. Build model consensus

In [5]:
metric = "auto"
cons_size = "auto"

In [7]:
cons_methods = [
    ("Best", SystematicSearchRegressor(cons_size=1, metric=metric)),         
    ("Random", RandomSearchRegressor(cons_size=cons_size, n_iter=1000, metric=metric)),       
    ("Systematic", SystematicSearchRegressor(cons_size=cons_size, metric=metric)),
    ("Genetic", GeneticSearchRegressor(cons_size=cons_size, n_iter=50, pop_size=50, mut_prob=0.2, metric=metric))
]

In [10]:
# load model predictions
df_val = pd.read_csv("logs_bench/val.csv")
df_test = pd.read_csv("logs_bench/test.csv")

# skip first two columns (smiles and true property value)
x_val, true_val = df_val.iloc[:, 2:], df_val.iloc[:, 1]
x_test = df_test.iloc[:, 2:]

In [11]:
for name, cons_searcher in cons_methods:
    
    # run search
    best_cons = cons_searcher.run(x_val, true_val)
    
    # make val and test predictions
    pred_val = cons_searcher._consensus_predict(x_val[best_cons])
    pred_test = cons_searcher._consensus_predict(x_test[best_cons])
    
    # write prediction accuracy metric
    df_val[name] = pred_val
    df_test[name] = pred_test

### 4. Summurize results

In [15]:
res = pd.DataFrame()
for model in df_val.columns[2:]:
    res.loc[model, "R2"] = r2_score(df_val["Y_TRUE"], df_val[model])

In [19]:
res.sort_values(by="R2", ascending=False)

Unnamed: 0,R2
Genetic,0.470755
Systematic,0.447144
Random,0.437362
BasicML|desc2D|MLPRegressor,0.392734
BasicML|desc2D|SVR,0.391462
...,...
BasicML|desc2D|DecisionTreeRegressor,-0.584921
BasicML|fcfp-count|DecisionTreeRegressor,-0.608773
BasicML|rdkit|RidgeRegression,-0.622639
BasicML|ecfp-count|DecisionTreeRegressor,-0.700100


In [32]:
y_pred = df_test["Genetic"].to_list()
results = benchmark.evaluate(y_pred)
results

test_set,target_label,scores
test,LOG_SOLUBILITY,explained_var0.35711736389720994pearsonr0.6004002542933552mean_squared_error0.3491398448007377r20.3560427891695943mean_absolute_error0.4238736241606715spearmanr0.48347265962427716
explained_var,0.35711736389720994,
pearsonr,0.6004002542933552,
mean_squared_error,0.3491398448007377,
r2,0.3560427891695943,
mean_absolute_error,0.4238736241606715,
spearmanr,0.48347265962427716,
benchmark_artifact_id,polaris/adme-fang-solu-1,
benchmark_name,,
benchmark_owner,,

test_set,target_label,scores
test,LOG_SOLUBILITY,explained_var0.35711736389720994pearsonr0.6004002542933552mean_squared_error0.3491398448007377r20.3560427891695943mean_absolute_error0.4238736241606715spearmanr0.48347265962427716
explained_var,0.35711736389720994,
pearsonr,0.6004002542933552,
mean_squared_error,0.3491398448007377,
r2,0.3560427891695943,
mean_absolute_error,0.4238736241606715,
spearmanr,0.48347265962427716,

0,1
explained_var,0.3571173638972099
pearsonr,0.6004002542933552
mean_squared_error,0.3491398448007377
r2,0.3560427891695943
mean_absolute_error,0.4238736241606715
spearmanr,0.4834726596242771


In [33]:
from polaris.hub.client import PolarisHubClient
with PolarisHubClient() as client:
    client.login()

Please enter the authorization token:  OGMZZGMWMTITYJQ3MI0ZOWJMLWEZNJGTYJY0YWI5ZDGXN2RK


In [34]:
results.name = "QSARcons"
results.upload_to_hub(owner="dzankov")