# Regression models
first we import all the stuff we need. Then we load the data. Then we optimize using Optuna

In [34]:
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,root_mean_squared_error
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import Descriptors
import numpy as np
import pandas as pd
from xgboost import XGBRegressor

MFPGEN = rdFingerprintGenerator.GetMorganGenerator(radius=2,fpSize=1024)
AFPGEN = rdFingerprintGenerator.GetAtomPairGenerator(fpSize=2048)
RFPGEN = rdFingerprintGenerator.GetRDKitFPGenerator(fpSize=2048)
TTPGEN = rdFingerprintGenerator.GetTopologicalTorsionGenerator(fpSize=2048)

def calculate_descriptors(mol: Chem.Mol, missingVal: float | None = 0.0) -> dict:
    """Calculate the full list of descriptors for a molecule.
    adapted from
    https://github.com/jonswain/tabpfn-tdc/blob/main/submission.py#L12
    """
    
    res = []
    for nm, fn in Descriptors._descList:
        try:
            if nm!="Ipc": # this one creates crazy values so we exclude it
                val = fn(mol)
        except:
            val = missingVal
        res.append(val)
    return res

def mol_feat(mol,morgan=True,tt=True,ap=True,descs=True,rdfp=True):
    """
    Extracts features by combining:
    - Morgan Fingerprints
    - RDKit Descriptors
    - Topological Torsion Fingerprints
    Returns a concatenated NumPy array of all features.
    """
    assert mol is not None, "Invalid molecule."
    features = []

    #combine the features:
    if morgan:
        morgan_fp = MFPGEN.GetFingerprintAsNumPy(mol)
        features.append(morgan_fp)
    if tt:
        torsion_fp = TTPGEN.GetFingerprintAsNumPy(mol)
        features.append(torsion_fp)
    if ap:
        ap_fp = AFPGEN.GetFingerprintAsNumPy(mol)
        features.append(ap_fp)
    if descs:
        rdkit_desc = calculate_descriptors(mol)  
        features.append(rdkit_desc)
    if rdfp:
        rdkit_fp = RFPGEN.GetFingerprintAsNumPy(mol)
        features.append(rdkit_fp)

    # Concatenate all features into a single vector
    combined_features = np.hstack(features)
    
    return combined_features


def build_regression_model(X,y,mode="RandomForest",model_options={"n_estimators":10,"max_depth":3,"random_state":123}):
    if mode == "RandomForest":
        model = RandomForestRegressor(**model_options)
    elif mode == "XGBoost":
        model = XGBRegressor(**model_options)
    elif mode == "Ridge":
        model = Ridge(**model_options)
    elif mode == "SVM":
        model = SVR(**model_options)
    else:
        print("mode not supported")
    model.fit(X,y)
    return model

def data_splitter(X,y,mode="Random",test_ratio=0.2,seed=123):
    if mode == "Random":
        X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=test_ratio,random_state=seed)
    else:
        print("mode not supported")    
    return X_train,X_test,y_train,y_test

m = Chem.MolFromSmiles("c1ccccc1")

# functions for optimizing models

In [17]:

import optuna
import sklearn
from sklearn.metrics import make_scorer


def xgb_objective(trial):
    xgb_max_depth = trial.suggest_int("xgb_max_depth", 2, 32, log=True)
    xgb_n_estimators = trial.suggest_int("xgb_n_estimators", 10, 1000, log=True)
    xgb_eta = trial.suggest_float("xgb_eta", 0, 1)
    xgb_gamma = trial.suggest_float("xgb_gamma", 0, 1)
    model = build_regression_model(X_train, y_train, mode="XGBoost",
                               model_options={"n_estimators":xgb_n_estimators, "max_depth":xgb_max_depth, "random_state":102, "eta":xgb_eta, "gamma":xgb_gamma})

    score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(r2_score))
    r2score_avg = score.mean()
    return r2score_avg

def rf_objective(trial):
    rf_max_depth = trial.suggest_int("rf_max_depth", 2, 32, log=True)
    rf_n_estimators = trial.suggest_int("rf_n_estimators", 10, 1000, log=True)
    model = build_regression_model(X_train, y_train, mode="RandomForest",
                               model_options={"n_estimators":rf_n_estimators, "max_depth":rf_max_depth, "random_state":102})
    score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(r2_score))
    r2score_avg = score.mean()
    return r2score_avg


def svm_objective(trial):
    svm_gamma = trial.suggest_float("svm_gamma", 0.001, 1000, log=True)
    svm_C = trial.suggest_int("svm_C", 1, 1000, log=True)
    model = build_regression_model(X_train, y_train, mode="SVM",
                               model_options={"kernel":"rbf", "gamma":svm_gamma,"C":svm_C}) #svm doesnt have random state

    score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(r2_score))
    r2score_avg = score.mean()
    return r2score_avg

# Data loading

In [6]:
df_chembl = pd.read_csv("data/PDL1-CHEMBL.csv",sep=";")
mols = []
y = []
for i,smiles in enumerate(df_chembl["Smiles"]):
    if df_chembl["Standard Relation"][i] == "'='":
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            pchembl = df_chembl["pChEMBL Value"][i]
            if float(pchembl) == pchembl:
                y.append(df_chembl["pChEMBL Value"][i])
                mols.append(mol)
    else:
        pass#print(df_chembl["Standard Relation"][i])

# SVM

## First, for ecfp

In [9]:
X = [mol_feat(m,descs=False,morgan=True,rdfp=False,ap=False,tt=False) for m in mols]
X_train, X_test, y_train, y_test = data_splitter(X, y,seed=123)

In [19]:
study = optuna.create_study(direction="maximize")
study.optimize(svm_objective, n_trials=200, n_jobs=20)
print(study.best_trial)

[I 2025-05-22 15:51:13,994] A new study created in memory with name: no-name-0ea2752a-2645-4b79-b856-d01aee515ac2
[I 2025-05-22 15:51:18,596] Trial 6 finished with value: 0.4969789641559272 and parameters: {'svm_gamma': 0.0753237834965103, 'svm_C': 1}. Best is trial 6 with value: 0.4969789641559272.
[I 2025-05-22 15:51:18,726] Trial 7 finished with value: 0.11628674664052793 and parameters: {'svm_gamma': 43.857580467610255, 'svm_C': 21}. Best is trial 6 with value: 0.4969789641559272.
[I 2025-05-22 15:51:19,728] Trial 4 finished with value: 0.1180530345790494 and parameters: {'svm_gamma': 0.9266290709737017, 'svm_C': 14}. Best is trial 6 with value: 0.4969789641559272.
[I 2025-05-22 15:51:19,769] Trial 12 finished with value: 0.11627640240011941 and parameters: {'svm_gamma': 3.162149550530004, 'svm_C': 900}. Best is trial 6 with value: 0.4969789641559272.
[I 2025-05-22 15:51:19,815] Trial 2 finished with value: 0.11628506911056517 and parameters: {'svm_gamma': 7.012455096578942, 'svm_C

FrozenTrial(number=35, state=TrialState.COMPLETE, values=[0.6477673229600946], datetime_start=datetime.datetime(2025, 5, 22, 15, 51, 24, 973925), datetime_complete=datetime.datetime(2025, 5, 22, 15, 51, 33, 125295), params={'svm_gamma': 0.014251599739363175, 'svm_C': 6}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'svm_gamma': FloatDistribution(high=1000.0, log=True, low=0.001, step=None), 'svm_C': IntDistribution(high=1000, log=True, low=1, step=1)}, trial_id=35, value=None)


In [43]:
params = study.best_trial.params
model = build_regression_model(X_train, y_train, mode="SVM",
                               model_options={"kernel":"rbf", "gamma":params["svm_gamma"],"C":params["svm_C"]}) #svm doesnt have random state

print("SVM ECFP metrics:")
score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(r2_score))
print("CV r2",round(score.mean(),3))

score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(root_mean_squared_error))
print("CV,rmse",round(score.mean(),3))

y_pred = model.predict(X_test)
print("test r2  ",round(r2_score(y_test,y_pred),3))
print("test RMSE",round(root_mean_squared_error(y_test,y_pred),3))

SVM ECFP metrics:
CV r2 0.648
CV,rmse 0.654
test r2   0.697
test RMSE 0.597


## same thing for RDKitFP

In [44]:
X = [mol_feat(m,descs=False,morgan=False,rdfp=True,ap=False,tt=False) for m in mols]
X_train, X_test, y_train, y_test = data_splitter(X, y,seed=123)

In [45]:
study = optuna.create_study(direction="maximize")
study.optimize(svm_objective, n_trials=200, n_jobs=20)
print(study.best_trial)

[I 2025-05-22 16:11:24,606] A new study created in memory with name: no-name-2f706876-626f-4395-86a7-0dc859ffc5a8
[I 2025-05-22 16:11:34,597] Trial 12 finished with value: 0.6247774592384122 and parameters: {'svm_gamma': 0.001660765284060667, 'svm_C': 2}. Best is trial 12 with value: 0.6247774592384122.
[I 2025-05-22 16:11:34,943] Trial 1 finished with value: 0.661792977201517 and parameters: {'svm_gamma': 0.0011128876816272914, 'svm_C': 526}. Best is trial 1 with value: 0.661792977201517.
[I 2025-05-22 16:11:35,551] Trial 11 finished with value: 0.12256058489954832 and parameters: {'svm_gamma': 0.5917193649943088, 'svm_C': 399}. Best is trial 1 with value: 0.661792977201517.
[I 2025-05-22 16:11:35,707] Trial 6 finished with value: 0.668188468966919 and parameters: {'svm_gamma': 0.00284672031916224, 'svm_C': 332}. Best is trial 6 with value: 0.668188468966919.
[I 2025-05-22 16:11:40,064] Trial 3 finished with value: 0.12086620386915052 and parameters: {'svm_gamma': 299.63505047040775, 

FrozenTrial(number=150, state=TrialState.COMPLETE, values=[0.6772288677584273], datetime_start=datetime.datetime(2025, 5, 22, 16, 14, 30, 290719), datetime_complete=datetime.datetime(2025, 5, 22, 16, 14, 56, 439704), params={'svm_gamma': 0.0019065561129150673, 'svm_C': 13}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'svm_gamma': FloatDistribution(high=1000.0, log=True, low=0.001, step=None), 'svm_C': IntDistribution(high=1000, log=True, low=1, step=1)}, trial_id=150, value=None)


In [46]:
params = study.best_trial.params
model = build_regression_model(X_train, y_train, mode="SVM",
                               model_options={"kernel":"rbf", "gamma":params["svm_gamma"],"C":params["svm_C"]}) #svm doesnt have random state

print("SVM RDK metrics:")
score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(r2_score))
print("CV r2",round(score.mean(),3))

score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(root_mean_squared_error))
print("CV,rmse",round(score.mean(),3))

y_pred = model.predict(X_test)
print("test r2  ",round(r2_score(y_test,y_pred),3))
print("test RMSE",round(root_mean_squared_error(y_test,y_pred),3))

SVM RDK metrics:
CV r2 0.677
CV,rmse 0.625
test r2   0.716
test RMSE 0.578


## final for physchem descriptors

In [49]:
X = [mol_feat(m,descs=True,morgan=False,rdfp=False,ap=False,tt=False) for m in mols]
X_train, X_test, y_train, y_test = data_splitter(X, y,seed=123)

In [50]:
study = optuna.create_study(direction="maximize")
study.optimize(svm_objective, n_trials=200, n_jobs=20)
print(study.best_trial)

[I 2025-05-22 16:17:34,229] A new study created in memory with name: no-name-df5b21f2-b512-4592-b81b-2c27775776d2
[I 2025-05-22 16:17:34,618] Trial 1 finished with value: 0.08790187148837354 and parameters: {'svm_gamma': 0.5990013482363401, 'svm_C': 1}. Best is trial 1 with value: 0.08790187148837354.
[I 2025-05-22 16:17:34,681] Trial 0 finished with value: 0.10697759536940281 and parameters: {'svm_gamma': 1.0200992397433146, 'svm_C': 166}. Best is trial 0 with value: 0.10697759536940281.
[I 2025-05-22 16:17:34,758] Trial 2 finished with value: 0.0886510128464463 and parameters: {'svm_gamma': 580.9923161925122, 'svm_C': 1}. Best is trial 0 with value: 0.10697759536940281.
[I 2025-05-22 16:17:34,875] Trial 10 finished with value: 0.1084884426536304 and parameters: {'svm_gamma': 0.013328971713295907, 'svm_C': 480}. Best is trial 10 with value: 0.1084884426536304.
[I 2025-05-22 16:17:35,056] Trial 11 finished with value: 0.10689552430367255 and parameters: {'svm_gamma': 0.0264258741734652

FrozenTrial(number=124, state=TrialState.COMPLETE, values=[0.23527995812576935], datetime_start=datetime.datetime(2025, 5, 22, 16, 17, 40, 382748), datetime_complete=datetime.datetime(2025, 5, 22, 16, 17, 41, 240563), params={'svm_gamma': 0.001001691229647634, 'svm_C': 3}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'svm_gamma': FloatDistribution(high=1000.0, log=True, low=0.001, step=None), 'svm_C': IntDistribution(high=1000, log=True, low=1, step=1)}, trial_id=124, value=None)


In [51]:
params = study.best_trial.params
model = build_regression_model(X_train, y_train, mode="SVM",
                               model_options={"kernel":"rbf", "gamma":params["svm_gamma"],"C":params["svm_C"]}) #svm doesnt have random state

print("SVM RDK metrics:")
score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(r2_score))
print("CV r2",round(score.mean(),3))

score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(root_mean_squared_error))
print("CV,rmse",round(score.mean(),3))

y_pred = model.predict(X_test)
print("test r2  ",round(r2_score(y_test,y_pred),3))
print("test RMSE",round(root_mean_squared_error(y_test,y_pred),3))

SVM RDK metrics:
CV r2 0.235
CV,rmse 0.963
test r2   0.314
test RMSE 0.898


# RF
## first ECFP

In [53]:
X = [mol_feat(m,descs=False,morgan=True,rdfp=False,ap=False,tt=False) for m in mols]
X_train, X_test, y_train, y_test = data_splitter(X, y,seed=123)

In [55]:
study = optuna.create_study(direction="maximize")
study.optimize(rf_objective, n_trials=200, n_jobs=20)
print(study.best_trial)

[I 2025-05-22 16:20:25,203] A new study created in memory with name: no-name-35b350e5-40ca-485b-b1fc-9159f1df3b71
[I 2025-05-22 16:20:26,296] Trial 7 finished with value: 0.37205686920957765 and parameters: {'rf_max_depth': 4, 'rf_n_estimators': 13}. Best is trial 7 with value: 0.37205686920957765.
[I 2025-05-22 16:20:26,656] Trial 3 finished with value: 0.6161640315146796 and parameters: {'rf_max_depth': 19, 'rf_n_estimators': 13}. Best is trial 3 with value: 0.6161640315146796.
[I 2025-05-22 16:20:26,675] Trial 14 finished with value: 0.5679731504204211 and parameters: {'rf_max_depth': 10, 'rf_n_estimators': 14}. Best is trial 3 with value: 0.6161640315146796.
[I 2025-05-22 16:20:26,799] Trial 19 finished with value: 0.3084933701951313 and parameters: {'rf_max_depth': 3, 'rf_n_estimators': 22}. Best is trial 3 with value: 0.6161640315146796.
[I 2025-05-22 16:20:27,194] Trial 12 finished with value: 0.2259626149082023 and parameters: {'rf_max_depth': 2, 'rf_n_estimators': 44}. Best is

FrozenTrial(number=31, state=TrialState.COMPLETE, values=[0.6454504591167856], datetime_start=datetime.datetime(2025, 5, 22, 16, 20, 29, 18430), datetime_complete=datetime.datetime(2025, 5, 22, 16, 21, 0, 714906), params={'rf_max_depth': 29, 'rf_n_estimators': 273}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'rf_max_depth': IntDistribution(high=32, log=True, low=2, step=1), 'rf_n_estimators': IntDistribution(high=1000, log=True, low=10, step=1)}, trial_id=31, value=None)


In [56]:
params = study.best_trial.params
model = build_regression_model(X_train, y_train, mode="RandomForest",
                               model_options={"n_estimators":params["rf_n_estimators"], "max_depth":params["rf_max_depth"], "random_state":102})

print("RF ECFP metrics:")
score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(r2_score))
print("CV r2",round(score.mean(),3))

score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(root_mean_squared_error))
print("CV,rmse",round(score.mean(),3))

y_pred = model.predict(X_test)
print("test r2  ",round(r2_score(y_test,y_pred),3))
print("test RMSE",round(root_mean_squared_error(y_test,y_pred),3))

RF ECFP metrics:
CV r2 0.645
CV,rmse 0.655
test r2   0.694
test RMSE 0.6


# RDK

In [58]:
X = [mol_feat(m,descs=False,morgan=False,rdfp=True,ap=False,tt=False) for m in mols]
X_train, X_test, y_train, y_test = data_splitter(X, y,seed=123)

In [59]:
study = optuna.create_study(direction="maximize")
study.optimize(rf_objective, n_trials=200, n_jobs=20)
print(study.best_trial)

[I 2025-05-22 16:42:18,238] A new study created in memory with name: no-name-16953b26-d6f3-4b6d-aa8c-1b7d219263ab
[I 2025-05-22 16:42:19,825] Trial 3 finished with value: 0.3297877903169061 and parameters: {'rf_max_depth': 2, 'rf_n_estimators': 11}. Best is trial 3 with value: 0.3297877903169061.
[I 2025-05-22 16:42:21,144] Trial 11 finished with value: 0.49946938373722516 and parameters: {'rf_max_depth': 4, 'rf_n_estimators': 14}. Best is trial 11 with value: 0.49946938373722516.
[I 2025-05-22 16:42:21,247] Trial 2 finished with value: 0.3297546629688103 and parameters: {'rf_max_depth': 2, 'rf_n_estimators': 29}. Best is trial 11 with value: 0.49946938373722516.
[I 2025-05-22 16:42:22,696] Trial 13 finished with value: 0.5973723216459341 and parameters: {'rf_max_depth': 29, 'rf_n_estimators': 11}. Best is trial 13 with value: 0.5973723216459341.
[I 2025-05-22 16:42:23,132] Trial 16 finished with value: 0.434730158333966 and parameters: {'rf_max_depth': 3, 'rf_n_estimators': 37}. Best 

FrozenTrial(number=28, state=TrialState.COMPLETE, values=[0.6450903060817954], datetime_start=datetime.datetime(2025, 5, 22, 16, 42, 26, 722025), datetime_complete=datetime.datetime(2025, 5, 22, 16, 44, 38, 485995), params={'rf_max_depth': 14, 'rf_n_estimators': 302}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'rf_max_depth': IntDistribution(high=32, log=True, low=2, step=1), 'rf_n_estimators': IntDistribution(high=1000, log=True, low=10, step=1)}, trial_id=28, value=None)


In [60]:
params = study.best_trial.params
model = build_regression_model(X_train, y_train, mode="RandomForest",
                               model_options={"n_estimators":params["rf_n_estimators"], "max_depth":params["rf_max_depth"], "random_state":102})

print("RF RDKP metrics:")
score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(r2_score))
print("CV r2",round(score.mean(),3))

score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(root_mean_squared_error))
print("CV,rmse",round(score.mean(),3))

y_pred = model.predict(X_test)
print("test r2  ",round(r2_score(y_test,y_pred),3))
print("test RMSE",round(root_mean_squared_error(y_test,y_pred),3))

RF RDKP metrics:
CV r2 0.645
CV,rmse 0.655
test r2   0.687
test RMSE 0.606


# physchem

In [61]:
X = [mol_feat(m,descs=True,morgan=False,rdfp=False,ap=False,tt=False) for m in mols]
X_train, X_test, y_train, y_test = data_splitter(X, y,seed=123)

In [62]:
study = optuna.create_study(direction="maximize")
study.optimize(rf_objective, n_trials=200, n_jobs=20)
print(study.best_trial)

[I 2025-05-22 17:16:54,948] A new study created in memory with name: no-name-24f08aab-50ff-4aa7-846a-9f5ef207bcfd
[I 2025-05-22 17:16:55,756] Trial 0 finished with value: 0.4331267551113653 and parameters: {'rf_max_depth': 4, 'rf_n_estimators': 13}. Best is trial 0 with value: 0.4331267551113653.
[I 2025-05-22 17:16:55,998] Trial 16 finished with value: 0.3549140637876064 and parameters: {'rf_max_depth': 3, 'rf_n_estimators': 18}. Best is trial 0 with value: 0.4331267551113653.
[I 2025-05-22 17:16:56,442] Trial 5 finished with value: 0.5526414871856253 and parameters: {'rf_max_depth': 13, 'rf_n_estimators': 13}. Best is trial 5 with value: 0.5526414871856253.
[I 2025-05-22 17:16:56,449] Trial 4 finished with value: 0.5403612671121292 and parameters: {'rf_max_depth': 7, 'rf_n_estimators': 19}. Best is trial 5 with value: 0.5526414871856253.
[I 2025-05-22 17:16:56,500] Trial 12 finished with value: 0.5340538692136063 and parameters: {'rf_max_depth': 20, 'rf_n_estimators': 12}. Best is tr

FrozenTrial(number=153, state=TrialState.COMPLETE, values=[0.5854260355286521], datetime_start=datetime.datetime(2025, 5, 22, 17, 27, 5, 702722), datetime_complete=datetime.datetime(2025, 5, 22, 17, 28, 45, 793073), params={'rf_max_depth': 13, 'rf_n_estimators': 448}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'rf_max_depth': IntDistribution(high=32, log=True, low=2, step=1), 'rf_n_estimators': IntDistribution(high=1000, log=True, low=10, step=1)}, trial_id=153, value=None)


In [66]:
params = study.best_trial.params
model = build_regression_model(X_train, y_train, mode="RandomForest",
                               model_options={"n_estimators":params["rf_n_estimators"], "max_depth":params["rf_max_depth"], "random_state":102})

print("RF RDKP metrics:")
score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(r2_score))
print("CV r2",round(score.mean(),3))

score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(root_mean_squared_error))
print("CV,rmse",round(score.mean(),3))

y_pred = model.predict(X_test)
print("test r2  ",round(r2_score(y_test,y_pred),3))
print("test RMSE",round(root_mean_squared_error(y_test,y_pred),3))

RF RDKP metrics:
CV r2 0.585
CV,rmse 0.709
test r2   0.631
test RMSE 0.659


# Ridge

In [68]:


X = [mol_feat(m,descs=False,morgan=True,rdfp=False,ap=False,tt=False) for m in mols]
X_train, X_test, y_train, y_test = data_splitter(X, y,seed=123)
model = build_regression_model(X_train, y_train, mode="Ridge",model_options={})

print("Ridge ECFP metrics:")
score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(r2_score))
print("CV r2",round(score.mean(),3))

score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(root_mean_squared_error))
print("CV,rmse",round(score.mean(),3))

y_pred = model.predict(X_test)
print("test r2  ",round(r2_score(y_test,y_pred),3))
print("test RMSE",round(root_mean_squared_error(y_test,y_pred),3))

X = [mol_feat(m,descs=False,morgan=False,rdfp=True,ap=False,tt=False) for m in mols]
X_train, X_test, y_train, y_test = data_splitter(X, y,seed=123)
model = build_regression_model(X_train, y_train, mode="Ridge",model_options={})

print("Ridge RDKFP metrics:")
score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(r2_score))
print("CV r2",round(score.mean(),3))

score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(root_mean_squared_error))
print("CV,rmse",round(score.mean(),3))

y_pred = model.predict(X_test)
print("test r2  ",round(r2_score(y_test,y_pred),3))
print("test RMSE",round(root_mean_squared_error(y_test,y_pred),3))

X = [mol_feat(m,descs=True,morgan=False,rdfp=False,ap=False,tt=False) for m in mols]
X_train, X_test, y_train, y_test = data_splitter(X, y,seed=123)
model = build_regression_model(X_train, y_train, mode="Ridge",model_options={})

print("Ridge Physchem metrics:")
score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(r2_score))
print("CV r2",round(score.mean(),3))

score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(root_mean_squared_error))
print("CV,rmse",round(score.mean(),3))

y_pred = model.predict(X_test)
print("test r2  ",round(r2_score(y_test,y_pred),3))
print("test RMSE",round(root_mean_squared_error(y_test,y_pred),3))

Ridge ECFP metrics:
CV r2 0.528
CV,rmse 0.755
test r2   0.571
test RMSE 0.711
Ridge RDKFP metrics:
CV r2 0.512
CV,rmse 0.768
test r2   0.591
test RMSE 0.694
Ridge Physchem metrics:
CV r2 0.423
CV,rmse 0.835
test r2   0.459
test RMSE 0.798


# XGB
## ECFP

In [69]:
X = [mol_feat(m,descs=False,morgan=True,rdfp=False,ap=False,tt=False) for m in mols]
X_train, X_test, y_train, y_test = data_splitter(X, y,seed=123)

In [72]:
study = optuna.create_study(direction="maximize")
study.optimize(xgb_objective, n_trials=200, n_jobs=20)
print(study.best_trial)

[I 2025-05-22 17:47:29,101] A new study created in memory with name: no-name-cd14dfc5-f393-4f2d-9252-96de7c2286e0
[I 2025-05-22 17:47:30,592] Trial 19 finished with value: 0.45406142473220823 and parameters: {'xgb_max_depth': 2, 'xgb_n_estimators': 13, 'xgb_eta': 0.41890426264496783, 'xgb_gamma': 0.47862566164620046}. Best is trial 19 with value: 0.45406142473220823.
[I 2025-05-22 17:47:30,727] Trial 1 finished with value: 0.5498203635215759 and parameters: {'xgb_max_depth': 3, 'xgb_n_estimators': 15, 'xgb_eta': 0.6942340799296597, 'xgb_gamma': 0.8061541712085071}. Best is trial 1 with value: 0.5498203635215759.
[I 2025-05-22 17:47:30,968] Trial 4 finished with value: 0.5124192118644715 and parameters: {'xgb_max_depth': 3, 'xgb_n_estimators': 14, 'xgb_eta': 0.36667514174299465, 'xgb_gamma': 0.6212920147749799}. Best is trial 1 with value: 0.5498203635215759.
[I 2025-05-22 17:47:31,703] Trial 15 finished with value: 0.5388504505157471 and parameters: {'xgb_max_depth': 2, 'xgb_n_estimato

FrozenTrial(number=100, state=TrialState.COMPLETE, values=[0.6545852303504944], datetime_start=datetime.datetime(2025, 5, 22, 17, 54, 20, 785375), datetime_complete=datetime.datetime(2025, 5, 22, 17, 57, 41, 919702), params={'xgb_max_depth': 27, 'xgb_n_estimators': 407, 'xgb_eta': 0.08191719418621277, 'xgb_gamma': 0.16571242315214585}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'xgb_max_depth': IntDistribution(high=32, log=True, low=2, step=1), 'xgb_n_estimators': IntDistribution(high=1000, log=True, low=10, step=1), 'xgb_eta': FloatDistribution(high=1.0, log=False, low=0.0, step=None), 'xgb_gamma': FloatDistribution(high=1.0, log=False, low=0.0, step=None)}, trial_id=100, value=None)


In [75]:
params = study.best_trial.params
model = build_regression_model(X_train, y_train, mode="XGBoost",
                               model_options={"n_estimators":params["xgb_n_estimators"], "max_depth":params["xgb_max_depth"], "random_state":102, "eta":params["xgb_eta"], "gamma":params["xgb_gamma"]})

print("XG ECFP metrics:")
score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(r2_score))
print("CV r2",round(score.mean(),3))

score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(root_mean_squared_error))
print("CV,rmse",round(score.mean(),3))

y_pred = model.predict(X_test)
print("test r2  ",round(r2_score(y_test,y_pred),3))
print("test RMSE",round(root_mean_squared_error(y_test,y_pred),3))

XG ECFP metrics:
CV r2 0.655
CV,rmse 0.647
test r2   0.681
test RMSE 0.613


# RDKit

In [76]:
X = [mol_feat(m,descs=False,morgan=False,rdfp=True,ap=False,tt=False) for m in mols]
X_train, X_test, y_train, y_test = data_splitter(X, y,seed=123)

In [77]:
study = optuna.create_study(direction="maximize")
study.optimize(xgb_objective, n_trials=200, n_jobs=20)
print(study.best_trial)

[I 2025-05-22 18:16:56,012] A new study created in memory with name: no-name-75889488-d6eb-4430-a275-8e5f0b723a4a
[I 2025-05-22 18:16:58,399] Trial 4 finished with value: 0.5510579466819763 and parameters: {'xgb_max_depth': 2, 'xgb_n_estimators': 36, 'xgb_eta': 0.23127791058377323, 'xgb_gamma': 0.4128480778586474}. Best is trial 4 with value: 0.5510579466819763.
[I 2025-05-22 18:16:58,704] Trial 11 finished with value: -0.9206721305847168 and parameters: {'xgb_max_depth': 4, 'xgb_n_estimators': 18, 'xgb_eta': 0.08647300277585934, 'xgb_gamma': 0.15711731963120634}. Best is trial 4 with value: 0.5510579466819763.
[I 2025-05-22 18:16:59,291] Trial 9 finished with value: 0.5931994676589966 and parameters: {'xgb_max_depth': 5, 'xgb_n_estimators': 17, 'xgb_eta': 0.3049080389975495, 'xgb_gamma': 0.8666898054606663}. Best is trial 9 with value: 0.5931994676589966.
[I 2025-05-22 18:16:59,776] Trial 3 finished with value: 0.5157374262809753 and parameters: {'xgb_max_depth': 5, 'xgb_n_estimators'

FrozenTrial(number=123, state=TrialState.COMPLETE, values=[0.6569361209869384], datetime_start=datetime.datetime(2025, 5, 22, 18, 32, 16, 144155), datetime_complete=datetime.datetime(2025, 5, 22, 18, 34, 7, 36307), params={'xgb_max_depth': 3, 'xgb_n_estimators': 710, 'xgb_eta': 0.10074111462849814, 'xgb_gamma': 0.22788601139926482}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'xgb_max_depth': IntDistribution(high=32, log=True, low=2, step=1), 'xgb_n_estimators': IntDistribution(high=1000, log=True, low=10, step=1), 'xgb_eta': FloatDistribution(high=1.0, log=False, low=0.0, step=None), 'xgb_gamma': FloatDistribution(high=1.0, log=False, low=0.0, step=None)}, trial_id=123, value=None)


In [78]:
params = study.best_trial.params
model = build_regression_model(X_train, y_train, mode="XGBoost",
                               model_options={"n_estimators":params["xgb_n_estimators"], "max_depth":params["xgb_max_depth"], "random_state":102, "eta":params["xgb_eta"], "gamma":params["xgb_gamma"]})

print("XG RDKitFP metrics:")
score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(r2_score))
print("CV r2",round(score.mean(),3))

score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(root_mean_squared_error))
print("CV,rmse",round(score.mean(),3))

y_pred = model.predict(X_test)
print("test r2  ",round(r2_score(y_test,y_pred),3))
print("test RMSE",round(root_mean_squared_error(y_test,y_pred),3))

XG RDKitFP metrics:
CV r2 0.657
CV,rmse 0.644
test r2   0.704
test RMSE 0.59


# Physchem

In [82]:
X = [mol_feat(m,descs=True,morgan=False,rdfp=False,ap=False,tt=False) for m in mols]
X_train, X_test, y_train, y_test = data_splitter(X, y,seed=123)

In [83]:
study = optuna.create_study(direction="maximize")
study.optimize(xgb_objective, n_trials=200, n_jobs=20)
print(study.best_trial)

[I 2025-05-22 19:08:19,538] A new study created in memory with name: no-name-d3dcf556-0ace-46bd-a1f4-a4f9e6a13589
[I 2025-05-22 19:08:22,201] Trial 0 finished with value: -23.587374114990233 and parameters: {'xgb_max_depth': 3, 'xgb_n_estimators': 12, 'xgb_eta': 0.01639650070843446, 'xgb_gamma': 0.8701191930708282}. Best is trial 0 with value: -23.587374114990233.
[I 2025-05-22 19:08:23,095] Trial 18 finished with value: 0.4507203340530396 and parameters: {'xgb_max_depth': 5, 'xgb_n_estimators': 34, 'xgb_eta': 0.8328029084170703, 'xgb_gamma': 0.9751494135279756}. Best is trial 18 with value: 0.4507203340530396.
[I 2025-05-22 19:08:23,125] Trial 9 finished with value: 0.5326296329498291 and parameters: {'xgb_max_depth': 8, 'xgb_n_estimators': 30, 'xgb_eta': 0.3218829309827407, 'xgb_gamma': 0.1496504455742248}. Best is trial 9 with value: 0.5326296329498291.
[I 2025-05-22 19:08:23,253] Trial 1 finished with value: 0.5311794877052307 and parameters: {'xgb_max_depth': 2, 'xgb_n_estimators'

FrozenTrial(number=74, state=TrialState.COMPLETE, values=[0.6186170697212219], datetime_start=datetime.datetime(2025, 5, 22, 19, 9, 33, 969173), datetime_complete=datetime.datetime(2025, 5, 22, 19, 9, 58, 442956), params={'xgb_max_depth': 4, 'xgb_n_estimators': 268, 'xgb_eta': 0.21811393204399238, 'xgb_gamma': 0.08092309944556364}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'xgb_max_depth': IntDistribution(high=32, log=True, low=2, step=1), 'xgb_n_estimators': IntDistribution(high=1000, log=True, low=10, step=1), 'xgb_eta': FloatDistribution(high=1.0, log=False, low=0.0, step=None), 'xgb_gamma': FloatDistribution(high=1.0, log=False, low=0.0, step=None)}, trial_id=74, value=None)


In [84]:
params = study.best_trial.params
model = build_regression_model(X_train, y_train, mode="XGBoost",
                               model_options={"n_estimators":params["xgb_n_estimators"], "max_depth":params["xgb_max_depth"], "random_state":102, "eta":params["xgb_eta"], "gamma":params["xgb_gamma"]})

print("XG Physchem metrics:")
score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(r2_score))
print("CV r2",round(score.mean(),3))

score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(root_mean_squared_error))
print("CV,rmse",round(score.mean(),3))

y_pred = model.predict(X_test)
print("test r2  ",round(r2_score(y_test,y_pred),3))
print("test RMSE",round(root_mean_squared_error(y_test,y_pred),3))

XG Physchem metrics:
CV r2 0.619
CV,rmse 0.679
test r2   0.601
test RMSE 0.685
