# Regression models
First we import all the stuff we need. Then we load the data. Then we optimize using Optuna

In [1]:
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import Descriptors
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,root_mean_squared_error
from xgboost import XGBRegressor
import optuna
import sklearn
from sklearn.metrics import make_scorer

In [None]:
MFPGEN = rdFingerprintGenerator.GetMorganGenerator(radius=2,fpSize=1024)
AFPGEN = rdFingerprintGenerator.GetAtomPairGenerator(fpSize=2048)
RFPGEN = rdFingerprintGenerator.GetRDKitFPGenerator(fpSize=2048)
TTPGEN = rdFingerprintGenerator.GetTopologicalTorsionGenerator(fpSize=2048)

def calculate_descriptors(mol: Chem.Mol, missingVal: float | None = 0.0) -> dict:
    """Calculate the full list of descriptors for a molecule.
    adapted from
    https://github.com/jonswain/tabpfn-tdc/blob/main/submission.py#L12
    """
    
    res = []
    for nm, fn in Descriptors._descList:
        try:
            if nm!="Ipc": # this one creates crazy values so we exclude it
                val = fn(mol)
        except:
            val = missingVal
        res.append(val)
    return res

def mol_feat(mol,morgan=True,tt=True,ap=True,descs=True,rdfp=True):
    """
    Extracts features by combining:
    - Morgan Fingerprints
    - RDKit Descriptors
    - Topological Torsion Fingerprints
    Returns a concatenated NumPy array of all features.
    """
    assert mol is not None, "Invalid molecule."
    features = []

    #combine the features:
    if morgan:
        morgan_fp = MFPGEN.GetFingerprintAsNumPy(mol)
        features.append(morgan_fp)
    if tt:
        torsion_fp = TTPGEN.GetFingerprintAsNumPy(mol)
        features.append(torsion_fp)
    if ap:
        ap_fp = AFPGEN.GetFingerprintAsNumPy(mol)
        features.append(ap_fp)
    if descs:
        rdkit_desc = calculate_descriptors(mol)  
        features.append(rdkit_desc)
    if rdfp:
        rdkit_fp = RFPGEN.GetFingerprintAsNumPy(mol)
        features.append(rdkit_fp)

    # Concatenate all features into a single vector
    combined_features = np.hstack(features)
    
    return combined_features


def build_regression_model(X,y,mode="RandomForest",model_options={"n_estimators":10,"max_depth":3,"random_state":123}):
    if mode == "RandomForest":
        model = RandomForestRegressor(**model_options)
    elif mode == "XGBoost":
        model = XGBRegressor(**model_options)
    elif mode == "Ridge":
        model = Ridge(**model_options)
    elif mode == "SVM":
        model = SVR(**model_options)
    else:
        print("mode not supported")
    model.fit(X,y)
    return model

def data_splitter(X,y,mode="Random",test_ratio=0.2,seed=123):
    if mode == "Random":
        X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=test_ratio,random_state=seed)
    else:
        print("mode not supported")    
    return X_train,X_test,y_train,y_test

m = Chem.MolFromSmiles("c1ccccc1")

# Functions for optimizing models

In [17]:
def xgb_objective(trial):
    xgb_max_depth = trial.suggest_int("xgb_max_depth", 2, 32, log=True)
    xgb_n_estimators = trial.suggest_int("xgb_n_estimators", 10, 1000, log=True)
    xgb_eta = trial.suggest_float("xgb_eta", 0, 1)
    xgb_gamma = trial.suggest_float("xgb_gamma", 0, 1)
    model = build_regression_model(X_train, y_train, mode="XGBoost",
                               model_options={"n_estimators":xgb_n_estimators, "max_depth":xgb_max_depth, "random_state":102, "eta":xgb_eta, "gamma":xgb_gamma})

    score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(r2_score))
    r2score_avg = score.mean()
    return r2score_avg

def rf_objective(trial):
    rf_max_depth = trial.suggest_int("rf_max_depth", 2, 32, log=True)
    rf_n_estimators = trial.suggest_int("rf_n_estimators", 10, 1000, log=True)
    model = build_regression_model(X_train, y_train, mode="RandomForest",
                               model_options={"n_estimators":rf_n_estimators, "max_depth":rf_max_depth, "random_state":102})
    score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(r2_score))
    r2score_avg = score.mean()
    return r2score_avg


def svm_objective(trial):
    svm_gamma = trial.suggest_float("svm_gamma", 0.001, 1000, log=True)
    svm_C = trial.suggest_int("svm_C", 1, 1000, log=True)
    model = build_regression_model(X_train, y_train, mode="SVM",
                               model_options={"kernel":"rbf", "gamma":svm_gamma,"C":svm_C}) #svm doesnt have random state

    score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(r2_score))
    r2score_avg = score.mean()
    return r2score_avg

# Data loading

In [3]:
df_chembl = pd.read_csv("data/PDL1-CHEMBL.csv",sep=";")
mols = []
y = []
for i,smiles in enumerate(df_chembl["Smiles"]):
    if df_chembl["Standard Relation"][i] == "'='":
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            pchembl = df_chembl["pChEMBL Value"][i]
            if float(pchembl) == pchembl:
                y.append(df_chembl["pChEMBL Value"][i])
                mols.append(mol)
    else:
        pass

# SVM

## First, for ecfp

In [9]:
X = [mol_feat(m,descs=False,morgan=True,rdfp=False,ap=False,tt=False) for m in mols]
X_train, X_test, y_train, y_test = data_splitter(X, y,seed=123)

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(svm_objective, n_trials=200, n_jobs=20)
print(study.best_trial)

In [43]:
params = study.best_trial.params
model = build_regression_model(X_train, y_train, mode="SVM",
                               model_options={"kernel":"rbf", "gamma":params["svm_gamma"],"C":params["svm_C"]}) #svm doesnt have random state

print("SVM ECFP metrics:")
score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(r2_score))
print("CV r2",round(score.mean(),3))

score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(root_mean_squared_error))
print("CV,rmse",round(score.mean(),3))

y_pred = model.predict(X_test)
print("test r2  ",round(r2_score(y_test,y_pred),3))
print("test RMSE",round(root_mean_squared_error(y_test,y_pred),3))

SVM ECFP metrics:
CV r2 0.648
CV,rmse 0.654
test r2   0.697
test RMSE 0.597


## Same thing for RDKitFP

In [44]:
X = [mol_feat(m,descs=False,morgan=False,rdfp=True,ap=False,tt=False) for m in mols]
X_train, X_test, y_train, y_test = data_splitter(X, y,seed=123)

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(svm_objective, n_trials=200, n_jobs=20)
print(study.best_trial)

In [46]:
params = study.best_trial.params
model = build_regression_model(X_train, y_train, mode="SVM",
                               model_options={"kernel":"rbf", "gamma":params["svm_gamma"],"C":params["svm_C"]}) #svm doesnt have random state

print("SVM RDK metrics:")
score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(r2_score))
print("CV r2",round(score.mean(),3))

score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(root_mean_squared_error))
print("CV,rmse",round(score.mean(),3))

y_pred = model.predict(X_test)
print("test r2  ",round(r2_score(y_test,y_pred),3))
print("test RMSE",round(root_mean_squared_error(y_test,y_pred),3))

SVM RDK metrics:
CV r2 0.677
CV,rmse 0.625
test r2   0.716
test RMSE 0.578


## Final for physchem descriptors

In [49]:
X = [mol_feat(m,descs=True,morgan=False,rdfp=False,ap=False,tt=False) for m in mols]
X_train, X_test, y_train, y_test = data_splitter(X, y,seed=123)

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(svm_objective, n_trials=200, n_jobs=20)
print(study.best_trial)

In [51]:
params = study.best_trial.params
model = build_regression_model(X_train, y_train, mode="SVM",
                               model_options={"kernel":"rbf", "gamma":params["svm_gamma"],"C":params["svm_C"]}) #svm doesnt have random state

print("SVM RDK metrics:")
score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(r2_score))
print("CV r2",round(score.mean(),3))

score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(root_mean_squared_error))
print("CV,rmse",round(score.mean(),3))

y_pred = model.predict(X_test)
print("test r2  ",round(r2_score(y_test,y_pred),3))
print("test RMSE",round(root_mean_squared_error(y_test,y_pred),3))

SVM RDK metrics:
CV r2 0.235
CV,rmse 0.963
test r2   0.314
test RMSE 0.898


# RF
## First ECFP

In [53]:
X = [mol_feat(m,descs=False,morgan=True,rdfp=False,ap=False,tt=False) for m in mols]
X_train, X_test, y_train, y_test = data_splitter(X, y,seed=123)

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(rf_objective, n_trials=200, n_jobs=20)
print(study.best_trial)

In [56]:
params = study.best_trial.params
model = build_regression_model(X_train, y_train, mode="RandomForest",
                               model_options={"n_estimators":params["rf_n_estimators"], "max_depth":params["rf_max_depth"], "random_state":102})

print("RF ECFP metrics:")
score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(r2_score))
print("CV r2",round(score.mean(),3))

score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(root_mean_squared_error))
print("CV,rmse",round(score.mean(),3))

y_pred = model.predict(X_test)
print("test r2  ",round(r2_score(y_test,y_pred),3))
print("test RMSE",round(root_mean_squared_error(y_test,y_pred),3))

RF ECFP metrics:
CV r2 0.645
CV,rmse 0.655
test r2   0.694
test RMSE 0.6


# RDK

In [58]:
X = [mol_feat(m,descs=False,morgan=False,rdfp=True,ap=False,tt=False) for m in mols]
X_train, X_test, y_train, y_test = data_splitter(X, y,seed=123)

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(rf_objective, n_trials=200, n_jobs=20)
print(study.best_trial)

In [60]:
params = study.best_trial.params
model = build_regression_model(X_train, y_train, mode="RandomForest",
                               model_options={"n_estimators":params["rf_n_estimators"], "max_depth":params["rf_max_depth"], "random_state":102})

print("RF RDKP metrics:")
score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(r2_score))
print("CV r2",round(score.mean(),3))

score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(root_mean_squared_error))
print("CV,rmse",round(score.mean(),3))

y_pred = model.predict(X_test)
print("test r2  ",round(r2_score(y_test,y_pred),3))
print("test RMSE",round(root_mean_squared_error(y_test,y_pred),3))

RF RDKP metrics:
CV r2 0.645
CV,rmse 0.655
test r2   0.687
test RMSE 0.606


# Physchem

In [61]:
X = [mol_feat(m,descs=True,morgan=False,rdfp=False,ap=False,tt=False) for m in mols]
X_train, X_test, y_train, y_test = data_splitter(X, y,seed=123)

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(rf_objective, n_trials=200, n_jobs=20)
print(study.best_trial)

In [66]:
params = study.best_trial.params
model = build_regression_model(X_train, y_train, mode="RandomForest",
                               model_options={"n_estimators":params["rf_n_estimators"], "max_depth":params["rf_max_depth"], "random_state":102})

print("RF RDKP metrics:")
score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(r2_score))
print("CV r2",round(score.mean(),3))

score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(root_mean_squared_error))
print("CV,rmse",round(score.mean(),3))

y_pred = model.predict(X_test)
print("test r2  ",round(r2_score(y_test,y_pred),3))
print("test RMSE",round(root_mean_squared_error(y_test,y_pred),3))

RF RDKP metrics:
CV r2 0.585
CV,rmse 0.709
test r2   0.631
test RMSE 0.659


# Ridge

In [68]:
X = [mol_feat(m,descs=False,morgan=True,rdfp=False,ap=False,tt=False) for m in mols]
X_train, X_test, y_train, y_test = data_splitter(X, y,seed=123)
model = build_regression_model(X_train, y_train, mode="Ridge",model_options={})

print("Ridge ECFP metrics:")
score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(r2_score))
print("CV r2",round(score.mean(),3))

score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(root_mean_squared_error))
print("CV,rmse",round(score.mean(),3))

y_pred = model.predict(X_test)
print("test r2  ",round(r2_score(y_test,y_pred),3))
print("test RMSE",round(root_mean_squared_error(y_test,y_pred),3))

X = [mol_feat(m,descs=False,morgan=False,rdfp=True,ap=False,tt=False) for m in mols]
X_train, X_test, y_train, y_test = data_splitter(X, y,seed=123)
model = build_regression_model(X_train, y_train, mode="Ridge",model_options={})

print("Ridge RDKFP metrics:")
score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(r2_score))
print("CV r2",round(score.mean(),3))

score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(root_mean_squared_error))
print("CV,rmse",round(score.mean(),3))

y_pred = model.predict(X_test)
print("test r2  ",round(r2_score(y_test,y_pred),3))
print("test RMSE",round(root_mean_squared_error(y_test,y_pred),3))

X = [mol_feat(m,descs=True,morgan=False,rdfp=False,ap=False,tt=False) for m in mols]
X_train, X_test, y_train, y_test = data_splitter(X, y,seed=123)
model = build_regression_model(X_train, y_train, mode="Ridge",model_options={})

print("Ridge Physchem metrics:")
score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(r2_score))
print("CV r2",round(score.mean(),3))

score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(root_mean_squared_error))
print("CV,rmse",round(score.mean(),3))

y_pred = model.predict(X_test)
print("test r2  ",round(r2_score(y_test,y_pred),3))
print("test RMSE",round(root_mean_squared_error(y_test,y_pred),3))

Ridge ECFP metrics:
CV r2 0.528
CV,rmse 0.755
test r2   0.571
test RMSE 0.711
Ridge RDKFP metrics:
CV r2 0.512
CV,rmse 0.768
test r2   0.591
test RMSE 0.694
Ridge Physchem metrics:
CV r2 0.423
CV,rmse 0.835
test r2   0.459
test RMSE 0.798


# XGB
## ECFP

In [69]:
X = [mol_feat(m,descs=False,morgan=True,rdfp=False,ap=False,tt=False) for m in mols]
X_train, X_test, y_train, y_test = data_splitter(X, y,seed=123)

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(xgb_objective, n_trials=200, n_jobs=20)
print(study.best_trial)

In [75]:
params = study.best_trial.params
model = build_regression_model(X_train, y_train, mode="XGBoost",
                               model_options={"n_estimators":params["xgb_n_estimators"], "max_depth":params["xgb_max_depth"], "random_state":102, "eta":params["xgb_eta"], "gamma":params["xgb_gamma"]})

print("XG ECFP metrics:")
score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(r2_score))
print("CV r2",round(score.mean(),3))

score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(root_mean_squared_error))
print("CV,rmse",round(score.mean(),3))

y_pred = model.predict(X_test)
print("test r2  ",round(r2_score(y_test,y_pred),3))
print("test RMSE",round(root_mean_squared_error(y_test,y_pred),3))

XG ECFP metrics:
CV r2 0.655
CV,rmse 0.647
test r2   0.681
test RMSE 0.613


# RDKit

In [76]:
X = [mol_feat(m,descs=False,morgan=False,rdfp=True,ap=False,tt=False) for m in mols]
X_train, X_test, y_train, y_test = data_splitter(X, y,seed=123)

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(xgb_objective, n_trials=200, n_jobs=20)
print(study.best_trial)

In [78]:
params = study.best_trial.params
model = build_regression_model(X_train, y_train, mode="XGBoost",
                               model_options={"n_estimators":params["xgb_n_estimators"], "max_depth":params["xgb_max_depth"], "random_state":102, "eta":params["xgb_eta"], "gamma":params["xgb_gamma"]})

print("XG RDKitFP metrics:")
score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(r2_score))
print("CV r2",round(score.mean(),3))

score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(root_mean_squared_error))
print("CV,rmse",round(score.mean(),3))

y_pred = model.predict(X_test)
print("test r2  ",round(r2_score(y_test,y_pred),3))
print("test RMSE",round(root_mean_squared_error(y_test,y_pred),3))

XG RDKitFP metrics:
CV r2 0.657
CV,rmse 0.644
test r2   0.704
test RMSE 0.59


# Physchem

In [82]:
X = [mol_feat(m,descs=True,morgan=False,rdfp=False,ap=False,tt=False) for m in mols]
X_train, X_test, y_train, y_test = data_splitter(X, y,seed=123)

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(xgb_objective, n_trials=200, n_jobs=20)
print(study.best_trial)

In [84]:
params = study.best_trial.params
model = build_regression_model(X_train, y_train, mode="XGBoost",
                               model_options={"n_estimators":params["xgb_n_estimators"], "max_depth":params["xgb_max_depth"], "random_state":102, "eta":params["xgb_eta"], "gamma":params["xgb_gamma"]})

print("XG Physchem metrics:")
score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(r2_score))
print("CV r2",round(score.mean(),3))

score = sklearn.model_selection.cross_val_score(model, X_train, y_train, n_jobs=-1, cv=5,scoring=make_scorer(root_mean_squared_error))
print("CV,rmse",round(score.mean(),3))

y_pred = model.predict(X_test)
print("test r2  ",round(r2_score(y_test,y_pred),3))
print("test RMSE",round(root_mean_squared_error(y_test,y_pred),3))

XG Physchem metrics:
CV r2 0.619
CV,rmse 0.679
test r2   0.601
test RMSE 0.685
