In [1]:
# this is copypasted from Classifiers.ipynb
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,root_mean_squared_error
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import Descriptors
import numpy as np
import pandas as pd
from xgboost import XGBClassifier

MFPGEN = rdFingerprintGenerator.GetMorganGenerator(radius=2,fpSize=2048)
AFPGEN = rdFingerprintGenerator.GetAtomPairGenerator(fpSize=2048)
RFPGEN = rdFingerprintGenerator.GetRDKitFPGenerator(fpSize=2048)
TTPGEN = rdFingerprintGenerator.GetTopologicalTorsionGenerator(fpSize=2048)

def calculate_descriptors(mol: Chem.Mol, missingVal: float | None = 0.0) -> dict:
    """Calculate the full list of descriptors for a molecule.
    adapted from
    https://github.com/jonswain/tabpfn-tdc/blob/main/submission.py#L12
    """
    
    res = []
    for nm, fn in Descriptors._descList:
        try:
            if nm!="Ipc": # this one creates crazy values so we exclude it
                val = fn(mol)
        except:
            val = missingVal
        res.append(val)
    return res

def mol_feat(mol,morgan=True,tt=True,ap=True,descs=True,rdfp=True):
    """
    Extracts features by combining:
    - Morgan Fingerprints
    - RDKit Descriptors
    - Topological Torsion Fingerprints
    Returns a concatenated NumPy array of all features.
    """
    assert mol is not None, "Invalid molecule."
    features = []

    #combine the features:
    if morgan:
        morgan_fp = MFPGEN.GetFingerprintAsNumPy(mol)
        features.append(morgan_fp)
    if tt:
        torsion_fp = TTPGEN.GetFingerprintAsNumPy(mol)
        features.append(torsion_fp)
    if ap:
        ap_fp = AFPGEN.GetFingerprintAsNumPy(mol)
        features.append(ap_fp)
    if descs:
        rdkit_desc = calculate_descriptors(mol)  
        features.append(rdkit_desc)
    if rdfp:
        rdkit_fp = RFPGEN.GetFingerprintAsNumPy(mol)
        features.append(rdkit_fp)

    # Concatenate all features into a single vector
    combined_features = np.hstack(features)
    
    return combined_features


def build_classification_model(X,y,mode="RandomForest",model_options={"n_estimators":10,"max_depth":3,"random_state":123}):
    if mode == "RandomForest":
        model = RandomForestClassifier(**model_options)
    elif mode == "XGBoost":
        model = XGBClassifier(**model_options)
    elif mode == "Ridge":
        model = RidgeClassifier(**model_options)
    elif mode == "SVM":
        model = SVC(**model_options)
    else:
        print("mode not supported")
    model.fit(X,y)
    return model

def data_splitter(X,y,mode="Random",test_ratio=0.2,seed=123):
    if mode == "Random":
        X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=test_ratio,random_state=seed)
    else:
        print("mode not supported")    
    return X_train,X_test,y_train,y_test

m = Chem.MolFromSmiles("c1ccccc1")

df_chembl = pd.read_csv("data/PDL1-CHEMBL.csv", sep=";")
mols = []
y = []

threshold = 6.0  

for i, row in df_chembl.iterrows():
    smiles = row["Smiles"]
    try:
        pchembl = float(row["pChEMBL Value"])
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            y.append(1 if pchembl >= threshold else 0)
            mols.append(mol)
    except:
        continue  
X_chembl_ecfp = [mol_feat(m, descs=False, morgan=True, rdfp=False, ap=False, tt=False) for m in mols]
X_chembl_rdk = [mol_feat(m, descs=False, morgan=False, rdfp=True, ap=False, tt=False) for m in mols]
y_chembl = np.array(y)
df_paper = pd.read_excel("data/pharmaceuticals-1710563-supplementary.xlsx")
smiles_paper = list(set(df_paper[df_paper["Unnamed: 2"] == "ACTIVE"]["Unnamed: 3"]))
smiles_paper_decoys = list(set(df_paper[df_paper["Unnamed: 2"] == "DECOY"]["Unnamed: 3"]))
mols = [Chem.MolFromSmiles(smi) for smi in smiles_paper+smiles_paper_decoys]
X_paper_ecfp = [mol_feat(m, descs=False, morgan=True, rdfp=False, ap=False, tt=False) for m in mols]
X_paper_rdk = [mol_feat(m, descs=False, morgan=False, rdfp=True, ap=False, tt=False) for m in mols]
y_paper = [1]*len(smiles_paper)+[0]*len(smiles_paper_decoys)

# Task 1
Take best model type and hyperparameters but apply to paper set. and also report metrics on chembl
best parameters were `'xgb_max_depth': 5, 'xgb_n_estimators': 586, 'xgb_eta': 0.3250187938100933, 'xgb_gamma': 0.5954578340544949`

In [2]:
X_features = {"ecfp":X_paper_ecfp,"rdk":X_paper_rdk}
y = y_paper

In [5]:
X = X_features["rdk"]
X_train, X_test, y_train, y_test = data_splitter(X, y,seed=123)
xgb_rdk_model = build_classification_model(X_train, y_train, mode="XGBoost",
                                       model_options={"n_jobs":-1, "n_estimators":586, "max_depth":5,"eta":0.325,"gamma":0.595,"random_state":102})

In [7]:
import sklearn
from sklearn.metrics import make_scorer, matthews_corrcoef, balanced_accuracy_score

# print the metrics
print(f"XGB RDK metrics:")
score = sklearn.model_selection.cross_val_score(xgb_rdk_model, X_train, y_train, cv=5,scoring=make_scorer(matthews_corrcoef))
print("CV MCC ",round(score.mean(),3))

score = sklearn.model_selection.cross_val_score(xgb_rdk_model, X_train, y_train, cv=5,scoring=make_scorer(balanced_accuracy_score))
print("CV,bAcc",round(score.mean(),3))

y_pred = rf_ecfp_model.predict(X_test)
print("test MCC ",round(matthews_corrcoef(y_test,y_pred),3))
print("test bAcc",round(balanced_accuracy_score(y_test,y_pred),3))

XGB RDK metrics:
CV MCC  0.997
CV,bAcc 0.997


NameError: name 'rf_ecfp_model' is not defined

In [9]:
y_pred = xgb_rdk_model.predict(X_test)
print("test MCC ",round(matthews_corrcoef(y_test,y_pred),3))
print("test bAcc",round(balanced_accuracy_score(y_test,y_pred),3))

test MCC  1.0
test bAcc 1.0


In [10]:

y_pred = xgb_rdk_model.predict(X_chembl_rdk)
print("ChEMBL MCC ",round(matthews_corrcoef(y_chembl,y_pred),3))
print("test bAcc",round(balanced_accuracy_score(y_chembl,y_pred),3))

ChEMBL MCC  0.181
test bAcc 0.587


In [12]:
from sklearn.metrics import roc_auc_score
y_pred = xgb_rdk_model.predict_proba(X_chembl_rdk)
print("AUC bAcc",round(roc_auc_score(y_chembl,[yy[1] for yy in y_pred]),3))

AUC bAcc 0.594


# Task 2
Build model on ChEMBL, test it on paper

In [15]:
X_features = {"ecfp":X_chembl_ecfp,"rdk":X_chembl_rdk}
y = y_chembl

In [16]:
X = X_features["rdk"]
X_train, X_test, y_train, y_test = data_splitter(X, y,seed=123)
xgb_rdk_model = build_classification_model(X_train, y_train, mode="XGBoost",
                                       model_options={"n_jobs":-1, "n_estimators":586, "max_depth":5,"eta":0.325,"gamma":0.595,"random_state":102})

In [17]:
import sklearn
from sklearn.metrics import make_scorer, matthews_corrcoef, balanced_accuracy_score

# print the metrics
print(f"XGB RDK metrics:")
score = sklearn.model_selection.cross_val_score(xgb_rdk_model, X_train, y_train, cv=5,scoring=make_scorer(matthews_corrcoef))
print("CV MCC ",round(score.mean(),3))

score = sklearn.model_selection.cross_val_score(xgb_rdk_model, X_train, y_train, cv=5,scoring=make_scorer(balanced_accuracy_score))
print("CV,bAcc",round(score.mean(),3))

y_pred = xgb_rdk_model.predict(X_test)
print("test MCC ",round(matthews_corrcoef(y_test,y_pred),3))
print("test bAcc",round(balanced_accuracy_score(y_test,y_pred),3))

XGB RDK metrics:
CV MCC  0.683
CV,bAcc 0.836
test MCC  0.662
test bAcc 0.828


In [18]:

y_pred = xgb_rdk_model.predict(X_paper_rdk)
print("ChEMBL MCC ",round(matthews_corrcoef(y_paper,y_pred),3))
print("test bAcc",round(balanced_accuracy_score(y_paper,y_pred),3))

ChEMBL MCC  0.287
test bAcc 0.813


In [19]:
from sklearn.metrics import roc_auc_score
y_pred = xgb_rdk_model.predict_proba(X_paper_rdk)
print("AUC bAcc",round(roc_auc_score(y_paper,[yy[1] for yy in y_pred]),3))

AUC bAcc 0.898
