# **Load Libraries**

In [None]:
import os
import numpy as np
import pandas as pd
import oddt
from oddt.fingerprints import PLEC
from scipy import stats
from sklearn import preprocessing
import pickle
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import matthews_corrcoef, precision_recall_curve, accuracy_score, auc
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn.utils import parallel_backend
from xgboost.sklearn import XGBClassifier
from rdkit import Chem
from rdkit.Chem import AllChem
import deepchem as dc
from deepchem.utils import download_url, load_from_disk
from deepchem.utils.vina_utils import prepare_inputs
from deepchem.models import AtomicConvModel
from deepchem.feat import RdkitGridFeaturizer
from joblib import Parallel, delayed
from tqdm import tqdm
import glob
import tempfile

# **Load Data**

In [3]:
# training set true actives
plec_train_true_actives = pd.read_csv('Path_to_csv')
grid_train_true_actives = pd.read_csv('Path_to_csv')


# test sets true actives
plec_test_true_actives = pd.read_csv('Path_to_csv')
grid_test_true_actives = pd.read_csv('Path_to_csv')

# **Load Decoys**

In [None]:
# training set random_decoys
plec_train_random_decoys = pd.read_csv('Path_to_csv')
grid_train_random_decoys = pd.read_csv('Path_to_csv')


# test sets random_decoys
plec_test_random_decoys = pd.read_csv('Path_to_csv')
grid_test_random_decoys = pd.read_csv('Path_to_csv')



# training set deepcoy decoys
plec_train_deepcoy_decoys = pd.read_csv('Path_to_csv')
grid_train_deepcoy_decoys = pd.read_csv('Path_to_csv')


# test sets deepcoy decoys
plec_test_deepcoy_decoys = pd.read_csv('Path_to_csv')
grid_test_deepcoy_decoys = pd.read_csv('Path_to_csv')

# Generate train set with true actives and Random Decoys

In [42]:
plec_train_random = pd.concat([plec_train_true_actives,plec_train_random_decoys])
grid_train_random = pd.concat([grid_train_true_actives,grid_train_random_decoys])

# Generate test set true actives and deepcoys decoys

In [45]:
# test set with deepcoys
test_deepcoy_plec = pd.concat([plec_test_true_actives,plec_test_deepcoy_decoys])
test_deepcoy_grid = pd.concat([grid_test_true_actives,grid_test_deepcoy_decoys])

# Generate test set with true actives and random decoys

In [51]:
# test set with randomdecoyss
test_randomdecoys_plec = pd.concat([plec_test_true_actives,plec_test_randomdecoys])
test_randomdecoys_grid = pd.concat([grid_test_true_actives,grid_test_randomdecoys])

In [54]:
# train
X_plec_train, y_plec_train = plec_train_random.drop(['class', 'potency','index'], axis= 1), plec_train_random['class']
X_grid_train, y_grid_train = grid_train_random.drop(['class', 'potency'], axis= 1), grid_train_random['class']


# test set deepcoys
X_test_deepcoy_plec, y_test_deepcoy_plec = test_deepcoy_plec.drop(['class', 'potency','index'], axis= 1), test_deepcoy_plec['class']
X_test_deepcoy_grid, y_test_deepcoy_grid = test_deepcoy_grid.drop(['class', 'potency'], axis= 1), test_deepcoy_grid['class']

# test set randomdecoys
X_test_randomdecoys_plec, y_test_randomdecoys_plec = test_randomdecoys_plec.drop(['class', 'potency','index'], axis= 1), test_randomdecoys_plec['class']
X_test_randomdecoys_grid, y_test_randomdecoys_grid = test_randomdecoys_grid.drop(['class', 'potency','index'], axis= 1), test_randomdecoys_grid['class']

In [57]:
y_plec_train = y_plec_train.map({'active': 1, 'inactive': 0})
y_grid_train = y_grid_train.map({'active': 1, 'inactive': 0})



y_test_deepcoy_plec = y_test_deepcoy_plec.map({'active': 1, 'inactive': 0})
y_test_deepcoy_grid = y_test_deepcoy_grid.map({'active': 1, 'inactive': 0})


y_test_randomdecoys_plec = y_test_randomdecoys_plec.map({'active': 1, 'inactive': 0})
y_test_randomdecoys_grid = y_test_randomdecoys_grid.map({'active': 1, 'inactive': 0})

# **Training with PLEC features**

# **RandomForest**

In [23]:
# Define the objective function for hyperparameter tuning
def objective(params):
    # Create the random forest classifier with the given parameters
    rf_plec = RandomForestClassifier(
        n_estimators=int(params['n_estimators']),
        max_depth=int(params['max_depth']),
        min_samples_split=int(params['min_samples_split']),
        min_samples_leaf=int(params['min_samples_leaf']),
        max_features=params['max_features'],
        random_state=60, # Set random state for reproducibility
        n_jobs=40     
    )

    # Perform 5-fold cross-validation and calculate PR-AUC
    cv_scores = cross_val_score(rf_plec, X_plec_train, y_plec_train, cv=5, scoring='average_precision')
    mean_pr_auc = np.mean(cv_scores)
    
    # Return the negative mean PR-AUC as the loss
    return {'loss': -mean_pr_auc, 'status': STATUS_OK}

# Define the search space for hyperparameters
space = {
    'n_estimators': hp.quniform('n_estimators', 50, 300, 10),
    'max_depth': hp.quniform('max_depth', 3, 20, 1),
    'min_samples_split': hp.quniform('min_samples_split', 2, 10, 1),
    'min_samples_leaf': hp.quniform('min_samples_leaf', 1, 10, 1),
    'max_features': hp.choice('max_features', ['sqrt', 'log2', None])
}

# Run the optimization
trials = Trials()
best_params_rf_plec_P4 = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=50,  # Adjust the number of evaluations as needed
    trials=trials,
    rstate=np.random.default_rng(60) # Set random state for reproducibility

)
    
# save the best hyperparameters found
file_path = ("/home/juni/working/eman/results_classi/best_params_rf_plec_P4.pkl")
with open(file_path, 'wb') as file:
    pickle.dump(best_params_rf_plec_P4, file)
with open(file_path, 'rb') as file:
    best_params_rf_plec_P4 = pickle.load(file)
    

# Print the best hyperparameters found
print('Best hyperparameters:', best_params_rf_plec_P4)

100%|██████████| 50/50 [46:03<00:00, 55.27s/trial, best loss: -0.9791700800560438]   
Best hyperparameters: {'max_depth': 19.0, 'max_features': 1, 'min_samples_leaf': 1.0, 'min_samples_split': 2.0, 'n_estimators': 190.0}


# **save the best hyperparameters found**

In [110]:
# save the best hyperparameters found
file_path = ("/home/juni/working/eman/results_classi/best_params_rf_plec_P4.pkl")
with open(file_path, 'rb') as file:
    best_params_rf_plec_P4 = pickle.load(file)

In [111]:
# Convert the hyperparameter choices to the proper format
if best_params_rf_plec_P4['max_features'] == 0:
    best_params_rf_plec_P4['max_features'] = 'sqrt'
elif best_params_rf_plec_P4['max_features'] == 1:
    best_params_rf_plec_P4['max_features'] = 'log2'
else:
    best_params_rf_plec_P4['max_features'] = None

# **train the RF ten times and save th results**

In [112]:
# Train the final model with the best hyperparameters
PR_AUCs_plec_rf_P1_hard = []
PR_AUCs_plec_rf_P2_hard = []
for i in range(1,11):
    rf_plec = RandomForestClassifier(
        n_estimators=int(best_params_rf_plec_P4['n_estimators']),
        max_depth=int(best_params_rf_plec_P4['max_depth']),
        min_samples_split=int(best_params_rf_plec_P4['min_samples_split']),
        min_samples_leaf=int(best_params_rf_plec_P4['min_samples_leaf']),
        max_features=best_params_rf_plec_P4['max_features'],
        random_state=i
    )
    
    # Fit the model
    rf_plec.fit(X_plec_train, y_plec_train)
    
    #Test the RF model on the test molecules:
    prediction_test_rf_plec_class_P1_hard = rf_plec.predict(X_test_randomdecoys_plec_hard)
    prediction_test_rf_plec_prob_P1_hard = rf_plec.predict_proba(X_test_randomdecoys_plec_hard)
    
    
    #Get virtual screening results on the test molecules and export results to a csv file:
    plec_result_rf_P1_hard = pd.DataFrame({"Active_Prob": prediction_test_rf_plec_prob_P1_hard[:, 1],
                                   "Inactive_Prob": prediction_test_rf_plec_prob_P1_hard[:, 0],
                                   "Predicted_Class": prediction_test_rf_plec_class_P1_hard,
                                   "Real_Class": y_test_randomdecoys_plec_hard})
    #plec_result_rf_P1_hard.to_csv("/home/juni/working/eman/revised_paper/hard_test_results/results_classi/plec_result_rf_P1_"+str(i)+".csv")
    
    precision, recall, _ = precision_recall_curve(plec_result_rf_P1_hard['Real_Class'], plec_result_rf_P1_hard['Active_Prob'])
    pr_auc = round(auc(recall, precision),4)
    PR_AUCs_plec_rf_P1_hard.append(pr_auc)

    #Test the RF model on the test molecules:
    prediction_test_rf_plec_class_P2_hard = rf_plec.predict(X_test_deepcoy_plec_hard)
    prediction_test_rf_plec_prob_P2_hard = rf_plec.predict_proba(X_test_deepcoy_plec_hard)
    
    
    #Get virtual screening results on the test molecules and export results to a csv file:
    plec_result_rf_P2_hard = pd.DataFrame({"Active_Prob": prediction_test_rf_plec_prob_P2_hard[:, 1],
                                   "Inactive_Prob": prediction_test_rf_plec_prob_P2_hard[:, 0],
                                   "Predicted_Class": prediction_test_rf_plec_class_P2_hard,
                                   "Real_Class": y_test_deepcoy_plec_hard})
    #plec_result_rf_P2_hard.to_csv("/home/juni/working/eman/revised_paper/hard_test_results/results_classi/plec_result_rf_P2_"+str(i)+".csv")
    
    precision, recall, _ = precision_recall_curve(plec_result_rf_P2_hard['Real_Class'], plec_result_rf_P2_hard['Active_Prob'])
    pr_auc = round(auc(recall, precision),4)
    PR_AUCs_plec_rf_P2_hard.append(pr_auc)
    #print(f'PR-AUC on test set: {pr_auc:.2f}')
print(PR_AUCs_plec_rf_P1_hard)
print(PR_AUCs_plec_rf_P2_hard)

[0.852, 0.8657, 0.8441, 0.8118, 0.8633, 0.8612, 0.8348, 0.8493, 0.8556, 0.853]
[0.9075, 0.9266, 0.8846, 0.8809, 0.8843, 0.9075, 0.8984, 0.8991, 0.8904, 0.8833]


# **XGBoost**

In [28]:
import numpy as np
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score

# Define the objective function for hyperparameter tuning
def objective(params):
    # Create the XGBoost classifier with the given parameters
    xgb_plec = XGBClassifier(
        n_estimators=int(params['n_estimators']),
        max_depth=int(params['max_depth']),
        learning_rate=params['learning_rate'],
        min_child_weight=int(params['min_child_weight']),
        gamma=params['gamma'],
        subsample=params['subsample'],
        colsample_bytree=params['colsample_bytree'],
        random_state=42,  # Set random state for reproducibility
        n_jobs=40  # Use 40 CPU cores for XGBClassifier
    )

    # Perform 5-fold cross-validation and calculate PR-AUC
    cv_scores = cross_val_score(xgb_plec, X_plec_train, y_plec_train, cv=5, scoring='average_precision')
    mean_pr_auc = np.mean(cv_scores)
    
    # Return the negative mean PR-AUC as the loss
    return {'loss': -mean_pr_auc, 'status': STATUS_OK}

# Define the search space for hyperparameters
space = {
    'n_estimators': hp.quniform('n_estimators', 50, 300, 10),
    'max_depth': hp.quniform('max_depth', 3, 15, 1),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
    'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1),
    'gamma': hp.uniform('gamma', 0, 5),
    'subsample': hp.uniform('subsample', 0.5, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1)
}

# Run the optimization
trials = Trials()
best_params_xgb_plec_P4 = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=50,
    trials=trials,
    rstate=np.random.default_rng(42)  # Set random state for reproducibility
)
# save the best hyperparameters found
file_path = ("/home/juni/working/eman/results_classi/best_params_xgb_plec_P4.pkl")
with open(file_path, 'wb') as file:
    pickle.dump(best_params_xgb_plec_P4, file)
with open(file_path, 'rb') as file:
    best_params_xgb_plec_P4 = pickle.load(file)

# Print the best hyperparameters found
print('Best hyperparameters:', best_params_xgb_plec_P4)

100%|██████████| 50/50 [2:02:42<00:00, 147.24s/trial, best loss: -0.9841922746191948]  
Best hyperparameters: {'colsample_bytree': 0.5884426219884694, 'gamma': 0.26525014628744187, 'learning_rate': 0.18103574581049287, 'max_depth': 9.0, 'min_child_weight': 3.0, 'n_estimators': 160.0, 'subsample': 0.7592767726533658}


# **save the best hyperparameters found**

In [105]:
# save the best hyperparameters found
file_path = ("/home/juni/working/eman/results_classi/best_params_xgb_plec_P4.pkl")
with open(file_path, 'rb') as file:
    best_params_xgb_plec_P4 = pickle.load(file)

# **train the XGB ten times and save th results**

In [108]:
# Train the final model with the best hyperparameters
PR_AUCs_plec_xgb_P1_hard = []
PR_AUCs_plec_xgb_P2_hard = []
for i in range(1,11):
    xgb_plec = XGBClassifier(
        n_estimators=int(best_params_xgb_plec_P4['n_estimators']),
        max_depth=int(best_params_xgb_plec_P4['max_depth']),
        learning_rate=best_params_xgb_plec_P4['learning_rate'],
        min_child_weight=int(best_params_xgb_plec_P4['min_child_weight']),
        gamma=best_params_xgb_plec_P4['gamma'],
        subsample=best_params_xgb_plec_P4['subsample'],
        colsample_bytree=best_params_xgb_plec_P4['colsample_bytree'],
        random_state=i,
        n_jobs=40  # Use 40 CPU cores for XGBClassifier
)
    
    # Fit the model
    xgb_plec.fit(X_plec_train, y_plec_train)
    
    #Test the RF model on the test molecules:
    prediction_test_xgb_plec_class_P1_hard = xgb_plec.predict(X_test_randomdecoys_plec_hard)
    prediction_test_xgb_plec_prob_P1_hard = xgb_plec.predict_proba(X_test_randomdecoys_plec_hard)
    
    
    #Get virtual screening results on the test molecules and export results to a csv file:
    plec_result_xgb_P1_hard = pd.DataFrame({"Active_Prob": prediction_test_xgb_plec_prob_P1_hard[:, 1],
                                   "Inactive_Prob": prediction_test_xgb_plec_prob_P1_hard[:, 0],
                                   "Predicted_Class": prediction_test_xgb_plec_class_P1_hard,
                                   "Real_Class": y_test_randomdecoys_plec_hard})
    plec_result_xgb_P1_hard.to_csv("/home/juni/working/eman/revised_paper/hard_test_results/results_classi/plec_result_xgb_P1_hard_"+str(i)+".csv")
    
    precision, recall, _ = precision_recall_curve(plec_result_xgb_P1_hard['Real_Class'], plec_result_xgb_P1_hard['Active_Prob'])
    pr_auc = round(auc(recall, precision),4)
    PR_AUCs_plec_xgb_P1_hard.append(pr_auc)

    #Test the RF model on the test molecules:
    prediction_test_xgb_plec_class_P2_hard = xgb_plec.predict(X_test_deepcoy_plec_hard)
    prediction_test_xgb_plec_prob_P2_hard = xgb_plec.predict_proba(X_test_deepcoy_plec_hard)
    
    
    #Get virtual screening results on the test molecules and export results to a csv file:
    plec_result_xgb_P2_hard = pd.DataFrame({"Active_Prob": prediction_test_xgb_plec_prob_P2_hard[:, 1],
                                   "Inactive_Prob": prediction_test_xgb_plec_prob_P2_hard[:, 0],
                                   "Predicted_Class": prediction_test_xgb_plec_class_P2_hard,
                                   "Real_Class": y_test_deepcoy_plec_hard})
    plec_result_xgb_P2_hard.to_csv("/home/juni/working/eman/revised_paper/hard_test_results/results_classi/plec_result_xgb_P2_hard_"+str(i)+".csv")
    
    precision, recall, _ = precision_recall_curve(plec_result_xgb_P2_hard['Real_Class'], plec_result_xgb_P2_hard['Active_Prob'])
    pr_auc = round(auc(recall, precision),4)
    PR_AUCs_plec_xgb_P2_hard.append(pr_auc)
    #print(f'PR-AUC on test set: {pr_auc:.2f}')
print(PR_AUCs_plec_xgb_P1_hard)
print(PR_AUCs_plec_xgb_P2_hard)

[0.8264, 0.7668, 0.795, 0.7631, 0.761, 0.7349, 0.8279, 0.7299, 0.7789, 0.7418]
[0.9231, 0.899, 0.8861, 0.8841, 0.8789, 0.8935, 0.8998, 0.8931, 0.8979, 0.8924]


In [67]:
plec_result_xgb_P2

Unnamed: 0,Active_Prob,Inactive_Prob,Predicted_Class,Real_Class
284,0.061193,0.938807,0,1
189,0.999767,0.000233,1,1
58,0.768013,0.231987,1,1
219,0.998507,0.001493,1,1
151,0.999819,0.000181,1,1
...,...,...,...,...
27645,0.000222,0.999778,0,0
27646,0.000049,0.999951,0,0
27647,0.000037,0.999963,0,0
27648,0.000048,0.999952,0,0


# **ANN**

In [None]:
import numpy as np
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score

# Define the objective function for hyperparameter tuning
def objective(params):
    # Create the ANN classifier with the given parameters
    ann_plec = MLPClassifier(
        hidden_layer_sizes=params['hidden_layer_sizes'],
        activation=params['activation'],
        solver=params['solver'],
        alpha=params['alpha'],
        learning_rate=params['learning_rate'],
        random_state=42
    )

    # Perform 5-fold cross-validation and calculate accuracy
    cv_scores = cross_val_score(ann_plec, X_plec_train, y_plec_train, cv=5, scoring='average_precision', n_jobs=40)
    mean_pr_auc = np.mean(cv_scores)
    
    # Return the negative mean accuracy as the loss
    return {'loss': -mean_pr_auc, 'status': STATUS_OK}

# Define the search space for hyperparameters
space = {
    'hidden_layer_sizes': hp.choice('hidden_layer_sizes', [(50,), (100,), (50, 50), (100, 100)]),
    'activation': hp.choice('activation', ['relu', 'tanh', 'logistic']),
    'solver': hp.choice('solver', ['adam', 'sgd', 'lbfgs']),
    'alpha': hp.loguniform('alpha', -5, -1),  # L2 penalty parameter (e.g., from 0.00001 to 0.1)
    'learning_rate': hp.choice('learning_rate', ['constant', 'adaptive', 'invscaling'])
}

# Run the optimization
trials = Trials()
best_params_ann_plec_P4 = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=50,
    trials=trials,
    rstate=np.random.default_rng(42)  # Set random state for reproducibility
)

# save the best hyperparameters found
file_path = ("/home/juni/working/eman/results_classi/best_params_ann_plec_P4.pkl")
with open(file_path, 'wb') as file:
    pickle.dump(best_params_ann_plec_P4, file)
with open(file_path, 'rb') as file:
    best_params_ann_plec_P4 = pickle.load(file)
    
# Print the best hyperparameters found
print('Best hyperparameters:', best_params_ann_plec_P4)

# **save the best hyperparameters found**

In [109]:
# save the best hyperparameters found
file_path = ("/home/juni/working/eman/results_classi/best_params_ann_plec_P4.pkl")
with open(file_path, 'rb') as file:
    best_params_ann_plec_P4 = pickle.load(file)

In [110]:
# Convert the hyperparameter choices to the proper format
if best_params_ann_plec_P4['hidden_layer_sizes'] == 0:
    best_params_ann_plec_P4['hidden_layer_sizes'] = (50,)
elif best_params_ann_plec_P4['hidden_layer_sizes'] == 1:
    best_params_ann_plec_P4['hidden_layer_sizes'] = (100,)
elif best_params_ann_plec_P4['hidden_layer_sizes'] == 2:
    best_params_ann_plec_P4['hidden_layer_sizes'] = (50,50)
else:
    best_params_ann_plec_P4['hidden_layer_sizes']=(100,100)


if best_params_ann_plec_P4['activation'] == 0:
    best_params_ann_plec_P4['activation'] = 'relu'
elif best_params_ann_plec_P4['activation'] == 1:
    best_params_ann_plec_P4['activation'] = 'tanh'
else:
    best_params_ann_plec_P4['activation'] = 'logistic'



# Convert the hyperparameter choices to the proper format
if best_params_ann_plec_P4['learning_rate'] == 0:
    best_params_ann_plec_P4['learning_rate'] = 'constant'
elif best_params_ann_plec_P4['learning_rate'] == 1:
    best_params_ann_plec_P4['learning_rate'] = 'adaptive'
else:
    best_params_ann_plec_P4['learning_rate'] = 'invscaling'


# Convert the hyperparameter choices to the proper format
if best_params_ann_plec_P4['solver'] == 0:
    best_params_ann_plec_P4['solver'] = 'adam'
elif best_params_ann_plec_P4['solver'] == 1:
    best_params_ann_plec_P4['solver'] = 'sgd'
else:
    best_params_ann_plec_P4['solver'] = 'lbfgs'

# **train the ANN ten times and save th results**

In [112]:
# Train the final model with the best hyperparameters
PR_AUCs_plec_ann_P1_hard = []
PR_AUCs_plec_ann_P2_hard = []
for i in range(1,11):
    ann_plec = MLPClassifier(
    hidden_layer_sizes=best_params_ann_plec_P4['hidden_layer_sizes'],
    activation=best_params_ann_plec_P4['activation'],
    solver=best_params_ann_plec_P4['solver'],
    alpha=best_params_ann_plec_P4['alpha'],
    learning_rate=best_params_ann_plec_P4['learning_rate'],
    random_state=i)
    
    # Fit the model
    ann_plec.fit(X_plec_train, y_plec_train)
    
    #Test the RF model on the test molecules:
    prediction_test_ann_plec_class_P1_hard = ann_plec.predict(X_test_randomdecoys_plec_hard)
    prediction_test_ann_plec_prob_P1_hard = ann_plec.predict_proba(X_test_randomdecoys_plec_hard)
    
    
    #Get virtual screening results on the test molecules and export results to a csv file:
    plec_result_ann_P1_hard = pd.DataFrame({"Active_Prob": prediction_test_ann_plec_prob_P1_hard[:, 1],
                                   "Inactive_Prob": prediction_test_ann_plec_prob_P1_hard[:, 0],
                                   "Predicted_Class": prediction_test_ann_plec_class_P1_hard,
                                   "Real_Class": y_test_randomdecoys_plec_hard})
    plec_result_ann_P1_hard.to_csv("/home/juni/working/eman/revised_paper/hard_test_results/results_classi/plec_result_ann_P1_hard_"+str(i)+".csv")
    
    precision, recall, _ = precision_recall_curve(plec_result_ann_P1_hard['Real_Class'], plec_result_ann_P1_hard['Active_Prob'])
    pr_auc = round(auc(recall, precision),4)
    PR_AUCs_plec_ann_P1_hard.append(pr_auc)

    #Test the RF model on the test molecules:
    prediction_test_ann_plec_class_P2_hard = ann_plec.predict(X_test_deepcoy_plec_hard)
    prediction_test_ann_plec_prob_P2_hard = ann_plec.predict_proba(X_test_deepcoy_plec_hard)
    
    
    #Get virtual screening results on the test molecules and export results to a csv file:
    plec_result_ann_P2_hard = pd.DataFrame({"Active_Prob": prediction_test_ann_plec_prob_P2_hard[:, 1],
                                   "Inactive_Prob": prediction_test_ann_plec_prob_P2_hard[:, 0],
                                   "Predicted_Class": prediction_test_ann_plec_class_P2_hard,
                                   "Real_Class": y_test_deepcoy_plec_hard})
    plec_result_ann_P2_hard.to_csv("/home/juni/working/eman/revised_paper/hard_test_results/results_classi/plec_result_ann_P2_hard_"+str(i)+".csv")
    
    precision, recall, _ = precision_recall_curve(plec_result_ann_P2_hard['Real_Class'], plec_result_ann_P2_hard['Active_Prob'])
    pr_auc = round(auc(recall, precision),4)
    PR_AUCs_plec_ann_P2_hard.append(pr_auc)
    #print(f'PR-AUC on test set: {pr_auc:.2f}')

print(PR_AUCs_plec_ann_P1_hard)
print(PR_AUCs_plec_ann_P2_hard)

[0.747, 0.7933, 0.6737, 0.8574, 0.8483, 0.8252, 0.8764, 0.8691, 0.8852, 0.939]
[0.9822, 0.9518, 0.9509, 0.973, 0.9962, 0.9631, 0.9577, 0.9923, 0.9633, 1.0]


# **Training with GRID features**

# **RandomForest**

In [None]:
# Define the objective function for hyperparameter tuning
def objective(params):
    # Create the random forest classifier with the given parameters
    rf_grid = RandomForestClassifier(
        n_estimators=int(params['n_estimators']),
        max_depth=int(params['max_depth']),
        min_samples_split=int(params['min_samples_split']),
        min_samples_leaf=int(params['min_samples_leaf']),
        max_features=params['max_features'],
        random_state=60, # Set random state for reproducibility
        n_jobs=40     
    )

    # Perform 5-fold cross-validation and calculate PR-AUC
    cv_scores = cross_val_score(rf_grid, X_grid_train, y_grid_train, cv=5, scoring='average_precision')
    mean_pr_auc = np.mean(cv_scores)
    
    # Return the negative mean PR-AUC as the loss
    return {'loss': -mean_pr_auc, 'status': STATUS_OK}

# Define the search space for hyperparameters
space = {
    'n_estimators': hp.quniform('n_estimators', 50, 300, 10),
    'max_depth': hp.quniform('max_depth', 3, 20, 1),
    'min_samples_split': hp.quniform('min_samples_split', 2, 10, 1),
    'min_samples_leaf': hp.quniform('min_samples_leaf', 1, 10, 1),
    'max_features': hp.choice('max_features', ['sqrt', 'log2', None])
}

# Run the optimization
trials = Trials()
best_params_rf_grid_P4 = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=50,  # Adjust the number of evaluations as needed
    trials=trials,
    rstate=np.random.default_rng(60) # Set random state for reproducibility

)
    
# save the best hyperparameters found
file_path = ("/home/juni/working/eman/results_classi/best_params_rf_grid_P4.pkl")
with open(file_path, 'wb') as file:
    pickle.dump(best_params_rf_grid_P4, file)
with open(file_path, 'rb') as file:
    best_params_rf_grid_P4 = pickle.load(file)
    

# Print the best hyperparameters found
print('Best hyperparameters:', best_params_rf_grid_P4)

In [113]:
# save the best hyperparameters found
file_path = ("/home/juni/working/eman/results_classi/best_params_rf_grid_P4.pkl")
with open(file_path, 'rb') as file:
    best_params_rf_grid_P4 = pickle.load(file)

In [114]:
# Convert the hyperparameter choices to the proper format
if best_params_rf_grid_P4['max_features'] == 0:
    best_params_rf_grid_P4['max_features'] = 'sqrt'
elif best_params_rf_grid_P4['max_features'] == 1:
    best_params_rf_grid_P4['max_features'] = 'log2'
else:
    best_params_rf_grid_P4['max_features'] = None

In [131]:
# Train the final model with the best hyperparameters
PR_AUCs_grid_rf_P1_hard = []
PR_AUCs_grid_rf_P2_hard = []
for i in range(1,11):
    rf_grid = RandomForestClassifier(
        n_estimators=int(best_params_rf_grid_P4['n_estimators']),
        max_depth=int(best_params_rf_grid_P4['max_depth']),
        min_samples_split=int(best_params_rf_grid_P4['min_samples_split']),
        min_samples_leaf=int(best_params_rf_grid_P4['min_samples_leaf']),
        max_features=best_params_rf_grid_P4['max_features'],
        random_state=i
    )
    
    # Fit the model
    rf_grid.fit(X_grid_train, y_grid_train)
    
    #Test the RF model on the test molecules:
    prediction_test_rf_grid_class_P1_hard = rf_grid.predict(X_test_randomdecoys_grid_hard)
    prediction_test_rf_grid_prob_P1_hard = rf_grid.predict_proba(X_test_randomdecoys_grid_hard)
    
    
    #Get virtual screening results on the test molecules and export results to a csv file:
    grid_result_rf_P1_hard = pd.DataFrame({"Active_Prob": prediction_test_rf_grid_prob_P1_hard[:, 1],
                                   "Inactive_Prob": prediction_test_rf_grid_prob_P1_hard[:, 0],
                                   "Predicted_Class": prediction_test_rf_grid_class_P1_hard,
                                   "Real_Class": y_test_randomdecoys_grid_hard})
    grid_result_rf_P1_hard.to_csv("/home/juni/working/eman/revised_paper/hard_test_results/results_classi/grid_result_rf_P1_"+str(i)+".csv")
    
    precision, recall, _ = precision_recall_curve(grid_result_rf_P1_hard['Real_Class'], grid_result_rf_P1_hard['Active_Prob'])
    pr_auc = round(auc(recall, precision),4)
    PR_AUCs_grid_rf_P1_hard.append(pr_auc)

    #Test the RF model on the test molecules:
    prediction_test_rf_grid_class_P2_hard = rf_grid.predict(X_test_deepcoy_grid_hard)
    prediction_test_rf_grid_prob_P2_hard = rf_grid.predict_proba(X_test_deepcoy_grid_hard)
    
    
    #Get virtual screening results on the test molecules and export results to a csv file:
    grid_result_rf_P2_hard = pd.DataFrame({"Active_Prob": prediction_test_rf_grid_prob_P2_hard[:, 1],
                                   "Inactive_Prob": prediction_test_rf_grid_prob_P2_hard[:, 0],
                                   "Predicted_Class": prediction_test_rf_grid_class_P2_hard,
                                   "Real_Class": y_test_deepcoy_grid_hard})
    grid_result_rf_P2_hard.to_csv("/home/juni/working/eman/revised_paper/hard_test_results/results_classi/grid_result_rf_P2_"+str(i)+".csv")
    
    precision, recall, _ = precision_recall_curve(grid_result_rf_P2_hard['Real_Class'], grid_result_rf_P2_hard['Active_Prob'])
    pr_auc = round(auc(recall, precision),4)
    PR_AUCs_grid_rf_P2_hard.append(pr_auc)
    #print(f'PR-AUC on test set: {pr_auc:.2f}')
print(PR_AUCs_grid_rf_P1_hard)
print(PR_AUCs_grid_rf_P2_hard)

[0.7499, 0.7704, 0.7837, 0.7798, 0.7226, 0.7563, 0.7513, 0.7546, 0.7715, 0.7786]
[0.8284, 0.8636, 0.8603, 0.8614, 0.7865, 0.8311, 0.8035, 0.8287, 0.8612, 0.8544]


# **XGBoost**

In [None]:
import numpy as np
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score

# Define the objective function for hyperparameter tuning
def objective(params):
    # Create the XGBoost classifier with the given parameters
    xgb_grid = XGBClassifier(
        n_estimators=int(params['n_estimators']),
        max_depth=int(params['max_depth']),
        learning_rate=params['learning_rate'],
        min_child_weight=int(params['min_child_weight']),
        gamma=params['gamma'],
        subsample=params['subsample'],
        colsample_bytree=params['colsample_bytree'],
        random_state=42,  # Set random state for reproducibility
        n_jobs=40  # Use 40 CPU cores for XGBClassifier
    )

    # Perform 5-fold cross-validation and calculate PR-AUC
    cv_scores = cross_val_score(xgb_grid, X_grid_train, y_grid_train, cv=5, scoring='average_precision')
    mean_pr_auc = np.mean(cv_scores)
    
    # Return the negative mean PR-AUC as the loss
    return {'loss': -mean_pr_auc, 'status': STATUS_OK}

# Define the search space for hyperparameters
space = {
    'n_estimators': hp.quniform('n_estimators', 50, 300, 10),
    'max_depth': hp.quniform('max_depth', 3, 15, 1),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
    'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1),
    'gamma': hp.uniform('gamma', 0, 5),
    'subsample': hp.uniform('subsample', 0.5, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1)
}

# Run the optimization
trials = Trials()
best_params_xgb_grid_P4 = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=50,
    trials=trials,
    rstate=np.random.default_rng(42)  # Set random state for reproducibility
)
# save the best hyperparameters found
file_path = ("/home/juni/working/eman/results_classi/best_params_xgb_grid_P4.pkl")
with open(file_path, 'wb') as file:
    pickle.dump(best_params_xgb_grid_P4, file)
with open(file_path, 'rb') as file:
    best_params_xgb_grid_P4 = pickle.load(file)

# Print the best hyperparameters found
print('Best hyperparameters:', best_params_xgb_grid_P4)

In [132]:
# save the best hyperparameters found
file_path = ("/home/juni/working/eman/results_classi/best_params_xgb_grid_P4.pkl")
with open(file_path, 'rb') as file:
    best_params_xgb_grid_P4 = pickle.load(file)

In [133]:
# Train the final model with the best hyperparameters
PR_AUCs_grid_xgb_P1_hard = []
PR_AUCs_grid_xgb_P2_hard = []
for i in range(1,11):
    xgb_grid = XGBClassifier(
        n_estimators=int(best_params_xgb_grid_P4['n_estimators']),
        max_depth=int(best_params_xgb_grid_P4['max_depth']),
        learning_rate=best_params_xgb_grid_P4['learning_rate'],
        min_child_weight=int(best_params_xgb_grid_P4['min_child_weight']),
        gamma=best_params_xgb_grid_P4['gamma'],
        subsample=best_params_xgb_grid_P4['subsample'],
        colsample_bytree=best_params_xgb_grid_P4['colsample_bytree'],
        random_state=i,
        n_jobs=40  # Use 40 CPU cores for XGBClassifier
)
    
    # Fit the model
    xgb_grid.fit(X_grid_train, y_grid_train)
    
    #Test the RF model on the test molecules:
    prediction_test_xgb_grid_class_P1_hard = xgb_grid.predict(X_test_randomdecoys_grid_hard)
    prediction_test_xgb_grid_prob_P1_hard = xgb_grid.predict_proba(X_test_randomdecoys_grid_hard)
    
    
    #Get virtual screening results on the test molecules and export results to a csv file:
    grid_result_xgb_P1_hard = pd.DataFrame({"Active_Prob": prediction_test_xgb_grid_prob_P1_hard[:, 1],
                                   "Inactive_Prob": prediction_test_xgb_grid_prob_P1_hard[:, 0],
                                   "Predicted_Class": prediction_test_xgb_grid_class_P1_hard,
                                   "Real_Class": y_test_randomdecoys_grid_hard})
    grid_result_xgb_P1_hard.to_csv("/home/juni/working/eman/revised_paper/hard_test_results/results_classi/grid_result_xgb_P1_hard_"+str(i)+".csv")
    
    precision, recall, _ = precision_recall_curve(grid_result_xgb_P1_hard['Real_Class'], grid_result_xgb_P1_hard['Active_Prob'])
    pr_auc = round(auc(recall, precision),4)
    PR_AUCs_grid_xgb_P1_hard.append(pr_auc)

    #Test the RF model on the test molecules:
    prediction_test_xgb_grid_class_P2_hard = xgb_grid.predict(X_test_deepcoy_grid_hard)
    prediction_test_xgb_grid_prob_P2_hard = xgb_grid.predict_proba(X_test_deepcoy_grid_hard)
    
    
    #Get virtual screening results on the test molecules and export results to a csv file:
    grid_result_xgb_P2_hard = pd.DataFrame({"Active_Prob": prediction_test_xgb_grid_prob_P2_hard[:, 1],
                                   "Inactive_Prob": prediction_test_xgb_grid_prob_P2_hard[:, 0],
                                   "Predicted_Class": prediction_test_xgb_grid_class_P2_hard,
                                   "Real_Class": y_test_deepcoy_grid_hard})
    grid_result_xgb_P2_hard.to_csv("/home/juni/working/eman/revised_paper/hard_test_results/results_classi/grid_result_xgb_P2_hard_"+str(i)+".csv")
    
    precision, recall, _ = precision_recall_curve(grid_result_xgb_P2_hard['Real_Class'], grid_result_xgb_P2_hard['Active_Prob'])
    pr_auc = round(auc(recall, precision),4)
    PR_AUCs_grid_xgb_P2_hard.append(pr_auc)
    #print(f'PR-AUC on test set: {pr_auc:.2f}')
print(PR_AUCs_grid_xgb_P1_hard)
print(PR_AUCs_grid_xgb_P2_hard)

[0.8844, 0.8329, 0.8202, 0.8522, 0.8808, 0.8671, 0.8603, 0.8746, 0.8914, 0.8673]
[0.9493, 0.894, 0.8796, 0.9057, 0.9303, 0.9174, 0.9182, 0.9465, 0.914, 0.9178]


# **ANN**

In [None]:
import numpy as np
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score

# Define the objective function for hyperparameter tuning
def objective(params):
    # Create the ANN classifier with the given parameters
    ann_grid = MLPClassifier(
        hidden_layer_sizes=params['hidden_layer_sizes'],
        activation=params['activation'],
        solver=params['solver'],
        alpha=params['alpha'],
        learning_rate=params['learning_rate'],
        random_state=42
    )

    # Perform 5-fold cross-validation and calculate accuracy
    cv_scores = cross_val_score(ann_grid, X_grid_train, y_grid_train, cv=5, scoring='average_precision', n_jobs=40)
    mean_pr_auc = np.mean(cv_scores)
    
    # Return the negative mean accuracy as the loss
    return {'loss': -mean_pr_auc, 'status': STATUS_OK}

# Define the search space for hyperparameters
space = {
    'hidden_layer_sizes': hp.choice('hidden_layer_sizes', [(50,), (100,), (50, 50), (100, 100)]),
    'activation': hp.choice('activation', ['relu', 'tanh', 'logistic']),
    'solver': hp.choice('solver', ['adam', 'sgd', 'lbfgs']),
    'alpha': hp.loguniform('alpha', -5, -1),  # L2 penalty parameter (e.g., from 0.00001 to 0.1)
    'learning_rate': hp.choice('learning_rate', ['constant', 'adaptive', 'invscaling'])
}

# Run the optimization
trials = Trials()
best_params_ann_grid_P4 = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=50,
    trials=trials,
    rstate=np.random.default_rng(42)  # Set random state for reproducibility
)

# save the best hyperparameters found
file_path = ("/home/juni/working/eman/results_classi/best_params_ann_grid_P4.pkl")
with open(file_path, 'wb') as file:
    pickle.dump(best_params_ann_grid_P4, file)
with open(file_path, 'rb') as file:
    best_params_ann_grid_P4 = pickle.load(file)
    
# Print the best hyperparameters found
print('Best hyperparameters:', best_params_ann_grid_P4)

In [134]:
# save the best hyperparameters found
file_path = ("/home/juni/working/eman/results_classi/best_params_ann_grid_P4.pkl")
with open(file_path, 'rb') as file:
    best_params_ann_grid_P4 = pickle.load(file)

In [135]:
# Convert the hyperparameter choices to the proper format
if best_params_ann_grid_P4['hidden_layer_sizes'] == 0:
    best_params_ann_grid_P4['hidden_layer_sizes'] = (50,)
elif best_params_ann_grid_P4['hidden_layer_sizes'] == 1:
    best_params_ann_grid_P4['hidden_layer_sizes'] = (100,)
elif best_params_ann_grid_P4['hidden_layer_sizes'] == 2:
    best_params_ann_grid_P4['hidden_layer_sizes'] = (50,50)
else:
    best_params_ann_grid_P4['hidden_layer_sizes']=(100,100)


if best_params_ann_grid_P4['activation'] == 0:
    best_params_ann_grid_P4['activation'] = 'relu'
elif best_params_ann_grid_P4['activation'] == 1:
    best_params_ann_grid_P4['activation'] = 'tanh'
else:
    best_params_ann_grid_P4['activation'] = 'logistic'



# Convert the hyperparameter choices to the proper format
if best_params_ann_grid_P4['learning_rate'] == 0:
    best_params_ann_grid_P4['learning_rate'] = 'constant'
elif best_params_ann_grid_P4['learning_rate'] == 1:
    best_params_ann_grid_P4['learning_rate'] = 'adaptive'
else:
    best_params_ann_grid_P4['learning_rate'] = 'invscaling'


# Convert the hyperparameter choices to the proper format
if best_params_ann_grid_P4['solver'] == 0:
    best_params_ann_grid_P4['solver'] = 'adam'
elif best_params_ann_grid_P4['solver'] == 1:
    best_params_ann_grid_P4['solver'] = 'sgd'
else:
    best_params_ann_grid_P4['solver'] = 'lbfgs'

In [136]:
# Train the final model with the best hyperparameters
PR_AUCs_grid_ann_P1_hard = []
PR_AUCs_grid_ann_P2_hard = []
for i in range(1,11):
    ann_grid = MLPClassifier(
    hidden_layer_sizes=best_params_ann_grid_P4['hidden_layer_sizes'],
    activation=best_params_ann_grid_P4['activation'],
    solver=best_params_ann_grid_P4['solver'],
    alpha=best_params_ann_grid_P4['alpha'],
    learning_rate=best_params_ann_grid_P4['learning_rate'],
    random_state=i)
    
    # Fit the model
    ann_grid.fit(X_grid_train, y_grid_train)
    
    #Test the RF model on the test molecules:
    prediction_test_ann_grid_class_P1_hard = ann_grid.predict(X_test_randomdecoys_grid_hard)
    prediction_test_ann_grid_prob_P1_hard = ann_grid.predict_proba(X_test_randomdecoys_grid_hard)
    
    
    #Get virtual screening results on the test molecules and export results to a csv file:
    grid_result_ann_P1_hard = pd.DataFrame({"Active_Prob": prediction_test_ann_grid_prob_P1_hard[:, 1],
                                   "Inactive_Prob": prediction_test_ann_grid_prob_P1_hard[:, 0],
                                   "Predicted_Class": prediction_test_ann_grid_class_P1_hard,
                                   "Real_Class": y_test_randomdecoys_grid_hard})
    grid_result_ann_P1_hard.to_csv("/home/juni/working/eman/revised_paper/hard_test_results/results_classi/grid_result_ann_P1_hard_"+str(i)+".csv")
    
    precision, recall, _ = precision_recall_curve(grid_result_ann_P1_hard['Real_Class'], grid_result_ann_P1_hard['Active_Prob'])
    pr_auc = round(auc(recall, precision),4)
    PR_AUCs_grid_ann_P1_hard.append(pr_auc)

    #Test the RF model on the test molecules:
    prediction_test_ann_grid_class_P2_hard = ann_grid.predict(X_test_deepcoy_grid_hard)
    prediction_test_ann_grid_prob_P2_hard = ann_grid.predict_proba(X_test_deepcoy_grid_hard)
    
    
    #Get virtual screening results on the test molecules and export results to a csv file:
    grid_result_ann_P2_hard = pd.DataFrame({"Active_Prob": prediction_test_ann_grid_prob_P2_hard[:, 1],
                                   "Inactive_Prob": prediction_test_ann_grid_prob_P2_hard[:, 0],
                                   "Predicted_Class": prediction_test_ann_grid_class_P2_hard,
                                   "Real_Class": y_test_deepcoy_grid_hard})
    grid_result_ann_P2_hard.to_csv("/home/juni/working/eman/revised_paper/hard_test_results/results_classi/grid_result_ann_P2_hard_"+str(i)+".csv")
    
    precision, recall, _ = precision_recall_curve(grid_result_ann_P2_hard['Real_Class'], grid_result_ann_P2_hard['Active_Prob'])
    pr_auc = round(auc(recall, precision),4)
    PR_AUCs_grid_ann_P2_hard.append(pr_auc)
    #print(f'PR-AUC on test set: {pr_auc:.2f}')

print(PR_AUCs_grid_ann_P1_hard)
print(PR_AUCs_grid_ann_P2_hard)

[0.7822, 0.6906, 0.6821, 0.7293, 0.7216, 0.7071, 0.7205, 0.7083, 0.762, 0.7402]
[0.9028, 0.843, 0.825, 0.8615, 0.8674, 0.8385, 0.8826, 0.8785, 0.863, 0.868]


# Generate train set with true actives and DeepCoy Decoys

In [42]:
plec_train_deepcoy = pd.concat([plec_train_true_actives,plec_train_deepcoy_decoys])
grid_train_deepcoy = pd.concat([grid_train_true_actives,grid_train_deepcoy_decoys])

# **Train and test the RF , XGB and ANN with above codes**