# **Load Libraries**

In [None]:
import os
import numpy as np
import pandas as pd
import oddt
from oddt.fingerprints import PLEC
from scipy import stats
from sklearn import preprocessing
import pickle
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import matthews_corrcoef, precision_recall_curve, accuracy_score, auc
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn.utils import parallel_backend
from xgboost.sklearn import XGBClassifier
from rdkit import Chem
from rdkit.Chem import AllChem
import deepchem as dc
from deepchem.utils import download_url, load_from_disk
from deepchem.utils.vina_utils import prepare_inputs
from deepchem.models import AtomicConvModel
from deepchem.feat import RdkitGridFeaturizer
from joblib import Parallel, delayed
from tqdm import tqdm
import glob
import tempfile

# **Load Data**

In [3]:
# training set true actives
plec_train_true_actives = pd.read_csv('Path_to_csv')
grid_train_true_actives = pd.read_csv('Path_to_csv')


# test sets true actives
plec_test_true_actives = pd.read_csv('Path_to_csv')
grid_test_true_actives = pd.read_csv('Path_to_csv')

# **Load Decoys**

In [None]:
# training set random_decoys
plec_train_random_decoys = pd.read_csv('Path_to_csv')
grid_train_random_decoys = pd.read_csv('Path_to_csv')


# test sets random_decoys
plec_test_random_decoys = pd.read_csv('Path_to_csv')
grid_test_random_decoys = pd.read_csv('Path_to_csv')



# training set deepcoy decoys
plec_train_deepcoy_decoys = pd.read_csv('Path_to_csv')
grid_train_deepcoy_decoys = pd.read_csv('Path_to_csv')


# test sets deepcoy decoys
plec_test_deepcoy_decoys = pd.read_csv('Path_to_csv')
grid_test_deepcoy_decoys = pd.read_csv('Path_to_csv')

# Generate train set with true actives and Random Decoys

In [42]:
plec_train_random = pd.concat([plec_train_true_actives,plec_train_random_decoys])
grid_train_random = pd.concat([grid_train_true_actives,grid_train_random_decoys])

# Generate test set true actives and deepcoys decoys

In [45]:
# test set with deepcoys
test_deepcoy_plec = pd.concat([plec_test_true_actives,plec_test_deepcoy_decoys])
test_deepcoy_grid = pd.concat([grid_test_true_actives,grid_test_deepcoy_decoys])

# Generate test set with true actives and random decoys

In [51]:
# test set with randomdecoyss
test_randomdecoys_plec = pd.concat([plec_test_true_actives,plec_test_randomdecoys])
test_randomdecoys_grid = pd.concat([grid_test_true_actives,grid_test_randomdecoys])

In [54]:
# train
X_plec_train, y_plec_train = plec_train_random.drop(['class', 'potency','index'], axis= 1), plec_train_random['potency']
X_grid_train, y_grid_train = grid_train_random.drop(['class', 'potency'], axis= 1), grid_train_random['potency']


# test set deepcoys
X_test_deepcoy_plec, y_test_deepcoy_plec = test_deepcoy_plec.drop(['class', 'potency','index'], axis= 1), test_deepcoy_plec['potency']
X_test_deepcoy_grid, y_test_deepcoy_grid = test_deepcoy_grid.drop(['class', 'potency'], axis= 1), test_deepcoy_grid['potency']

# test set randomdecoys
X_test_randomdecoys_plec, y_test_randomdecoys_plec = test_randomdecoys_plec.drop(['class', 'potency','index'], axis= 1), test_randomdecoys_plec['potency']
X_test_randomdecoys_grid, y_test_randomdecoys_grid = test_randomdecoys_grid.drop(['class', 'potency','index'], axis= 1), test_randomdecoys_grid['potency']

# **Training with PLEC features**

# **RandomForest**

In [68]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
import pickle

# Define the objective function for hyperparameter tuning
def objective(params):
    # Create the random forest regressor with the given parameters
    rf_plec = RandomForestRegressor(
        n_estimators=int(params['n_estimators']),
        max_depth=int(params['max_depth']),
        min_samples_split=int(params['min_samples_split']),
        min_samples_leaf=int(params['min_samples_leaf']),
        max_features=params['max_features'],
        random_state=60, # Set random state for reproducibility
        n_jobs=40     
    )

    # Perform 5-fold cross-validation and calculate mean squared error (MSE)
    cv_scores = cross_val_score(rf_plec, X_plec_train, y_plec_train, cv=5, scoring='neg_mean_squared_error')
    mean_mse = np.mean(cv_scores)
    
    # Return the negative mean squared error as the loss
    return {'loss': mean_mse, 'status': STATUS_OK}

# Define the search space for hyperparameters
space = {
    'n_estimators': hp.quniform('n_estimators', 50, 300, 10),
    'max_depth': hp.quniform('max_depth', 3, 20, 1),
    'min_samples_split': hp.quniform('min_samples_split', 2, 10, 1),
    'min_samples_leaf': hp.quniform('min_samples_leaf', 1, 10, 1),
    'max_features': hp.choice('max_features', ['sqrt', 'log2', None])
}

# Run the optimization
trials = Trials()
best_params_rf_plec_reg_P1 = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=50,  # Adjust the number of evaluations as needed
    trials=trials,
    rstate=np.random.default_rng(60) # Set random state for reproducibility
)
    
# Save the best hyperparameters found
file_path = ("/home/juni/working/eman/results_reg/best_params_rf_plec_reg_P1.pkl")
with open(file_path, 'wb') as file:
    pickle.dump(best_params_rf_plec_reg_P1, file)

# Print the best hyperparameters found
print('Best hyperparameters:', best_params_rf_plec_reg_P1)

100%|██████████| 50/50 [02:46<00:00,  3.34s/trial, best loss: -3.5529274704296454]
Best hyperparameters: {'max_depth': 3.0, 'max_features': 1, 'min_samples_leaf': 1.0, 'min_samples_split': 8.0, 'n_estimators': 270.0}


# **save the best hyperparameters found**

In [81]:
# save the best hyperparameters found
file_path = ("/home/juni/working/eman/results_reg/best_params_rf_plec_reg_P1.pkl")
with open(file_path, 'rb') as file:
    best_params_rf_plec_reg_P1 = pickle.load(file)

In [82]:
# Convert the hyperparameter choices to the proper format
if best_params_rf_plec_reg_P1['max_features'] == 0:
    best_params_rf_plec_reg_P1['max_features'] = 'sqrt'
elif best_params_rf_plec_reg_P1['max_features'] == 1:
    best_params_rf_plec_reg_P1['max_features'] = 'log2'
else:
    best_params_rf_plec_reg_P1['max_features'] = None

# **train the RF ten times and save the results**

In [83]:
# Train the final model with the best hyperparameters
PR_AUCs_plec_rf_P1_hard = []
PR_AUCs_plec_rf_P2_hard = []
for i in range(1,11):
    rf_plec = RandomForestRegressor(
        n_estimators=int(best_params_rf_plec_reg_P1['n_estimators']),
        max_depth=int(best_params_rf_plec_reg_P1['max_depth']),
        min_samples_split=int(best_params_rf_plec_reg_P1['min_samples_split']),
        min_samples_leaf=int(best_params_rf_plec_reg_P1['min_samples_leaf']),
        max_features=best_params_rf_plec_reg_P1['max_features'],
        random_state=i
    )
    
    # Fit the model
    rf_plec.fit(X_plec_train, y_plec_train)
    
    #Test the RF model on the test molecules:
    prediction_test_rf_plec_score_P1_hard = rf_plec.predict(X_test_randomdecoys_plec_hard)
  
    
    
    # Get virtual screening results on the test molecules and export results to a csv file:
    plec_result_rf_P1_hard = pd.DataFrame({"Predicted_Score": prediction_test_rf_plec_score_P1_hard,
                                   "Real_Score": y_test_randomdecoys_plec_hard})
    plec_result_rf_P1_hard['Predicted_Class'] = plec_result_rf_P1_hard['Predicted_Score'].apply(lambda x: 'active' if x > 2 else 'inactive')
    plec_result_rf_P1_hard['Real_Class'] = plec_result_rf_P1_hard['Real_Score'].apply(lambda x: 'active' if x > 2 else 'inactive')
    plec_result_rf_P1_hard['normalized_scores'] = (plec_result_rf_P1_hard['Predicted_Score'] - plec_result_rf_P1_hard['Predicted_Score'].min())/(plec_result_rf_P1_hard['Predicted_Score'].max() - plec_result_rf_P1_hard['Predicted_Score'].min())
    plec_result_rf_P1_hard.to_csv("/home/juni/working/eman/revised_paper/hard_test_results/results_reg/plec_result_rf_reg_P1_hard_"+str(i)+".csv")
    
    precision, recall, _ = precision_recall_curve(plec_result_rf_P1_hard['Real_Class'], plec_result_rf_P1_hard['Predicted_Score'], pos_label='active')
    pr_auc = round(auc(recall, precision),4)
    PR_AUCs_plec_rf_P1_hard.append(pr_auc)
    
    
    #Test the RF model on the test molecules:
    prediction_test_rf_plec_score_P2_hard = rf_plec.predict(X_test_deepcoy_plec_hard)
 
    
    
    # Get virtual screening results on the test molecules and export results to a csv file:
    plec_result_rf_P2_hard = pd.DataFrame({"Predicted_Score": prediction_test_rf_plec_score_P2_hard,
                                   "Real_Score": y_test_deepcoy_plec_hard})
    plec_result_rf_P2_hard['Predicted_Class'] = plec_result_rf_P2_hard['Predicted_Score'].apply(lambda x: 'active' if x > 2 else 'inactive')
    plec_result_rf_P2_hard['Real_Class'] = plec_result_rf_P2_hard['Real_Score'].apply(lambda x: 'active' if x > 2 else 'inactive')
    plec_result_rf_P2_hard['normalized_scores'] = (plec_result_rf_P2_hard['Predicted_Score'] - plec_result_rf_P2_hard['Predicted_Score'].min())/(plec_result_rf_P2_hard['Predicted_Score'].max() - plec_result_rf_P2_hard['Predicted_Score'].min())
    plec_result_rf_P2_hard.to_csv("/home/juni/working/eman/revised_paper/hard_test_results/results_reg/plec_result_rf_reg_P2_hard_"+str(i)+".csv")
    
    precision, recall, _ = precision_recall_curve(plec_result_rf_P2_hard['Real_Class'], plec_result_rf_P2_hard['Predicted_Score'], pos_label='active')
    pr_auc = round(auc(recall, precision),4)
    PR_AUCs_plec_rf_P2_hard.append(pr_auc)
    
    
print(PR_AUCs_plec_rf_P1_hard)
print(PR_AUCs_plec_rf_P2_hard)

[0.5395, 0.5527, 0.5643, 0.5461, 0.5433, 0.5757, 0.5427, 0.5488, 0.5489, 0.5459]
[0.6281, 0.6267, 0.6548, 0.6656, 0.5944, 0.6516, 0.6022, 0.6164, 0.6147, 0.5953]


# **XGBoost**

In [80]:
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor  
import numpy as np
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
import pickle

# Define the objective function for hyperparameter tuning
def objective(params):
    # Create the XGBoost classifier with the given parameters
    xgb_plec = XGBRegressor(
        n_estimators=int(params['n_estimators']),
        max_depth=int(params['max_depth']),
        learning_rate=params['learning_rate'],
        min_child_weight=int(params['min_child_weight']),
        gamma=params['gamma'],
        subsample=params['subsample'],
        colsample_bytree=params['colsample_bytree'],
        random_state=42,  # Set random state for reproducibility
        n_jobs=40  # Use 40 CPU cores for XGBClassifier
    )

    # Perform 5-fold cross-validation and calculate mean squared error (MSE)
    cv_scores = cross_val_score(xgb_plec, X_plec_train, y_plec_train, cv=5, scoring='neg_mean_squared_error')
    mean_mse = np.mean(cv_scores)
    
    # Return the negative mean squared error as the loss
    return {'loss': mean_mse, 'status': STATUS_OK}

# Define the search space for hyperparameters
space = {
    'n_estimators': hp.quniform('n_estimators', 50, 300, 10),
    'max_depth': hp.quniform('max_depth', 3, 15, 1),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
    'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1),
    'gamma': hp.uniform('gamma', 0, 5),
    'subsample': hp.uniform('subsample', 0.5, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1)
}

# Run the optimization
trials = Trials()
best_params_xgb_plec_reg_P1 = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=50,  # Adjust the number of evaluations as needed
    trials=trials,
    rstate=np.random.default_rng(60) # Set random state for reproducibility
)
    
# Save the best hyperparameters found
file_path = ("/home/juni/working/eman/results_reg/best_params_xgb_plec_reg_P1.pkl")
with open(file_path, 'wb') as file:
    pickle.dump(best_params_xgb_plec_reg_P1, file)

# Print the best hyperparameters found
print('Best hyperparameters:', best_params_xgb_plec_reg_P1)

100%|██████████| 50/50 [16:13<00:00, 19.48s/trial, best loss: -2.6320631116598494]
Best hyperparameters: {'colsample_bytree': 0.7308229288393999, 'gamma': 1.964045543005358, 'learning_rate': 0.010393413439404125, 'max_depth': 7.0, 'min_child_weight': 8.0, 'n_estimators': 150.0, 'subsample': 0.7140723940024111}


# **save the best hyperparameters found**

In [84]:
# save the best hyperparameters found
file_path = ("/home/juni/working/eman/results_reg/best_params_xgb_plec_reg_P1.pkl")
with open(file_path, 'rb') as file:
    best_params_xgb_plec_reg_P1 = pickle.load(file)

# **train the XGB ten times and save th results**

In [85]:
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor  
import numpy as np
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
import pickle

# Train the final model with the best hyperparameters
PR_AUCs_plec_xgb_P1_hard = []
PR_AUCs_plec_xgb_P2_hard = []
for i in range(1,11):
    xgb_plec = XGBRegressor(
        n_estimators=int(best_params_xgb_plec_reg_P1['n_estimators']),
        max_depth=int(best_params_xgb_plec_reg_P1['max_depth']),
        learning_rate=best_params_xgb_plec_reg_P1['learning_rate'],
        min_child_weight=int(best_params_xgb_plec_reg_P1['min_child_weight']),
        gamma=best_params_xgb_plec_reg_P1['gamma'],
        subsample=best_params_xgb_plec_reg_P1['subsample'],
        colsample_bytree=best_params_xgb_plec_reg_P1['colsample_bytree'],
        random_state=i,
        n_jobs=40  # Use 40 CPU cores for XGBClassifier
)
    
    # Fit the model
    xgb_plec.fit(X_plec_train, y_plec_train)
    
    #Test the RF model on the test molecules:
    prediction_test_xgb_plec_score_P1_hard = xgb_plec.predict(X_test_randomdecoys_plec_hard)
  
    
    
    # Get virtual screening results on the test molecules and export results to a csv file:
    plec_result_xgb_P1_hard = pd.DataFrame({"Predicted_Score": prediction_test_xgb_plec_score_P1_hard,
                                   "Real_Score": y_test_randomdecoys_plec_hard})
    plec_result_xgb_P1_hard['Predicted_Class'] = plec_result_xgb_P1_hard['Predicted_Score'].apply(lambda x: 'active' if x > 2 else 'inactive')
    plec_result_xgb_P1_hard['Real_Class'] = plec_result_xgb_P1_hard['Real_Score'].apply(lambda x: 'active' if x > 2 else 'inactive')
    plec_result_xgb_P1_hard['normalized_scores'] = (plec_result_xgb_P1_hard['Predicted_Score'] - plec_result_xgb_P1_hard['Predicted_Score'].min())/(plec_result_xgb_P1_hard['Predicted_Score'].max() - plec_result_xgb_P1_hard['Predicted_Score'].min())
    plec_result_xgb_P1_hard.to_csv("/home/juni/working/eman/revised_paper/hard_test_results/results_reg/plec_result_xgb_reg_P1_hard_"+str(i)+".csv")
    
    precision, recall, _ = precision_recall_curve(plec_result_xgb_P1_hard['Real_Class'], plec_result_xgb_P1_hard['Predicted_Score'], pos_label='active')
    pr_auc = round(auc(recall, precision),4)
    PR_AUCs_plec_xgb_P1_hard.append(pr_auc)
    
    
    #Test the RF model on the test molecules:
    prediction_test_xgb_plec_score_P2_hard = xgb_plec.predict(X_test_deepcoy_plec_hard)
 
    
    
    # Get virtual screening results on the test molecules and export results to a csv file:
    plec_result_xgb_P2_hard = pd.DataFrame({"Predicted_Score": prediction_test_xgb_plec_score_P2_hard,
                                   "Real_Score": y_test_deepcoy_plec_hard})
    plec_result_xgb_P2_hard['Predicted_Class'] = plec_result_xgb_P2_hard['Predicted_Score'].apply(lambda x: 'active' if x > 2 else 'inactive')
    plec_result_xgb_P2_hard['Real_Class'] = plec_result_xgb_P2_hard['Real_Score'].apply(lambda x: 'active' if x > 2 else 'inactive')
    plec_result_xgb_P2_hard['normalized_scores'] = (plec_result_xgb_P2_hard['Predicted_Score'] - plec_result_xgb_P2_hard['Predicted_Score'].min())/(plec_result_xgb_P2_hard['Predicted_Score'].max() - plec_result_xgb_P2_hard['Predicted_Score'].min())
    plec_result_xgb_P2_hard.to_csv("/home/juni/working/eman/revised_paper/hard_test_results/results_reg/plec_result_xgb_reg_P2_hard_"+str(i)+".csv")
    
    precision, recall, _ = precision_recall_curve(plec_result_xgb_P2_hard['Real_Class'], plec_result_xgb_P2_hard['Predicted_Score'], pos_label='active')
    pr_auc = round(auc(recall, precision),4)
    PR_AUCs_plec_xgb_P2_hard.append(pr_auc)
    
    
print(PR_AUCs_plec_xgb_P1_hard)
print(PR_AUCs_plec_xgb_P2_hard)

[0.7323, 0.7562, 0.7583, 0.7538, 0.7589, 0.737, 0.7603, 0.7687, 0.7526, 0.7594]
[0.7156, 0.7211, 0.7311, 0.7187, 0.7287, 0.7221, 0.7229, 0.733, 0.709, 0.7327]


# **ANN**

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPRegressor  
import numpy as np
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
import pickle

# Define the objective function for hyperparameter tuning
def objective(params):
    # Create the ANN classifier with the given parameters
    ann_plec = MLPRegressor(
        hidden_layer_sizes=params['hidden_layer_sizes'],
        activation=params['activation'],
        solver=params['solver'],
        alpha=params['alpha'],
        learning_rate=params['learning_rate'],
        random_state=42
    )

    # Perform 5-fold cross-validation and calculate mean squared error (MSE)
    cv_scores = cross_val_score(ann_plec, X_plec_train, y_plec_train, cv=5, scoring='neg_mean_squared_error')
    mean_mse = np.mean(cv_scores)
    
    # Return the negative mean squared error as the loss
    return {'loss': mean_mse, 'status': STATUS_OK}

# Define the search space for hyperparameters
space = {
    'hidden_layer_sizes': hp.choice('hidden_layer_sizes', [(50,), (100,), (50, 50), (100, 100)]),
    'activation': hp.choice('activation', ['relu', 'tanh', 'logistic']),
    'solver': hp.choice('solver', ['adam', 'sgd', 'lbfgs']),
    'alpha': hp.loguniform('alpha', -5, -1),  # L2 penalty parameter (e.g., from 0.00001 to 0.1)
    'learning_rate': hp.choice('learning_rate', ['constant', 'adaptive', 'invscaling'])
}

# Run the optimization
trials = Trials()
best_params_ann_plec_reg_P1 = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=50,  # Adjust the number of evaluations as needed
    trials=trials,
    rstate=np.random.default_rng(60) # Set random state for reproducibility
)
    
# Save the best hyperparameters found
file_path = ("/home/juni/working/eman/results_reg/best_params_ann_plec_reg_P1.pkl")
with open(file_path, 'wb') as file:
    pickle.dump(best_params_ann_plec_reg_P1, file)

# Print the best hyperparameters found
print('Best hyperparameters:', best_params_ann_plec_reg_P1)

# **save the best hyperparameters found**

In [86]:
# save the best hyperparameters found
file_path = ("/home/juni/working/eman/results_reg/best_params_ann_plec_reg_P1.pkl")
with open(file_path, 'rb') as file:
    best_params_ann_plec_reg_P1 = pickle.load(file)

In [87]:
# Convert the hyperparameter choices to the proper format
if best_params_ann_plec_reg_P1['activation'] == 0:
    best_params_ann_plec_reg_P1['activation'] = 'relu'
elif best_params_ann_plec_reg_P1['activation'] == 1:
    best_params_ann_plec_reg_P1['activation'] = 'tanh'
else:
    best_params_ann_plec_reg_P1['activation'] = 'logistic'



# Convert the hyperparameter choices to the proper format
if best_params_ann_plec_reg_P1['learning_rate'] == 0:
    best_params_ann_plec_reg_P1['learning_rate'] = 'constant'
elif best_params_ann_plec_reg_P1['learning_rate'] == 1:
    best_params_ann_plec_reg_P1['learning_rate'] = 'adaptive'
else:
    best_params_ann_plec_reg_P1['learning_rate'] = 'invscaling'


# Convert the hyperparameter choices to the proper format
if best_params_ann_plec_reg_P1['solver'] == 0:
    best_params_ann_plec_reg_P1['solver'] = 'adam'
elif best_params_ann_plec_reg_P1['solver'] == 1:
    best_params_ann_plec_reg_P1['solver'] = 'sgd'
else:
    best_params_ann_plec_reg_P1['solver'] = 'lbfgs'


if best_params_ann_plec_reg_P1['hidden_layer_sizes'] == 0:
    best_params_ann_plec_reg_P1['hidden_layer_sizes'] = (50,)
elif best_params_ann_plec_reg_P1['hidden_layer_sizes'] == 1:
    best_params_ann_plec_reg_P1['hidden_layer_sizes'] = (100,)
elif best_params_ann_plec_reg_P1['hidden_layer_sizes'] == 2:
    best_params_ann_plec_reg_P1['hidden_layer_sizes'] = (50,50)
else:
    best_params_ann_plec_reg_P1['hidden_layer_sizes'] = (100,100)

# **train the ANN ten times and save th results**

In [88]:
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPRegressor  
import numpy as np
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
import pickle

# Train the final model with the best hyperparameters
PR_AUCs_plec_ann_P1_hard = []
PR_AUCs_plec_ann_P2_hard = []
for i in range(1,11):
    ann_plec = MLPRegressor(
    hidden_layer_sizes=best_params_ann_plec_reg_P1['hidden_layer_sizes'],
    activation=best_params_ann_plec_reg_P1['activation'],
    solver=best_params_ann_plec_reg_P1['solver'],
    alpha=best_params_ann_plec_reg_P1['alpha'],
    learning_rate=best_params_ann_plec_reg_P1['learning_rate'],
    random_state=i)
    
    
    # Fit the model
    ann_plec.fit(X_plec_train, y_plec_train)
    
    #Test the RF model on the test molecules:
    prediction_test_ann_plec_score_P1_hard = ann_plec.predict(X_test_randomdecoys_plec_hard)
  
    
    
    # Get virtual screening results on the test molecules and export results to a csv file:
    plec_result_ann_P1_hard = pd.DataFrame({"Predicted_Score": prediction_test_ann_plec_score_P1_hard,
                                   "Real_Score": y_test_randomdecoys_plec_hard})
    plec_result_ann_P1_hard['Predicted_Class'] = plec_result_ann_P1_hard['Predicted_Score'].apply(lambda x: 'active' if x > 2 else 'inactive')
    plec_result_ann_P1_hard['Real_Class'] = plec_result_ann_P1_hard['Real_Score'].apply(lambda x: 'active' if x > 2 else 'inactive')
    plec_result_ann_P1_hard['normalized_scores'] = (plec_result_ann_P1_hard['Predicted_Score'] - plec_result_ann_P1_hard['Predicted_Score'].min())/(plec_result_ann_P1_hard['Predicted_Score'].max() - plec_result_ann_P1_hard['Predicted_Score'].min())
    plec_result_ann_P1_hard.to_csv("/home/juni/working/eman/revised_paper/hard_test_results/results_reg/plec_result_ann_reg_P1_hard_"+str(i)+".csv")
    
    precision, recall, _ = precision_recall_curve(plec_result_ann_P1_hard['Real_Class'], plec_result_ann_P1_hard['Predicted_Score'], pos_label='active')
    pr_auc = round(auc(recall, precision),4)
    PR_AUCs_plec_ann_P1_hard.append(pr_auc)
    
    
    #Test the RF model on the test molecules:
    prediction_test_ann_plec_score_P2_hard = ann_plec.predict(X_test_deepcoy_plec_hard)
 
    
    
    # Get virtual screening results on the test molecules and export results to a csv file:
    plec_result_ann_P2_hard = pd.DataFrame({"Predicted_Score": prediction_test_ann_plec_score_P2_hard,
                                   "Real_Score": y_test_deepcoy_plec_hard})
    plec_result_ann_P2_hard['Predicted_Class'] = plec_result_ann_P2_hard['Predicted_Score'].apply(lambda x: 'active' if x > 2 else 'inactive')
    plec_result_ann_P2_hard['Real_Class'] = plec_result_ann_P2_hard['Real_Score'].apply(lambda x: 'active' if x > 2 else 'inactive')
    plec_result_ann_P2_hard['normalized_scores'] = (plec_result_ann_P2_hard['Predicted_Score'] - plec_result_ann_P2_hard['Predicted_Score'].min())/(plec_result_ann_P2_hard['Predicted_Score'].max() - plec_result_ann_P2_hard['Predicted_Score'].min())
    plec_result_ann_P2_hard.to_csv("/home/juni/working/eman/revised_paper/hard_test_results/results_reg/plec_result_ann_reg_P2_hard_"+str(i)+".csv")
    
    precision, recall, _ = precision_recall_curve(plec_result_ann_P2_hard['Real_Class'], plec_result_ann_P2_hard['Predicted_Score'], pos_label='active')
    pr_auc = round(auc(recall, precision),4)
    PR_AUCs_plec_ann_P2_hard.append(pr_auc)
    
    

print(PR_AUCs_plec_ann_P1_hard)
print(PR_AUCs_plec_ann_P2_hard)

[0.5266, 0.4528, 0.4983, 0.5, 0.4504, 0.4616, 0.4755, 0.5022, 0.5084, 0.4934]
[0.5307, 0.4416, 0.4964, 0.5363, 0.4575, 0.4683, 0.4989, 0.5075, 0.4954, 0.5103]


# **Training with GRID features**

# **RandomForest**

In [102]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
import pickle

# Define the objective function for hyperparameter tuning
def objective(params):
    # Create the random forest regressor with the given parameters
    rf_grid = RandomForestRegressor(
        n_estimators=int(params['n_estimators']),
        max_depth=int(params['max_depth']),
        min_samples_split=int(params['min_samples_split']),
        min_samples_leaf=int(params['min_samples_leaf']),
        max_features=params['max_features'],
        random_state=60, # Set random state for reproducibility
        n_jobs=40     
    )

    # Perform 5-fold cross-validation and calculate mean squared error (MSE)
    cv_scores = cross_val_score(rf_grid, X_grid_train, y_grid_train, cv=5, scoring='neg_mean_squared_error')
    mean_mse = np.mean(cv_scores)
    
    # Return the negative mean squared error as the loss
    return {'loss': mean_mse, 'status': STATUS_OK}

# Define the search space for hyperparameters
space = {
    'n_estimators': hp.quniform('n_estimators', 50, 300, 10),
    'max_depth': hp.quniform('max_depth', 3, 20, 1),
    'min_samples_split': hp.quniform('min_samples_split', 2, 10, 1),
    'min_samples_leaf': hp.quniform('min_samples_leaf', 1, 10, 1),
    'max_features': hp.choice('max_features', ['sqrt', 'log2', None])
}

# Run the optimization
trials = Trials()
best_params_rf_grid_reg_P1 = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=50,  # Adjust the number of evaluations as needed
    trials=trials,
    rstate=np.random.default_rng(60) # Set random state for reproducibility
)
    
# Save the best hyperparameters found
file_path = ("/home/juni/working/eman/results_reg/best_params_rf_grid_reg_P1.pkl")
with open(file_path, 'wb') as file:
    pickle.dump(best_params_rf_grid_reg_P1, file)

# Print the best hyperparameters found
print('Best hyperparameters:', best_params_rf_grid_reg_P1)

100%|██████████| 50/50 [02:33<00:00,  3.07s/trial, best loss: -3.510822879444946] 
Best hyperparameters: {'max_depth': 3.0, 'max_features': 1, 'min_samples_leaf': 1.0, 'min_samples_split': 8.0, 'n_estimators': 270.0}


# **save the best hyperparameters found**

In [89]:
# save the best hyperparameters found
file_path = ("/home/juni/working/eman/results_reg/best_params_rf_grid_reg_P1.pkl")
with open(file_path, 'rb') as file:
    best_params_rf_grid_reg_P1 = pickle.load(file)

In [90]:
# Convert the hyperparameter choices to the proper format
if best_params_rf_grid_reg_P1['max_features'] == 0:
    best_params_rf_grid_reg_P1['max_features'] = 'sqrt'
elif best_params_rf_grid_reg_P1['max_features'] == 1:
    best_params_rf_grid_reg_P1['max_features'] = 'log2'
else:
    best_params_rf_grid_reg_P1['max_features'] = None

# **train the RF ten times and save th results**

In [91]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
import pickle

# Train the final model with the best hyperparameters
PR_AUCs_grid_rf_P1_hard = []
PR_AUCs_grid_rf_P2_hard = []
for i in range(1,11):
    rf_grid = RandomForestRegressor(
        n_estimators=int(best_params_rf_grid_reg_P1['n_estimators']),
        max_depth=int(best_params_rf_grid_reg_P1['max_depth']),
        min_samples_split=int(best_params_rf_grid_reg_P1['min_samples_split']),
        min_samples_leaf=int(best_params_rf_grid_reg_P1['min_samples_leaf']),
        max_features=best_params_rf_grid_reg_P1['max_features'],
        random_state=i
    )
    
    # Fit the model
    rf_grid.fit(X_grid_train, y_grid_train)
    
    #Test the RF model on the test molecules:
    prediction_test_rf_grid_score_P1_hard = rf_grid.predict(X_test_randomdecoys_grid_hard)
  
    
    
    # Get virtual screening results on the test molecules and export results to a csv file:
    grid_result_rf_P1_hard = pd.DataFrame({"Predicted_Score": prediction_test_rf_grid_score_P1_hard,
                                   "Real_Score": y_test_randomdecoys_grid_hard})
    grid_result_rf_P1_hard['Predicted_Class'] = grid_result_rf_P1_hard['Predicted_Score'].apply(lambda x: 'active' if x > 2 else 'inactive')
    grid_result_rf_P1_hard['Real_Class'] = grid_result_rf_P1_hard['Real_Score'].apply(lambda x: 'active' if x > 2 else 'inactive')
    grid_result_rf_P1_hard['normalized_scores'] = (grid_result_rf_P1_hard['Predicted_Score'] - grid_result_rf_P1_hard['Predicted_Score'].min())/(grid_result_rf_P1_hard['Predicted_Score'].max() - grid_result_rf_P1_hard['Predicted_Score'].min())
    grid_result_rf_P1_hard.to_csv("/home/juni/working/eman/revised_paper/hard_test_results/results_reg/grid_result_rf_reg_P1_hard_"+str(i)+".csv")
    
    precision, recall, _ = precision_recall_curve(grid_result_rf_P1_hard['Real_Class'], grid_result_rf_P1_hard['Predicted_Score'], pos_label='active')
    pr_auc = round(auc(recall, precision),4)
    PR_AUCs_grid_rf_P1_hard.append(pr_auc)
    
    
    #Test the RF model on the test molecules:
    prediction_test_rf_grid_score_P2_hard = rf_grid.predict(X_test_deepcoy_grid_hard)
 
    
    
    # Get virtual screening results on the test molecules and export results to a csv file:
    grid_result_rf_P2_hard = pd.DataFrame({"Predicted_Score": prediction_test_rf_grid_score_P2_hard,
                                   "Real_Score": y_test_deepcoy_grid_hard})
    grid_result_rf_P2_hard['Predicted_Class'] = grid_result_rf_P2_hard['Predicted_Score'].apply(lambda x: 'active' if x > 2 else 'inactive')
    grid_result_rf_P2_hard['Real_Class'] = grid_result_rf_P2_hard['Real_Score'].apply(lambda x: 'active' if x > 2 else 'inactive')
    grid_result_rf_P2_hard['normalized_scores'] = (grid_result_rf_P2_hard['Predicted_Score'] - grid_result_rf_P2_hard['Predicted_Score'].min())/(grid_result_rf_P2_hard['Predicted_Score'].max() - grid_result_rf_P2_hard['Predicted_Score'].min())
    grid_result_rf_P2_hard.to_csv("/home/juni/working/eman/revised_paper/hard_test_results/results_reg/grid_result_rf_reg_P2_hard_"+str(i)+".csv")
    
    precision, recall, _ = precision_recall_curve(grid_result_rf_P2_hard['Real_Class'], grid_result_rf_P2_hard['Predicted_Score'], pos_label='active')
    pr_auc = round(auc(recall, precision),4)
    PR_AUCs_grid_rf_P2_hard.append(pr_auc)
    
    
print(PR_AUCs_grid_rf_P1_hard)
print(PR_AUCs_grid_rf_P2_hard)

[0.4498, 0.4457, 0.4318, 0.4409, 0.4496, 0.4337, 0.4407, 0.4478, 0.4301, 0.4445]
[0.4839, 0.4773, 0.4804, 0.4693, 0.5027, 0.4793, 0.5036, 0.5086, 0.4819, 0.4681]


# **XGBoost**

In [107]:
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor  
import numpy as np
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
import pickle

# Define the objective function for hyperparameter tuning
def objective(params):
    # Create the XGBoost classifier with the given parameters
    xgb_grid = XGBRegressor(
        n_estimators=int(params['n_estimators']),
        max_depth=int(params['max_depth']),
        learning_rate=params['learning_rate'],
        min_child_weight=int(params['min_child_weight']),
        gamma=params['gamma'],
        subsample=params['subsample'],
        colsample_bytree=params['colsample_bytree'],
        random_state=42,  # Set random state for reproducibility
        n_jobs=40  # Use 40 CPU cores for XGBClassifier
    )

    # Perform 5-fold cross-validation and calculate mean squared error (MSE)
    cv_scores = cross_val_score(xgb_grid, X_grid_train, y_grid_train, cv=5, scoring='neg_mean_squared_error')
    mean_mse = np.mean(cv_scores)
    
    # Return the negative mean squared error as the loss
    return {'loss': mean_mse, 'status': STATUS_OK}

# Define the search space for hyperparameters
space = {
    'n_estimators': hp.quniform('n_estimators', 50, 300, 10),
    'max_depth': hp.quniform('max_depth', 3, 15, 1),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
    'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1),
    'gamma': hp.uniform('gamma', 0, 5),
    'subsample': hp.uniform('subsample', 0.5, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1)
}

# Run the optimization
trials = Trials()
best_params_xgb_grid_reg_P1 = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=50,  # Adjust the number of evaluations as needed
    trials=trials,
    rstate=np.random.default_rng(60) # Set random state for reproducibility
)
    
# Save the best hyperparameters found
file_path = ("/home/juni/working/eman/results_reg/best_params_xgb_grid_reg_P1.pkl")
with open(file_path, 'wb') as file:
    pickle.dump(best_params_xgb_grid_reg_P1, file)

# Print the best hyperparameters found
print('Best hyperparameters:', best_params_xgb_grid_reg_P1)

100%|██████████| 50/50 [07:34<00:00,  9.10s/trial, best loss: -2.290092991419162] 
Best hyperparameters: {'colsample_bytree': 0.7151959713442447, 'gamma': 3.051560419817014, 'learning_rate': 0.03462652763929768, 'max_depth': 4.0, 'min_child_weight': 5.0, 'n_estimators': 60.0, 'subsample': 0.518783299183045}


# **save the best hyperparameters found**

In [92]:
# save the best hyperparameters found
file_path = ("/home/juni/working/eman/results_reg/best_params_xgb_grid_reg_P1.pkl")
with open(file_path, 'rb') as file:
    best_params_xgb_grid_reg_P1 = pickle.load(file)

# **train the XGB ten times and save th results**

In [93]:
# Train the final model with the best hyperparameters
PR_AUCs_grid_xgb_P1_hard = []
PR_AUCs_grid_xgb_P2_hard = []
for i in range(1,11):
    xgb_grid = XGBRegressor(
        n_estimators=int(best_params_xgb_grid_reg_P1['n_estimators']),
        max_depth=int(best_params_xgb_grid_reg_P1['max_depth']),
        learning_rate=best_params_xgb_grid_reg_P1['learning_rate'],
        min_child_weight=int(best_params_xgb_grid_reg_P1['min_child_weight']),
        gamma=best_params_xgb_grid_reg_P1['gamma'],
        subsample=best_params_xgb_grid_reg_P1['subsample'],
        colsample_bytree=best_params_xgb_grid_reg_P1['colsample_bytree'],
        random_state=i,
        n_jobs=40  # Use 40 CPU cores for XGBClassifier
)
    
    # Fit the model
    xgb_grid.fit(X_grid_train, y_grid_train)
    
    #Test the RF model on the test molecules:
    prediction_test_xgb_grid_score_P1_hard = xgb_grid.predict(X_test_randomdecoys_grid_hard)
  
    
    
    # Get virtual screening results on the test molecules and export results to a csv file:
    grid_result_xgb_P1_hard = pd.DataFrame({"Predicted_Score": prediction_test_xgb_grid_score_P1_hard,
                                   "Real_Score": y_test_randomdecoys_grid_hard})
    grid_result_xgb_P1_hard['Predicted_Class'] = grid_result_xgb_P1_hard['Predicted_Score'].apply(lambda x: 'active' if x > 2 else 'inactive')
    grid_result_xgb_P1_hard['Real_Class'] = grid_result_xgb_P1_hard['Real_Score'].apply(lambda x: 'active' if x > 2 else 'inactive')
    grid_result_xgb_P1_hard['normalized_scores'] = (grid_result_xgb_P1_hard['Predicted_Score'] - grid_result_xgb_P1_hard['Predicted_Score'].min())/(grid_result_xgb_P1_hard['Predicted_Score'].max() - grid_result_xgb_P1_hard['Predicted_Score'].min())
    grid_result_xgb_P1_hard.to_csv("/home/juni/working/eman/revised_paper/hard_test_results/results_reg/grid_result_xgb_reg_P1_hard_"+str(i)+".csv")
    
    precision, recall, _ = precision_recall_curve(grid_result_xgb_P1_hard['Real_Class'], grid_result_xgb_P1_hard['Predicted_Score'], pos_label='active')
    pr_auc = round(auc(recall, precision),4)
    PR_AUCs_grid_xgb_P1_hard.append(pr_auc)
    
    
    #Test the RF model on the test molecules:
    prediction_test_xgb_grid_score_P2_hard = xgb_grid.predict(X_test_deepcoy_grid_hard)
 
    
    
    # Get virtual screening results on the test molecules and export results to a csv file:
    grid_result_xgb_P2_hard = pd.DataFrame({"Predicted_Score": prediction_test_xgb_grid_score_P2_hard,
                                   "Real_Score": y_test_deepcoy_grid_hard})
    grid_result_xgb_P2_hard['Predicted_Class'] = grid_result_xgb_P2_hard['Predicted_Score'].apply(lambda x: 'active' if x > 2 else 'inactive')
    grid_result_xgb_P2_hard['Real_Class'] = grid_result_xgb_P2_hard['Real_Score'].apply(lambda x: 'active' if x > 2 else 'inactive')
    grid_result_xgb_P2_hard['normalized_scores'] = (grid_result_xgb_P2_hard['Predicted_Score'] - grid_result_xgb_P2_hard['Predicted_Score'].min())/(grid_result_xgb_P2_hard['Predicted_Score'].max() - grid_result_xgb_P2_hard['Predicted_Score'].min())
    grid_result_xgb_P2_hard.to_csv("/home/juni/working/eman/revised_paper/hard_test_results/results_reg/grid_result_xgb_reg_P2_hard_"+str(i)+".csv")
    
    precision, recall, _ = precision_recall_curve(grid_result_xgb_P2_hard['Real_Class'], grid_result_xgb_P2_hard['Predicted_Score'], pos_label='active')
    pr_auc = round(auc(recall, precision),4)
    PR_AUCs_grid_xgb_P2_hard.append(pr_auc)
    
    
print(PR_AUCs_grid_xgb_P1_hard)
print(PR_AUCs_grid_xgb_P2_hard)

[0.6557, 0.6792, 0.6928, 0.6606, 0.663, 0.69, 0.6555, 0.6512, 0.6646, 0.7061]
[0.6646, 0.6825, 0.6947, 0.6742, 0.6701, 0.6778, 0.6581, 0.6655, 0.6902, 0.7121]


# **ANN**

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPRegressor  
import numpy as np
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
import pickle

# Define the objective function for hyperparameter tuning
def objective(params):
    # Create the ANN classifier with the given parameters
    ann_grid = MLPRegressor(
        hidden_layer_sizes=params['hidden_layer_sizes'],
        activation=params['activation'],
        solver=params['solver'],
        alpha=params['alpha'],
        learning_rate=params['learning_rate'],
        random_state=42
    )

    # Perform 5-fold cross-validation and calculate mean squared error (MSE)
    cv_scores = cross_val_score(ann_grid, X_grid_train, y_grid_train, cv=5, scoring='neg_mean_squared_error')
    mean_mse = np.mean(cv_scores)
    
    # Return the negative mean squared error as the loss
    return {'loss': mean_mse, 'status': STATUS_OK}

# Define the search space for hyperparameters
space = {
    'hidden_layer_sizes': hp.choice('hidden_layer_sizes', [(50,), (100,), (50, 50), (100, 100)]),
    'activation': hp.choice('activation', ['relu', 'tanh', 'logistic']),
    'solver': hp.choice('solver', ['adam', 'sgd', 'lbfgs']),
    'alpha': hp.loguniform('alpha', -5, -1),  # L2 penalty parameter (e.g., from 0.00001 to 0.1)
    'learning_rate': hp.choice('learning_rate', ['constant', 'adaptive', 'invscaling'])
}

# Run the optimization
trials = Trials()
best_params_ann_grid_reg_P1 = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=50,  # Adjust the number of evaluations as needed
    trials=trials,
    rstate=np.random.default_rng(60) # Set random state for reproducibility
)
    
# Save the best hyperparameters found
file_path = ("/home/juni/working/eman/results_reg/best_params_ann_grid_reg_P1.pkl")
with open(file_path, 'wb') as file:
    pickle.dump(best_params_ann_grid_reg_P1, file)

# Print the best hyperparameters found
print('Best hyperparameters:', best_params_ann_grid_reg_P1)

# **save the best hyperparameters found**

In [94]:
# save the best hyperparameters found
file_path = ("/home/juni/working/eman/results_reg/best_params_ann_grid_reg_P1.pkl")
with open(file_path, 'rb') as file:
    best_params_ann_grid_reg_P1 = pickle.load(file)

In [95]:
# Convert the hyperparameter choices to the proper format
if best_params_ann_grid_reg_P1['activation'] == 0:
    best_params_ann_grid_reg_P1['activation'] = 'relu'
elif best_params_ann_grid_reg_P1['activation'] == 1:
    best_params_ann_grid_reg_P1['activation'] = 'tanh'
else:
    best_params_ann_grid_reg_P1['activation'] = 'logistic'



# Convert the hyperparameter choices to the proper format
if best_params_ann_grid_reg_P1['learning_rate'] == 0:
    best_params_ann_grid_reg_P1['learning_rate'] = 'constant'
elif best_params_ann_grid_reg_P1['learning_rate'] == 1:
    best_params_ann_grid_reg_P1['learning_rate'] = 'adaptive'
else:
    best_params_ann_grid_reg_P1['learning_rate'] = 'invscaling'


# Convert the hyperparameter choices to the proper format
if best_params_ann_grid_reg_P1['solver'] == 0:
    best_params_ann_grid_reg_P1['solver'] = 'adam'
elif best_params_ann_grid_reg_P1['solver'] == 1:
    best_params_ann_grid_reg_P1['solver'] = 'sgd'
else:
    best_params_ann_grid_reg_P1['solver'] = 'lbfgs'


if best_params_ann_grid_reg_P1['hidden_layer_sizes'] == 0:
    best_params_ann_grid_reg_P1['hidden_layer_sizes'] = (50,)
elif best_params_ann_grid_reg_P1['hidden_layer_sizes'] == 1:
    best_params_ann_grid_reg_P1['hidden_layer_sizes'] = (100,)
elif best_params_ann_grid_reg_P1['hidden_layer_sizes'] == 2:
    best_params_ann_grid_reg_P1['hidden_layer_sizes'] = (50,50)
else:
    best_params_ann_grid_reg_P1['hidden_layer_sizes'] = (100,100)

# **train the ANN ten times and save the results**

In [96]:
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPRegressor  
import numpy as np
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
import pickle

# Train the final model with the best hyperparameters
PR_AUCs_grid_ann_P1_hard = []
PR_AUCs_grid_ann_P2_hard = []
for i in range(1,11):
    ann_grid = MLPRegressor(
    hidden_layer_sizes=best_params_ann_grid_reg_P1['hidden_layer_sizes'],
    activation=best_params_ann_grid_reg_P1['activation'],
    solver=best_params_ann_grid_reg_P1['solver'],
    alpha=best_params_ann_grid_reg_P1['alpha'],
    learning_rate=best_params_ann_grid_reg_P1['learning_rate'],
    random_state=i)
    
    
    # Fit the model
    ann_grid.fit(X_grid_train, y_grid_train)
    
    #Test the RF model on the test molecules:
    prediction_test_ann_grid_score_P1_hard = ann_grid.predict(X_test_randomdecoys_grid_hard)
  
    
    
    # Get virtual screening results on the test molecules and export results to a csv file:
    grid_result_ann_P1_hard = pd.DataFrame({"Predicted_Score": prediction_test_ann_grid_score_P1_hard,
                                   "Real_Score": y_test_randomdecoys_grid_hard})
    grid_result_ann_P1_hard['Predicted_Class'] = grid_result_ann_P1_hard['Predicted_Score'].apply(lambda x: 'active' if x > 2 else 'inactive')
    grid_result_ann_P1_hard['Real_Class'] = grid_result_ann_P1_hard['Real_Score'].apply(lambda x: 'active' if x > 2 else 'inactive')
    grid_result_ann_P1_hard['normalized_scores'] = (grid_result_ann_P1_hard['Predicted_Score'] - grid_result_ann_P1_hard['Predicted_Score'].min())/(grid_result_ann_P1_hard['Predicted_Score'].max() - grid_result_ann_P1_hard['Predicted_Score'].min())
    grid_result_ann_P1_hard.to_csv("/home/juni/working/eman/revised_paper/hard_test_results/results_reg/grid_result_ann_reg_P1_hard_"+str(i)+".csv")
    
    precision, recall, _ = precision_recall_curve(grid_result_ann_P1_hard['Real_Class'], grid_result_ann_P1_hard['Predicted_Score'], pos_label='active')
    pr_auc = round(auc(recall, precision),4)
    PR_AUCs_grid_ann_P1_hard.append(pr_auc)
    
    
    #Test the RF model on the test molecules:
    prediction_test_ann_grid_score_P2_hard = ann_grid.predict(X_test_deepcoy_grid_hard)
 
    
    
    # Get virtual screening results on the test molecules and export results to a csv file:
    grid_result_ann_P2_hard = pd.DataFrame({"Predicted_Score": prediction_test_ann_grid_score_P2_hard,
                                   "Real_Score": y_test_deepcoy_grid_hard})
    grid_result_ann_P2_hard['Predicted_Class'] = grid_result_ann_P2_hard['Predicted_Score'].apply(lambda x: 'active' if x > 2 else 'inactive')
    grid_result_ann_P2_hard['Real_Class'] = grid_result_ann_P2_hard['Real_Score'].apply(lambda x: 'active' if x > 2 else 'inactive')
    grid_result_ann_P2_hard['normalized_scores'] = (grid_result_ann_P2_hard['Predicted_Score'] - grid_result_ann_P2_hard['Predicted_Score'].min())/(grid_result_ann_P2_hard['Predicted_Score'].max() - grid_result_ann_P2_hard['Predicted_Score'].min())
    grid_result_ann_P2_hard.to_csv("/home/juni/working/eman/revised_paper/hard_test_results/results_reg/grid_result_ann_reg_P2_hard_"+str(i)+".csv")
    
    precision, recall, _ = precision_recall_curve(grid_result_ann_P2_hard['Real_Class'], grid_result_ann_P2_hard['Predicted_Score'], pos_label='active')
    pr_auc = round(auc(recall, precision),4)
    PR_AUCs_grid_ann_P2_hard.append(pr_auc)
    
    
print(PR_AUCs_grid_ann_P1_hard)
print(PR_AUCs_grid_ann_P2_hard)

[0.3779, 0.3911, 0.4235, 0.4112, 0.4246, 0.4763, 0.4338, 0.3726, 0.4297, 0.3959]
[0.3692, 0.4259, 0.3742, 0.3676, 0.4236, 0.3889, 0.3778, 0.3519, 0.4111, 0.3934]


# Generate train set with true actives and DeepCoy Decoys

In [42]:
plec_train_deepcoy = pd.concat([plec_train_true_actives,plec_train_deepcoy_decoys])
grid_train_deepcoy = pd.concat([grid_train_true_actives,grid_train_deepcoy_decoys])

# **Train and test the RF , XGB and ANN with above codes**