# **Load Libraries**

In [None]:
import os
import numpy as np
import pandas as pd
import oddt
from oddt.fingerprints import PLEC
from scipy import stats
from sklearn import preprocessing
import pickle
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import matthews_corrcoef, precision_recall_curve, accuracy_score, auc
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn.utils import parallel_backend
from xgboost.sklearn import XGBClassifier
from rdkit import Chem
from rdkit.Chem import AllChem
import deepchem as dc
from deepchem.utils import download_url, load_from_disk
from deepchem.utils.vina_utils import prepare_inputs
from deepchem.models import AtomicConvModel
from deepchem.feat import RdkitGridFeaturizer
from joblib import Parallel, delayed
from tqdm import tqdm
import glob
import tempfile

# **Load Data**

In [3]:
# training set true actives
plec_train_true_actives = pd.read_csv('Path_to_csv')
grid_train_true_actives = pd.read_csv('Path_to_csv')


# test sets true actives
plec_test_true_actives = pd.read_csv('Path_to_csv')
grid_test_true_actives = pd.read_csv('Path_to_csv')

# **Load Decoys**

In [None]:
# training set random_decoys
plec_train_random_decoys = pd.read_csv('Path_to_csv')
grid_train_random_decoys = pd.read_csv('Path_to_csv')


# test sets random_decoys
plec_test_random_decoys = pd.read_csv('Path_to_csv')
grid_test_random_decoys = pd.read_csv('Path_to_csv')



# training set deepcoy decoys
plec_train_deepcoy_decoys = pd.read_csv('Path_to_csv')
grid_train_deepcoy_decoys = pd.read_csv('Path_to_csv')


# test sets deepcoy decoys
plec_test_deepcoy_decoys = pd.read_csv('Path_to_csv')
grid_test_deepcoy_decoys = pd.read_csv('Path_to_csv')

# **Cross validation with Randoom deocys in the training data**

In [104]:
plec_train = pd.concat([plec_train_true_actives,plec_train_random_decoys])
grid_train = pd.concat([grid_train_true_actives,grid_train_random_decoys])

In [105]:
# train
X_plec_train, y_plec_train = plec_train.drop(['class', 'potency','index'], axis= 1), plec_train['class']
X_grid_train, y_grid_train = grid_train.drop(['class', 'potency'], axis= 1), grid_train['class']

In [106]:
plec_train_reset = plec_train.reset_index(drop=True)
grid_train_reset = grid_train.reset_index(drop=True)

In [107]:
X_plec_train, y_plec_train = plec_train_reset.drop(['class', 'potency','index'], axis= 1), plec_train_reset['class']
X_grid_train, y_grid_train = grid_train_reset.drop(['class', 'potency'], axis= 1), grid_train_reset['class']

In [108]:
y_plec_train = y_plec_train.map({'active': 1, 'inactive': 0})
y_grid_train = y_grid_train.map({'active': 1, 'inactive': 0})

In [109]:
y_plec_train

0        1
1        1
2        1
3        1
4        1
        ..
22684    0
22685    0
22686    0
22687    0
22688    0
Name: class, Length: 22689, dtype: int64

In [55]:
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
    average_precision_score, roc_auc_score, precision_score,
    recall_score, matthews_corrcoef, roc_curve, precision_recall_curve,
    f1_score
)
import numpy as np
import pandas as pd

def find_optimal_threshold(y_true, y_pred_prob):
    fpr, tpr, thresholds = roc_curve(y_true, y_pred_prob)
    optimal_idx = np.argmax(tpr - fpr)
    return thresholds[optimal_idx]

# Initialize the StratifiedKFold object
skf = StratifiedKFold(n_splits=5)

# Define the models in a dictionary
models = {
    'Random Forest': RandomForestClassifier(max_depth=3, max_features='log2', min_samples_leaf=1, min_samples_split=8, n_estimators=270, n_jobs=40),
    'XGBoost': XGBClassifier(learning_rate=0.01, max_depth=7, colsample_bytree=0.73, gamma=1.96, min_child_weight=8, subsample=0.71, n_estimators=150),
    'ANN': MLPClassifier(hidden_layer_sizes=(50,), activation='tanh', alpha=0.0070, learning_rate='invscaling', solver='sgd')
}

# Perform stratified 5-fold cross-validation for each model
results = {}

for model_name, model in models.items():
    print(f"Evaluating {model_name}...")
    
    metrics = {
        'Optimal Threshold': [],
        'Average Precision': [],
        'ROC-AUC': [],
        'PR-AUC': [],
        'Precision': [],
        'Recall': [],
        'MCC': [],
        'F1 Score': []
    }
    
    fold = 1
    for train_index, test_index in skf.split(X_plec_train, y_plec_train):
        # Split the data into training and validation sets
        X_train, X_test = X_plec_train.iloc[train_index], X_plec_train.iloc[test_index]
        y_train, y_test = y_plec_train[train_index], y_plec_train[test_index]
        
        # Train the model
        model.fit(X_train, y_train)
        
        # Predict probabilities on the validation set
        y_pred_prob = model.predict_proba(X_test)[:, 1]
        
        # Find optimal threshold
        optimal_threshold = find_optimal_threshold(y_test, y_pred_prob)
        
        # Make predictions using the optimal threshold
        y_pred = (y_pred_prob >= optimal_threshold).astype(int)
        
        # Calculate metrics
        avg_precision = average_precision_score(y_test, y_pred_prob)
        roc_auc = roc_auc_score(y_test, y_pred_prob)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        mcc = matthews_corrcoef(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        
        # Calculate PR-AUC
        precision_curve, recall_curve, _ = precision_recall_curve(y_test, y_pred_prob)
        pr_auc = auc(recall_curve, precision_curve)
        
        # Store metrics
        metrics['Optimal Threshold'].append(optimal_threshold)
        metrics['Average Precision'].append(avg_precision)
        metrics['ROC-AUC'].append(roc_auc)
        metrics['PR-AUC'].append(pr_auc)
        metrics['Precision'].append(precision)
        metrics['Recall'].append(recall)
        metrics['MCC'].append(mcc)
        metrics['F1 Score'].append(f1)
        
        print(f"Fold {fold} - Optimal Threshold: {optimal_threshold:.3f}, Avg Precision: {avg_precision:.3f}, ROC-AUC: {roc_auc:.3f}, PR-AUC: {pr_auc:.3f}, Precision: {precision:.3f}, Recall: {recall:.3f}, MCC: {mcc:.3f}, F1: {f1:.3f}")
        fold += 1
    
    # Calculate mean and standard deviation of each metric
    results[model_name] = {metric: (np.mean(values), np.std(values)) for metric, values in metrics.items()}

# Print overall results
for model_name, metrics in results.items():
    print(f"\n{model_name}:")
    for metric, (mean, std) in metrics.items():
        print(f"Mean {metric}: {mean:.3f}, Std Dev: {std:.3f}")

# Print mean metrics for each model
print("\nMean Metrics for Each Model (rounded to three decimal places):")
mean_metrics = {metric: [] for metric in metrics.keys()}
for model_name, metrics in results.items():
    print(f"\n{model_name}:")
    for metric, (mean, std) in metrics.items():
        mean_rounded = round(mean, 3)
        mean_metrics[metric].append(mean_rounded)
        print(f"Mean {metric}: {mean_rounded}, Std Dev: {std:.3f}")

print("\nMean Metric Scores for Each Model:")
for metric, scores in mean_metrics.items():
    print(f"{metric}: {scores}")

Evaluating Random Forest...
Fold 1 - Optimal Threshold: 0.024, Avg Precision: 0.872, ROC-AUC: 0.990, PR-AUC: 0.871, Precision: 0.330, Recall: 0.944, MCC: 0.543, F1: 0.489
Fold 2 - Optimal Threshold: 0.025, Avg Precision: 0.919, ROC-AUC: 0.995, PR-AUC: 0.919, Precision: 0.433, Recall: 0.963, MCC: 0.635, F1: 0.598
Fold 3 - Optimal Threshold: 0.026, Avg Precision: 0.829, ROC-AUC: 0.977, PR-AUC: 0.829, Precision: 0.411, Recall: 0.880, MCC: 0.589, F1: 0.560
Fold 4 - Optimal Threshold: 0.026, Avg Precision: 0.833, ROC-AUC: 0.975, PR-AUC: 0.832, Precision: 0.254, Recall: 0.935, MCC: 0.468, F1: 0.399
Fold 5 - Optimal Threshold: 0.028, Avg Precision: 0.803, ROC-AUC: 0.957, PR-AUC: 0.803, Precision: 0.247, Recall: 0.916, MCC: 0.456, F1: 0.389
Evaluating XGBoost...
Fold 1 - Optimal Threshold: 0.138, Avg Precision: 0.932, ROC-AUC: 0.992, PR-AUC: 0.932, Precision: 0.457, Recall: 0.972, MCC: 0.656, F1: 0.621
Fold 2 - Optimal Threshold: 0.183, Avg Precision: 0.983, ROC-AUC: 0.999, PR-AUC: 0.983, Prec

In [56]:
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
    average_precision_score, roc_auc_score, precision_score,
    recall_score, matthews_corrcoef, roc_curve, precision_recall_curve,
    f1_score
)
import numpy as np
import pandas as pd

def find_optimal_threshold(y_true, y_pred_prob):
    fpr, tpr, thresholds = roc_curve(y_true, y_pred_prob)
    optimal_idx = np.argmax(tpr - fpr)
    return thresholds[optimal_idx]

# Initialize the StratifiedKFold object
skf = StratifiedKFold(n_splits=5)

# Define the models in a dictionary
models = {
    'Random Forest': RandomForestClassifier(max_depth=3, max_features='log2', min_samples_leaf=1, min_samples_split=8, n_estimators=270, n_jobs=40),
    'XGBoost': XGBClassifier(learning_rate=0.01, max_depth=7, colsample_bytree=0.73, gamma=1.96, min_child_weight=8, subsample=0.71, n_estimators=150),
    'ANN': MLPClassifier(hidden_layer_sizes=(50,), activation='tanh', alpha=0.0070, learning_rate='invscaling', solver='sgd')
}

# Perform stratified 5-fold cross-validation for each model
results = {}

for model_name, model in models.items():
    print(f"Evaluating {model_name}...")
    
    metrics = {
        'Optimal Threshold': [],
        'Average Precision': [],
        'ROC-AUC': [],
        'PR-AUC': [],
        'Precision': [],
        'Recall': [],
        'MCC': [],
        'F1 Score': []
    }
    
    fold = 1
    for train_index, test_index in skf.split(X_grid_train, y_grid_train):
        # Split the data into training and validation sets
        X_train, X_test = X_grid_train.iloc[train_index], X_grid_train.iloc[test_index]
        y_train, y_test = y_grid_train[train_index], y_grid_train[test_index]
        
        # Train the model
        model.fit(X_train, y_train)
        
        # Predict probabilities on the validation set
        y_pred_prob = model.predict_proba(X_test)[:, 1]
        
        # Find optimal threshold
        optimal_threshold = find_optimal_threshold(y_test, y_pred_prob)
        
        # Make predictions using the optimal threshold
        y_pred = (y_pred_prob >= optimal_threshold).astype(int)
        
        # Calculate metrics
        avg_precision = average_precision_score(y_test, y_pred_prob)
        roc_auc = roc_auc_score(y_test, y_pred_prob)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        mcc = matthews_corrcoef(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        
        # Calculate PR-AUC
        precision_curve, recall_curve, _ = precision_recall_curve(y_test, y_pred_prob)
        pr_auc = auc(recall_curve, precision_curve)
        
        # Store metrics
        metrics['Optimal Threshold'].append(optimal_threshold)
        metrics['Average Precision'].append(avg_precision)
        metrics['ROC-AUC'].append(roc_auc)
        metrics['PR-AUC'].append(pr_auc)
        metrics['Precision'].append(precision)
        metrics['Recall'].append(recall)
        metrics['MCC'].append(mcc)
        metrics['F1 Score'].append(f1)
        
        print(f"Fold {fold} - Optimal Threshold: {optimal_threshold:.3f}, Avg Precision: {avg_precision:.3f}, ROC-AUC: {roc_auc:.3f}, PR-AUC: {pr_auc:.3f}, Precision: {precision:.3f}, Recall: {recall:.3f}, MCC: {mcc:.3f}, F1: {f1:.3f}")
        fold += 1
    
    # Calculate mean and standard deviation of each metric
    results[model_name] = {metric: (np.mean(values), np.std(values)) for metric, values in metrics.items()}

# Print overall results
for model_name, metrics in results.items():
    print(f"\n{model_name}:")
    for metric, (mean, std) in metrics.items():
        print(f"Mean {metric}: {mean:.3f}, Std Dev: {std:.3f}")

# Print mean metrics for each model
print("\nMean Metrics for Each Model (rounded to three decimal places):")
mean_metrics = {metric: [] for metric in metrics.keys()}
for model_name, metrics in results.items():
    print(f"\n{model_name}:")
    for metric, (mean, std) in metrics.items():
        mean_rounded = round(mean, 3)
        mean_metrics[metric].append(mean_rounded)
        print(f"Mean {metric}: {mean_rounded}, Std Dev: {std:.3f}")

print("\nMean Metric Scores for Each Model:")
for metric, scores in mean_metrics.items():
    print(f"{metric}: {scores}")

Evaluating Random Forest...
Fold 1 - Optimal Threshold: 0.028, Avg Precision: 0.785, ROC-AUC: 0.955, PR-AUC: 0.785, Precision: 0.271, Recall: 0.861, MCC: 0.464, F1: 0.412
Fold 2 - Optimal Threshold: 0.027, Avg Precision: 0.859, ROC-AUC: 0.986, PR-AUC: 0.858, Precision: 0.240, Recall: 0.954, MCC: 0.459, F1: 0.384
Fold 3 - Optimal Threshold: 0.028, Avg Precision: 0.769, ROC-AUC: 0.949, PR-AUC: 0.769, Precision: 0.271, Recall: 0.824, MCC: 0.453, F1: 0.407
Fold 4 - Optimal Threshold: 0.032, Avg Precision: 0.777, ROC-AUC: 0.927, PR-AUC: 0.777, Precision: 0.318, Recall: 0.824, MCC: 0.495, F1: 0.459
Fold 5 - Optimal Threshold: 0.031, Avg Precision: 0.663, ROC-AUC: 0.884, PR-AUC: 0.663, Precision: 0.132, Recall: 0.813, MCC: 0.294, F1: 0.227
Evaluating XGBoost...
Fold 1 - Optimal Threshold: 0.192, Avg Precision: 0.930, ROC-AUC: 0.978, PR-AUC: 0.930, Precision: 0.789, Recall: 0.935, MCC: 0.855, F1: 0.856
Fold 2 - Optimal Threshold: 0.153, Avg Precision: 0.969, ROC-AUC: 0.999, PR-AUC: 0.969, Prec

# **Cross validation with DeepCoy decoys in the training data**

In [None]:
plec_train = pd.concat([plec_train_true_actives,plec_train_deepcoys_decoys])
grid_train = pd.concat([grid_train_true_actives,grid_train_deepcoys_decoys])

In [95]:
# train
X_plec_train, y_plec_train = plec_train.drop(['class', 'potency','index'], axis= 1), plec_train['class']
X_grid_train, y_grid_train = grid_train.drop(['class', 'potency','index'], axis= 1), grid_train['class']

In [96]:
plec_train_reset = plec_train.reset_index(drop=True)
grid_train_reset = grid_train.reset_index(drop=True)

In [106]:
X_plec_train, y_plec_train = plec_train_reset.drop(['class', 'potency','index'], axis= 1), plec_train_reset['class']
X_grid_train, y_grid_train = grid_train_reset.drop(['class', 'potency','index'], axis= 1), grid_train_reset['class']

In [107]:
y_plec_train = y_plec_train.map({'active': 1, 'inactive': 0})
y_grid_train = y_grid_train.map({'active': 1, 'inactive': 0})

In [108]:
X_grid_train

Unnamed: 0,GRID_0,GRID_1,GRID_2,GRID_3,GRID_4,GRID_5,GRID_6,GRID_7,GRID_8,GRID_9,...,GRID_2042,GRID_2043,GRID_2044,GRID_2045,GRID_2046,GRID_2047,GRID_2048,GRID_2049,GRID_2050,GRID_2051
0,0,1,0,1,1,0,0,0,0,0,...,0,0,2,8,0,0,0,0,0,0
1,0,0,0,0,1,0,0,1,0,0,...,2,16,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,1,0,0,...,0,10,0,0,2,4,0,0,2,0
3,0,0,0,0,1,0,0,1,0,0,...,0,6,2,4,0,0,0,1,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,2,0,0,0,0,0,4,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22684,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,2,0
22685,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,2,0
22686,0,0,0,0,1,0,0,0,0,0,...,0,0,0,2,0,0,0,2,0,0
22687,0,0,0,0,1,0,0,0,0,0,...,0,0,4,0,4,0,0,0,0,0


In [93]:
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
    average_precision_score, roc_auc_score, precision_score,
    recall_score, matthews_corrcoef, roc_curve, precision_recall_curve,
    f1_score
)
import numpy as np
import pandas as pd

def find_optimal_threshold(y_true, y_pred_prob):
    fpr, tpr, thresholds = roc_curve(y_true, y_pred_prob)
    optimal_idx = np.argmax(tpr - fpr)
    return thresholds[optimal_idx]

# Initialize the StratifiedKFold object
skf = StratifiedKFold(n_splits=5)

# Define the models in a dictionary
models = {
    'Random Forest': RandomForestClassifier(max_depth=3, max_features='log2', min_samples_leaf=1, min_samples_split=8, n_estimators=270, n_jobs=40),
    'XGBoost': XGBClassifier(learning_rate=0.01, max_depth=7, colsample_bytree=0.73, gamma=1.96, min_child_weight=8, subsample=0.71, n_estimators=150),
    'ANN': MLPClassifier(hidden_layer_sizes=(50,), activation='tanh', alpha=0.0070, learning_rate='invscaling', solver='sgd')
}

# Perform stratified 5-fold cross-validation for each model
results = {}

for model_name, model in models.items():
    print(f"Evaluating {model_name}...")
    
    metrics = {
        'Optimal Threshold': [],
        'Average Precision': [],
        'ROC-AUC': [],
        'PR-AUC': [],
        'Precision': [],
        'Recall': [],
        'MCC': [],
        'F1 Score': []
    }
    
    fold = 1
    for train_index, test_index in skf.split(X_plec_train, y_plec_train):
        # Split the data into training and validation sets
        X_train, X_test = X_plec_train.iloc[train_index], X_plec_train.iloc[test_index]
        y_train, y_test = y_plec_train[train_index], y_plec_train[test_index]
        
        # Train the model
        model.fit(X_train, y_train)
        
        # Predict probabilities on the validation set
        y_pred_prob = model.predict_proba(X_test)[:, 1]
        
        # Find optimal threshold
        optimal_threshold = find_optimal_threshold(y_test, y_pred_prob)
        
        # Make predictions using the optimal threshold
        y_pred = (y_pred_prob >= optimal_threshold).astype(int)
        
        # Calculate metrics
        avg_precision = average_precision_score(y_test, y_pred_prob)
        roc_auc = roc_auc_score(y_test, y_pred_prob)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        mcc = matthews_corrcoef(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        
        # Calculate PR-AUC
        precision_curve, recall_curve, _ = precision_recall_curve(y_test, y_pred_prob)
        pr_auc = auc(recall_curve, precision_curve)
        
        # Store metrics
        metrics['Optimal Threshold'].append(optimal_threshold)
        metrics['Average Precision'].append(avg_precision)
        metrics['ROC-AUC'].append(roc_auc)
        metrics['PR-AUC'].append(pr_auc)
        metrics['Precision'].append(precision)
        metrics['Recall'].append(recall)
        metrics['MCC'].append(mcc)
        metrics['F1 Score'].append(f1)
        
        print(f"Fold {fold} - Optimal Threshold: {optimal_threshold:.3f}, Avg Precision: {avg_precision:.3f}, ROC-AUC: {roc_auc:.3f}, PR-AUC: {pr_auc:.3f}, Precision: {precision:.3f}, Recall: {recall:.3f}, MCC: {mcc:.3f}, F1: {f1:.3f}")
        fold += 1
    
    # Calculate mean and standard deviation of each metric
    results[model_name] = {metric: (np.mean(values), np.std(values)) for metric, values in metrics.items()}

# Print overall results
for model_name, metrics in results.items():
    print(f"\n{model_name}:")
    for metric, (mean, std) in metrics.items():
        print(f"Mean {metric}: {mean:.3f}, Std Dev: {std:.3f}")

# Print mean metrics for each model
print("\nMean Metrics for Each Model (rounded to three decimal places):")
mean_metrics = {metric: [] for metric in metrics.keys()}
for model_name, metrics in results.items():
    print(f"\n{model_name}:")
    for metric, (mean, std) in metrics.items():
        mean_rounded = round(mean, 3)
        mean_metrics[metric].append(mean_rounded)
        print(f"Mean {metric}: {mean_rounded}, Std Dev: {std:.3f}")

print("\nMean Metric Scores for Each Model:")
for metric, scores in mean_metrics.items():
    print(f"{metric}: {scores}")

Evaluating Random Forest...
Fold 1 - Optimal Threshold: 0.030, Avg Precision: 0.907, ROC-AUC: 0.997, PR-AUC: 0.907, Precision: 0.566, Recall: 0.991, MCC: 0.742, F1: 0.721
Fold 2 - Optimal Threshold: 0.035, Avg Precision: 0.952, ROC-AUC: 0.998, PR-AUC: 0.952, Precision: 0.682, Recall: 0.991, MCC: 0.817, F1: 0.808
Fold 3 - Optimal Threshold: 0.032, Avg Precision: 0.884, ROC-AUC: 0.996, PR-AUC: 0.883, Precision: 0.482, Recall: 0.981, MCC: 0.678, F1: 0.646
Fold 4 - Optimal Threshold: 0.029, Avg Precision: 0.870, ROC-AUC: 0.996, PR-AUC: 0.870, Precision: 0.461, Recall: 0.991, MCC: 0.666, F1: 0.629
Fold 5 - Optimal Threshold: 0.030, Avg Precision: 0.929, ROC-AUC: 0.998, PR-AUC: 0.928, Precision: 0.473, Recall: 1.000, MCC: 0.679, F1: 0.643
Evaluating XGBoost...
Fold 1 - Optimal Threshold: 0.139, Avg Precision: 0.964, ROC-AUC: 0.999, PR-AUC: 0.964, Precision: 0.470, Recall: 1.000, MCC: 0.676, F1: 0.639
Fold 2 - Optimal Threshold: 0.207, Avg Precision: 0.988, ROC-AUC: 0.999, PR-AUC: 0.988, Prec

In [109]:
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
    average_precision_score, roc_auc_score, precision_score,
    recall_score, matthews_corrcoef, roc_curve, precision_recall_curve,
    f1_score
)
import numpy as np
import pandas as pd

def find_optimal_threshold(y_true, y_pred_prob):
    fpr, tpr, thresholds = roc_curve(y_true, y_pred_prob)
    optimal_idx = np.argmax(tpr - fpr)
    return thresholds[optimal_idx]

# Initialize the StratifiedKFold object
skf = StratifiedKFold(n_splits=5)

# Define the models in a dictionary
models = {
    'Random Forest': RandomForestClassifier(max_depth=3, max_features='log2', min_samples_leaf=1, min_samples_split=8, n_estimators=270, n_jobs=40),
    'XGBoost': XGBClassifier(learning_rate=0.01, max_depth=7, colsample_bytree=0.73, gamma=1.96, min_child_weight=8, subsample=0.71, n_estimators=150),
    'ANN': MLPClassifier(hidden_layer_sizes=(50,), activation='tanh', alpha=0.0070, learning_rate='invscaling', solver='sgd')
}

# Perform stratified 5-fold cross-validation for each model
results = {}

for model_name, model in models.items():
    print(f"Evaluating {model_name}...")
    
    metrics = {
        'Optimal Threshold': [],
        'Average Precision': [],
        'ROC-AUC': [],
        'PR-AUC': [],
        'Precision': [],
        'Recall': [],
        'MCC': [],
        'F1 Score': []
    }
    
    fold = 1
    for train_index, test_index in skf.split(X_grid_train, y_grid_train):
        # Split the data into training and validation sets
        X_train, X_test = X_grid_train.iloc[train_index], X_grid_train.iloc[test_index]
        y_train, y_test = y_grid_train[train_index], y_grid_train[test_index]
        
        # Train the model
        model.fit(X_train, y_train)
        
        # Predict probabilities on the validation set
        y_pred_prob = model.predict_proba(X_test)[:, 1]
        
        # Find optimal threshold
        optimal_threshold = find_optimal_threshold(y_test, y_pred_prob)
        
        # Make predictions using the optimal threshold
        y_pred = (y_pred_prob >= optimal_threshold).astype(int)
        
        # Calculate metrics
        avg_precision = average_precision_score(y_test, y_pred_prob)
        roc_auc = roc_auc_score(y_test, y_pred_prob)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        mcc = matthews_corrcoef(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        
        # Calculate PR-AUC
        precision_curve, recall_curve, _ = precision_recall_curve(y_test, y_pred_prob)
        pr_auc = auc(recall_curve, precision_curve)
        
        # Store metrics
        metrics['Optimal Threshold'].append(optimal_threshold)
        metrics['Average Precision'].append(avg_precision)
        metrics['ROC-AUC'].append(roc_auc)
        metrics['PR-AUC'].append(pr_auc)
        metrics['Precision'].append(precision)
        metrics['Recall'].append(recall)
        metrics['MCC'].append(mcc)
        metrics['F1 Score'].append(f1)
        
        print(f"Fold {fold} - Optimal Threshold: {optimal_threshold:.3f}, Avg Precision: {avg_precision:.3f}, ROC-AUC: {roc_auc:.3f}, PR-AUC: {pr_auc:.3f}, Precision: {precision:.3f}, Recall: {recall:.3f}, MCC: {mcc:.3f}, F1: {f1:.3f}")
        fold += 1
    
    # Calculate mean and standard deviation of each metric
    results[model_name] = {metric: (np.mean(values), np.std(values)) for metric, values in metrics.items()}

# Print overall results
for model_name, metrics in results.items():
    print(f"\n{model_name}:")
    for metric, (mean, std) in metrics.items():
        print(f"Mean {metric}: {mean:.3f}, Std Dev: {std:.3f}")

# Print mean metrics for each model
print("\nMean Metrics for Each Model (rounded to three decimal places):")
mean_metrics = {metric: [] for metric in metrics.keys()}
for model_name, metrics in results.items():
    print(f"\n{model_name}:")
    for metric, (mean, std) in metrics.items():
        mean_rounded = round(mean, 3)
        mean_metrics[metric].append(mean_rounded)
        print(f"Mean {metric}: {mean_rounded}, Std Dev: {std:.3f}")

print("\nMean Metric Scores for Each Model:")
for metric, scores in mean_metrics.items():
    print(f"{metric}: {scores}")

Evaluating Random Forest...
Fold 1 - Optimal Threshold: 0.042, Avg Precision: 0.943, ROC-AUC: 0.995, PR-AUC: 0.943, Precision: 0.658, Recall: 0.963, MCC: 0.791, F1: 0.782
Fold 2 - Optimal Threshold: 0.051, Avg Precision: 0.982, ROC-AUC: 1.000, PR-AUC: 0.982, Precision: 0.794, Recall: 1.000, MCC: 0.888, F1: 0.885
Fold 3 - Optimal Threshold: 0.034, Avg Precision: 0.918, ROC-AUC: 0.994, PR-AUC: 0.918, Precision: 0.405, Recall: 0.972, MCC: 0.616, F1: 0.572
Fold 4 - Optimal Threshold: 0.039, Avg Precision: 0.907, ROC-AUC: 0.995, PR-AUC: 0.907, Precision: 0.484, Recall: 0.954, MCC: 0.669, F1: 0.642
Fold 5 - Optimal Threshold: 0.034, Avg Precision: 0.921, ROC-AUC: 0.995, PR-AUC: 0.921, Precision: 0.387, Recall: 0.972, MCC: 0.601, F1: 0.553
Evaluating XGBoost...
Fold 1 - Optimal Threshold: 0.190, Avg Precision: 0.974, ROC-AUC: 0.993, PR-AUC: 0.974, Precision: 0.847, Recall: 0.972, MCC: 0.905, F1: 0.905
Fold 2 - Optimal Threshold: 0.188, Avg Precision: 0.985, ROC-AUC: 0.999, PR-AUC: 0.985, Prec