In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mutual_info_score,  roc_curve, auc, confusion_matrix, classification_report, roc_auc_score, precision_recall_fscore_support
from sklearn.model_selection import GridSearchCV, cross_val_predict, cross_val_score
import matplotlib.pyplot as plt
from mrmr import mrmr_classif
from sklearn.feature_selection import mutual_info_classif
from scipy import stats
from scipy.stats import norm

# Read data
#file path: CT --- RFs_CT_bin50\6-4ADASYN-1; PET (original) --- RFs_PET1_bin0.25\6-4ADASYN-1\original; 
#           PET (pre-Combat or Limma) --- RFs_PET1_bin0.25\6-4ADASYN-1\pre-Combat or Limma; PET (standardized) --- RFs_PET2_bin0.05\6-4ADASYN-1; 
train_data = pd.read_csv(r'C:\Users\37427\Desktop\github\FS-ML\RFs_PET1_bin0.25\6-4ADASYN-1\pre-Combat\train.csv')
test_data = pd.read_csv(r'C:\Users\37427\Desktop\github\FS-ML\RFs_PET1_bin0.25\6-4ADASYN-1\pre-Combat\test.csv')

X_train = train_data.iloc[:, 1:] 
y_train = train_data.iloc[:, 0]  
X_test = test_data.iloc[:, 1:]   
y_test = test_data.iloc[:, 0]   

# Feature Selector ---- MRMR
try:
    selected_features = mrmr_classif(X=X_train, y=y_train, K=10, n_jobs=1)
    print("Selected features names:", selected_features)
    X_train_selected_top_ten = X_train[selected_features]
    X_test_selected_top_ten = X_test[selected_features]
except Exception as e:
    print("An error occurred:", e)

print("Selected features names:", selected_features)

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 83.65it/s]

Selected features names: ['log.sigma.1.mm.3D_glcm_Correlation', 'wavelet.HHL_firstorder_Skewness', 'wavelet.HHH_glszm_GrayLevelVariance', 'wavelet.LLH_firstorder_Kurtosis', 'log.sigma.1.mm.3D_glcm_Imc1', 'log.sigma.2.mm.3D_glrlm_LowGrayLevelRunEmphasis', 'wavelet.LLH_glcm_Correlation', 'wavelet.HLL_glrlm_LongRunEmphasis', 'wavelet.LLH_glszm_GrayLevelVariance', 'original_glszm_LowGrayLevelZoneEmphasis']
Selected features names: ['log.sigma.1.mm.3D_glcm_Correlation', 'wavelet.HHL_firstorder_Skewness', 'wavelet.HHH_glszm_GrayLevelVariance', 'wavelet.LLH_firstorder_Kurtosis', 'log.sigma.1.mm.3D_glcm_Imc1', 'log.sigma.2.mm.3D_glrlm_LowGrayLevelRunEmphasis', 'wavelet.LLH_glcm_Correlation', 'wavelet.HLL_glrlm_LongRunEmphasis', 'wavelet.LLH_glszm_GrayLevelVariance', 'original_glszm_LowGrayLevelZoneEmphasis']





In [2]:
#Classifier --- categorical boosting
np.random.seed(0)
class WrappedCatBoost(BaseEstimator, ClassifierMixin):
    def __init__(self, iterations=100, learning_rate=0.1, depth=6, l2_leaf_reg=1, border_count=1, random_seed=0):
        self.iterations = iterations
        self.learning_rate = learning_rate
        self.depth = depth
        self.l2_leaf_reg = l2_leaf_reg
        self.border_count = border_count
        self.random_seed = random_seed
        self.model_ = None

    def _prepare_params(self, params):
        return {**self.get_params(), **params}

    def fit(self, X, y, **fit_params):
        self.model_ = CatBoostClassifier(
            iterations=self.iterations,
            learning_rate=self.learning_rate,
            depth=self.depth,
            l2_leaf_reg=self.l2_leaf_reg,
            border_count=self.border_count,
            random_seed=self.random_seed,
            verbose=False
        )
        self.model_.fit(X, y, **fit_params)
        self.classes_ = np.unique(y)
        return self

    def predict(self, X):
        return self.model_.predict(X)

    def predict_proba(self, X):
        return self.model_.predict_proba(X)
    
param_grid = {
    'iterations': [100, 200],
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [3, 6, 10],
    'l2_leaf_reg': [1, 3, 5],
    'border_count': [16, 64, 128]
}
catboost = WrappedCatBoost()
grid_search = GridSearchCV(estimator=catboost, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_selected_top_ten, y_train)
print("Best parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_

y_train_pred_cv = cross_val_predict(best_model, X_train_selected_top_ten, y_train, cv=5)
y_train_pred_proba_cv = cross_val_predict(best_model, X_train_selected_top_ten, y_train, cv=5, method='predict_proba')[:, 1]
train_precision_cv, train_recall_cv, train_f1_cv, _ = precision_recall_fscore_support(y_train, y_train_pred_cv, average='binary')
y_test_pred = best_model.predict(X_test_selected_top_ten)
y_test_pred_proba = best_model.predict_proba(X_test_selected_top_ten)[:, 1]
test_precision, test_recall, test_f1, _ = precision_recall_fscore_support(y_test, y_test_pred, average='binary')

train_accuracy_cv = cross_val_score(best_model, X_train_selected_top_ten, y_train, cv=5, scoring='accuracy').mean()
test_accuracy = accuracy_score(y_test, y_test_pred)
conf_mat_train_cv = confusion_matrix(y_train, y_train_pred_cv)
conf_mat_test = confusion_matrix(y_test, y_test_pred)
sensitivity_train = conf_mat_train_cv[1, 1] / (conf_mat_train_cv[1, 1] + conf_mat_train_cv[1, 0])  # TP / (TP + FN)
sensitivity_test = conf_mat_test[1, 1] / (conf_mat_test[1, 1] + conf_mat_test[1, 0])  # TP / (TP + FN)
specificity_train = conf_mat_train_cv[0, 0] / (conf_mat_train_cv[0, 0] + conf_mat_train_cv[1, 0])
specificity_test = conf_mat_test[0, 0] / (conf_mat_test[0, 0] + conf_mat_test[1, 0])
ppv_train = conf_mat_train_cv[1, 1] / (conf_mat_train_cv[1, 1] + conf_mat_train_cv[0, 1])
npv_train = conf_mat_train_cv[0, 0] / (conf_mat_train_cv[0, 0] + conf_mat_train_cv[1, 0])
ppv_test = conf_mat_test[1, 1] / (conf_mat_test[1, 1] + conf_mat_test[0, 1])
npv_test = conf_mat_test[0, 0] / (conf_mat_test[0, 0] + conf_mat_test[1, 0])
train_precision = precision_score(y_train, y_train_pred_cv)
test_precision = precision_score(y_test, y_test_pred)
train_recall = recall_score(y_train, y_train_pred_cv)
test_recall = recall_score(y_test, y_test_pred)
train_f1 = f1_score(y_train, y_train_pred_cv)
test_f1 = f1_score(y_test, y_test_pred)
print(f"Sensitivity (Cross-Validation): {sensitivity_train:.4f}")
print(f"Specificity (Cross-Validation): {specificity_train:.4f}")
print(f"PPV (Cross-Validation): {ppv_train:.4f}")
print(f"NPV (Cross-Validation): {npv_train:.4f}")
print(f"Precision (Cross-Validation): {train_precision:.4f}")
print(f"Recall (Cross-Validation): {train_recall:.4f}")
print(f"F1 Score (Cross-Validation): {train_f1:.4f}")
print(f"Train Accuracy (Cross-Validation): {train_accuracy_cv:.4f}")
print(f"Sensitivity (Test): {sensitivity_test:.4f}")
print(f"Specificity (Test): {specificity_test:.4f}")
print(f"PPV (Test): {ppv_test:.4f}")
print(f"NPV (Test): {npv_test:.4f}")
print(f"Precision (Test): {test_precision:.4f}")
print(f"Recall (Test): {test_recall:.4f}")
print(f"F1 Score (Test): {test_f1:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

fpr_train, tpr_train, _ = roc_curve(y_train, y_train_pred_proba_cv)
fpr_test, tpr_test, _ = roc_curve(y_test, y_test_pred_proba)
np.random.seed(0)
def bootstrap_roc_auc(y_true, y_scores, n_bootstraps=1000):
    auc_scores = np.zeros(n_bootstraps)
    for i in range(n_bootstraps):
        indices = np.random.choice(len(y_true), size=len(y_true))
        auc_scores[i] = roc_auc_score(y_true[indices], y_scores[indices])
    return auc_scores

auc_scores_train = bootstrap_roc_auc(y_train, y_train_pred_proba_cv)
auc_train_mean = np.mean(auc_scores_train)
auc_train_std = np.std(auc_scores_train)
auc_train_lower = norm.ppf(0.025, loc=auc_train_mean, scale=auc_train_std)
auc_train_upper = norm.ppf(0.975, loc=auc_train_mean, scale=auc_train_std)
print(f"Cross-Validation AUC: {auc_train_mean:.4f} (95% CI: {auc_train_lower:.4f}, {auc_train_upper:.4f})")

auc_scores_test = bootstrap_roc_auc(y_test, y_test_pred_proba)
auc_test_mean = np.mean(auc_scores_test)
auc_test_std = np.std(auc_scores_test)
auc_test_lower = norm.ppf(0.025, loc=auc_test_mean, scale=auc_test_std)
auc_test_upper = norm.ppf(0.975, loc=auc_test_mean, scale=auc_test_std)
print(f"Test AUC: {auc_test_mean:.4f} (95% CI: {auc_test_lower:.4f}, {auc_test_upper:.4f})")

Best parameters: {'border_count': 16, 'depth': 10, 'iterations': 200, 'l2_leaf_reg': 1, 'learning_rate': 0.1}
Sensitivity (Cross-Validation): 0.9072
Specificity (Cross-Validation): 0.9062
PPV (Cross-Validation): 0.7928
NPV (Cross-Validation): 0.9062
Precision (Cross-Validation): 0.7928
Recall (Cross-Validation): 0.9072
F1 Score (Cross-Validation): 0.8462
Train Accuracy (Cross-Validation): 0.8452
Sensitivity (Test): 0.6316
Specificity (Test): 0.8082
PPV (Test): 0.6486
NPV (Test): 0.8082
Precision (Test): 0.6486
Recall (Test): 0.6316
F1 Score (Test): 0.6400
Test Accuracy: 0.7545
Cross-Validation AUC: 0.9313 (95% CI: 0.8994, 0.9632)
Test AUC: 0.8404 (95% CI: 0.7656, 0.9151)
