In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mutual_info_score,  roc_curve, auc, confusion_matrix, classification_report, roc_auc_score, precision_recall_fscore_support
from sklearn.model_selection import GridSearchCV, cross_val_predict, cross_val_score
import matplotlib.pyplot as plt
from mrmr import mrmr_classif
from sklearn.feature_selection import mutual_info_classif
from scipy import stats
from scipy.stats import norm

# Read data
#file path: CT --- RFs_CT_bin50\6-4ADASYN-1; PET (original) --- RFs_PET1_bin0.25\6-4ADASYN-1\original; 
#           PET (pre-Combat or Limma) --- RFs_PET1_bin0.25\6-4ADASYN-1\pre-Combat or Limma; PET (standardized) --- RFs_PET2_bin0.05\6-4ADASYN-1; 
train_data = pd.read_csv(r'C:\Users\37427\Desktop\github\FS-ML\RFs_PET1_bin0.25\6-4ADASYN-1\pre-Combat\train.csv')
test_data = pd.read_csv(r'C:\Users\37427\Desktop\github\FS-ML\RFs_PET1_bin0.25\6-4ADASYN-1\pre-Combat\test.csv')

X_train = train_data.iloc[:, 1:] 
y_train = train_data.iloc[:, 0]  
X_test = test_data.iloc[:, 1:]   
y_test = test_data.iloc[:, 0]   

# Feature Selector ---- LASSO
alphas = np.logspace(-4, 0, 50)

lasso = Lasso(max_iter=10000, tol=1e-5, random_state=42)

param_grid = {
    'alpha': alphas,
    'tol': [1e-4, 1e-3]
}
grid_search = GridSearchCV(estimator=lasso, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=1, verbose=2)

grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best score (neg MSE):", -grid_search.best_score_)

best_lasso = grid_search.best_estimator_
coef_abs = np.abs(best_lasso.coef_)
coef_abs_with_index = list(zip(coef_abs, np.arange(len(coef_abs))))
coef_abs_with_index.sort(key=lambda x: x[0], reverse=True)
top_ten_indices = [index for _, index in coef_abs_with_index[:10]]

X_train_selected_top_ten = X_train.iloc[:, top_ten_indices]
X_test_selected_top_ten = X_test.iloc[:, top_ten_indices]
selected_features_top_ten = X_train.columns[top_ten_indices]
print("Selected Top Ten Features:", selected_features_top_ten)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END ...........................alpha=0.0001, tol=0.0001; total time=   0.0s
[CV] END ...........................alpha=0.0001, tol=0.0001; total time=   0.0s
[CV] END ...........................alpha=0.0001, tol=0.0001; total time=   0.0s
[CV] END ...........................alpha=0.0001, tol=0.0001; total time=   0.0s
[CV] END ...........................alpha=0.0001, tol=0.0001; total time=   0.0s
[CV] END ............................alpha=0.0001, tol=0.001; total time=   0.0s
[CV] END ............................alpha=0.0001, tol=0.001; total time=   0.0s
[CV] END ............................alpha=0.0001, tol=0.001; total time=   0.0s
[CV] END ............................alpha=0.0001, tol=0.001; total time=   0.0s
[CV] END ............................alpha=0.0001, tol=0.001; total time=   0.0s
[CV] END ...........alpha=0.00012067926406393288, tol=0.0001; total time=   0.0s
[CV] END ...........alpha=0.000120679264063932

[CV] END ............alpha=0.0006551285568595509, tol=0.0001; total time=   0.0s
[CV] END ............alpha=0.0006551285568595509, tol=0.0001; total time=   0.0s
[CV] END ............alpha=0.0006551285568595509, tol=0.0001; total time=   0.0s
[CV] END ............alpha=0.0006551285568595509, tol=0.0001; total time=   0.0s
[CV] END .............alpha=0.0006551285568595509, tol=0.001; total time=   0.0s
[CV] END .............alpha=0.0006551285568595509, tol=0.001; total time=   0.0s
[CV] END .............alpha=0.0006551285568595509, tol=0.001; total time=   0.0s
[CV] END .............alpha=0.0006551285568595509, tol=0.001; total time=   0.0s
[CV] END .............alpha=0.0006551285568595509, tol=0.001; total time=   0.0s
[CV] END ............alpha=0.0007906043210907702, tol=0.0001; total time=   0.0s
[CV] END ............alpha=0.0007906043210907702, tol=0.0001; total time=   0.0s
[CV] END ............alpha=0.0007906043210907702, tol=0.0001; total time=   0.0s
[CV] END ............alpha=0

[CV] END .............alpha=0.007543120063354615, tol=0.0001; total time=   0.0s
[CV] END ..............alpha=0.007543120063354615, tol=0.001; total time=   0.0s
[CV] END ..............alpha=0.007543120063354615, tol=0.001; total time=   0.0s
[CV] END ..............alpha=0.007543120063354615, tol=0.001; total time=   0.0s
[CV] END ..............alpha=0.007543120063354615, tol=0.001; total time=   0.0s
[CV] END ..............alpha=0.007543120063354615, tol=0.001; total time=   0.0s
[CV] END .............alpha=0.009102981779915217, tol=0.0001; total time=   0.0s
[CV] END .............alpha=0.009102981779915217, tol=0.0001; total time=   0.0s
[CV] END .............alpha=0.009102981779915217, tol=0.0001; total time=   0.0s
[CV] END .............alpha=0.009102981779915217, tol=0.0001; total time=   0.0s
[CV] END .............alpha=0.009102981779915217, tol=0.0001; total time=   0.0s
[CV] END ..............alpha=0.009102981779915217, tol=0.001; total time=   0.0s
[CV] END ..............alpha

[CV] END ...............alpha=0.05963623316594643, tol=0.001; total time=   0.0s
[CV] END ...............alpha=0.05963623316594643, tol=0.001; total time=   0.0s
[CV] END ..............alpha=0.07196856730011514, tol=0.0001; total time=   0.0s
[CV] END ..............alpha=0.07196856730011514, tol=0.0001; total time=   0.0s
[CV] END ..............alpha=0.07196856730011514, tol=0.0001; total time=   0.0s
[CV] END ..............alpha=0.07196856730011514, tol=0.0001; total time=   0.0s
[CV] END ..............alpha=0.07196856730011514, tol=0.0001; total time=   0.0s
[CV] END ...............alpha=0.07196856730011514, tol=0.001; total time=   0.0s
[CV] END ...............alpha=0.07196856730011514, tol=0.001; total time=   0.0s
[CV] END ...............alpha=0.07196856730011514, tol=0.001; total time=   0.0s
[CV] END ...............alpha=0.07196856730011514, tol=0.001; total time=   0.0s
[CV] END ...............alpha=0.07196856730011514, tol=0.001; total time=   0.0s
[CV] END ..............alpha

[CV] END ...............alpha=0.47148663634573895, tol=0.001; total time=   0.0s
[CV] END ...............alpha=0.47148663634573895, tol=0.001; total time=   0.0s
[CV] END ...............alpha=0.47148663634573895, tol=0.001; total time=   0.0s
[CV] END ...............alpha=0.5689866029018293, tol=0.0001; total time=   0.0s
[CV] END ...............alpha=0.5689866029018293, tol=0.0001; total time=   0.0s
[CV] END ...............alpha=0.5689866029018293, tol=0.0001; total time=   0.0s
[CV] END ...............alpha=0.5689866029018293, tol=0.0001; total time=   0.0s
[CV] END ...............alpha=0.5689866029018293, tol=0.0001; total time=   0.0s
[CV] END ................alpha=0.5689866029018293, tol=0.001; total time=   0.0s
[CV] END ................alpha=0.5689866029018293, tol=0.001; total time=   0.0s
[CV] END ................alpha=0.5689866029018293, tol=0.001; total time=   0.0s
[CV] END ................alpha=0.5689866029018293, tol=0.001; total time=   0.0s
[CV] END ................alp

In [2]:
#Classifier --- Logistic Regression
np.random.seed(0)
param_grid = {
    'C': [0.1, 1.0, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear'],
    'max_iter': [1000],
    'tol': [1e-4]
}
logistic = LogisticRegression()
grid_search = GridSearchCV(estimator=logistic, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_selected_top_ten, y_train)
print("Best parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_

y_train_pred_cv = cross_val_predict(best_model, X_train_selected_top_ten, y_train, cv=5)
y_train_pred_proba_cv = cross_val_predict(best_model, X_train_selected_top_ten, y_train, cv=5, method='predict_proba')[:, 1]
train_precision_cv, train_recall_cv, train_f1_cv, _ = precision_recall_fscore_support(y_train, y_train_pred_cv, average='binary')
y_test_pred = best_model.predict(X_test_selected_top_ten)
y_test_pred_proba = best_model.predict_proba(X_test_selected_top_ten)[:, 1]
test_precision, test_recall, test_f1, _ = precision_recall_fscore_support(y_test, y_test_pred, average='binary')

train_accuracy_cv = cross_val_score(best_model, X_train_selected_top_ten, y_train, cv=5, scoring='accuracy').mean()
test_accuracy = accuracy_score(y_test, y_test_pred)
conf_mat_train_cv = confusion_matrix(y_train, y_train_pred_cv)
conf_mat_test = confusion_matrix(y_test, y_test_pred)
sensitivity_train = conf_mat_train_cv[1, 1] / (conf_mat_train_cv[1, 1] + conf_mat_train_cv[1, 0])  # TP / (TP + FN)
sensitivity_test = conf_mat_test[1, 1] / (conf_mat_test[1, 1] + conf_mat_test[1, 0])  # TP / (TP + FN)
specificity_train = conf_mat_train_cv[0, 0] / (conf_mat_train_cv[0, 0] + conf_mat_train_cv[1, 0])
specificity_test = conf_mat_test[0, 0] / (conf_mat_test[0, 0] + conf_mat_test[1, 0])
ppv_train = conf_mat_train_cv[1, 1] / (conf_mat_train_cv[1, 1] + conf_mat_train_cv[0, 1])
npv_train = conf_mat_train_cv[0, 0] / (conf_mat_train_cv[0, 0] + conf_mat_train_cv[1, 0])
ppv_test = conf_mat_test[1, 1] / (conf_mat_test[1, 1] + conf_mat_test[0, 1])
npv_test = conf_mat_test[0, 0] / (conf_mat_test[0, 0] + conf_mat_test[1, 0])
train_precision = precision_score(y_train, y_train_pred_cv)
test_precision = precision_score(y_test, y_test_pred)
train_recall = recall_score(y_train, y_train_pred_cv)
test_recall = recall_score(y_test, y_test_pred)
train_f1 = f1_score(y_train, y_train_pred_cv)
test_f1 = f1_score(y_test, y_test_pred)
print(f"Sensitivity (Cross-Validation): {sensitivity_train:.4f}")
print(f"Specificity (Cross-Validation): {specificity_train:.4f}")
print(f"PPV (Cross-Validation): {ppv_train:.4f}")
print(f"NPV (Cross-Validation): {npv_train:.4f}")
print(f"Precision (Cross-Validation): {train_precision:.4f}")
print(f"Recall (Cross-Validation): {train_recall:.4f}")
print(f"F1 Score (Cross-Validation): {train_f1:.4f}")
print(f"Train Accuracy (Cross-Validation): {train_accuracy_cv:.4f}")
print(f"Sensitivity (Test): {sensitivity_test:.4f}")
print(f"Specificity (Test): {specificity_test:.4f}")
print(f"PPV (Test): {ppv_test:.4f}")
print(f"NPV (Test): {npv_test:.4f}")
print(f"Precision (Test): {test_precision:.4f}")
print(f"Recall (Test): {test_recall:.4f}")
print(f"F1 Score (Test): {test_f1:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

fpr_train, tpr_train, _ = roc_curve(y_train, y_train_pred_proba_cv)
fpr_test, tpr_test, _ = roc_curve(y_test, y_test_pred_proba)
np.random.seed(0)
def bootstrap_roc_auc(y_true, y_scores, n_bootstraps=1000):
    auc_scores = np.zeros(n_bootstraps)
    for i in range(n_bootstraps):
        indices = np.random.choice(len(y_true), size=len(y_true))
        auc_scores[i] = roc_auc_score(y_true[indices], y_scores[indices])
    return auc_scores

auc_scores_train = bootstrap_roc_auc(y_train, y_train_pred_proba_cv)
auc_train_mean = np.mean(auc_scores_train)
auc_train_std = np.std(auc_scores_train)
auc_train_lower = norm.ppf(0.025, loc=auc_train_mean, scale=auc_train_std)
auc_train_upper = norm.ppf(0.975, loc=auc_train_mean, scale=auc_train_std)
print(f"Cross-Validation AUC: {auc_train_mean:.4f} (95% CI: {auc_train_lower:.4f}, {auc_train_upper:.4f})")

auc_scores_test = bootstrap_roc_auc(y_test, y_test_pred_proba)
auc_test_mean = np.mean(auc_scores_test)
auc_test_std = np.std(auc_scores_test)
auc_test_lower = norm.ppf(0.025, loc=auc_test_mean, scale=auc_test_std)
auc_test_upper = norm.ppf(0.975, loc=auc_test_mean, scale=auc_test_std)
print(f"Test AUC: {auc_test_mean:.4f} (95% CI: {auc_test_lower:.4f}, {auc_test_upper:.4f})")

Best parameters: {'C': 0.1, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'liblinear', 'tol': 0.0001}
Sensitivity (Cross-Validation): 0.7423
Specificity (Cross-Validation): 0.7748
PPV (Cross-Validation): 0.7500
NPV (Cross-Validation): 0.7748
Precision (Cross-Validation): 0.7500
Recall (Cross-Validation): 0.7423
F1 Score (Cross-Validation): 0.7461
Train Accuracy (Cross-Validation): 0.7632
Sensitivity (Test): 0.5526
Specificity (Test): 0.7500
PPV (Test): 0.5000
NPV (Test): 0.7500
Precision (Test): 0.5000
Recall (Test): 0.5526
F1 Score (Test): 0.5250
Test Accuracy: 0.6545
Cross-Validation AUC: 0.8121 (95% CI: 0.7545, 0.8697)
Test AUC: 0.7077 (95% CI: 0.6033, 0.8121)
