In [1]:
import statsmodels.api as sm
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mutual_info_score, roc_curve, auc, confusion_matrix, classification_report, roc_auc_score, precision_recall_fscore_support
from sklearn.model_selection import GridSearchCV, cross_val_predict, cross_val_score
import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats import norm

# read data
#file path: Baseline (original) --- Baseline\6-4ADASYN-1\original; Baseline (standardized) --- Baseline\6-4ADASYN-1\standardized
train_df = pd.read_csv(r'C:\Users\37427\Desktop\github\FS-ML\Baseline\6-4ADASYN-1\standardized\train.csv')
test_df = pd.read_csv(r'C:\Users\37427\Desktop\github\FS-ML\Baseline\6-4ADASYN-1\standardized\test.csv')

# feature selection ---univariate logistic regression
np.random.seed(0)
def predict_proba(model, X):
    probabilities = model.predict(X)
    return np.column_stack((1 - probabilities, probabilities))

def perform_single_factor_analyses_and_select_features(train_df):
    significant_features = []
    num_features = len(train_df.columns) - 1  
    for i in range(1, num_features + 1): 
        feature_name = train_df.columns[i]
        X = train_df[[feature_name]]  
        y = train_df['Label']  
        X = sm.add_constant(X)  
        model = sm.Logit(y, X).fit()
        print(f"{feature_name} - univariate logistic regression:")
        print(model.summary())
        
        if model.pvalues[1] < 0.05:
            significant_features.append(feature_name)
    return significant_features

selected_features = perform_single_factor_analyses_and_select_features(train_df)
missing_features = [feature for feature in selected_features if feature not in test_df.columns]
if missing_features:
    raise ValueError(f"Missing features in test dataset: {missing_features}")
X_train_selected_top_ten = train_df[selected_features]
y_train = train_df['Label']
X_test_selected_top_ten = test_df[selected_features]
y_test = test_df['Label']

Optimization terminated successfully.
         Current function value: 0.567424
         Iterations 7
TLR-SUVmax - univariate logistic regression:
                           Logit Regression Results                           
Dep. Variable:                  Label   No. Observations:                  207
Model:                          Logit   Df Residuals:                      205
Method:                           MLE   Df Model:                            1
Date:                Mon, 02 Dec 2024   Pseudo R-squ.:                  0.1790
Time:                        16:56:11   Log-Likelihood:                -117.46
converged:                       True   LL-Null:                       -143.07
Covariance Type:            nonrobust   LLR p-value:                 8.205e-13
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0477      0.172      0.278      0.781      -0.

In [2]:
#Classifier --- Logistic Regression
np.random.seed(0)
param_grid = {
    'C': [0.1, 1.0, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear'],
    'max_iter': [1000],
    'tol': [1e-4]
}
logistic = LogisticRegression()
grid_search = GridSearchCV(estimator=logistic, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_selected_top_ten, y_train)
print("Best parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_

y_train_pred_cv = cross_val_predict(best_model, X_train_selected_top_ten, y_train, cv=5)
y_train_pred_proba_cv = cross_val_predict(best_model, X_train_selected_top_ten, y_train, cv=5, method='predict_proba')[:, 1]
train_precision_cv, train_recall_cv, train_f1_cv, _ = precision_recall_fscore_support(y_train, y_train_pred_cv, average='binary')
y_test_pred = best_model.predict(X_test_selected_top_ten)
y_test_pred_proba = best_model.predict_proba(X_test_selected_top_ten)[:, 1]
test_precision, test_recall, test_f1, _ = precision_recall_fscore_support(y_test, y_test_pred, average='binary')

train_accuracy_cv = cross_val_score(best_model, X_train_selected_top_ten, y_train, cv=5, scoring='accuracy').mean()
test_accuracy = accuracy_score(y_test, y_test_pred)
conf_mat_train_cv = confusion_matrix(y_train, y_train_pred_cv)
conf_mat_test = confusion_matrix(y_test, y_test_pred)
sensitivity_train = conf_mat_train_cv[1, 1] / (conf_mat_train_cv[1, 1] + conf_mat_train_cv[1, 0])  # TP / (TP + FN)
sensitivity_test = conf_mat_test[1, 1] / (conf_mat_test[1, 1] + conf_mat_test[1, 0])  # TP / (TP + FN)
specificity_train = conf_mat_train_cv[0, 0] / (conf_mat_train_cv[0, 0] + conf_mat_train_cv[1, 0])
specificity_test = conf_mat_test[0, 0] / (conf_mat_test[0, 0] + conf_mat_test[1, 0])
ppv_train = conf_mat_train_cv[1, 1] / (conf_mat_train_cv[1, 1] + conf_mat_train_cv[0, 1])
npv_train = conf_mat_train_cv[0, 0] / (conf_mat_train_cv[0, 0] + conf_mat_train_cv[1, 0])
ppv_test = conf_mat_test[1, 1] / (conf_mat_test[1, 1] + conf_mat_test[0, 1])
npv_test = conf_mat_test[0, 0] / (conf_mat_test[0, 0] + conf_mat_test[1, 0])
train_precision = precision_score(y_train, y_train_pred_cv)
test_precision = precision_score(y_test, y_test_pred)
train_recall = recall_score(y_train, y_train_pred_cv)
test_recall = recall_score(y_test, y_test_pred)
train_f1 = f1_score(y_train, y_train_pred_cv)
test_f1 = f1_score(y_test, y_test_pred)
print(f"Sensitivity (Cross-Validation): {sensitivity_train:.4f}")
print(f"Specificity (Cross-Validation): {specificity_train:.4f}")
print(f"PPV (Cross-Validation): {ppv_train:.4f}")
print(f"NPV (Cross-Validation): {npv_train:.4f}")
print(f"Precision (Cross-Validation): {train_precision:.4f}")
print(f"Recall (Cross-Validation): {train_recall:.4f}")
print(f"F1 Score (Cross-Validation): {train_f1:.4f}")
print(f"Train Accuracy (Cross-Validation): {train_accuracy_cv:.4f}")
print(f"Sensitivity (Test): {sensitivity_test:.4f}")
print(f"Specificity (Test): {specificity_test:.4f}")
print(f"PPV (Test): {ppv_test:.4f}")
print(f"NPV (Test): {npv_test:.4f}")
print(f"Precision (Test): {test_precision:.4f}")
print(f"Recall (Test): {test_recall:.4f}")
print(f"F1 Score (Test): {test_f1:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

fpr_train, tpr_train, _ = roc_curve(y_train, y_train_pred_proba_cv)
fpr_test, tpr_test, _ = roc_curve(y_test, y_test_pred_proba)
np.random.seed(0)
def bootstrap_roc_auc(y_true, y_scores, n_bootstraps=1000):
    auc_scores = np.zeros(n_bootstraps)
    for i in range(n_bootstraps):
        indices = np.random.choice(len(y_true), size=len(y_true))
        auc_scores[i] = roc_auc_score(y_true[indices], y_scores[indices])
    return auc_scores

auc_scores_train = bootstrap_roc_auc(y_train, y_train_pred_proba_cv)
auc_train_mean = np.mean(auc_scores_train)
auc_train_std = np.std(auc_scores_train)
auc_train_lower = norm.ppf(0.025, loc=auc_train_mean, scale=auc_train_std)
auc_train_upper = norm.ppf(0.975, loc=auc_train_mean, scale=auc_train_std)
print(f"Cross-Validation AUC: {auc_train_mean:.4f} (95% CI: {auc_train_lower:.4f}, {auc_train_upper:.4f})")

auc_scores_test = bootstrap_roc_auc(y_test, y_test_pred_proba)
auc_test_mean = np.mean(auc_scores_test)
auc_test_std = np.std(auc_scores_test)
auc_test_lower = norm.ppf(0.025, loc=auc_test_mean, scale=auc_test_std)
auc_test_upper = norm.ppf(0.975, loc=auc_test_mean, scale=auc_test_std)
print(f"Test AUC: {auc_test_mean:.4f} (95% CI: {auc_test_lower:.4f}, {auc_test_upper:.4f})")

Best parameters: {'C': 1.0, 'max_iter': 1000, 'penalty': 'l1', 'solver': 'liblinear', 'tol': 0.0001}
Sensitivity (Cross-Validation): 0.5567
Specificity (Cross-Validation): 0.6972
PPV (Cross-Validation): 0.8308
NPV (Cross-Validation): 0.6972
Precision (Cross-Validation): 0.8308
Recall (Cross-Validation): 0.5567
F1 Score (Cross-Validation): 0.6667
Train Accuracy (Cross-Validation): 0.7381
Sensitivity (Test): 0.5263
Specificity (Test): 0.7831
PPV (Test): 0.7407
NPV (Test): 0.7831
Precision (Test): 0.7407
Recall (Test): 0.5263
F1 Score (Test): 0.6154
Test Accuracy: 0.7727
Cross-Validation AUC: 0.7785 (95% CI: 0.7145, 0.8426)
Test AUC: 0.8271 (95% CI: 0.7483, 0.9059)
