In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import (SelectKBest, mutual_info_classif, f_classif,
                                       SelectFromModel, VarianceThreshold)
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, LassoCV
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.metrics import (classification_report, roc_auc_score, roc_curve)
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
import os
import openpyxl  # Ensure this is installed for Excel writing

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# Function to process each file
def process_file(file_path, output_dir):
    data = pd.read_excel(file_path)

    # Check label distribution
    label_distribution = data['label'].value_counts()
    print(f"Processing file: {file_path}")
    print("Label distribution:\n", label_distribution)

    # Drop irrelevant columns
    data_cleaned = data.drop(columns=['ID'])

    # Convert non-numeric columns to numeric using one-hot encoding
    X = data_cleaned.drop(columns=['label'])
    X = pd.get_dummies(X, drop_first=True)
    y = data_cleaned['label']

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=72, stratify=y)

    # Handle class imbalance using SMOTE
    smote = SMOTE(random_state=42)
    X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

    # Feature Selection Methods
    feature_selection_methods = {
        'Variance Threshold': VarianceThreshold(threshold=0.01),
        'Mutual Information': SelectKBest(score_func=mutual_info_classif, k=20),
        'ANOVA F-test': SelectKBest(score_func=f_classif, k=20),
        'L1-based': SelectFromModel(LogisticRegression(C=1, penalty='l1', solver='liblinear', max_iter=1000)),
        'Tree-based': SelectFromModel(estimator=RandomForestClassifier(class_weight={0: 1, 1: 5}, random_state=42), max_features=20)
    }

    # Models to Train
    models = {
        'Logistic Regression': LogisticRegression(class_weight={0: 1, 1: 5}, max_iter=1000, random_state=42),
        'Random Forest': RandomForestClassifier(class_weight={0: 1, 1: 5}, random_state=42),
        'Gradient Boosting': GradientBoostingClassifier(random_state=42),
        'CatBoost': CatBoostClassifier(verbose=0, class_weights=[1, 5], random_state=42),
        'XGBoost': XGBClassifier(scale_pos_weight=5, use_label_encoder=False, eval_metric='logloss', random_state=42),
        'LightGBM': lgb.LGBMClassifier(class_weight={0: 1, 1: 5}, random_state=42),
        'SVC': SVC(class_weight={0: 1, 1: 5}, probability=True, random_state=42),
        'K-Nearest Neighbors': KNeighborsClassifier()
    }

    # Create a list to store all results
    all_results = []

    # Directory to save ROC curves
    roc_output_dir = os.path.join(output_dir, 'roc_curves')
    os.makedirs(roc_output_dir, exist_ok=True)

    # Loop over feature selection methods and models
    for fs_name, selector in feature_selection_methods.items():
        print(f"\nUsing Feature Selection Method: {fs_name}")

        # Fit the selector
        if fs_name in ['L1-based', 'Tree-based']:
            selector.fit(X_train_balanced, y_train_balanced)
            X_train_selected = selector.transform(X_train_balanced)
            X_test_selected = selector.transform(X_test)
            selected_features = X.columns[selector.get_support()]
        else:
            X_train_selected = selector.fit_transform(X_train_balanced, y_train_balanced)
            X_test_selected = selector.transform(X_test)
            selected_features = X.columns[selector.get_support()]

        # Save selected features
        features_df = pd.DataFrame(selected_features, columns=["Selected Features"])
        features_output_path = os.path.join(output_dir, f"{os.path.basename(file_path)}_{fs_name}_features.xlsx")
        features_df.to_excel(features_output_path, index=False)
        print(f"Selected features saved to {features_output_path}")

        for model_name, model in models.items():
            print(f"Training Model: {model_name}")

            # Train the model
            model.fit(X_train_selected, y_train_balanced)

            # Make predictions
            y_pred = model.predict(X_test_selected)
            # Compute probabilities if supported
            if hasattr(model, 'predict_proba'):
                y_proba = model.predict_proba(X_test_selected)[:, 1]
                roc_auc = roc_auc_score(y_test, y_proba)
            else:
                y_proba = None
                roc_auc = None

            # Evaluate the model
            report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
            # Extract macro-average metrics
            precision_macro = report['macro avg']['precision']
            recall_macro = report['macro avg']['recall']
            f1_macro = report['macro avg']['f1-score']
            # Extract accuracy (newly added)
            accuracy = report['accuracy']

            # Append results including AUC and Accuracy
            result_row = {
                'Model': model_name,
                'Feature Selection': fs_name,
                'Precision (Macro Avg)': precision_macro,
                'Recall (Macro Avg)': recall_macro,
                'F1-Score (Macro Avg)': f1_macro,
                'Accuracy': accuracy,  # New accuracy field
                'AUC': roc_auc
            }
            all_results.append(result_row)

            # Plot ROC curve if probabilities are available
            if y_proba is not None:
                fpr, tpr, _ = roc_curve(y_test, y_proba)
                plt.figure()
                plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
                plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
                plt.xlim([0.0, 1.0])
                plt.ylim([0.0, 1.05])
                plt.xlabel('False Positive Rate')
                plt.ylabel('True Positive Rate')
                plt.title(f'ROC Curve - {model_name} using {fs_name}')
                plt.legend(loc="lower right")
                roc_curve_path = os.path.join(
                    roc_output_dir,
                    f"{os.path.basename(file_path)}_{fs_name}_{model_name}_roc_curve.png"
                )
                plt.savefig(roc_curve_path)
                plt.close()
                print(f"ROC curve saved to {roc_curve_path}")
            else:
                print(f"{model_name} does not support probability estimates, skipping ROC curve.")

    # Convert the list of results to a DataFrame
    all_results_df = pd.DataFrame(all_results)

    # Save all results to Excel
    results_file = os.path.join(output_dir, f"{os.path.basename(file_path)}_metrics.xlsx")
    all_results_df.to_excel(results_file, index=False)
    print(f"\nAll results saved to {results_file}")

    print(f"Processing for {file_path} completed.")

# List of files to process
files = [
    r'e:\Label_2.xlsx', r'e:\Label_3.xlsx', r'e:\Label_4.xlsx', r'e:\Label_5.xlsx', r'e:\Label_6.xlsx'
]

# Directory to save outputs
output_directory = r'e:\result step 1 label 2 radiomics with ROC and AUC'  # Update this path to your desired output directory

# Process each file
for file in files:
    process_file(file, output_directory)

Processing file: e:\Label_2.xlsx
Label distribution:
 label
0    1096
1     167
Name: count, dtype: int64

Using Feature Selection Method: Variance Threshold
Selected features saved to e:\result step 1 label 2 radiomics with ROC and AUC\Label_2.xlsx_Variance Threshold_features.xlsx
Training Model: Logistic Regression
ROC curve saved to e:\result step 1 label 2 radiomics with ROC and AUC\roc_curves\Label_2.xlsx_Variance Threshold_Logistic Regression_roc_curve.png
Training Model: Random Forest
ROC curve saved to e:\result step 1 label 2 radiomics with ROC and AUC\roc_curves\Label_2.xlsx_Variance Threshold_Random Forest_roc_curve.png
Training Model: Gradient Boosting
ROC curve saved to e:\result step 1 label 2 radiomics with ROC and AUC\roc_curves\Label_2.xlsx_Variance Threshold_Gradient Boosting_roc_curve.png
Training Model: CatBoost
ROC curve saved to e:\result step 1 label 2 radiomics with ROC and AUC\roc_curves\Label_2.xlsx_Variance Threshold_CatBoost_roc_curve.png
Training Model: XGB