In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier,
    AdaBoostClassifier, ExtraTreesClassifier, VotingClassifier, StackingClassifier
)
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import (
    classification_report, confusion_matrix,
    roc_auc_score, roc_curve, precision_recall_fscore_support, accuracy_score
)
from sklearn.feature_selection import (
    SelectKBest, mutual_info_classif, f_classif,
    SelectFromModel, VarianceThreshold
)
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# Function to add Hard Voting and Stacking classifiers
def add_hard_voting_and_stacking(X_train, y_train, X_test):
    clf1 = RandomForestClassifier(random_state=42)
    clf2 = LogisticRegression(random_state=42, max_iter=1000)
    clf3 = SVC(probability=True, random_state=42)
    classifiers = [('rf', clf1), ('lr', clf2), ('svc', clf3)]

    # Hard Voting Classifier
    voting_clf = VotingClassifier(estimators=classifiers, voting='soft')
    voting_clf.fit(X_train, y_train)
    voting_preds = voting_clf.predict(X_test)

    # Stacking Classifier
    meta_clf = LogisticRegression(random_state=42)
    stacking_clf = StackingClassifier(
        estimators=classifiers, final_estimator=meta_clf, cv=5
    )
    stacking_clf.fit(X_train, y_train)
    stacking_preds = stacking_clf.predict(X_test)

    return voting_clf, voting_preds, stacking_clf, stacking_preds

# Function to save results and draw ROC plot
def save_results_and_draw_roc(clf, X_test, y_test, preds, file_name, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Compute ROC AUC if possible
    try:
        roc_auc = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
    except Exception as e:
        roc_auc = np.nan
        print(f"Could not compute ROC AUC for {file_name}: {e}")

    # Compute accuracy
    accuracy = accuracy_score(y_test, preds)

    # Save classification report with added AUC and Accuracy rows
    report = classification_report(y_test, preds, output_dict=True)
    # Add ROC AUC row (placing the value in precision, recall, and f1-score columns)
    report["ROC_AUC"] = {"precision": roc_auc, "recall": roc_auc, "f1-score": roc_auc, "support": ""}
    # Add Overall Accuracy row
    report["Overall_Accuracy"] = {"precision": accuracy, "recall": accuracy, "f1-score": accuracy, "support": ""}
    df_report = pd.DataFrame(report).transpose()
    df_report.to_excel(
        os.path.join(output_dir, f'{file_name}_classification_report.xlsx'),
        index=True
    )

    # Save confusion matrix
    conf_matrix = confusion_matrix(y_test, preds)
    df_conf_matrix = pd.DataFrame(conf_matrix)
    df_conf_matrix.to_excel(
        os.path.join(output_dir, f'{file_name}_confusion_matrix.xlsx'),
        index=False
    )

    # Try to compute ROC and plot if possible
    try:
        fpr, tpr, _ = roc_curve(y_test, clf.predict_proba(X_test)[:, 1])
        plt.figure()
        plt.plot(
            fpr, tpr, color='blue', lw=2,
            label='ROC curve (area = %0.2f)' % roc_auc
        )
        plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'{file_name} ROC Curve')
        plt.legend(loc='lower right')
        plt.savefig(
            os.path.join(output_dir, f'{file_name}_roc_curve.png')
        )
        plt.close()
    except Exception as e:
        print(f"Could not generate ROC curve for {file_name}: {e}")

# Function to generate charts
def generate_charts(df_results, output_dir):
    metrics = ['Recall', 'Precision', 'F1-Score', 'AUC', 'Accuracy']
    for metric in metrics:
        plt.figure(figsize=(12, 8))
        pivot_table = df_results.pivot(
            index='Feature_Selection', columns='Model', values=metric
        )
        pivot_table.plot(kind='bar', ax=plt.gca())
        plt.title(
            f'{metric} Scores for Different Models and Feature Selection Methods'
        )
        plt.xlabel('Feature Selection Method')
        plt.ylabel(metric)
        plt.tight_layout()
        plt.legend(
            title='Model', bbox_to_anchor=(1.05, 1), loc='upper left'
        )
        plt.savefig(
            os.path.join(output_dir, f'{metric}_comparison_chart.png')
        )
        plt.close()

# Function to process file with multiple models and feature selection methods
def process_file(file_path, output_dir):
    # Load data
    data = pd.read_excel(file_path)
    data_cleaned = data.drop(columns=['File Name'], errors='ignore')
    X = pd.get_dummies(data_cleaned.drop(columns=['Cancer Status']), drop_first=True)
    y = data_cleaned['Cancer Status']

    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.3, stratify=y, random_state=42
    )

    # Apply SMOTE for balancing
    smote = SMOTE(random_state=42)
    X_train_balanced, y_train_balanced = smote.fit_resample(
        X_train, y_train
    )

    # Feature selection methods
    feature_selection_methods = {
        'SelectKBest_MI': SelectKBest(score_func=mutual_info_classif, k=20),
        'SelectKBest_ANOVA': SelectKBest(score_func=f_classif, k=20),
        'VarianceThreshold': VarianceThreshold(threshold=0.1),
        'SelectFromModel_RF': SelectFromModel(
            RandomForestClassifier(random_state=42), max_features=20
        ),
        'SelectFromModel_ET': SelectFromModel(
            ExtraTreesClassifier(random_state=42), max_features=20
        )
    }

    # List of models to evaluate
    models = {
        "LDA_Classifier": LinearDiscriminantAnalysis(),
        "XGBoost": XGBClassifier(
            scale_pos_weight=5, random_state=42, eval_metric='logloss'
        ),
        "LightGBM": LGBMClassifier(random_state=42),
        "CatBoost": CatBoostClassifier(verbose=0, random_state=42),
        "GradientBoosting": GradientBoostingClassifier(random_state=42),
        "AdaBoost": AdaBoostClassifier(random_state=42),
        "MLP": MLPClassifier(
            hidden_layer_sizes=(100,), max_iter=1000, random_state=42
        )
    }

    # Dictionary to store results (including AUC and Accuracy)
    results = {
        'Model': [],
        'Feature_Selection': [],
        'Precision': [],
        'Recall': [],
        'F1-Score': [],
        'AUC': [],
        'Accuracy': []
    }

    # Loop over feature selection methods
    for fs_name, selector in feature_selection_methods.items():
        print(f"\nUsing feature selection method: {fs_name}")

        # Fit the selector on training data
        selector.fit(X_train_balanced, y_train_balanced)
        X_train_selected = selector.transform(X_train_balanced)
        X_test_selected = selector.transform(X_test)

        # Train and evaluate each model
        for model_name, model in models.items():
            print(f"Training {model_name} with {fs_name}...")

            model.fit(X_train_selected, y_train_balanced)
            y_pred = model.predict(X_test_selected)

            # Calculate metrics using macro average
            precision, recall, f1_score, _ = precision_recall_fscore_support(
                y_test, y_pred, average='macro', zero_division=0
            )

            # Compute ROC AUC; use try/except in case predict_proba isn't available
            try:
                auc = roc_auc_score(y_test, model.predict_proba(X_test_selected)[:, 1])
            except Exception as e:
                auc = np.nan
                print(f"Could not compute AUC for {model_name} with {fs_name}: {e}")

            # Compute accuracy
            acc = accuracy_score(y_test, y_pred)

            # Store the macro average metrics along with AUC and Accuracy
            results['Model'].append(model_name)
            results['Feature_Selection'].append(fs_name)
            results['Precision'].append(precision)
            results['Recall'].append(recall)
            results['F1-Score'].append(f1_score)
            results['AUC'].append(auc)
            results['Accuracy'].append(acc)

            # Save individual results
            model_fs_name = f"{model_name}_{fs_name}"
            save_results_and_draw_roc(
                model, X_test_selected, y_test, y_pred,
                model_fs_name, output_dir
            )

    # Convert results to DataFrame
    df_results = pd.DataFrame(results)
    df_results.to_excel(
        os.path.join(output_dir, 'combined_results.xlsx'), index=False
    )

    # Generate comparison charts
    generate_charts(df_results, output_dir)

    print("\nAll models trained and results saved!")

# Main function
def main():
    # Set paths - UPDATE THESE TO MATCH YOUR DRIVE STRUCTURE
    input_path = r"c:\Users\alire\Downloads\combinedoutput.xlsx"  # Update with your file path
    output_dir = r"E:\step 2 deep feature_with ROC and AUC(0.3)"  # Update with desired output directory

    # Create output directory if not exists
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    process_file(input_path, output_dir)

if __name__ == "__main__":
    main()


Using feature selection method: SelectKBest_MI
Training LDA_Classifier with SelectKBest_MI...
Training XGBoost with SelectKBest_MI...
Training LightGBM with SelectKBest_MI...
[LightGBM] [Info] Number of positive: 3063, number of negative: 3063
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000817 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5100
[LightGBM] [Info] Number of data points in the train set: 6126, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Training CatBoost with SelectKBest_MI...
Training GradientBoosting with SelectKBest_MI...
Training AdaBoost with SelectKBest_MI...




Training MLP with SelectKBest_MI...

Using feature selection method: SelectKBest_ANOVA
Training LDA_Classifier with SelectKBest_ANOVA...


  f = msb / msw


Training XGBoost with SelectKBest_ANOVA...
Training LightGBM with SelectKBest_ANOVA...
[LightGBM] [Info] Number of positive: 3063, number of negative: 3063
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001453 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5100
[LightGBM] [Info] Number of data points in the train set: 6126, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Training CatBoost with SelectKBest_ANOVA...
Training GradientBoosting with SelectKBest_ANOVA...
Training AdaBoost with SelectKBest_ANOVA...




Training MLP with SelectKBest_ANOVA...

Using feature selection method: VarianceThreshold
Training LDA_Classifier with VarianceThreshold...
Training XGBoost with VarianceThreshold...
Training LightGBM with VarianceThreshold...
[LightGBM] [Info] Number of positive: 3063, number of negative: 3063
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011273 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 60162
[LightGBM] [Info] Number of data points in the train set: 6126, number of used features: 244
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Training CatBoost with VarianceThreshold...
Training GradientBoosting with VarianceThreshold...
Training AdaBoost with VarianceThreshold...




Training MLP with VarianceThreshold...

Using feature selection method: SelectFromModel_RF
Training LDA_Classifier with SelectFromModel_RF...
Training XGBoost with SelectFromModel_RF...
Training LightGBM with SelectFromModel_RF...
[LightGBM] [Info] Number of positive: 3063, number of negative: 3063
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000975 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5100
[LightGBM] [Info] Number of data points in the train set: 6126, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Training CatBoost with SelectFromModel_RF...
Training GradientBoosting with SelectFromModel_RF...
Training AdaBoost with SelectFromModel_RF...




Training MLP with SelectFromModel_RF...

Using feature selection method: SelectFromModel_ET
Training LDA_Classifier with SelectFromModel_ET...
Training XGBoost with SelectFromModel_ET...
Training LightGBM with SelectFromModel_ET...
[LightGBM] [Info] Number of positive: 3063, number of negative: 3063
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000732 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5100
[LightGBM] [Info] Number of data points in the train set: 6126, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Training CatBoost with SelectFromModel_ET...
Training GradientBoosting with SelectFromModel_ET...
Training AdaBoost with SelectFromModel_ET...




Training MLP with SelectFromModel_ET...

All models trained and results saved!
