In [1]:
# Data handling
import numpy as np
import pandas as pd
import re

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as ticker
import graphviz
import optuna
import optuna.visualization as vis
%matplotlib inline

# Stats
from statsmodels.graphics.gofplots import qqplot
import statsmodels.api as sm
import scipy.stats as st
from scipy.stats import shapiro, norm, chi2_contingency, kstest, boxcox

# Preprocessing
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import OneHotEncoder,PowerTransformer
from sklearn.compose import ColumnTransformer

# Models
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingRegressor, RandomForestClassifier,AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeRegressor, export_graphviz

from sklearn.svm import SVC
# from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# Metrics
from sklearn.metrics import  accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, confusion_matrix, classification_report #Classifier
from sklearn.inspection import permutation_importance
# from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, root_mean_squared_error # Regressor

#lib

#
# from wordcloud import WordCloud,STOPWORDS
from ast import literal_eval
from collections import Counter

# os
import os

import pickle

# time
import time

import warnings
# warnings.filterwarnings("ignore")    # (Optional)

print("Project has been created with Pandas: " ,pd. __version__," And with Numpy: ",np. __version__)

ModuleNotFoundError: No module named 'graphviz'

### Loading

In [5]:
import yaml

try:
    with open("../config.yaml", "r") as file:
        config = yaml.safe_load(file)
except:
    print("Yaml configuration file not found!")

: 

In [None]:
config

: 

In [None]:
df = pd.read_csv(config["data"]["clean"]["file_eda_cleaned"])
# df = df.sort_values(by = ["ext_install_count", "ext_rating"], ascending= False)
df.head()

: 

### 5. Preprocessing 

In [None]:
df.info()

: 

- Handle duplicated

In [None]:
df.duplicated().sum()

: 

In [None]:
df = df.drop_duplicates()
df

: 

- Handle missing values

In [None]:
df.isna().sum()

: 

- Feature transformation/Transform values

Handle categories with high cardinality -> Grouping rare

In [None]:
df["ext_categories"].value_counts(normalize=True)

: 

In [None]:
df["repo_languages"].value_counts(normalize=True)

: 

In [14]:
threshold_ext_categories = 0.1  # e.g., categories below 10% frequency
value_counts_ext_categories = df["ext_categories"].value_counts(normalize=True)
rare_categories = value_counts_ext_categories[value_counts_ext_categories.values <= threshold_ext_categories].index

def transform_ext_categories(x):
    text = str(x)
    if text == "Other":
        return "Unknown"
    elif text in rare_categories:
        return "Others"
    else:
        return x
df["ext_categories_grouped"] = df["ext_categories"].apply(transform_ext_categories)

: 

In [15]:
threshold_repo_languages = 0.05 # e.g., categories below 3% frequency
value_counts_repo_languages = df["repo_languages"].value_counts(normalize=True)
rare_categories = value_counts_repo_languages[value_counts_repo_languages.values <= threshold_repo_languages].index

def transform_repo_languages(x):
    text = str(x)
    if text  in ["other", "unknown"]:
        return "unknown"
    elif text in rare_categories:
        return "others"
    else:
        return x
df["repo_languages_grouped"] = df["repo_languages"].apply(transform_repo_languages)

: 

In [16]:
# threshold_ext_categories = 0.1  # e.g., categories below 10% frequency
# value_counts_ext_categories = df['ext_categories'].value_counts(normalize=True)
# rare_categories = value_counts_ext_categories[value_counts_ext_categories.values <= threshold_ext_categories].index
# df['ext_categories_grouped'] = df['ext_categories'].apply(lambda x: 'Rest' if x in rare_categories else x)


: 

In [17]:
# threshold_repo_languages = 0.03  # e.g., categories below 3% frequency
# value_counts_repo_languages = df["repo_languages"].value_counts(normalize=True)
# rare_categories = value_counts_repo_languages[value_counts_repo_languages.values <= threshold_repo_languages].index
# df["repo_languages_grouped"] = df["repo_languages"].apply(lambda x: 'rest' if x in rare_categories else x)

: 

In [None]:
df["ext_categories_grouped"].value_counts(normalize=True)

: 

In [None]:
df["repo_languages_grouped"].value_counts(normalize=True)

: 

In [None]:
df = df.drop(columns=["repo_languages", "ext_categories"])
df

: 

- Convert target to number

In [21]:
df["verified"] = df["verified"].map({True: 1, False:0})

: 

- Get number and category columns

In [None]:
potential_categorical_from_numerical = df.select_dtypes("number").loc[:, df.select_dtypes("number").nunique() < 10].drop(columns="verified")
potential_categorical_from_numerical

: 

In [23]:

df_categorical = pd.concat([df.select_dtypes("object"), potential_categorical_from_numerical], axis=1)
df_numerical = df.select_dtypes("number").drop(columns=potential_categorical_from_numerical.columns)

: 

In [None]:
df.info()

: 

In [25]:
cols_num = df_numerical.columns.to_list()

: 

In [None]:
cols_num = df_numerical.drop(["verified","total_vulners"],axis=1).columns.to_list() #Drop 'total_vulners'
cols_cat = df_categorical.columns.to_list()
cols_num, cols_cat

: 

#### Spliting Test data

In [27]:
features = df.drop(columns = ["verified"])
target = df["verified"]
X_train, X_test, y_train, y_test = train_test_split(features,target, test_size = 0.20, random_state=0) #before transforming

: 

In [None]:
X_train.shape, X_test.shape

: 

In [None]:
df

: 

### Feature Engineering

OHE: for nominal categorial

In [30]:
ohe = OneHotEncoder(sparse_output=False)
ohe.fit((X_train[["repo_languages_grouped", "ext_categories_grouped"]]))
X_train_trans_nom_np = ohe.transform(X_train[["repo_languages_grouped", "ext_categories_grouped"]])
X_test_trans_nom_np = ohe.transform(X_test[["repo_languages_grouped", "ext_categories_grouped"]])

X_train_nom_trans_df = pd.DataFrame(X_train_trans_nom_np, columns=ohe.get_feature_names_out(), index=X_train.index)
X_test_nom_trans_df = pd.DataFrame(X_test_trans_nom_np, columns=ohe.get_feature_names_out(), index=X_test.index)

: 

In [None]:
X_train_nom_trans_df

: 

In [32]:
with open(config["model"]["preprocessing"]["file_ohe"], "wb") as file:
    pickle.dump(ohe, file)

: 

Transform cols_num to normal distribution

In [None]:
fig, axes = plt.subplots(nrows = int(np.ceil(len(cols_num)/2)), ncols = 2, figsize = (8,10))
axes = axes.flat

for i,col in enumerate(cols_num):
    sm.qqplot(X_test[col], 
           line = "s", 
           ax = axes[i])
    
    axes[i].set_title(col, fontsize = 10, fontweight = "bold", color = "black")
    
# fig.delaxes(axes[7])
fig.suptitle("QQ-Plots before Transforming", fontsize = 12, fontweight = "bold", color = "darkblue")
fig.tight_layout()
fig.show()

: 

Powertransform

In [34]:
# Power transform
pt = PowerTransformer(method="yeo-johnson")

X_train_num = X_train[cols_num]
pt.fit(X_train_num)
X_test_num  = X_test[cols_num]

X_train_num_trans = pt.transform(X_train_num)
X_test_num_trans = pt.transform(X_test_num)

X_train_num_trans_df = pd.DataFrame(X_train_num_trans, columns=X_train_num.columns, index=X_train_num.index )
X_test_num_trans_df = pd.DataFrame(X_test_num_trans, columns=X_test_num.columns, index=X_test_num.index )

: 

Normalize transform

In [35]:
# #Normalizer
# normalizer = MinMaxScaler()
# X_train_num = X_train[cols_num]
# normalizer.fit(X_train_num)
# X_test_num  = X_test[cols_num]

# X_train_trans = normalizer.transform(X_train_num)
# X_test_trans = normalizer.transform(X_test_num)

# X_train_trans = pd.DataFrame(X_train_trans, columns=X_train_num.columns, index=X_train_num.index)
# X_test_trans = pd.DataFrame(X_test_trans, columns=X_test_num.columns, index=X_test_num.index)

: 

Log Transform

In [36]:
# #Log-transform
# for col in cols_num:
#     df[col] = np.log1p(df[col])
    # X_train_trans = pt.transform(X_train_num)
    # X_test_trans = pt.transform(X_test_num)

: 

In [None]:
fig, axes = plt.subplots(nrows = int(np.ceil(len(cols_num)/2)), ncols = 2, figsize = (8,10))
axes = axes.flat

for i,col in enumerate(cols_num):
    sns.histplot(X_test_num_trans_df[col],
                 kde=True,
                 bins=20,
                 color="orange",
                 ax=axes[i]) 
    
    # sm.qqplot(X_test_num_trans_df[col], 
    #        line = "s", 
    #        ax = axes[i]);
    
    axes[i].set_title(col, fontsize = 10, fontweight = "bold", color = "black")
    
# fig.delaxes(axes[7])
fig.suptitle("QQ-Plots after Transforming", fontsize = 12, fontweight = "bold", color = "darkblue")
fig.tight_layout()
fig.show()


: 

Combining

In [38]:
X_train_ord_trans_df = X_train[["ext_rating_category","ext_version_category"]].copy()
X_test_ord_trans_df = X_test[["ext_rating_category","ext_version_category"]].copy()

: 

In [39]:
X_train_trans = pd.concat([X_train_num_trans_df, X_train_nom_trans_df, X_train_ord_trans_df], axis=1)
X_test_trans = pd.concat([X_test_num_trans_df, X_test_nom_trans_df, X_test_ord_trans_df], axis=1)   


: 

In [40]:
X_trans =  pd.concat([X_train_trans, X_test_trans], axis = 0)
y_trans = pd.concat([y_train, y_test], axis = 0)

: 

In [41]:
# df_trans = pd.concat([X_trans,y_trans], axis = 1).reset_index(drop= True)
# df_trans

: 

In [42]:
# X_train_trans.shape[0] == X_train.shape[0]
# X_test_trans.shape[0] == X_test.shape[0]

: 

In [43]:
# X_train_corr = pd.concat([X_train_trans, y_train], axis=1)

: 

Feature Selection

In [None]:
corr=np.abs(X_train_trans.corr(method="pearson"))

# Set up mask for triangle representation
mask = np.zeros_like(corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(10, 10))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
# sns.heatmap(corr, vmax=1,square=True, linewidths=.5, cbar_kws={"shrink": .5},annot = corr)
sns.heatmap(corr, mask=mask,  vmax=1,square=True, linewidths=.5, cbar_kws={"shrink": .5},annot = corr)

plt.show()

: 

There are high correlations between **_repo stars vs repo forks, total_vulners vs low vulvers /medium vulners_**

In [45]:
# df_trans= df_trans.drop("total_vulners",axis=1)

: 

In [46]:
# features_ = df_trans.drop(columns = ["verified"])
# target_ = df_trans["verified"]
# Xtrans_train, Xtrans_test, ytrans_train, ytrans_test = train_test_split(features,target, test_size = 0.20, random_state=0)

: 

#### Imbalanced

In [47]:
smote = SMOTE(random_state = 1,sampling_strategy=1.0)
X_train_smote, y_train_smote = smote.fit_resample(X_train_trans, y_train)

: 

In [None]:
# def bar_labels(axes, rotation=0, location="edge", xlabel=None, ylabel=None):
#     """Add labels to bars in a bar plot and configure axes."""
#     for container in axes.containers:
#         axes.bar_label(container, label_type=location, rotation=rotation)
#     if xlabel:
#         axes.set_xlabel(xlabel)
#     if ylabel:
#         axes.set_ylabel(ylabel)
#     else:
#         axes.set_ylabel("Accuracy (%)")

# def training_classification_optimized(x_train: pd.DataFrame | np.ndarray , x_test: pd.DataFrame | np.ndarray, y_train: pd.DataFrame | np.ndarray, y_test: pd.DataFrame | np.ndarray):
#     """Train and evaluate multiple classifiers, plot accuracies and confusion matrices."""
    
#     # Define models dictionary
#     models = {
#         "Random Forest": RandomForestClassifier(random_state=0),
#         "Ada Boost": AdaBoostClassifier(random_state=0),
#         "Gradient Boosting": GradientBoostingClassifier(random_state=0),
#         "Extra Trees": ExtraTreesClassifier(random_state=0),
#         "Logistic Regression": LogisticRegression(random_state=0, max_iter=1000),
#         "SVC": SVC(random_state=0),
#         "XGBoost": XGBClassifier(random_state=0),
#         "LightGBM": LGBMClassifier(verbose=-100, random_state=0),
#         "Cat Boost": CatBoostClassifier(verbose=False, random_state=0)
#     }

#     # Initialize storage for metrics
#     scores = {}
#     cms = {}
#     reports = {}

#     # Train and evaluate each model
#     for name, model in models.items():
#         model.fit(x_train, y_train)
#         with open(name + ".pkl", "wb") as file:
#             pickle.dump(model, file)
#         pred = model.predict(x_test)
#         scores[name] = accuracy_score(y_test, pred) * 100  # Convert to percentage
#         cms[name] = confusion_matrix(y_test, pred)
#         # f1_score(y_test, y_pred_rf, average='binary')
#         # roc_auc_score(y_test, y_pred_proba_rf[:,1])]
#         reports[name] = classification_report(y_test, pred, output_dict=True)

#     # Create DataFrame for scores
#     dt = pd.DataFrame.from_dict(scores, orient='index', columns=['scores'])
#     dt = dt.sort_values('scores', ascending=False)
#     dt['scores'] = dt['scores'].round(2)

#     # Plot accuracy bar chart
#     fig, ax = plt.subplots(figsize=(15, 6))
#     dt['scores'].plot(kind='bar', ax=ax)
#     bar_labels(ax, xlabel='Model', ylabel='Accuracy (%)')
#     plt.tight_layout()
#     plt.show()

#     print(f"{'*'*30}\n")
    
#     # Plot confusion matrices in rows of up to 5
#     index = 0
#     n_models = len(models)
#     while index < n_models:
#         n_cols = min(5, n_models - index)
#         fig, axes = plt.subplots(ncols=n_cols, figsize=(3 * n_cols, 4))
#         if n_cols == 1:
#             axes = [axes]  # Ensure axes is iterable for single plot
#         for i in range(n_cols):
#             model_name = dt.index[index]
#             sns.heatmap(cms[model_name], annot=True, fmt='d', ax=axes[i], cbar=False)
#             axes[i].set_title(f"{model_name}: {dt.loc[model_name, 'scores']}%")
#             axes[i].set_xlabel('Predicted')
#             axes[i].set_ylabel('True')
#             index += 1
#         plt.tight_layout()
#         plt.show()

#     # Print classification reports
#     for name in dt.index:
#         print(f"{'*'*30}\n{name}\n")
#         print(pd.DataFrame(reports[name]).transpose().round(2))

# training_classification_optimized(X_train_trans, X_test_trans, y_train, y_test)

: 

In [None]:
from typing import Any, Union, Optional




def get_feature_importance(
    model: Any,
    x_train: Union[pd.DataFrame, np.ndarray],
    y_train: Union[pd.Series, np.ndarray],
    x_test: Optional[Union[pd.DataFrame, np.ndarray]] = None,
    y_test: Optional[Union[pd.Series, np.ndarray]] = None,
    method: str = 'auto',
    feature_names: Optional[list] = None,
    top_n: int = 20,
    plot: bool = False,  # Changed default to False for batch processing
    figsize: tuple = (15, 10),
    random_state: int = 0
) -> pd.DataFrame:
   
    # Get feature names
    if feature_names is None:
        if hasattr(x_train, 'columns'):
            feature_names = x_train.columns.tolist()
        else:
            feature_names = [f'feature_{i}' for i in range(x_train.shape[1])]

    # Determine method automatically if 'auto'
    if method == 'auto':
        if hasattr(model, 'feature_importances_'):
            method = 'builtin'
        elif hasattr(model, 'coef_'):
            method = 'coefficients'
        else:
            method = 'permutation'
            if x_test is None or y_test is None:
                warnings.warn("No test data provided. Using training data for permutation importance (may overfit).")
                x_test, y_test = x_train, y_train

    importance_scores = None
    importance_std = None
    method_name = ""

    # Calculate importance based on method
    if method == 'builtin':
        # Tree-based models (RandomForest, XGBoost, LightGBM, etc.)
        importance_scores = model.feature_importances_
        method_name = "Built-in Feature Importance"

    elif method == 'coefficients':
        # Linear models (LogisticRegression, LinearRegression, etc.)
        if hasattr(model, 'coef_'):
            coef = model.coef_
            if coef.ndim > 1:
                # Multi-class classification - take mean of absolute values
                importance_scores = np.mean(np.abs(coef), axis=0)
            else:
                importance_scores = np.abs(coef)
            method_name = "Coefficient-based Importance"
        else:
            raise ValueError("Model does not have coefficients. Try 'permutation' method.")

    elif method == 'permutation':
        # Permutation importance - works with any model
        try:
            # Use test data if available, otherwise training data
            x_eval = x_test if x_test is not None else x_train
            y_eval = y_test if y_test is not None else y_train

            perm_importance = permutation_importance(
                model, x_eval, y_eval,
                n_repeats=5,  # Reduced for faster processing
                random_state=random_state,
                n_jobs=-1
            )
            importance_scores = perm_importance.importances_mean
            importance_std = perm_importance.importances_std
            method_name = "Permutation Importance"

        except ImportError:
            raise ImportError("sklearn is required for permutation importance")
    else:
        raise ValueError(f"Unknown method: {method}. Use 'auto', 'builtin', 'permutation', or 'coefficients'")

    # Create results DataFrame
    results_data = {
        'feature': feature_names,
        'importance': importance_scores
    }

    if importance_std is not None:
        results_data['importance_std'] = importance_std

    importance_df = pd.DataFrame(results_data).sort_values('importance', ascending=False)

    # Create visualization
    if plot:
        plt.figure(figsize=figsize)

        # Get top N features
        top_features = importance_df.head(top_n)

        # Create horizontal bar plot
        y_pos = np.arange(len(top_features))
        bars = plt.barh(y_pos, top_features['importance'], alpha=0.8)

        # Add error bars if available
        if 'importance_std' in top_features.columns:
            plt.errorbar(top_features['importance'], y_pos,
                        xerr=top_features['importance_std'],
                        fmt='none', color='black', alpha=0.6)

        # Customize plot
        plt.yticks(y_pos, top_features['feature'])
        plt.xlabel('Importance Score')
        plt.title(f'{method_name} - Top {min(top_n, len(top_features))} Features')
        plt.gca().invert_yaxis()

        # Add value labels on bars
        for i, (bar, importance) in enumerate(zip(bars, top_features['importance'])):
            plt.text(bar.get_width() + max(top_features['importance']) * 0.01,
                    bar.get_y() + bar.get_height()/2,
                    f'{importance:.4f}',
                    va='center', fontsize=8)

        plt.tight_layout()
        plt.grid(axis='x', alpha=0.3)
        plt.show()

    return importance_df



# Example usage:
# results1 = training_classification_optimized(X_train_trans, X_test_trans, y_train, y_test)
# results2 = training_classification_optimized(X_train_smote, X_test_trans, y_train_smote, y_test)
# comparison = compare_experiments([results1, results2], ['Original', 'SMOTE'], 'Accuracy')

In [None]:
def bar_labels(axes, rotation=0, location="edge", xlabel=None, ylabel=None):
    """Add labels to bars in a bar plot and configure axes."""
    for container in axes.containers:
        axes.bar_label(container, label_type=location, rotation=rotation)
    if xlabel:
        axes.set_xlabel(xlabel)
    if ylabel:
        axes.set_ylabel(ylabel)

In [None]:
       
def plot_metrics_comparison(metrics_dict, metric_names, figsize=(20, 12)):
    """Plot multiple metrics in subplots."""
    n_metrics = len(metric_names)
    n_cols = 3
    n_rows = (n_metrics + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize)
    axes = axes.flatten() if n_rows > 1 else [axes] if n_cols == 1 else axes
    
    for i, metric_name in enumerate(metric_names):
        if i < len(axes):
            ax = axes[i]
            df_metric = pd.DataFrame.from_dict(metrics_dict[metric_name], orient='index', 
                                             columns=[metric_name])
            df_metric = df_metric.sort_values(metric_name, ascending=False)
            df_metric[metric_name] = df_metric[metric_name].round(2)
            
            df_metric[metric_name].plot(kind='bar', ax=ax, color='skyblue')
            bar_labels(ax, xlabel='Model', ylabel=f'{metric_name.replace("_", " ").title()} (%)')
            ax.set_title(f'{metric_name.replace("_", " ").title()} Comparison')
            ax.tick_params(axis='x', rotation=45)
    
    # Hide unused subplots
    for i in range(len(metric_names), len(axes)):
        axes[i].set_visible(False)
    
    plt.tight_layout()
    plt.show()

def plot_confusion_matrices(cms, df_metrics, metric_for_title='Accuracy'):
    """Plot confusion matrices in rows of up to 5."""
    model_names = list(cms.keys())
    n_models = len(model_names)
    index = 0
    
    while index < n_models:
        n_cols = min(5, n_models - index)
        fig, axes = plt.subplots(ncols=n_cols, figsize=(3 * n_cols, 4))
        if n_cols == 1:
            axes = [axes]
        
        for i in range(n_cols):
            if index < n_models:
                model_name = model_names[index]
                sns.heatmap(cms[model_name], annot=True, fmt='d', ax=axes[i], cbar=False)
                
                # Get metric value for title
                metric_value = df_metrics.loc[model_name, metric_for_title] if model_name in df_metrics.index else 'N/A'
                axes[i].set_title(f"{model_name}: {metric_value}%")
                axes[i].set_xlabel('Predicted')
                axes[i].set_ylabel('True')
                index += 1
        
        plt.tight_layout()
        plt.show()

In [None]:
def plot_feature_importance_comparison(importances_dict, top_n=10, figsize=(20, 15)):
    """Plot feature importance comparison across models."""
    n_models = len(importances_dict)
    n_cols = 3
    n_rows = (n_models + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize)
    axes = axes.flatten() if n_rows > 1 else [axes] if n_cols == 1 else axes
    
    for i, (model_name, importance_df) in enumerate(importances_dict.items()):
        if i < len(axes):
            ax = axes[i]
            top_features = importance_df.head(top_n)
            
            y_pos = np.arange(len(top_features))
            bars = ax.barh(y_pos, top_features['importance'], alpha=0.8)
            
            ax.set_yticks(y_pos)
            ax.set_yticklabels(top_features['feature'], fontsize=8)
            ax.set_xlabel('Importance Score')
            ax.set_title(f'{model_name} - Top {top_n} Features')
            ax.invert_yaxis()
            
            # Add value labels
            for j, (bar, importance) in enumerate(zip(bars, top_features['importance'])):
                ax.text(bar.get_width() + max(top_features['importance']) * 0.01,
                       bar.get_y() + bar.get_height()/2,
                       f'{importance:.3f}',
                       va='center', fontsize=6)
            
            ax.grid(axis='x', alpha=0.3)
    
    # Hide unused subplots
    for i in range(len(importances_dict), len(axes)):
        axes[i].set_visible(False)
    
    plt.tight_layout()
    plt.show()


In [None]:
def training_classification_optimized(x_train, x_test, y_train, y_test, 
                                    save_models=True, plot_feature_importance=True,
                                    feature_importance_top_n=10):
    """Train and evaluate multiple classifiers with comprehensive metrics and feature importance."""
    
    # Define models dictionary
    models = {
        "Random Forest": RandomForestClassifier(random_state=0),
        "Ada Boost": AdaBoostClassifier(random_state=0),
        "Gradient Boosting": GradientBoostingClassifier(random_state=0),
        "Extra Trees": ExtraTreesClassifier(random_state=0),
        "Logistic Regression": LogisticRegression(random_state=0, max_iter=1000),
        "SVC": SVC(random_state=0, probability=True),
        "KNN": KNeighborsClassifier(),
        "XGBoost": XGBClassifier(random_state=0, eval_metric='logloss'),
        "LightGBM": LGBMClassifier(verbose=-1, random_state=0),
        "CatBoost": CatBoostClassifier(verbose=False, random_state=0)
    }
    
    # Define metrics to calculate
    metrics = {
        'Accuracy': {},
        'Precision': {},
        'Recall': {},
        'F1_Score': {},
        'ROC_AUC': {}
    }
    
    # Additional storage
    cms = {}
    reports = {}
    importances = {}
    trained_models = {}
    
    print("Training models and calculating feature importance...")
    
    # Train and evaluate each model
    for name, model in models.items():
        print(f"Training {name}...")
        
        # Fit model
        model.fit(x_train, y_train)
        trained_models[name] = model
        
        # Save model if requested
        if save_models:
            with open(f"{name.replace(' ', '_')}.pkl", "wb") as file:
                pickle.dump(model, file)
        
        # Make predictions
        y_pred = model.predict(x_test)
        
        # Get probabilities for ROC-AUC (handle models without predict_proba)
        try:
            y_proba = model.predict_proba(x_test)[:, 1]
        except AttributeError:
            y_proba = y_pred
        
        # Calculate metrics
        metrics['Accuracy'][name] = accuracy_score(y_test, y_pred) * 100
        metrics['Precision'][name] = precision_score(y_test, y_pred, average='binary') * 100
        metrics['Recall'][name] = recall_score(y_test, y_pred, average='binary') * 100
        metrics['F1_Score'][name] = f1_score(y_test, y_pred, average='binary') * 100
        
        try:
            metrics['ROC_AUC'][name] = roc_auc_score(y_test, y_proba) * 100
        except (ValueError, TypeError):
            metrics['ROC_AUC'][name] = 0
        
        # Store confusion matrix and classification report
        cms[name] = confusion_matrix(y_test, y_pred)
        reports[name] = classification_report(y_test, y_pred, output_dict=True)
        
        # Get feature importance using the optimized function
        try:
            if name in ["Logistic Regression"]:
                importances[name] = get_feature_importance(
                    model, x_train, y_train, x_test, y_test, 
                    method='coefficients', top_n=feature_importance_top_n
                )
            elif name in ["SVC", "KNN"]:
                importances[name] = get_feature_importance(
                    model, x_train, y_train, x_test, y_test,
                    method='permutation', top_n=feature_importance_top_n
                )
            else:
                importances[name] = get_feature_importance(
                    model, x_train, y_train, x_test, y_test,
                    method='builtin', top_n=feature_importance_top_n
                )
        except Exception as e:
            print(f"Warning: Could not calculate feature importance for {name}: {e}")
            # Create dummy importance for consistency
            n_features = x_train.shape[1]
            feature_names = (x_train.columns.tolist() if hasattr(x_train, 'columns') 
                           else [f'feature_{i}' for i in range(n_features)])
            importances[name] = pd.DataFrame({
                'feature': feature_names[:feature_importance_top_n],
                'importance': np.zeros(min(feature_importance_top_n, n_features))
            })
    
    # Create comprehensive results DataFrame
    results_df = pd.DataFrame(metrics).round(2)
    results_df = results_df.sort_values('Accuracy', ascending=False)
    
    print("\n" + "="*60)
    print("OVERALL RESULTS SUMMARY")
    print("="*60)
    print(results_df)
    
    # Plot individual metrics
    print("\n" + "="*60)
    print("PLOTTING INDIVIDUAL METRICS")
    print("="*60)
    
    metric_names = list(metrics.keys())
    plot_metrics_comparison(metrics, metric_names)
    
    # Plot confusion matrices
    print("\n" + "="*60)
    print("CONFUSION MATRICES")
    print("="*60)
    
    plot_confusion_matrices(cms, results_df, metric_for_title='Accuracy')
    
    # Plot feature importance comparison
    if plot_feature_importance and importances:
        print("\n" + "="*60)
        print("FEATURE IMPORTANCE COMPARISON")
        print("="*60)
        
        plot_feature_importance_comparison(importances, top_n=feature_importance_top_n)
    
    # Print detailed classification reports
    print("\n" + "="*60)
    print("DETAILED CLASSIFICATION REPORTS")
    print("="*60)
    
    for name in results_df.index:
        print(f"\n{'*'*40}\n{name}\n{'*'*40}")
        report_df = pd.DataFrame(reports[name]).transpose().round(2)
        print(report_df)
    
    # Print top feature importance for each model
    if importances:
        print("\n" + "="*60)
        print("TOP FEATURE IMPORTANCE BY MODEL")
        print("="*60)
        
        for name, importance_df in importances.items():
            print(f"\n{'-'*30}\n{name} - Top {min(5, len(importance_df))} Features\n{'-'*30}")
            print(importance_df.head().to_string(index=False))
    
    # Return results for further analysis
    return {
        'metrics': metrics,
        'results_df': results_df,
        'confusion_matrices': cms,
        'classification_reports': reports,
        'feature_importances': importances,
        'trained_models': trained_models
    }

In [None]:
# Utility function for comparing multiple experiments
def compare_experiments(results_list, experiment_names, metric='Accuracy'):
    """Compare results from multiple experiments."""
    comparison_df = pd.DataFrame()
    
    for i, (results, exp_name) in enumerate(zip(results_list, experiment_names)):
        comparison_df[exp_name] = results['results_df'][metric]
    
    # Plot comparison
    fig, ax = plt.subplots(figsize=(15, 8))
    comparison_df.plot(kind='bar', ax=ax)
    bar_labels(ax, xlabel='Model', ylabel=f'{metric} (%)')
    ax.set_title(f'{metric} Comparison Across Experiments')
    ax.legend(title='Experiments')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    return comparison_df

In [719]:
# def plot_histograms_by_diagnosis(df, target='diagnosis', exclude_cols=None, bins=30, cols_per_row=3):
#     import matplotlib.pyplot as plt
#     import seaborn as sns

#     if exclude_cols is None:
#         exclude_cols = []

#     # Select only numerical columns except the target and excluded ones
#     num_cols = df.select_dtypes(include='number').columns.difference(exclude_cols + [target])
#     n = len(num_cols)
#     nrows = (n + cols_per_row - 1) // cols_per_row

#     fig, axes = plt.subplots(nrows, cols_per_row, figsize=(6 * cols_per_row, 4 * nrows))
#     axes = axes.flatten()

#     for i, col in enumerate(num_cols):
#         for diagnosis in df[target].unique():
#             sns.histplot(df[df[target] == diagnosis][col],
#                          label=str(diagnosis),
#                          bins=bins,
#                          kde=False,
#                          ax=axes[i],
#                          element='step',
#                          stat='density')

#         axes[i].set_title(f'{col} distribution by {target}', fontsize=10)
#         axes[i].legend(title=target)

#     for j in range(i + 1, len(axes)):
#         fig.delaxes(axes[j])

#     plt.tight_layout()
#     plt.show()

: 

In [None]:
# models = {
#     "DT":  DecisionTreeClassifier(criterion= 'entropy', max_depth= 10, max_features= None, min_samples_leaf= 4, min_samples_split= 2, splitter= 'best'),
#     "KNN": KNeighborsClassifier(n_neighbors=3,p= 1, weights= 'distance'),
#     "RF":  RandomForestClassifier( max_depth=20,max_features=7),
#     "SGD": SGDClassifier(early_stopping=True ,loss='log' ,eta0=0.001, random_state=42),
#     "XGB": XGBClassifier(learning_rate= 0.1, max_depth= 7, n_estimators=200)
# }

# MODELS_RESULTS = pd.DataFrame(
#     columns=['Model', 'Sampling By ','Train Score', 'Test Score', 'Recall', 'Precision', 'f1-score', 'classification_report'])


# def test_models(X_train, X_test, y_train, y_test, Sampling_tech):

#     for model in models:
#         # fit
#         fit = models[model].fit(X_train, y_train)
#         # sep before output
#         print('-'*40)
#         print(f' -------------------{model}-------------------')
#         print('-'*40)

#         # output
#         y_pred= fit.predict(X_test)
#         report = metrics.classification_report(y_test,y_pred)

#         print(report)
#         fig, ax = plt.subplots(figsize=(6, 4))
#         ax.set_title(model)
#         sns.heatmap(metrics.confusion_matrix(y_test, y_pred),
#                     annot=True, cmap='Blues', fmt='g', cbar=False, ax=ax)
#         plt.xlabel('Predicted labels')
#         plt.ylabel('True labels')
#         plt.show()

#         # data frame of MODELS_RESULTS and save results
#         train_score = round(fit.score(X_train, y_train), 3)
#         test_score = round(fit.score(X_test, y_test), 3)
#         recall = round(metrics.recall_score(y_test, y_pred) * 100, 2)
#         precision = round(metrics.precision_score(y_test,y_pred) * 100, 2)
#         f1 = round(metrics.f1_score(y_test, y_pred) * 100, 2)

#         MODELS_RESULTS.loc[len(MODELS_RESULTS.index)] = [model, Sampling_tech,
#                                            train_score, test_score, recall, precision, f1, report]

: 

- 🌲 Random Forest

Ensemble of decision trees (bagging).
Uses random subsets of data and features.
Robust to overfitting and outliers.
Good baseline model for tabular data.
- ⚡ AdaBoost

Sequential boosting of weak learners.
Focuses on previous misclassified samples.
Sensitive to noise/outliers.
Good for clean data with subtle patterns.
- 🚀 XGBoost

Optimized gradient boosting algorithm.
Fast, accurate, and regularized.
Best for performance with tuning effort.

- 📊 Logistic Regression

Linear model for binary classification.
Estimates probabilities using a sigmoid function.
Assumes a linear relationship between features and the log-odds of the target.
Simple, fast, and interpretable — great baseline for linearly separable data.

- 🎯 Support Vector Machine (SVM)

Finds the optimal hyperplane that maximizes the margin between classes.
Works well in high-dimensional spaces.
Can use different kernels (linear, RBF, polynomial) to capture nonlinear patterns.
Sensitive to scaling; may be slower on large datasets.

- 👟 K-Nearest Neighbors (KNN)

Instance-based learning — no training, just storing.
Classifies based on the majority label among k-nearest neighbors.
Simple and intuitive, but slow with large datasets.
Sensitive to feature scaling and irrelevant features.