In [None]:
# Data handling
import numpy as np
import pandas as pd
import re

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as ticker
import graphviz
import optuna
import optuna.visualization as vis
%matplotlib inline

# Stats
from statsmodels.graphics.gofplots import qqplot
import statsmodels.api as sm
import scipy.stats as st
from scipy.stats import shapiro, norm, chi2_contingency, kstest, boxcox

# Preprocessing
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import OneHotEncoder,PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.inspection import permutation_importance
# Models
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.tree import export_graphviz, DecisionTreeClassifier

from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB,BernoulliNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# Metrics
from sklearn.metrics import  make_scorer, accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, confusion_matrix, classification_report #Classifier
# from sklearn.inspection import 


from typing import Any, Optional

#lib
from lib.ml_functions_experiment import bar_labels, get_feature_importance, plot_metrics_comparison, plot_confusion_matrices
# from wordcloud import WordCloud,STOPWORDS
from ast import literal_eval
from collections import Counter

# os
import os

import pickle

# time
import time

import warnings
# warnings.filterwarnings("ignore")    # (Optional)

print("Project has been created with Pandas: " ,pd. __version__," And with Numpy: ",np. __version__)

### Loading

In [3]:
import yaml

try:
    with open("../config.yaml", "r") as file:
        config = yaml.safe_load(file)
except:
    print("Yaml configuration file not found!")

In [None]:
config

In [None]:
# df = pd.read_csv("./extensions_eda_cleaned.csv")
df = pd.read_csv(config["data"]["clean"]["file_eda_cleaned"])
df.head()

### 5. Preprocessing

In [None]:
df.info()

- Handle duplicated

In [None]:
df.duplicated().sum()

In [None]:
df = df.drop_duplicates()
df

- Handle missing values

In [None]:
df.isna().sum()

- Feature transformation/Transform values

Handle categories with high cardinality -> Grouping rare

In [None]:
df["ext_categories"].value_counts(normalize=True)

In [None]:
df["repo_languages"].value_counts(normalize=True)

In [12]:
threshold_ext_categories = 0.1  # e.g., categories below 10% frequency
value_counts_ext_categories = df["ext_categories"].value_counts(normalize=True)
rare_categories = value_counts_ext_categories[value_counts_ext_categories.values <= threshold_ext_categories].index

def transform_ext_categories(x):
    text = str(x)
    if text == "Other":
        return "Unknown"
    elif text in rare_categories:
        return "Others"
    else:
        return x
df["ext_categories_grouped"] = df["ext_categories"].apply(transform_ext_categories)

In [13]:
threshold_repo_languages = 0.05 # e.g., categories below 3% frequency
value_counts_repo_languages = df["repo_languages"].value_counts(normalize=True)
rare_categories = value_counts_repo_languages[value_counts_repo_languages.values <= threshold_repo_languages].index

def transform_repo_languages(x):
    text = str(x)
    if text  in ["other", "unknown"]:
        return "unknown"
    elif text in rare_categories:
        return "others"
    else:
        return x
df["repo_languages_grouped"] = df["repo_languages"].apply(transform_repo_languages)

In [14]:
# threshold_ext_categories = 0.1  # e.g., categories below 10% frequency
# value_counts_ext_categories = df['ext_categories'].value_counts(normalize=True)
# rare_categories = value_counts_ext_categories[value_counts_ext_categories.values <= threshold_ext_categories].index
# df['ext_categories_grouped'] = df['ext_categories'].apply(lambda x: 'Rest' if x in rare_categories else x)


In [15]:
# threshold_repo_languages = 0.03  # e.g., categories below 3% frequency
# value_counts_repo_languages = df["repo_languages"].value_counts(normalize=True)
# rare_categories = value_counts_repo_languages[value_counts_repo_languages.values <= threshold_repo_languages].index
# df["repo_languages_grouped"] = df["repo_languages"].apply(lambda x: 'rest' if x in rare_categories else x)

In [None]:
df["ext_categories_grouped"].value_counts(normalize=True)

In [None]:
df["repo_languages_grouped"].value_counts(normalize=True)

In [None]:
df = df.drop(columns=["repo_languages", "ext_categories"])
df

- Convert target to number

In [19]:
df["verified"] = df["verified"].map({True: 1, False:0})

- Get number and category columns

In [None]:
potential_categorical_from_numerical = df.select_dtypes("number").loc[:, df.select_dtypes("number").nunique() < 10].drop(columns="verified")
potential_categorical_from_numerical

In [21]:

df_categorical = pd.concat([df.select_dtypes("object"), potential_categorical_from_numerical], axis=1)
df_numerical = df.select_dtypes("number").drop(columns=potential_categorical_from_numerical.columns)

In [None]:
df.info()

In [23]:
cols_num = df_numerical.columns.to_list()

In [None]:
cols_num = df_numerical.drop(["verified","total_vulners"],axis=1).columns.to_list() #Drop 'total_vulners'
# cols_num = df_numerical.drop(["verified","total_vulners","repo_stars"],axis=1).columns.to_list() #Drop 'total_vulners','repo_stars'
cols_cat = df_categorical.columns.to_list()
cols_num, cols_cat

In [25]:
df_corr = pd.concat([df[cols_num], df["verified"]], axis=1)

In [None]:
# corr=np.abs(X_train_trans.corr(method="pearson"))
corr=np.abs(df_corr.corr(method="pearson"))

# Set up mask for triangle representation
mask = np.zeros_like(corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(10, 10))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
# sns.heatmap(corr, vmax=1,square=True, linewidths=.5, cbar_kws={"shrink": .5},annot = corr)
sns.heatmap(corr, mask=mask,  vmax=1,square=True, linewidths=.5, cbar_kws={"shrink": .5},annot = corr)

plt.show()

#### Spliting Test data

In [27]:
features = df.drop(columns = ["verified","total_vulners"])
# features = df.drop(["verified","total_vulners","repo_stars"], axis = 1)
target = df["verified"]
X_train, X_test, y_train, y_test = train_test_split(features,target, test_size = 0.20, random_state=0) #before transforming

In [None]:
X_train.shape, X_test.shape

In [None]:
df

### Feature Engineering

OHE: for nominal categorial

In [30]:
ohe = OneHotEncoder(sparse_output=False)
ohe.fit((X_train[["repo_languages_grouped", "ext_categories_grouped"]]))
X_train_trans_nom_np = ohe.transform(X_train[["repo_languages_grouped", "ext_categories_grouped"]])
X_test_trans_nom_np = ohe.transform(X_test[["repo_languages_grouped", "ext_categories_grouped"]])

X_train_nom_trans_df = pd.DataFrame(X_train_trans_nom_np, columns=ohe.get_feature_names_out(), index=X_train.index)
X_test_nom_trans_df = pd.DataFrame(X_test_trans_nom_np, columns=ohe.get_feature_names_out(), index=X_test.index)

In [None]:
X_train_nom_trans_df

In [32]:
base_name = f"X_train_{id(X_train)}_Y_train_{id(y_train)}"
file_path = f"{config["model"]["preprocessing_path"]}{base_name}_onehot_encoding.pkl"

with open(file_path, "wb") as file:
    pickle.dump(ohe, file)

Transform cols_num to normal distribution

In [None]:
fig, axes = plt.subplots(nrows = int(np.ceil(len(cols_num)/2)), ncols = 2, figsize = (8,10))
axes = axes.flat

for i,col in enumerate(cols_num):
    sm.qqplot(X_test[col],
           line = "s",
           ax = axes[i])

    axes[i].set_title(col, fontsize = 10, fontweight = "bold", color = "black")

# fig.delaxes(axes[7])
fig.suptitle("QQ-Plots before Transforming", fontsize = 12, fontweight = "bold", color = "darkblue")
fig.tight_layout()
fig.show()

Powertransform

In [34]:
# Power transform
pt = PowerTransformer(method="yeo-johnson")

X_train_num = X_train[cols_num]
pt.fit(X_train_num)
X_test_num  = X_test[cols_num]

X_train_num_trans = pt.transform(X_train_num)
X_test_num_trans = pt.transform(X_test_num)

X_train_num_trans_df = pd.DataFrame(X_train_num_trans, columns=X_train_num.columns, index=X_train_num.index )
X_test_num_trans_df = pd.DataFrame(X_test_num_trans, columns=X_test_num.columns, index=X_test_num.index )

In [35]:
base_name = f"X_train_{id(X_train)}_Y_train_{id(y_train)}"
file_path = f"{config["model"]["preprocessing_path"]}{base_name}_power_transformer.pkl"

with open(file_path, "wb") as file:
    pickle.dump(pt, file)

Normalize transform

In [36]:
# #Normalizer
# normalizer = MinMaxScaler()
# X_train_num = X_train[cols_num]
# normalizer.fit(X_train_num)
# X_test_num  = X_test[cols_num]

# X_train_trans = normalizer.transform(X_train_num)
# X_test_trans = normalizer.transform(X_test_num)

# X_train_trans = pd.DataFrame(X_train_trans, columns=X_train_num.columns, index=X_train_num.index)
# X_test_trans = pd.DataFrame(X_test_trans, columns=X_test_num.columns, index=X_test_num.index)

Log Transform

In [37]:
# #Log-transform
# for col in cols_num:
#     df[col] = np.log1p(df[col])
    # X_train_trans = pt.transform(X_train_num)
    # X_test_trans = pt.transform(X_test_num)

In [None]:
fig, axes = plt.subplots(nrows = int(np.ceil(len(cols_num)/2)), ncols = 2, figsize = (8,10))
axes = axes.flat

for i,col in enumerate(cols_num):
    # sns.histplot(X_test_num_trans_df[col],
    #              kde=True,
    #              bins=20,
    #              color="orange",
    #              ax=axes[i])

    sm.qqplot(X_test_num_trans_df[col],
           line = "s",
           ax = axes[i]);

    axes[i].set_title(col, fontsize = 10, fontweight = "bold", color = "black")

# fig.delaxes(axes[7])
fig.suptitle("QQ-Plots after Transforming", fontsize = 12, fontweight = "bold", color = "darkblue")
fig.tight_layout()
fig.show()


Combining

In [39]:
X_train_ord_trans_df = X_train[["ext_rating_category","ext_version_category"]].copy()
X_test_ord_trans_df = X_test[["ext_rating_category","ext_version_category"]].copy()

In [40]:
X_train_trans = pd.concat([X_train_num_trans_df, X_train_nom_trans_df, X_train_ord_trans_df], axis=1)
X_test_trans = pd.concat([X_test_num_trans_df, X_test_nom_trans_df, X_test_ord_trans_df], axis=1)


In [41]:
X_trans =  pd.concat([X_train_trans, X_test_trans], axis = 0)
y_trans = pd.concat([y_train, y_test], axis = 0)

In [None]:
df_trans = pd.concat([X_trans,y_trans], axis = 1)
df_trans

In [43]:
# X_train_trans.shape[0] == X_train.shape[0]
# X_test_trans.shape[0] == X_test.shape[0]

In [44]:
# X_train_corr = pd.concat([X_train_trans, y_train], axis=1)

Feature Selection

In [None]:
# corr=np.abs(X_train_trans.corr(method="pearson"))
corr=np.abs(df_trans.corr(method="pearson"))

# Set up mask for triangle representation
mask = np.zeros_like(corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(10, 10))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
# sns.heatmap(corr, vmax=1,square=True, linewidths=.5, cbar_kws={"shrink": .5},annot = corr)
sns.heatmap(corr, mask=mask,  vmax=1,square=True, linewidths=.5, cbar_kws={"shrink": .5},annot = corr)

plt.show()

**_We want low correlation between features, but high correlation between features and our target._**

There are high correlations between **_repo stars vs repo forks

In [46]:
# features_ = df_trans.drop(columns = ["verified"])
# target_ = df_trans["verified"]
# Xtrans_train, Xtrans_test, ytrans_train, ytrans_test = train_test_split(features,target, test_size = 0.20, random_state=0)

#### Imbalanced

In [47]:
smote = SMOTE(random_state = 1,sampling_strategy=1.0)

In [48]:
X_train_smote, y_train_smote = smote.fit_resample(X_train_trans, y_train)

In [None]:
X_train_smote

#### Define Metrics

|Metric|Definition|	Meaning in Attrition Context|
| ----------- | ----------- | ----------- |
| Recall|  TP/(TP+FN) | 	Most important – how many true leavers you can catch|
| Precision	 |TP/(TP+FP) |	Among predicted leavers, how many are actually correct
Accuracy |(TP+TN)/Total |	Can be misleading with imbalanced data (e.g., <20% attrition) |
| F1-score |2⋅Precision⋅Recall/(Precision+Recall) |Balanced trade-off between Precision and Recall|
| AUC-ROC	|Area under ROC Curve |	Measures ability to distinguish leavers vs. stayers at all thresholds|

### Model Training + Evaluation

In [50]:
# Code optimized metrics, plots
def training_classification_optimized(x_train, x_test, y_train, y_test, save_models=True):
    '''Train and evaluate multiple classifiers with comprehensive metrics'''

    # Define models dictionary
    models = {
        #Ensemble
        "Random Forest": RandomForestClassifier(random_state=0),
        "Gradient Boosting": GradientBoostingClassifier(random_state=0),

        #Ensemble, should Not be used as standalone?!!
        "Bagging Classifier": BaggingClassifier(random_state=0), #default use with DecisionTreeClassifier
        "Ada Boost": AdaBoostClassifier(random_state=0), #default use with DecisionTreeClassifier

        #Trees
        "Extra Trees": ExtraTreesClassifier(random_state=0),
        "Decision Trees": DecisionTreeClassifier(random_state=0),
        "XGBoost": XGBClassifier(random_state=0, eval_metric="logloss"),
        "LightGBM": LGBMClassifier(verbose=-1, random_state=0),
        "CatBoost": CatBoostClassifier(verbose=False, random_state=0),

        #Linear
        "Logistic Regression": LogisticRegression(random_state=0, max_iter=1000),

        #Probabilistic Models
        # "ComplementNB": ComplementNB(),#Negative values
        # "MultinomialNB": MultinomialNB(),#Negative values
        "BernoulliNB" : BernoulliNB(),
        "GaussianNB": GaussianNB(), 

        #Kernel-Based Models
        "SVC": SVC(random_state=0, probability=True),  # Added probability=True for ROC-AUC , slow execution

        #Instance-Based
        "KNN": KNeighborsClassifier(),

    }

    # Define metrics to calculate
    metrics = {
        "Accuracy": {},
        "Precision": {},
        "Recall": {},
        "F1_Score": {},
        "ROC_AUC": {}
    }

    cms = {}
    reports = {}
    importances = {}

    print("Training models...")

    # Train and evaluate each model
    for name, model in models.items():
        print(f"Training {name}...")
        model.fit(x_train, y_train)

        base_name = f"X_train_{id(x_train)}_Y_train_{id(y_train)}"
        file_path = f"{config["model"]["training_path"]}{base_name}_{name.replace(' ', '_')}.pkl"
        # file_path = f"{base_name}_{name.replace(' ', '_')}.pkl"
        # Save model if requested
        if save_models:
            # with open(f"{config["model"]["training_path"]}{name.replace(' ', '_')}.pkl", "wb") as file:
            with open(file_path, "wb") as file:
                pickle.dump(model, file)

        y_pred = model.predict(x_test)

        # Get probabilities for ROC-AUC (handle models without predict_proba)
        try:
            y_proba = model.predict_proba(x_test)[:, 1]
        except AttributeError:
            y_proba = y_pred  # Fallback for models without probability prediction

        # Calculate metrics
        metrics["Accuracy"][name] = accuracy_score(y_test, y_pred) * 100
        metrics["Precision"][name] = precision_score(y_test, y_pred, average="binary") * 100
        metrics["Recall"][name] = recall_score(y_test, y_pred, average="binary") * 100
        metrics["F1_Score"][name] = f1_score(y_test, y_pred, average="binary") * 100

        try:
            metrics["ROC_AUC"][name] = roc_auc_score(y_test, y_proba) * 100
        except ValueError:
            metrics["ROC_AUC"][name] = 0  # Handle cases where ROC-AUC can't be calculated

        # Store confusion matrix and classification report
        cms[name] = confusion_matrix(y_test, y_pred)
        reports[name] = classification_report(y_test, y_pred, output_dict=True)

        # Get feature importance
        importances[name] = get_feature_importance(model, x_test, y_test, method="auto", top_n=5)

    # Create comprehensive results DataFrame
    selected_metric = "Recall"
    results_df = pd.DataFrame(metrics).round(2)
    results_df = results_df.sort_values(selected_metric, ascending=False)

    print("\n" + "="*50)
    print("OVERALL RESULTS SUMMARY")
    print("="*50)
    print(results_df)

    print("\n" + "="*50)
    print("INDIVIDUAL METRICS")
    print("="*50)
    metric_names = list(metrics.keys())
    plot_metrics_comparison(metrics, metric_names)

    print("\n" + "="*50)
    print(f"CONFUSION MATRICES OF '{selected_metric.upper()}'")
    print("="*50)
    plot_confusion_matrices(cms, results_df, metric_for_title=selected_metric)

    print("\n" + "="*50)
    print("CLASSIFICATION REPORTS")
    print("="*50)
    for name in results_df.index:
        print(f"\n{'*'*30}\n{name}\n{'*'*30}")
        report_df = pd.DataFrame(reports[name]).transpose().round(2)
        print(report_df)

    # Return results for further analysis
    # return {
    #     'metrics': metrics,
    #     'results_df': results_df,
    #     'confusion_matrices': cms,
    #     'classification_reports': reports,
    #     'feature_importances': importances,
    #     'models': models
    # }

In [None]:
training_classification_optimized(X_train_trans, X_test_trans, y_train, y_test)

In [None]:
training_classification_optimized(X_train_smote, X_test_trans, y_train_smote, y_test)

Insight:
- Bagging (with DecisionTree) gives the best prediction in term of Recall, F1 on imbalanced and balanced dataset
- SVC Model takes a lot of computational and time -> Less effective
- SMOTE balancing improves significantly performances of weak models imbalanced data
- Extension\_install\_count is an important feature which afffects the prediction of breached extension. In combination with high repo_stars or/and repo_forks, the extension has a high proability of being breached

### Model Tuning

Best model was Bagging, we gonna use hyperparam + crossvalidation to tune the model


- GridSearchCV

In [None]:
#GridSearch OPtimized

# Define parameter grid for BaggingClassifier
param_grid_bag = {
    'bootstrap': [True, False],
    'bootstrap_features': [True, False],    
    'n_estimators': [5, 10],
    'max_samples': [0.6, 0.8, 1.0]
}

# Initialize BaggingClassifier
bag = BaggingClassifier(random_state=123)

# Set confidence level and number of folds
confidence_level = 0.95
folds = 10

# Define recall scoring metric
recall_scoring = make_scorer(recall_score)

# Initialize GridSearchCV
gs = GridSearchCV(
    estimator=bag,
    param_grid=param_grid_bag,
    scoring=recall_scoring,
    cv=folds,
    verbose=10
)

# Fit GridSearchCV and measure time
start_time = time.time()
gs.fit(X_train_trans, y_train)
end_time = time.time()

# Print time taken
print(f"\nTime taken to find the best combination of hyperparameters: {end_time - start_time:.4f} seconds\n")

# Extract results into a DataFrame and sort by mean test score
results_gs_df = pd.DataFrame(gs.cv_results_).sort_values(by="mean_test_score", ascending=False)

# Calculate confidence interval for the best model's recall score
gs_mean_score = results_gs_df.iloc[0]["mean_test_score"]
gs_sem = results_gs_df.iloc[0]["std_test_score"] / np.sqrt(folds)
gs_tc = st.t.ppf(1 - ((1 - confidence_level) / 2), df=folds - 1)
gs_lower_bound = gs_mean_score - (gs_tc * gs_sem)
gs_upper_bound = gs_mean_score + (gs_tc * gs_sem)

# Print best recall score and confidence interval
print(f"Best recall score for training data: {gs.best_score_:.4f}")
print(f"Recall confidence interval for best hyperparameters: ({gs_lower_bound:.4f}, {gs_upper_bound:.4f})\n")

# Get the best model
best_model = gs.best_estimator_

# Make predictions
y_pred_train = best_model.predict(X_train_trans)
y_pred_test = best_model.predict(X_test_trans)

# Evaluate performance on test data
print(f"Test Accuracy: {accuracy_score(y_test, y_pred_test):.4f}")
print(f"Test Precision: {precision_score(y_test, y_pred_test):.4f}")
print(f"Test Recall: {recall_score(y_test, y_pred_test):.4f}")
print(f"Test F1: {f1_score(y_test, y_pred_test):.4f}\n")

# Print best hyperparameters and test recall score
print(f"Best combination of hyperparameters: {gs.best_params_}")
print(f"Recall score for test data: {recall_score(y_test, y_pred_test):.4f}")
# print(f"Best combi's score for Test_data:  {best_model.score(X_test_trans, y_test): .4f} # Always sign to accuracy_score in GridScearchCV

- RandomizedSeachCV

In [None]:
# parameter_grid_dct = {"max_depth": [10, 50],
#                   "min_samples_split": [4, 16],
#                   "max_leaf_nodes": [250, 1000],
#                   "max_features": ["sqrt", "log2"]}

param_grid_bag = {
 'bootstrap': [True, False],
 'bootstrap_features': [True, False],    
 'n_estimators': [5, 10],
 'max_samples' : [0.6, 0.8, 1.0]
}

folds= 10

# dt = DecisionTreeClassifier(random_state=123)
bag = BaggingClassifier(random_state = 123)

recall_scoring = make_scorer(recall_score)
# precision_scoring = make_scorer(precision_score)

# rs = RandomizedSearchCV(dt, param_distributions = parameter_grid, n_iter = 16, cv = folds, verbose=10, random_state=123)
rs = RandomizedSearchCV(bag, param_distributions = param_grid_bag, scoring=recall_scoring, n_iter = 16, cv = folds, verbose=10, random_state=123)

start_time = time.time()
rs.fit(X_train_trans, y_train)
end_time = time.time()

print("\n")
print(f"Time taken to find the best combination of hyperparameters among the given ones: {end_time - start_time: .4f} seconds")
print("\n")

results_rs_df = pd.DataFrame(rs.cv_results_).sort_values(by="mean_test_score", ascending=False)

rs_mean_score = results_rs_df.iloc[0,-3]
rs_sem = results_rs_df.iloc[0,-2] / np.sqrt(10)

rs_tc = st.t.ppf(1-((1-confidence_level)/2), df=folds-1)
rs_lower_bound = rs_mean_score - ( rs_tc * gs_sem )
rs_upper_bound = rs_mean_score + ( rs_tc * gs_sem )

print(f"The best score 'Recall' for Train_data is: {gs.best_score_: .4f}")
print(f"The confidence interval for the best combination of hyperparameters is: \
    ({rs_lower_bound: .4f}, {rs_mean_score: .4f}, {rs_upper_bound: .4f}) ")


best_model = rs.best_estimator_

y_pred_train_df = best_model.predict(X_train_trans)
# y_pred_train_df = best_model.predict(X_train_smote)
y_pred_test_df  = best_model.predict(X_test_trans)

print("\n")
# print(f"Test Accuracy: {accuracy_score(y_pred_test_df, y_test): .4f}")
print(f"Best combi's  score:  {best_model.score(X_test_trans, y_test): .4f}")
print(f"Test Prec: {precision_score(y_pred_test_df, y_test): .4f}")
print(f"Test Recall: {recall_score(y_pred_test_df, y_test): .4f}")
print(f"Test F1: {f1_score(y_pred_test_df, y_test): .4f}")

print("\n")

print(f"Test Recall: {recall_score(y_pred_test_df, y_test): .4f}")
print(f"The best combination of hyperparameters has been: {gs.best_params_}")
# print(f"Best combi's Recall score for Test_data:  {best_model.score(X_test_trans, y_test): .4f}") #Always sign to accuracy_score in RandomSearchCV

- Bayesian Search + Cross Validation

In [None]:
def objective(trial, confidence_level, folds):

    # First, we define the grid with values to consider when train several possible combinations.
    # Now we specify a range/list of values to try for each hyper-parameter, and we let optuna to decide which
    # combination to try.
    max_depth = trial.suggest_int("max_depth", 10, 50) # trial.suggest_int("hyperparameter_name", min_value, maximum_value)
    min_samples_split = trial.suggest_int("min_samples_split", 4, 16)
    max_leaf_nodes = trial.suggest_int("max_leaf_nodes", 250, 1000)
    max_features = trial.suggest_categorical("max_features", ["sqrt", "log2"])
    recall_scoring = make_scorer(recall_score, response_method='predict')

    dt = DecisionTreeClassifier(random_state=123,
                               max_depth=max_depth,
                               min_samples_split=min_samples_split,
                               max_leaf_nodes=max_leaf_nodes,
                               max_features=max_features)

    # Here the parameter "cv" specifies the number of folds K
    scores = cross_val_score(dt, X_train_trans, y_train,scoring=recall_scoring, cv=folds) # The scores provided will be the score on each hold out fold
    mean_score = np.mean(scores)
    sem = np.std(scores, ddof=1) / np.sqrt(folds)

    tc = st.t.ppf(1-((1-confidence_level)/2), df=folds-1)
    lower_bound = mean_score - ( tc * sem )
    upper_bound = mean_score + ( tc * sem )

    # Here, we're storing confidence interval for each trial. It's not possible for the objective function to return
    # multiple values as Optuna uses the only returned value to find the best combination of hyperparameters.
    trial.set_user_attr("CV_score_summary", [round(lower_bound,4), round(np.mean(scores),4), round(upper_bound,4)])

    return np.mean(scores)

confidence_level = 0.95
folds = 10

start_time = time.time()
study = optuna.create_study(direction="maximize") # We want to have the maximum values for the scores
study.optimize(lambda trial: objective(trial, confidence_level, folds), n_trials=45)
#study.optimize(objective(**settings), n_trials=45) # n_trials is the number of combinations of hyperparameters to test.
end_time = time.time()
best_model = study.best_params

print("\n")
print(f"Time taken to find the best combination of hyperparameters among the given ones: {end_time - start_time: .4f} seconds")
print(f"The best score found was: {study.best_value: .4f}")


best_model = DecisionTreeClassifier(random_state=123, **study.best_params)
best_model.fit(X_train_trans, y_train)
# best_model.fit(X_train_smote, y_train_smote)
y_pred_test_df = best_model.predict(X_test_trans)

print(f"Test Acc: {accuracy_score(y_pred_test_df, y_test): .3f}")
print(f"Test Pre: {precision_score(y_pred_test_df, y_test): .3f}")
print(f"Test Recall: {recall_score(y_pred_test_df, y_test): .3f}")
print(f"Test F1: {f1_score(y_pred_test_df, y_test): .3f}")


print("\n")
print(f"Best combi's  score:  {best_model.score(X_test_trans, y_test): .3f}")
print(f"The best combination of hyperparameters found was: {best_model}")



GridSearchCV
    Time taken to find the best combination of hyperparameters: 760.6650 seconds

    Best recall score for training data: 0.7562
    Recall confidence interval for best hyperparameters: (0.7357, 0.7767)

    Test Accuracy: 0.9688
    Test Precision: 0.9398
    Test Recall: 0.7545
    Test F1: 0.8370

    Best combination of hyperparameters: {'bootstrap': False, 'bootstrap_features': True, 'max_samples': 1.0, 'n_estimators': 10}
    Recall score for test data: 0.7545

- 🌲 Random Forest

Ensemble of decision trees (bagging).
Uses random subsets of data and features.
Robust to overfitting and outliers.
Good baseline model for tabular data.
- ⚡ AdaBoost

Sequential boosting of weak learners.
Focuses on previous misclassified samples.
Sensitive to noise/outliers.
Good for clean data with subtle patterns.
- 🚀 XGBoost

Optimized gradient boosting algorithm.
Fast, accurate, and regularized.
Best for performance with tuning effort.

- 📊 Logistic Regression

Linear model for binary classification.
Estimates probabilities using a sigmoid function.
Assumes a linear relationship between features and the log-odds of the target.
Simple, fast, and interpretable — great baseline for linearly separable data.

- 🎯 Support Vector Machine (SVM)

Finds the optimal hyperplane that maximizes the margin between classes.
Works well in high-dimensional spaces.
Can use different kernels (linear, RBF, polynomial) to capture nonlinear patterns.
Sensitive to scaling; may be slower on large datasets.

- 👟 K-Nearest Neighbors (KNN)

Instance-based learning — no training, just storing.
Classifies based on the majority label among k-nearest neighbors.
Simple and intuitive, but slow with large datasets.
Sensitive to feature scaling and irrelevant features.