In [None]:
!pip install scikit-learn numpy pandas imbalanced-learn optuna lightgbm --quiet

## Data preprocessing 

In [None]:
import pandas as pd

data = pd.read_csv("datasets/IBM Dataset 1.csv")

data = data.drop(columns=['EmployeeCount', 'EmployeeNumber', 'StandardHours', 'Over18'], errors='ignore')

engineered_features = [
    ('IncomePerJobLevel', lambda df: df['MonthlyIncome'] / (df['JobLevel'] + 1)),
    ('TotalWorkingYearsToJobLevelRatio', lambda df: df['TotalWorkingYears'] / (df['JobLevel'] + 1)),
    ('YearsAtCompanyToAgeRatio', lambda df: df['YearsAtCompany'] / (df['Age'] + 1)),
    ('YearsAtCompanyToYearsInCurrentRoleRatio', lambda df: df['YearsAtCompany'] / (df['YearsInCurrentRole'] + 1))
]

for name, func in engineered_features:
    data[name] = func(data)

numerical_columns = [x for x in data.select_dtypes(include=['int64', 'float64']).columns if x!= "Attrition"]

categorical_columns = data.select_dtypes(include=['object']).columns

processed_data = data.copy()

nominal_columns = [ 'Department', 'EducationField', 'JobRole', 'MaritalStatus']
processed_data = pd.get_dummies(processed_data, columns=nominal_columns, drop_first=True)

processed_data['Attrition'] = processed_data['Attrition'].map({'No': 0, 'Yes': 1})
processed_data['OverTime'] = processed_data['OverTime'].map({'No': 0, 'Yes': 1})
processed_data['Gender'] = processed_data['Gender'].map({'Male': 0, 'Female': 1})
processed_data['BusinessTravel'] = processed_data['BusinessTravel'].map({'Non-Travel': 0, 'Travel_Rarely': 1, 'Travel_Frequently': 2})

X_processed = processed_data.drop(columns=["Attrition"], errors='ignore')
y_processed = processed_data["Attrition"]


## Run all ML models 

In [None]:
N_TRIALS = 2
import optuna
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix, precision_recall_curve, auc
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN
from imblearn.combine import SMOTETomek
from sklearn.model_selection import StratifiedKFold, train_test_split
from optuna.importance import get_param_importances
import lightgbm as lgb

numeric_transformer = StandardScaler()

X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, test_size=0.2, random_state=42, stratify=y_processed)


sampling_techniques = {
    "None": None,
    "SMOTE": SMOTE(random_state=42),
    "BorderlineSMOTE": BorderlineSMOTE(random_state=42),
    "SMOTETomek": SMOTETomek(random_state=42),
    "ADASYN": ADASYN(random_state=42),
}

models_and_params = {
    "Logistic Regression": LogisticRegression,
    "KNN": KNeighborsClassifier,
    #"SVM": SVC,
    "Decision Tree": DecisionTreeClassifier,
    "Random Forest": RandomForestClassifier,
    "AdaBoost": AdaBoostClassifier,
    "XGBoost": XGBClassifier,
    "Gaussian Naive Bayes": GaussianNB,
    "Gradient Boosting": GradientBoostingClassifier,
    "LightGBM": lgb.LGBMClassifier
}

combined_results = []

for sampling_name, sampler_type in sampling_techniques.items():
    for model_name, model_class in models_and_params.items():
        print(f"Optimizing {model_name}")

        def objective(trial, sampler):
            if model_name == "Logistic Regression":
                model = LogisticRegression(
                    C=trial.suggest_float("C", 1e-3, 1e3, log=True),
                    penalty=trial.suggest_categorical("penalty", ["l2"]),
                    class_weight=trial.suggest_categorical("class_weight", [None, "balanced"]),
                    max_iter=trial.suggest_categorical("max_iter", [1000]),
                    random_state=42
                )
            elif model_name == "KNN":
                model = KNeighborsClassifier(
                    n_neighbors=trial.suggest_int("n_neighbors", 3, 15),
                    weights=trial.suggest_categorical("weights", ["uniform", "distance"]),
                    metric=trial.suggest_categorical("metric", ["euclidean", "manhattan", "minkowski"])
                )
            elif model_name == "SVM":
                model = SVC(
                    C=trial.suggest_categorical("C", [0.001, 0.01, 0.1, 1, 10, 100, 1000]),
                    gamma=trial.suggest_categorical("gamma", ["scale", 0.01, 0.1, 1]),
                    kernel=trial.suggest_categorical("kernel", ["linear", "rbf"]),
                    probability=trial.suggest_categorical("probability", [True]),
                    random_state=42
                )
            elif model_name == "Decision Tree":
                model = DecisionTreeClassifier(
                    max_depth=trial.suggest_int("max_depth", 3, 20),
                    min_samples_split=trial.suggest_int("min_samples_split", 2, 10),
                    class_weight=trial.suggest_categorical("class_weight", [None, "balanced"]),
                    criterion=trial.suggest_categorical("criterion", ["gini", "entropy"]),
                    random_state=42
                )
            elif model_name == "Random Forest":
                model = RandomForestClassifier(
                    n_estimators=trial.suggest_int("n_estimators", 50, 200),
                    max_depth=trial.suggest_int("max_depth", 5, 15),
                    min_samples_split=trial.suggest_int("min_samples_split", 2, 10),
                    min_samples_leaf=trial.suggest_int("min_samples_leaf", 1, 4),
                    class_weight=trial.suggest_categorical("class_weight", [None, "balanced"]),
                    random_state=42
                )
            elif model_name == "AdaBoost":
                model = AdaBoostClassifier(
                    n_estimators=trial.suggest_int("n_estimators", 50, 200),
                    learning_rate=trial.suggest_float("learning_rate", 0.01, 1, log=True),
                    algorithm=trial.suggest_categorical("algorithm", ["SAMME"]),
                    random_state=42
                )
            elif model_name == "XGBoost":
                model = XGBClassifier(
                    n_estimators=trial.suggest_int("n_estimators", 50, 200),
                    max_depth=trial.suggest_int("max_depth", 3, 10),
                    learning_rate=trial.suggest_float("learning_rate", 0.01, 1, log=True),
                    scale_pos_weight=trial.suggest_categorical("scale_pos_weight", [1, 10, 50]),
                    eval_metric=trial.suggest_categorical("eval_metric", ["logloss"]),
                    random_state=42
                )
            elif model_name == "Gaussian Naive Bayes":
                model = GaussianNB()
            elif model_name == "Gradient Boosting":
                model = GradientBoostingClassifier(
                    n_estimators=trial.suggest_int("n_estimators", 50, 200),
                    learning_rate=trial.suggest_float("learning_rate", 0.01, 1, log=True),
                    max_depth=trial.suggest_int("max_depth", 3, 10),
                    subsample=trial.suggest_categorical("subsample", [0.8, 1.0]),
                    random_state=42
                )
            elif model_name == "LightGBM":
                model = lgb.LGBMClassifier(
                    n_estimators=trial.suggest_int("n_estimators", 50, 500),
                    learning_rate=trial.suggest_float("learning_rate", 0.01, 1.0, log=True), 
                    max_depth=trial.suggest_int("max_depth", -1, 15), 
                    num_leaves=trial.suggest_int("num_leaves", 2, 64),
                    feature_fraction=trial.suggest_float("feature_fraction", 0.6, 1.0),
                    bagging_fraction=trial.suggest_float("bagging_fraction", 0.6, 1.0),
                    bagging_freq=trial.suggest_int("bagging_freq", 1, 10),
                    min_child_samples=trial.suggest_int("min_child_samples", 5, 50),
                    lambda_l1=trial.suggest_float("lambda_l1", 0.0, 10.0),
                    lambda_l2=trial.suggest_float("lambda_l2", 0.0, 10.0),
                    random_state=42
                )


            else:
                raise ValueError(f"Unknown model: {model_name}")

            cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
            f1_scores = []

            for fold, (train_idx, val_idx) in enumerate(cv.split(X_train, y_train)):
                X_train_cv, X_val_cv = X_train.iloc[train_idx], X_train.iloc[val_idx]
                y_train_cv, y_val_cv = y_train.iloc[train_idx], y_train.iloc[val_idx]

                preprocessor = ColumnTransformer(
                    transformers=[
                        ('num', numeric_transformer, numerical_columns)],
                    remainder='passthrough')

                if sampler:

                    X_train_cv, y_train_cv = sampler.fit_resample(X_train_cv, y_train_cv)
                    X_train_cv = preprocessor.fit_transform(X_train_cv)
                else:
                    X_train_cv = preprocessor.fit_transform(X_train_cv)
                
                X_val_cv = preprocessor.transform(X_val_cv)

                model.fit(X_train_cv, y_train_cv)
                y_pred_cv = model.predict(X_val_cv)

                f1 = f1_score(y_val_cv, y_pred_cv, average="weighted")
                f1_scores.append(f1)

                trial.report(f1, fold)

                if trial.should_prune():
                    raise optuna.exceptions.TrialPruned()

            return np.mean(f1_scores)

        study = optuna.create_study(direction="maximize")
        study.optimize(lambda trial: objective(trial, sampler_type), n_trials=N_TRIALS)

        try:
            importance = get_param_importances(study)
        except:
            importance = None

        best_trial = study.best_trial
        best_params = best_trial.params

        model_params = best_params.copy()
        if model_name in ["KNN", "Gaussian Naive Bayes"]:
            best_model = model_class(**model_params)
        else:
            best_model = model_class(**model_params, random_state=42)

        preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numerical_columns)],remainder='passthrough')
        if sampler_type:
            X_train_transformed = preprocessor.fit_transform(X_train)
            X_train_resampled, y_train_resampled = sampler_type.fit_resample(X_train_transformed, y_train)
        else:
            X_train_resampled = preprocessor.fit_transform(X_train)
            y_train_resampled = y_train


        best_model.fit(X_train_resampled, y_train_resampled)

        X_test_transformed = preprocessor.transform(X_test)

        y_pred_test = best_model.predict(X_test_transformed)
        y_prob_test = best_model.predict_proba(X_test_transformed)[:, 1]

        f1_weighted = f1_score(y_test, y_pred_test, average="weighted")
        precision_weighted = precision_score(y_test, y_pred_test, average="weighted")
        recall_weighted = recall_score(y_test, y_pred_test, average="weighted")

        roc_auc = roc_auc_score(y_test, y_prob_test)

        conf_matrix = confusion_matrix(y_test, y_pred_test)
     
        precision, recall, _ = precision_recall_curve(y_test, y_prob_test)
        pr_auc = auc(recall, precision)

        combined_results.append({
            "Model": model_name,
            "Sampling": sampling_name,
            "Validation F1-Score (CV)": best_trial.value,
            "Test F1-Score (Weighted)": f1_weighted,
            "Test Precision (Weighted)": precision_weighted,
            "Test Recall (Weighted)": recall_weighted,
            "Test ROC-AUC": roc_auc,
            "Test PR-AUC": pr_auc,
            "Best Parameters": best_params,
            "hyperparameter_importance": importance,
            "confusion_matrix": conf_matrix
        })
       




In [None]:
results_df = pd.DataFrame(combined_results)

results_df.to_csv("ml_results.csv")