In [None]:
# Install necessary libraries
!pip install pytorch-tabnet scikit-learn numpy pandas imbalanced-learn optuna --quiet

In [2]:
import pandas as pd

data = pd.read_csv("datasets/IBM Dataset 1.csv")

data = data.drop(columns=['EmployeeCount', 'EmployeeNumber', 'StandardHours', 'Over18'], errors='ignore')

engineered_features = [
    ('IncomePerJobLevel', lambda df: df['MonthlyIncome'] / (df['JobLevel'] + 1)),
    ('TotalWorkingYearsToJobLevelRatio', lambda df: df['TotalWorkingYears'] / (df['JobLevel'] + 1)),
    ('YearsAtCompanyToAgeRatio', lambda df: df['YearsAtCompany'] / (df['Age'] + 1)),
    ('YearsAtCompanyToYearsInCurrentRoleRatio', lambda df: df['YearsAtCompany'] / (df['YearsInCurrentRole'] + 1))
]

for name, func in engineered_features:
    data[name] = func(data)

numerical_columns = [x for x in data.select_dtypes(include=['int64', 'float64']).columns if x!= "Attrition"]

categorical_columns = data.select_dtypes(include=['object']).columns

processed_data = data.copy()

nominal_columns = [ 'Department', 'EducationField', 'JobRole', 'MaritalStatus']
processed_data = pd.get_dummies(processed_data, columns=nominal_columns, drop_first=True)

processed_data['Attrition'] = processed_data['Attrition'].map({'No': 0, 'Yes': 1})
processed_data['OverTime'] = processed_data['OverTime'].map({'No': 0, 'Yes': 1})
processed_data['Gender'] = processed_data['Gender'].map({'Male': 0, 'Female': 1})
processed_data['BusinessTravel'] = processed_data['BusinessTravel'].map({'Non-Travel': 0, 'Travel_Rarely': 1, 'Travel_Frequently': 2})

X_processed = processed_data.drop(columns=["Attrition"], errors='ignore').values
y_processed = processed_data["Attrition"].values


In [3]:
N_TRIALS=1

from pytorch_tabnet.tab_model import TabNetClassifier
from pytorch_tabnet.metrics import Metric
import optuna
import numpy as np
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix, precision_recall_curve, auc
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN
from imblearn.combine import SMOTETomek
from sklearn.model_selection import StratifiedKFold, train_test_split
from optuna.importance import get_param_importances
import torch 

numeric_transformer = StandardScaler()

X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, test_size=0.2, random_state=42, stratify=y_processed)

sampling_techniques = {
    "None": None,
    "SMOTE": SMOTE(random_state=42),
    "BorderlineSMOTE": BorderlineSMOTE(random_state=42),
    "SMOTETomek": SMOTETomek(random_state=42),
    "ADASYN": ADASYN(random_state=42),
}

class F1WeightedMetric(Metric):
    def __init__(self):
        self._name = "f1_score"
        self._maximize = True

    def __call__(self, y_true, y_score):
        y_score = np.where(y_score > 0.5, 1, 0)
        y_score = y_score[:,1]
        return f1_score(y_true, y_score, average="weighted")



def objective(trial, sampling_technique):
    params = {
        "n_d": trial.suggest_int("n_d", 8, 64),
        "n_a": trial.suggest_int("n_a", 8, 64),
        "n_steps": trial.suggest_int("n_steps", 2, 10),
        "n_independent": trial.suggest_int("n_independent", 1, 4),
        "n_shared": trial.suggest_int("n_shared", 1, 4),
        "gamma": trial.suggest_float("gamma", 1.0, 2.0),
        "lambda_sparse": trial.suggest_float("lambda_sparse", 1e-6, 1e-2),
        "optimizer": trial.suggest_categorical("optimizer", ["Adadelta", "Adam", "Adagrad", "Adamax", "RMSprop", "SGD"]),
    }

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    f1_scores = []

    for train_idx, val_idx in skf.split(X_train, y_train):
        X_train_cv, X_val_cv = X_train[train_idx], X_train[val_idx]
        y_train_cv, y_val_cv = y_train[train_idx], y_train[val_idx]

        if sampling_technique is not None:
            X_train_cv_resampled, y_train_cv_resampled = sampling_technique.fit_resample(X_train_cv, y_train_cv)
        else:
            X_train_cv_resampled, y_train_cv_resampled = X_train_cv, y_train_cv

        scaler = StandardScaler()
        X_train_cv_resampled = scaler.fit_transform(X_train_cv_resampled)
        X_val_cv = scaler.transform(X_val_cv)

        tabnet_model = TabNetClassifier(
            n_d=params["n_d"],
            n_a=params["n_a"],
            n_steps=params["n_steps"],
            gamma=params["gamma"],
            lambda_sparse=params["lambda_sparse"],
            n_independent=params["n_independent"],
            n_shared=params["n_shared"],
            scheduler_fn=torch.optim.lr_scheduler.StepLR, epsilon=1e-15,
            scheduler_params = {"gamma": params["gamma"],
                     "step_size": 20},
            optimizer_params=dict(lr=2e-2),
             momentum=0.3, clip_value=2., cat_emb_dim=1,
            seed=42
        )

        tabnet_model.fit(
            X_train_cv_resampled, y_train_cv_resampled,
            eval_set=[(X_val_cv, y_val_cv)],
            eval_name=["validation"],
            eval_metric=[F1WeightedMetric],
            max_epochs=1,
            batch_size=2,
            patience=10,

        )

        y_val_pred = tabnet_model.predict(X_val_cv)
        f1_weighted = f1_score(y_val_cv, y_val_pred, average="weighted")
        f1_scores.append(f1_weighted)

    return np.mean(f1_scores)

combined_results = []

for sampling_name, sampling_technique in sampling_techniques.items():
    print(f"Optimizing {sampling_name}")

    study = optuna.create_study(direction="maximize")
    study.optimize(lambda trial: objective(trial, sampling_technique), n_trials=N_TRIALS)

    best_params = study.best_params
    try:
        importance = get_param_importances(study)
    except:
        importance = None

    if sampling_technique is not None:
        X_train_resampled, y_train_resampled = sampling_technique.fit_resample(X_train, y_train)
    else:
        X_train_resampled, y_train_resampled = X_train, y_train

    scaler = StandardScaler()
    X_train_resampled = scaler.fit_transform(X_train_resampled)
    X_test_scaled = scaler.transform(X_test)  
    
    best_model = TabNetClassifier(
        n_d=best_params["n_d"],
        n_a=best_params["n_a"],
        n_steps=best_params["n_steps"],
        gamma=best_params["gamma"],
        lambda_sparse=best_params["lambda_sparse"],
        n_independent=best_params["n_independent"],
        n_shared=best_params["n_shared"],
        optimizer_fn=getattr(torch.optim, best_params["optimizer"]),
        seed=42
    )

    best_model.fit(
        X_train_resampled, y_train_resampled,
        eval_set=[(X_test_scaled, y_test)],
        eval_name=["test"],
        eval_metric=[F1WeightedMetric],
        max_epochs=1,
        batch_size=2,
        patience=10,
    )

    y_test_pred = best_model.predict(X_test_scaled)
    y_test_pred_prob = best_model.predict_proba(X_test_scaled)[:, 1]

    f1_weighted = f1_score(y_test, y_test_pred, average="weighted")
    precision_weighted = precision_score(y_test, y_test_pred, average="weighted")
    recall_weighted = recall_score(y_test, y_test_pred, average="weighted")
    roc_auc = roc_auc_score(y_test, y_test_pred_prob)

    precision, recall, _ = precision_recall_curve(y_test, y_test_pred_prob, pos_label=1)
    pr_auc = auc(recall, precision)

    conf_matrix = confusion_matrix(y_test, y_test_pred)
    combined_results.append({
            "Model": "TabNet",
            "Sampling": sampling_name,
            "Validation F1-Score (CV)": study.best_trial.value,
            "Test F1-Score (Weighted)": f1_weighted,
            "Test Precision (Weighted)": precision_weighted,
            "Test Recall (Weighted)": recall_weighted,
            "Test ROC-AUC": roc_auc,
            "Test PR-AUC": pr_auc,
            "Best Parameters": best_params,
            "hyperparameter_importance": importance,
            "confusion_matrix": conf_matrix
        })
        


[I 2024-12-06 22:41:59,061] A new study created in memory with name: no-name-ef104633-acf1-496e-9bb8-baf702bc88c0


Optimizing None




epoch 0  | loss: 2.02189 | validation_f1_score: 0.74294 |  0:00:16s
Stop training because you reached max_epochs = 1 with best_epoch = 0 and best_validation_f1_score = 0.74294




epoch 0  | loss: 1.5653  | validation_f1_score: 0.76533 |  0:00:17s
Stop training because you reached max_epochs = 1 with best_epoch = 0 and best_validation_f1_score = 0.76533




epoch 0  | loss: 1.7966  | validation_f1_score: 0.76456 |  0:00:20s
Stop training because you reached max_epochs = 1 with best_epoch = 0 and best_validation_f1_score = 0.76456




epoch 0  | loss: 1.75552 | validation_f1_score: 0.66689 |  0:00:18s
Stop training because you reached max_epochs = 1 with best_epoch = 0 and best_validation_f1_score = 0.66689




epoch 0  | loss: 2.05711 | validation_f1_score: 0.7466  |  0:00:18s
Stop training because you reached max_epochs = 1 with best_epoch = 0 and best_validation_f1_score = 0.7466


[I 2024-12-06 22:44:08,651] Trial 0 finished with value: 0.737263078674314 and parameters: {'n_d': 49, 'n_a': 61, 'n_steps': 9, 'n_independent': 2, 'n_shared': 4, 'gamma': 1.46820430309878, 'lambda_sparse': 0.007277558878295638, 'optimizer': 'Adam'}. Best is trial 0 with value: 0.737263078674314.


epoch 0  | loss: 1.74798 | test_f1_score: 0.76376 |  0:00:23s
Stop training because you reached max_epochs = 1 with best_epoch = 0 and best_test_f1_score = 0.76376




In [4]:
results_df = pd.DataFrame(combined_results)
results_df.to_csv("tabnet_results.csv")