In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind, mannwhitneyu
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.inspection import permutation_importance
import shap

DATA_PATH = '../data/'

In [2]:
df = pd.read_csv(DATA_PATH+'eda_data.csv')

raw_cols_to_drop = [
    'BusinessTravel',
    'JobLevel',
    'StockOptionLevel',
    'RelationshipSatisfaction',
    'TrainingTimesLastYear',
    'NumCompaniesWorked',
    'Department',
    'EducationField',
    'JobRole',
    'MaritalStatus',
    'OverTime'
]

df = df.drop(columns=raw_cols_to_drop)
df

Unnamed: 0,MonthlyIncome,Age,DistanceFromHome,DailyRate,Attrition,BusinessTravel_ord,JobLevel_ord,StockOptionLevel_grp,RelationshipSatisfaction_bin,TrainingTimesLastYear_grp,NumCompaniesWorked_grp,Department_bin,EducationField_bin,JobRole_grp,MaritalStatus_bin,OverTime_bin
0,5993,41,1,1102,Yes,1,2,SOL_0,0,high,NCW_84023,1,0,Medium,1,1
1,5130,49,8,279,No,2,2,SOL_12,1,low,NCW_1,0,0,Low,0,0
2,2090,37,2,1373,Yes,1,0,SOL_0,1,low,NCW_5679,0,0,High,1,1
3,2909,33,3,1392,No,2,0,SOL_0,1,low,NCW_1,0,0,Low,0,1
4,3468,27,2,591,No,1,0,SOL_12,1,low,NCW_5679,0,0,High,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,2571,36,23,884,No,2,2,SOL_12,1,low,NCW_84023,0,0,High,0,0
1466,9991,39,6,613,No,1,1,SOL_12,0,low,NCW_84023,0,0,Low,0,0
1467,6142,27,4,155,No,1,2,SOL_12,1,high,NCW_1,0,0,Low,0,1
1468,5390,49,2,1023,No,2,2,SOL_0,1,low,NCW_84023,1,0,Medium,0,0


In [3]:
y = df["Attrition"].map({"Yes": 1, "No": 0})
X = df.drop(columns=["Attrition"])
X.shape

(1470, 15)

In [4]:
y.value_counts(normalize=True)

Attrition
0    0.838776
1    0.161224
Name: proportion, dtype: float64

In [5]:
X = pd.get_dummies(X)
bool_cols = X.select_dtypes(include='bool').columns
X[bool_cols] = X[bool_cols].astype(int)
X.columns, X.shape

(Index(['MonthlyIncome', 'Age', 'DistanceFromHome', 'DailyRate',
        'BusinessTravel_ord', 'JobLevel_ord', 'RelationshipSatisfaction_bin',
        'Department_bin', 'EducationField_bin', 'MaritalStatus_bin',
        'OverTime_bin', 'StockOptionLevel_grp_SOL_0',
        'StockOptionLevel_grp_SOL_12', 'StockOptionLevel_grp_drop',
        'TrainingTimesLastYear_grp_drop', 'TrainingTimesLastYear_grp_high',
        'TrainingTimesLastYear_grp_low', 'NumCompaniesWorked_grp_NCW_1',
        'NumCompaniesWorked_grp_NCW_5679', 'NumCompaniesWorked_grp_NCW_84023',
        'JobRole_grp_High', 'JobRole_grp_Low', 'JobRole_grp_Medium'],
       dtype='object'),
 (1470, 23))

In [6]:
print(X.isnull().sum().sum())

0


In [7]:
drop_cols = [
    'TrainingTimesLastYear_grp_drop',
    'StockOptionLevel_grp_drop'
]

X = X.drop(columns=drop_cols)

In [8]:
print(X.isnull().sum().sum())

0


In [9]:
rank_ok = (np.linalg.matrix_rank(X.values) == X.shape[1])
rank_ok

np.False_

In [10]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled_copy = X_scaled.copy()

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

logit = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LogisticRegression(
        penalty="l1",
        solver="liblinear",
        class_weight="balanced",
        random_state=42
    ))
])

logit.fit(X, y)
logit_importance = abs(logit.named_steps["model"].coef_[0])

rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=5,
    class_weight="balanced",
    random_state=42
)

rf.fit(X, y)
rf_perm = permutation_importance(
    rf, X, y, n_repeats=10, random_state=42, n_jobs=-1
)
rf_importance = rf_perm.importances_mean

gb = GradientBoostingClassifier(
    n_estimators=200,
    max_depth=3,
    random_state=42
)

gb.fit(X, y)
gb_perm = permutation_importance(
    gb, X, y, n_repeats=10, random_state=42, n_jobs=-1
)
gb_importance = gb_perm.importances_mean

svm = Pipeline([
    ("scaler", StandardScaler()),
    ("model", SVC(
        kernel="rbf",
        C=1.0,
        gamma="scale",
        class_weight="balanced",
        probability=True,
        random_state=42
    ))
])

svm.fit(X, y)
svm_perm = permutation_importance(
    svm, X, y, n_repeats=10, random_state=42, n_jobs=-1
)
svm_importance = svm_perm.importances_mean

importance_df = pd.DataFrame({
    "feature": X.columns,
    "logit": logit_importance,
    "rf": rf_importance,
    "gb": gb_importance,
    "svm": svm_importance
})

for c in ["logit", "rf", "gb", "svm"]:
    importance_df[c] = importance_df[c] / importance_df[c].max()

threshold = 0.1

importance_df["votes"] = (
    (importance_df["logit"] > threshold).astype(int) +
    (importance_df["rf"] > threshold).astype(int) +
    (importance_df["gb"] > threshold).astype(int) +
    (importance_df["svm"] > threshold).astype(int)
)

importance_df = importance_df.sort_values("votes", ascending=False)
importance_df

Unnamed: 0,feature,logit,rf,gb,svm,votes
1,Age,0.201474,0.132605,0.59448,0.206019,4
3,DailyRate,0.25469,0.179407,0.494692,0.171296,4
2,DistanceFromHome,0.487737,0.226209,0.363057,0.234954,4
4,BusinessTravel_ord,0.51516,0.112324,0.266454,0.165509,4
8,EducationField_bin,0.332898,0.121685,0.164544,0.189815,4
16,NumCompaniesWorked_grp_NCW_5679,0.544353,0.121685,0.36518,0.212963,4
10,OverTime_bin,1.0,1.0,1.0,1.0,4
5,JobLevel_ord,0.921299,-0.093604,0.288747,0.358796,3
6,RelationshipSatisfaction_bin,0.479047,0.063963,0.170913,0.3125,3
14,TrainingTimesLastYear_grp_low,0.227926,0.087363,0.026539,0.134259,2


In [19]:
drop_features = [
    "NumCompaniesWorked_grp_NCW_1",
    "JobRole_grp_Medium"
]

X_clean = X.drop(columns=drop_features)

print("Old shape:", X.shape)
print("New shape:", X_clean.shape)

X_clean.to_csv(DATA_PATH + "X_model_selection.csv", index=False)
y.to_csv(DATA_PATH + "y_model_selection.csv", index=False)

Old shape: (1470, 21)
New shape: (1470, 19)


## Feature Importance Summary

Feature importance was evaluated across four model families with different inductive biases:

* L1-regularized Logistic Regression
* Random Forest
* Gradient Boosting
* RBF-kernel SVM

Permutation importance was used for non-linear models, and normalized importance scores were aggregated using a vote-based framework.

A small subset of features showed **consistent importance across all models (4 votes)**:

* `Age`
* `DailyRate`
* `DistanceFromHome`
* `BusinessTravel_ord`
* `EducationField_bin`
* `NumCompaniesWorked_grp_NCW_5679`
* `OverTime_bin`

These features are treated as **mandatory core features**, representing robust, model-agnostic drivers of attrition.

Most remaining features showed **model-dependent or conditional importance**, contributing meaningfully only within specific model families (e.g. boosting or tree-based models). A small number of features received **0 votes** and were dropped due to lack of signal across all models.

---

## Next Step: Nested Feature Subset Evaluation

Given the strong separation between core and non-core features, feature selection is transitioned from heuristic importance analysis to a **nested cross-validation framework**.

* All models will include the **7 core features**
* Subsets of the remaining candidate features will be evaluated
* Feature subset selection will occur **inside the inner CV loop**
* Model performance will be estimated using the **outer CV loop**

This approach allows systematic evaluation of feature combinations while preventing selection bias and data leakage.
