In [3]:
import pandas as pd

data = pd.read_csv("datasets/IBM Dataset 1.csv")

data = data.drop(columns=['EmployeeCount', 'EmployeeNumber', 'StandardHours', 'Over18'], errors='ignore')

engineered_features = [
    ('IncomePerJobLevel', lambda df: df['MonthlyIncome'] / (df['JobLevel'] + 1)),
    ('TotalWorkingYearsToJobLevelRatio', lambda df: df['TotalWorkingYears'] / (df['JobLevel'] + 1)),
    ('YearsAtCompanyToAgeRatio', lambda df: df['YearsAtCompany'] / (df['Age'] + 1)),
    ('YearsAtCompanyToYearsInCurrentRoleRatio', lambda df: df['YearsAtCompany'] / (df['YearsInCurrentRole'] + 1))
]

for name, func in engineered_features:
    data[name] = func(data)

numerical_columns = [x for x in data.select_dtypes(include=['int64', 'float64']).columns if x!= "Attrition"]

categorical_columns = data.select_dtypes(include=['object']).columns

processed_data = data.copy()

nominal_columns = [ 'Department', 'EducationField', 'JobRole', 'MaritalStatus']
processed_data = pd.get_dummies(processed_data, columns=nominal_columns, drop_first=True)

processed_data['Attrition'] = processed_data['Attrition'].map({'No': 0, 'Yes': 1})
processed_data['OverTime'] = processed_data['OverTime'].map({'No': 0, 'Yes': 1})
processed_data['Gender'] = processed_data['Gender'].map({'Male': 0, 'Female': 1})
processed_data['BusinessTravel'] = processed_data['BusinessTravel'].map({'Non-Travel': 0, 'Travel_Rarely': 1, 'Travel_Frequently': 2})

X = processed_data.drop(columns=["Attrition"], errors='ignore')
y = processed_data["Attrition"]


In [4]:
import numpy as np
from sklearn.metrics import (
    f1_score, recall_score, precision_score, roc_auc_score, precision_recall_curve, auc, confusion_matrix
)
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import xgboost as xgb

# Split data into 70% training, 20% test, and 10% validation
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=2/3, stratify=y_temp, random_state=42)

# Apply SMOTE to handle class imbalance in training set
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

def train_and_evaluate(X_train, y_train, X_val, y_val):
    model = xgb.XGBClassifier(n_estimators=200, learning_rate=0.429, max_depth=4, eval_metric='logloss', random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)
    y_proba = model.predict_proba(X_val)[:, 1]

    # Metrics
    weighted_f1 = f1_score(y_val, y_pred, average='weighted')
    weighted_recall = recall_score(y_val, y_pred, average='weighted')
    weighted_precision = precision_score(y_val, y_pred, average='weighted')
    roc_auc = roc_auc_score(y_val, y_proba)

    precision, recall, _ = precision_recall_curve(y_val, y_proba)
    pr_auc = auc(recall, precision)
    cm = confusion_matrix(y_val, y_pred)

    return {
        "Weighted F1": weighted_f1,
        "Weighted Recall": weighted_recall,
        "Weighted Precision": weighted_precision,
        "ROC-AUC": roc_auc,
        "PR-AUC": pr_auc,
        "Confusion Matrix": cm.tolist()
    }


def evaluate_models():
    results = []

    X_train_baseline = X_train_smote.drop(columns=[col[0] for col in engineered_features], errors='ignore')
    X_val_baseline = X_val.drop(columns=[col[0] for col in engineered_features], errors='ignore')

    baseline_results = train_and_evaluate(X_train_baseline, y_train_smote, X_val_baseline, y_val)
    baseline_results["Feature"] = "Baseline (No Engineered Features)"
    results.append(baseline_results)
    
    for feature, _ in engineered_features:
        X_train_with_feature = X_train_baseline.copy()
        X_val_with_feature = X_val_baseline.copy()
        X_train_with_feature[feature] = X_train_smote[feature]
        X_val_with_feature[feature] = X_val[feature]
        feature_results = train_and_evaluate(X_train_with_feature, y_train_smote, X_val_with_feature, y_val)
        feature_results["Feature"] = feature
        results.append(feature_results)
        
    all_features_results = train_and_evaluate(X_train_smote, y_train_smote, X_val, y_val)
    all_features_results["Feature"] = "All Engineered Features"
    results.append(all_features_results)

    return pd.DataFrame(results)

impact_df = evaluate_models()


In [5]:
impact_df

Unnamed: 0,Weighted F1,Weighted Recall,Weighted Precision,ROC-AUC,PR-AUC,Confusion Matrix,Feature
0,0.788797,0.829932,0.781195,0.685976,0.37751,"[[119, 4], [21, 3]]",Baseline (No Engineered Features)
1,0.82878,0.857143,0.836439,0.676829,0.407299,"[[120, 3], [18, 6]]",IncomePerJobLevel
2,0.817495,0.85034,0.824402,0.693428,0.411914,"[[120, 3], [19, 5]]",TotalWorkingYearsToJobLevelRatio
3,0.800904,0.836735,0.797974,0.683604,0.36367,"[[119, 4], [20, 4]]",YearsAtCompanyToAgeRatio
4,0.800904,0.836735,0.797974,0.691734,0.389736,"[[119, 4], [20, 4]]",YearsAtCompanyToYearsInCurrentRoleRatio
5,0.812474,0.843537,0.812235,0.695799,0.377207,"[[119, 4], [19, 5]]",All Engineered Features
