In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer

from imblearn.over_sampling import SMOTE

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, roc_auc_score, confusion_matrix, 
                             classification_report, roc_curve, auc, precision_recall_curve)

import shap
import lime
import lime.lime_tabular

# Set style for plots
plt.style.use('fivethirtyeight')
sns.set_palette("Set2")

In [2]:
# Load data
df = pd.read_csv('WA_Fn-UseC_-HR-Employee-Attrition.csv')

print("Data Shape:", df.shape)
print("\nFirst few rows of data:")
print(df.head())

print("\nDataset Info:")
print(df.info())

print("\nTarget Variable Distribution:")
attrition_counts = df['Attrition'].value_counts()
print(attrition_counts)
print(f"Attrition Rate: {attrition_counts['Yes'] / len(df) * 100:.2f}%")

missing_values = df.isnull().sum()
missing_percent = (missing_values / len(df)) * 100
missing_df = pd.DataFrame({'Missing Values': missing_values, 'Percentage': missing_percent})
missing_df = missing_df[missing_df['Missing Values'] > 0].sort_values('Percentage', ascending=False)

print("\nColumns with missing values:")
print(missing_df)

Data Shape: (1470, 35)

First few rows of data:
   Age Attrition     BusinessTravel  DailyRate              Department  \
0   41       Yes      Travel_Rarely       1102                   Sales   
1   49        No  Travel_Frequently        279  Research & Development   
2   37       Yes      Travel_Rarely       1373  Research & Development   
3   33        No  Travel_Frequently       1392  Research & Development   
4   27        No      Travel_Rarely        591  Research & Development   

   DistanceFromHome  Education EducationField  EmployeeCount  EmployeeNumber  \
0                 1          2  Life Sciences              1               1   
1                 8          1  Life Sciences              1               2   
2                 2          2          Other              1               4   
3                 3          4  Life Sciences              1               5   
4                 2          1        Medical              1               7   

   ...  RelationshipSatisf

In [3]:
#preprocessing
data = df.copy()

data['Attrition'] = data['Attrition'].map({'Yes': 1, 'No': 0})

categorical_cols = data.select_dtypes(include=['object']).columns.tolist()
if 'Attrition' in categorical_cols:
    categorical_cols.remove('Attrition')

numerical_cols = data.select_dtypes(include=[np.number]).columns.tolist()
if 'Attrition' in numerical_cols:
    numerical_cols.remove('Attrition')

print(f"Categorical columns: {len(categorical_cols)}")
print(f"Numerical columns: {len(numerical_cols)}")

label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

print("Categorical variables encoded.")

constant_columns = [col for col in data.columns if data[col].nunique() == 1]
print(f"Constant columns: {constant_columns}")

if constant_columns:
    data = data.drop(columns=constant_columns)
    print(f"Removed constant columns: {constant_columns}")

X = data.drop('Attrition', axis=1)
y = data['Attrition']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

Categorical columns: 8
Numerical columns: 26
Categorical variables encoded.
Constant columns: ['EmployeeCount', 'Over18', 'StandardHours']
Removed constant columns: ['EmployeeCount', 'Over18', 'StandardHours']
Features shape: (1470, 31)
Target shape: (1470,)
