# Employee Attrition Prediction

This notebook demonstrates an end-to-end workflow to predict employee attrition using the **IBM HR Analytics Employee Attrition & Performance** dataset.

**What this notebook includes**
- Exploratory Data Analysis (EDA)
- Feature engineering and preprocessing
- Handling class imbalance with SMOTE
- Training and evaluation of **Random Forest** and **XGBoost**
- Feature importance and saving the best model

> Place the dataset CSV file as `data/WA_Fn-UseC_-HR-Employee-Attrition.csv` before running.


In [None]:
# Imports and settings
import warnings
warnings.filterwarnings('ignore')

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from imblearn.over_sampling import SMOTE
import pickle

print("Imports loaded.")

In [None]:
# Load dataset
data_path = os.path.join('..', 'data', 'WA_Fn-UseC_-HR-Employee-Attrition.csv')
if not os.path.exists(data_path):
    print(f"Dataset not found at {data_path}. Please download the dataset from Kaggle and place it in the data/ folder.")
else:
    df = pd.read_csv(data_path)
    print("Loaded dataset shape:", df.shape)
    display(df.head(5))
    display(df.describe(include='all').T)

In [None]:
# Basic EDA
print("Attrition value counts:")
print(df['Attrition'].value_counts(), '\n')

plt.figure(figsize=(6,4))
sns.countplot(x='Attrition', data=df)
plt.title('Attrition Distribution')
plt.show()

# Numeric correlation heatmap
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
plt.figure(figsize=(12,10))
sns.heatmap(df[num_cols].corr(), cmap='coolwarm', center=0)
plt.title('Correlation matrix (numeric features)')
plt.show()

# OverTime vs Attrition
plt.figure(figsize=(6,4))
sns.countplot(x='OverTime', hue='Attrition', data=df)
plt.title('OverTime vs Attrition')
plt.show()

# JobSatisfaction vs Attrition
plt.figure(figsize=(6,4))
sns.countplot(x='JobSatisfaction', hue='Attrition', data=df)
plt.title('Job Satisfaction vs Attrition')
plt.show()

In [None]:
# Preprocessing
df = df.copy()

# Map target to binary
df['Attrition'] = df['Attrition'].map({'Yes':1, 'No':0})

# Check missing values
missing = df.isnull().sum().sort_values(ascending=False)
print("Top missing (if any):")
print(missing[missing>0] if missing.sum()>0 else "No missing values found")

# Encode categorical variables using LabelEncoder (simple and reproducible)
cat_cols = df.select_dtypes(include='object').columns.tolist()
print("Categorical columns:", cat_cols)
le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col].astype(str))

# Features and target
X = df.drop('Attrition', axis=1)
y = df['Attrition']

# Feature scaling
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X_scaled.head()

In [None]:
# Train-test split (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.20, stratify=y, random_state=42
)
print("Train class distribution (before SMOTE):")
print(y_train.value_counts())

# Apply SMOTE to training data only
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
print("Train class distribution (after SMOTE):")
print(pd.Series(y_train_res).value_counts())

In [None]:
# Random Forest training and evaluation
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train_res, y_train_res)
y_pred_rf = rf.predict(X_test)

acc_rf = accuracy_score(y_test, y_pred_rf)
print("Random Forest Accuracy: {:.4f}".format(acc_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))

cm_rf = confusion_matrix(y_test, y_pred_rf)
plt.figure(figsize=(5,4))
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Blues')
plt.title('Random Forest - Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# RF ROC AUC
y_proba_rf = rf.predict_proba(X_test)[:,1]
print("Random Forest ROC AUC: {:.4f}".format(roc_auc_score(y_test, y_proba_rf)))

In [None]:
# XGBoost training and evaluation
xgb_model = xgb.XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=6,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

xgb_model.fit(X_train_res, y_train_res)
y_pred_xgb = xgb_model.predict(X_test)

acc_xgb = accuracy_score(y_test, y_pred_xgb)
print("XGBoost Accuracy: {:.4f}".format(acc_xgb))
print("Classification Report:\n", classification_report(y_test, y_pred_xgb))

cm_xgb = confusion_matrix(y_test, y_pred_xgb)
plt.figure(figsize=(5,4))
sns.heatmap(cm_xgb, annot=True, fmt='d', cmap='Greens')
plt.title('XGBoost - Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# XGBoost ROC AUC
y_proba_xgb = xgb_model.predict_proba(X_test)[:,1]
print("XGBoost ROC AUC: {:.4f}".format(roc_auc_score(y_test, y_proba_xgb)))

In [None]:
# Feature Importance - XGBoost
try:
    fig, ax = plt.subplots(figsize=(8,6))
    xgb.plot_importance(xgb_model, max_num_features=15, importance_type='gain', ax=ax)
    plt.title('XGBoost - Feature Importance (gain)')
    plt.show()
except Exception as e:
    print("Could not plot XGBoost importance:", e)

# Random Forest feature importance
rf_imp = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False).head(15)
plt.figure(figsize=(8,6))
rf_imp.plot(kind='barh')
plt.gca().invert_yaxis()
plt.title('Random Forest - Top 15 Feature Importances')
plt.xlabel('Importance')
plt.show()

In [None]:
# Save the best model
os.makedirs(os.path.join('..','models'), exist_ok=True)

if acc_xgb >= acc_rf:
    best_model = xgb_model
    best_name = 'xgboost'
    best_acc = acc_xgb
else:
    best_model = rf
    best_name = 'random_forest'
    best_acc = acc_rf

model_path = os.path.join('..', 'models', f'best_model_{best_name}.pkl')
with open(model_path, 'wb') as f:
    pickle.dump(best_model, f)

print(f"Saved best model ({best_name}) with accuracy {best_acc:.4f} to {model_path}")

In [None]:
# Example: Predict on a sample from the test set
sample = X_test.iloc[0:3]
probs = best_model.predict_proba(sample)[:,1]
preds = best_model.predict(sample)
print("Predicted probabilities (attrition):", probs)
print("Predicted classes:", preds)

---

## Next steps / Notes

- If you want to expose this as an API, you can create a small Flask app that loads `models/best_model_*.pkl` and returns predictions for input employee records.
- To reproduce results, ensure you use the same `data/` CSV and the required packages in `requirements.txt`.
- If you want, I can also generate a `app.py` (Flask) and a `Procfile` for Heroku deployment.

