# HR Analytics: Employee Attrition Prediction

This notebook performs EDA, builds classification models (Logistic Regression & Decision Tree), evaluates performance, and visualizes feature importance for HR attrition data.

In [ ]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.inspection import permutation_importance
import joblib

## Load Dataset

In [ ]:
# Load CSV
df = pd.read_csv('hr_attrition_synthetic.csv')
df.head()

## Exploratory Data Analysis (EDA)

In [ ]:
# Attrition distribution
sns.countplot(x='Attrition', data=df)
plt.title('Attrition Distribution')
plt.show()

# Attrition by Department
plt.figure(figsize=(6,4))
sns.countplot(x='Department', hue='Attrition', data=df)
plt.title('Attrition by Department')
plt.show()

# Attrition vs OverTime
plt.figure(figsize=(6,4))
sns.countplot(x='OverTime', hue='Attrition', data=df)
plt.title('Attrition vs OverTime')
plt.show()

# Average MonthlyIncome by Attrition
df.groupby('Attrition')['MonthlyIncome'].mean().plot(kind='bar')
plt.title('Average Monthly Income by Attrition')
plt.show()

## Preprocessing & Feature Selection

In [ ]:
target = 'Attrition'
X = df.drop(columns=[target])
y = (df[target] == 'Yes').astype(int)

categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()

# Preprocessing pipelines
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                                      ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                                        ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_cols),
    ('cat', categorical_transformer, categorical_cols)
])

## Train-Test Split

In [ ]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)

## Train Models

In [ ]:
# Logistic Regression Pipeline
log_pipe = Pipeline(steps=[('pre', preprocessor), ('clf', LogisticRegression(max_iter=1000))])
log_pipe.fit(X_train, y_train)
y_pred_log = log_pipe.predict(X_test)
acc_log = accuracy_score(y_test, y_pred_log)
print('Logistic Regression Accuracy:', acc_log)

# Decision Tree Pipeline
tree_pipe = Pipeline(steps=[('pre', preprocessor), ('clf', DecisionTreeClassifier(max_depth=6, random_state=42))])
tree_pipe.fit(X_train, y_train)
y_pred_tree = tree_pipe.predict(X_test)
acc_tree = accuracy_score(y_test, y_pred_tree)
print('Decision Tree Accuracy:', acc_tree)

## Model Evaluation

In [ ]:
# Confusion Matrices
cm_log = confusion_matrix(y_test, y_pred_log)
cm_tree = confusion_matrix(y_test, y_pred_tree)

print('Logistic Regression Confusion Matrix:\n', cm_log)
print('Decision Tree Confusion Matrix:\n', cm_tree)

print('Decision Tree Classification Report:\n', classification_report(y_test, y_pred_tree))

## Feature Importance (Permutation Importance)

In [ ]:
r = permutation_importance(tree_pipe, X_test, y_test, n_repeats=10, random_state=42)
importances = pd.Series(r.importances_mean, index=X.columns).sort_values(ascending=False)
print('Top Features:')
print(importances.head(10))

## Save Models

In [ ]:
joblib.dump(log_pipe, 'logistic_pipeline.joblib')
joblib.dump(tree_pipe, 'tree_pipeline.joblib')

## Save Evaluation Summary

In [ ]:
with open('evaluation_summary.txt', 'w') as f:
    f.write(f'Logistic Regression Accuracy: {acc_log}\n')
    f.write(f'Decision Tree Accuracy: {acc_tree}\n')
    f.write('Decision Tree Classification Report:\n')
    f.write(classification_report(y_test, y_pred_tree))