In [1]:
# HR Attrition Prediction – Machine Learning Model
# Author: Likhitha P
# Dataset: 5,000 Employee Records (HR Analytics)

import pandas as pd
import numpy as np


In [4]:
# Load dataset
df = pd.read_csv(r"C:\Users\91939\OneDrive\Desktop\SQL Project\data\HR_Analytics_5000.csv")

df.head()


Unnamed: 0,EmployeeID,Age,Gender,Department,JobRole,MonthlyIncome,Salary,Bonus,EducationLevel,MaritalStatus,...,JobSatisfaction,EnvironmentSatisfaction,WorkLifeBalance,Overtime,TrainingHours,ManagerID,TeamSize,RemoteWork,TravelFrequency,HireDate
0,1,50,Male,IT,Senior Analyst,128694,1992743,110268,2,Single,...,1,4,3,Yes,79,174,10,Yes,Frequently,2011-02-01
1,2,45,Female,IT,Engineer,66090,629365,64820,4,Single,...,4,4,2,No,34,137,4,Yes,Frequently,2008-09-19
2,3,33,Male,HR,Analyst,110305,1770485,258795,1,Divorced,...,4,2,3,Yes,53,190,13,No,Rarely,2005-08-30
3,4,49,Male,IT,Coordinator,27747,948143,65725,4,Single,...,1,2,2,Yes,59,163,11,Yes,Frequently,2014-11-24
4,5,58,Female,R&D,Senior Analyst,28890,1605416,141699,3,Married,...,2,2,2,No,64,113,4,No,Occasionally,2007-01-31


In [5]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 24 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   EmployeeID               5000 non-null   int64 
 1   Age                      5000 non-null   int64 
 2   Gender                   5000 non-null   object
 3   Department               5000 non-null   object
 4   JobRole                  5000 non-null   object
 5   MonthlyIncome            5000 non-null   int64 
 6   Salary                   5000 non-null   int64 
 7   Bonus                    5000 non-null   int64 
 8   EducationLevel           5000 non-null   int64 
 9   MaritalStatus            5000 non-null   object
 10  Attrition                5000 non-null   object
 11  PerformanceRating        5000 non-null   int64 
 12  YearsAtCompany           5000 non-null   int64 
 13  YearsSinceLastPromotion  5000 non-null   int64 
 14  JobSatisfaction          5000 non-null  

In [6]:
df.isnull().sum()


EmployeeID                 0
Age                        0
Gender                     0
Department                 0
JobRole                    0
MonthlyIncome              0
Salary                     0
Bonus                      0
EducationLevel             0
MaritalStatus              0
Attrition                  0
PerformanceRating          0
YearsAtCompany             0
YearsSinceLastPromotion    0
JobSatisfaction            0
EnvironmentSatisfaction    0
WorkLifeBalance            0
Overtime                   0
TrainingHours              0
ManagerID                  0
TeamSize                   0
RemoteWork                 0
TravelFrequency            0
HireDate                   0
dtype: int64

In [4]:
df["Attrition_Flag"] = df["Attrition"].map({"Yes": 1, "No": 0})

y = df["Attrition_Flag"]

# Drop unnecessary columns
X = df.drop(columns=["EmployeeID", "Attrition", "Attrition_Flag"])


In [5]:
numeric_features = [
    "Age","MonthlyIncome","Salary","Bonus","EducationLevel",
    "PerformanceRating","YearsAtCompany","YearsSinceLastPromotion",
    "JobSatisfaction","EnvironmentSatisfaction","WorkLifeBalance",
    "TrainingHours","TeamSize"
]

categorical_features = [
    "Gender","Department","JobRole","MaritalStatus",
    "Overtime","RemoteWork","TravelFrequency"
]


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)


In [8]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

log_reg = LogisticRegression(max_iter=1000)

log_reg_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", log_reg)
])

log_reg_pipeline.fit(X_train, y_train)

y_pred_lr = log_reg_pipeline.predict(X_test)
y_prob_lr = log_reg_pipeline.predict_proba(X_test)[:, 1]

print("=== Logistic Regression Performance ===")
print("Accuracy:", round(accuracy_score(y_test, y_pred_lr), 4))
print("ROC-AUC:", round(roc_auc_score(y_test, y_prob_lr), 4))
print("\nClassification Report:\n", classification_report(y_test, y_pred_lr))


=== Logistic Regression Performance ===
Accuracy: 0.486
ROC-AUC: 0.4885

Classification Report:
               precision    recall  f1-score   support

           0       0.48      0.50      0.49       498
           1       0.49      0.47      0.48       502

    accuracy                           0.49      1000
   macro avg       0.49      0.49      0.49      1000
weighted avg       0.49      0.49      0.49      1000



In [9]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    class_weight="balanced"
)

rf_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", rf_clf)
])

rf_pipeline.fit(X_train, y_train)

y_pred_rf = rf_pipeline.predict(X_test)
y_prob_rf = rf_pipeline.predict_proba(X_test)[:, 1]

print("=== Random Forest Performance ===")
print("Accuracy:", round(accuracy_score(y_test, y_pred_rf), 4))
print("ROC-AUC:", round(roc_auc_score(y_test, y_prob_rf), 4))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))


=== Random Forest Performance ===
Accuracy: 0.475
ROC-AUC: 0.4816

Classification Report:
               precision    recall  f1-score   support

           0       0.47      0.43      0.45       498
           1       0.48      0.52      0.50       502

    accuracy                           0.48      1000
   macro avg       0.47      0.47      0.47      1000
weighted avg       0.47      0.47      0.47      1000



In [12]:
import joblib

auc_lr = roc_auc_score(y_test, y_prob_lr)
auc_rf = roc_auc_score(y_test, y_prob_rf)

if auc_rf >= auc_lr:
    best_model = rf_pipeline
    model_name = "RandomForest"
else:
    best_model = log_reg_pipeline
    model_name = "LogisticRegression"

print("Best Model Selected:", model_name)

# Save the model
joblib.dump(best_model, "data/attrition_model.pkl")
print("Model saved to data/attrition_model.pkl")



Best Model Selected: LogisticRegression
Model saved to data/attrition_model.pkl


In [13]:
# Only works if RF is chosen
if model_name == "RandomForest":
    rf = best_model.named_steps["model"]
    
    # Extract OHE feature names
    cat_cols_expanded = best_model.named_steps["preprocessor"].transformers_[1][1].get_feature_names_out(categorical_features)
    all_features = list(numeric_features) + list(cat_cols_expanded)
    
    importances = rf.feature_importances_
    
    fi = pd.DataFrame({
        "Feature": all_features,
        "Importance": importances
    }).sort_values(by="Importance", ascending=False)
    
    fi.head(20)
else:
    print("Feature importance available only for Random Forest")


Feature importance available only for Random Forest


In [15]:
import matplotlib.pyplot as plt

if 'fi' in globals():
    fi_top = fi.head(15)

    plt.figure(figsize=(10, 6))
    plt.barh(fi_top["Feature"], fi_top["Importance"])
    plt.gca().invert_yaxis()
    plt.title("Top 15 Important Features for Attrition Prediction")
    plt.show()
else:
    print("Feature importance is only available when Random Forest is selected as the best model.")


Feature importance is only available when Random Forest is selected as the best model.


In [16]:
sample = X_test.iloc[[0]]
prediction = best_model.predict(sample)[0]
probability = best_model.predict_proba(sample)[0][1]

print("Prediction (1 = will leave):", prediction)
print("Probability:", round(probability, 3))


Prediction (1 = will leave): 1
Probability: 0.507
