In [1]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

# ───────────────────── Load Data ─────────────────────
df = pd.read_csv("WA_Fn-UseC_-HR-Employee-Attrition.csv")

# ───────────────────── Select Features ─────────────────────
selected_features = [
    "Age", "Gender", "JobRole", "MaritalStatus", "BusinessTravel",
    "Department", "Education", "YearsAtCompany", "MonthlyIncome", "OverTime"
]
df = df[selected_features + ["Attrition"]]

# ───────────────────── Encode Categoricals ─────────────────────
# We'll use one-hot encoding for nominal features, label encode binaries
label_encoders = {}

# Binary columns: encode with LabelEncoder
binary_cols = ["Gender", "OverTime", "Attrition"]
for col in binary_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Nominal categorical columns: one-hot encode
categorical_cols = ["JobRole", "MaritalStatus", "BusinessTravel", "Department"]
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Note: "Education" is already ordinal (1-5), so leave as-is

# ───────────────────── Feature Engineering ─────────────────────
df["YearlyIncome"] = df["MonthlyIncome"] * 12
df["Tenure_Age_Ratio"] = df["YearsAtCompany"] / (df["Age"] + 1)
df["Income_Age_Ratio"] = df["MonthlyIncome"] / (df["Age"] + 1)
df["AgeGroup"] = pd.cut(df["Age"], bins=[18, 25, 35, 45, 60, 100], labels=[0, 1, 2, 3, 4])
df["AgeGroup"] = df["AgeGroup"].astype(float).fillna(0).astype(int)

# ───────────────────── Prepare Features and Target ─────────────────────
X = df.drop(columns=["Attrition"])
y = df["Attrition"]

# ───────────────────── Train-Test Split First ─────────────────────
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# ───────────────────── Apply SMOTE on Training Data Only ─────────────────────
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# ───────────────────── Train Model ─────────────────────
model = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
model.fit(X_train_res, y_train_res)

# Evaluate on test set
y_pred = model.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))

# ───────────────────── Save Model and Encoders ─────────────────────
with open("employee_turnover_hr.pkl", "wb") as f:
    pickle.dump(model, f)

with open("encoders.pkl", "wb") as f:
    pickle.dump(label_encoders, f)

print("✅ Model and encoders saved successfully.")

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.90      0.88       247
           1       0.29      0.21      0.25        47

    accuracy                           0.79       294
   macro avg       0.58      0.56      0.56       294
weighted avg       0.77      0.79      0.78       294

✅ Model and encoders saved successfully.
