ðŸ§© STEP 1: Import Libraries

In [16]:

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier


ðŸ§© STEP 2: Load Engineered Data

In [17]:

df = pd.read_csv("../data/processed/cleaned_burnout_data.csv")
df.head()

Unnamed: 0,Age,OverTime,WorkLifeBalance,JobSatisfaction,EnvironmentSatisfaction,MonthlyIncome,YearsAtCompany,PerformanceRating,Attrition,Burnout_Risk
0,41,1,1,4,2,5993,6,3,1,High
1,49,0,3,2,3,5130,10,4,0,Low
2,37,1,3,3,4,2090,0,3,1,Medium
3,33,1,3,3,4,2909,8,3,0,Low
4,27,0,3,2,1,3468,2,3,0,Low




ðŸ§© STEP 3: Encode Target Variable

In [18]:
le = LabelEncoder()
df['Burnout_Label'] = le.fit_transform(df['Burnout_Risk'])

X = df.drop(['Burnout_Risk', 'Burnout_Label'], axis=1)
y = df['Burnout_Label']



ðŸ§© STEP 5: Trainâ€“Test Split


In [19]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


ðŸ§ª MODEL 1: Logistic Regression (Baseline)

In [20]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

Logistic Regression Accuracy: 0.8435374149659864
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        11
           1       0.88      0.93      0.91       184
           2       0.77      0.77      0.77        99

    accuracy                           0.84       294
   macro avg       0.55      0.57      0.56       294
weighted avg       0.81      0.84      0.83       294



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


ðŸŒ² MODEL 2: Random Forest



In [21]:
rf = RandomForestClassifier(
    n_estimators=100,
    random_state=42
)

rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

Random Forest Accuracy: 0.9829931972789115
              precision    recall  f1-score   support

           0       1.00      0.64      0.78        11
           1       1.00      0.99      1.00       184
           2       0.95      1.00      0.98        99

    accuracy                           0.98       294
   macro avg       0.98      0.88      0.92       294
weighted avg       0.98      0.98      0.98       294





ðŸš€ MODEL 3: XGBoost (FINAL OPTIMIZED MODEL)

In [22]:


xgb = XGBClassifier(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='multi:softprob',
    eval_metric='mlogloss',
    random_state=42
)

xgb.fit(X_train, y_train)

import joblib

# Save trained XGBoost model
joblib.dump(xgb, "../models/burnout_xgboost_model.pkl")

# Save label encoder
joblib.dump(le, "../models/burnout_label_encoder.pkl")


y_pred_xgb = xgb.predict(X_test)

print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))

XGBoost Accuracy: 0.9863945578231292
              precision    recall  f1-score   support

           0       1.00      0.64      0.78        11
           1       1.00      1.00      1.00       184
           2       0.96      1.00      0.98        99

    accuracy                           0.99       294
   macro avg       0.99      0.88      0.92       294
weighted avg       0.99      0.99      0.99       294



ðŸ§© STEP 6: Compare All Models



In [23]:
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))

Logistic Regression Accuracy: 0.8435374149659864
Random Forest Accuracy: 0.9829931972789115
XGBoost Accuracy: 0.9863945578231292
