In [1]:
!pip install pandas numpy scikit-learn shap textblob vaderSentiment joblib


Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [2]:
import pandas as pd
import numpy as np

np.random.seed(42)

n = 5000

departments = ['Sales', 'HR', 'Engineering', 'Operations', 'Finance', 'Marketing', 'Legal']
roles = ['Junior', 'Mid', 'Senior', 'Lead']

df = pd.DataFrame({
    'employee_id': np.arange(1, n+1),
    'department': np.random.choice(departments, n),
    'role_level': np.random.choice(roles, n),
    'avg_weekly_hours': np.random.normal(42, 8, n).clip(20, 70),
    'monthly_overtime_hours': np.random.normal(12, 6, n).clip(0, 60),
    'meetings_per_week': np.random.randint(5, 22, n),
    'email_sentiment_score': np.random.uniform(-1, 1, n),
    'productivity_index': np.random.normal(70, 10, n).clip(30, 100)
})

df['burnout_risk_score'] = (
      0.4*(df['avg_weekly_hours']/70)
    + 0.3*(df['monthly_overtime_hours']/60)
    + 0.2*(df['meetings_per_week']/20)
    + 0.2*(1 - df['email_sentiment_score'])
    + np.random.normal(0, 0.03, n)
).clip(0, 1)

df.to_csv("employee_data.csv", index=False)
df.head()


Unnamed: 0,employee_id,department,role_level,avg_weekly_hours,monthly_overtime_hours,meetings_per_week,email_sentiment_score,productivity_index,burnout_risk_score
0,1,Legal,Senior,46.718252,1.952231,16,-0.07431,66.121062,0.711818
1,2,Operations,Senior,43.17533,16.860013,14,-0.557994,73.693949,0.783428
2,3,Finance,Mid,57.426989,21.237264,20,0.477462,76.819812,0.758112
3,4,Legal,Mid,44.796417,9.434332,12,-0.944864,80.917635,0.812311
4,5,Engineering,Junior,35.65642,16.073309,12,0.109493,75.242026,0.583609


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

df = pd.read_csv("employee_data.csv")

df['label'] = (df['burnout_risk_score'] > 0.6).astype(int)

num_features = [
    'avg_weekly_hours',
    'monthly_overtime_hours',
    'meetings_per_week',
    'email_sentiment_score',
    'productivity_index'
]

cat_features = ['department', 'role_level']

X = df[num_features + cat_features]
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

preprocess = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
    ],
    remainder='passthrough'
)

model = Pipeline([
    ('preprocess', preprocess),
    ('clf', RandomForestClassifier(n_estimators=200, random_state=42))
])

model.fit(X_train, y_train)

print("Model training complete!")


Model training complete!


In [5]:
df['model_prob'] = model.predict_proba(X)[:, 1]
df['model_label'] = (df['model_prob'] > 0.6).astype(int)

df.to_csv("powerbi_burnout_predictions.csv", index=False)

df.head()


Unnamed: 0,employee_id,department,role_level,avg_weekly_hours,monthly_overtime_hours,meetings_per_week,email_sentiment_score,productivity_index,burnout_risk_score,label,model_prob,model_label
0,1,Legal,Senior,46.718252,1.952231,16,-0.07431,66.121062,0.711818,1,0.975,1
1,2,Operations,Senior,43.17533,16.860013,14,-0.557994,73.693949,0.783428,1,1.0,1
2,3,Finance,Mid,57.426989,21.237264,20,0.477462,76.819812,0.758112,1,0.95,1
3,4,Legal,Mid,44.796417,9.434332,12,-0.944864,80.917635,0.812311,1,0.985,1
4,5,Engineering,Junior,35.65642,16.073309,12,0.109493,75.242026,0.583609,0,0.1,0


In [6]:
import shap
import numpy as np

explainer = shap.TreeExplainer(model.named_steps['clf'])

X_trans = model.named_steps['preprocess'].transform(X)

ohe = model.named_steps['preprocess'].named_transformers_['cat']
ohe_features = list(ohe.get_feature_names_out(cat_features))

final_feature_names = ohe_features + num_features

X_sample = pd.DataFrame(X_trans[:20], columns=final_feature_names)

shap_values = explainer.shap_values(X_sample)[:,:,1]

shap_df = pd.DataFrame(shap_values, columns=final_feature_names)

shap_df['employee_id'] = df.loc[:19, 'employee_id'].values

shap_df.to_csv("shap_values.csv", index=False)

shap_df.head()


Unnamed: 0,department_Engineering,department_Finance,department_HR,department_Legal,department_Marketing,department_Operations,department_Sales,role_level_Junior,role_level_Lead,role_level_Mid,role_level_Senior,avg_weekly_hours,monthly_overtime_hours,meetings_per_week,email_sentiment_score,productivity_index,employee_id
0,-0.000509,0.00022,0.004944,0.018408,0.00179,0.006799,0.000462,0.000395,0.000148,0.004225,0.003035,0.0955,-0.075395,0.116833,0.204501,0.018719,1
1,0.000877,0.001014,0.001125,-0.000454,0.000707,7.6e-05,0.000145,-0.000271,0.001542,0.000826,0.001615,0.016512,0.028555,0.009014,0.360494,0.003297,2
2,0.004574,-0.001783,0.004005,0.000657,0.005494,0.001118,2.7e-05,0.002242,0.001259,-0.007415,0.001611,0.218177,0.110358,0.195414,-0.163449,0.002787,3
3,9.5e-05,0.001029,0.000306,0.003316,-1.2e-05,0.001154,0.00057,-0.001007,0.002304,-0.003446,7.4e-05,0.028077,-0.007982,-0.004453,0.392076,-0.002027,4
4,-0.019458,0.001611,-0.001226,-0.002609,-0.000573,-0.005657,-0.005049,0.00148,-0.001518,-0.000797,-0.007433,-0.203455,0.029538,-0.095755,-0.14073,-0.023291,5
