ðŸ”¹ Cell 1: Import Libraries & Load Data

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("../data/raw/WA_Fn-UseC_-HR-Employee-Attrition.csv")


ðŸ”¹ Cell 2: Select Burnout-Relevant Columns

In [2]:
selected_cols = [
    'Age',
    'OverTime',
    'WorkLifeBalance',
    'JobSatisfaction',
    'EnvironmentSatisfaction',
    'MonthlyIncome',
    'YearsAtCompany',
    'PerformanceRating',
    'Attrition'
]

df = df[selected_cols]
df.head()


Unnamed: 0,Age,OverTime,WorkLifeBalance,JobSatisfaction,EnvironmentSatisfaction,MonthlyIncome,YearsAtCompany,PerformanceRating,Attrition
0,41,Yes,1,4,2,5993,6,3,Yes
1,49,No,3,2,3,5130,10,4,No
2,37,Yes,3,3,4,2090,0,3,Yes
3,33,Yes,3,3,4,2909,8,3,No
4,27,No,3,2,1,3468,2,3,No


ðŸ”¹ Cell 3: Encode Categorical Columns

In [3]:
# Convert Yes/No to 1/0
df['OverTime'] = df['OverTime'].map({'Yes': 1, 'No': 0})
df['Attrition'] = df['Attrition'].map({'Yes': 1, 'No': 0})


ðŸ”¹ Cell 4: Create Burnout Risk

In [4]:
def calculate_burnout(row):
    score = 0

    # Workload pressure (strong signal)
    if row['OverTime'] == 1:
        score += 2

    # Work-life imbalance (strong signal)
    if row['WorkLifeBalance'] <= 2:
        score += 2

    # Satisfaction-related stress (moderate signals)
    if row['JobSatisfaction'] <= 2:
        score += 1
    if row['EnvironmentSatisfaction'] <= 2:
        score += 1

    # Attrition as delayed burnout outcome (very strong signal)
    if row['Attrition'] == 1:
        score += 2

    # Final burnout classification (early warning logic)
    if score >= 6:
        return "High"
    elif score >= 3:
        return "Medium"
    else:
        return "Low"

df['Burnout_Risk'] = df.apply(calculate_burnout, axis=1)


ðŸ”¹ Cell 5: Check Burnout Distribution


In [5]:
df['Burnout_Risk'].value_counts()

Burnout_Risk
Low       878
Medium    515
High       77
Name: count, dtype: int64

ðŸ”¹ Cell 6: Save Clean Data


In [6]:
df.to_csv("../data/processed/cleaned_burnout_data.csv", index=False)