In [None]:
"""
train_productivity_and_burnout_models.py

Trains:
  1. Regression model -> productivity_index
  2. Classification model -> burnout_risk

Requirements:
    pip install pandas scikit-learn xgboost
"""

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, f1_score, classification_report
from xgboost import XGBRegressor, XGBClassifier

# -----------------------------
# LOAD DATA
# -----------------------------
df = pd.read_csv("synthetic_employee_weeks_20k.csv")

print("✅ Data loaded:", df.shape)
print(df.head(3))

# -----------------------------
# BASIC CLEANING
# -----------------------------
# Drop identifiers and date columns (not used as features)
drop_cols = ["emp_id", "manager_id", "week_start"]
df = df.drop(columns=drop_cols, errors='ignore')

# Encode burnout_risk as numbers for classification model
le = LabelEncoder()
df["burnout_risk_encoded"] = le.fit_transform(df["burnout_risk"])  # Low=1, Medium=2, High=0 (depends on fit order)
print("\nEncoded burnout_risk classes:", dict(zip(le.classes_, le.transform(le.classes_))))

# -----------------------------
# FEATURE SELECTION
# -----------------------------
features = [
    'week', 'department', 'role_level', 'base_skill', 'baseline_stress', 'workload',
    'tasks_completed', 'avg_task_difficulty', 'hours_worked', 'overtime_hours',
    'idle_time_ratio', 'msg_count', 'avg_sentiment', 'meetings', 'leave_days',
    'task_on_time_ratio'
]

# One-hot encode categorical features
df = pd.get_dummies(df, columns=['department', 'role_level'], drop_first=True)

# Define targets
y_reg = df["productivity_index"]
y_clf = df["burnout_risk_encoded"]

# Define X (remove both targets and their score)
X = df.drop(columns=["productivity_index", "burnout_risk", "burnout_risk_encoded", "burnout_score"], errors="ignore")

# -----------------------------
# SPLIT DATA
# -----------------------------
X_train, X_test, y_reg_train, y_reg_test, y_clf_train, y_clf_test = train_test_split(
    X, y_reg, y_clf, test_size=0.2, random_state=42
)

# -----------------------------
# SCALE NUMERICAL FEATURES
# -----------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# -----------------------------
# MODEL 1: PRODUCTIVITY REGRESSION
# -----------------------------
print("\n🔧 Training Productivity Regression Model...")

reg_model = XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
reg_model.fit(X_train_scaled, y_reg_train)

# Evaluate
y_pred_reg = reg_model.predict(X_test_scaled)
rmse = np.sqrt(mean_squared_error(y_reg_test, y_pred_reg))
r2 = r2_score(y_reg_test, y_pred_reg)

print("\n📊 Regression Results:")
print(f"RMSE: {rmse:.3f}")
print(f"R²: {r2:.3f}")

# -----------------------------
# MODEL 2: BURNOUT CLASSIFICATION
# -----------------------------
print("\n🔥 Training Burnout Classification Model...")

clf_model = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric='mlogloss'
)
clf_model.fit(X_train_scaled, y_clf_train)

# Evaluate
y_pred_clf = clf_model.predict(X_test_scaled)
acc = accuracy_score(y_clf_test, y_pred_clf)
f1 = f1_score(y_clf_test, y_pred_clf, average='weighted')

print("\n📊 Classification Results:")
print(f"Accuracy: {acc:.3f}")
print(f"Weighted F1-score: {f1:.3f}")
print("\nDetailed classification report:")
print(classification_report(y_clf_test, y_pred_clf, target_names=le.classes_))

# -----------------------------
# FEATURE IMPORTANCES
# -----------------------------
reg_importances = pd.Series(reg_model.feature_importances_, index=X.columns).sort_values(ascending=False)
clf_importances = pd.Series(clf_model.feature_importances_, index=X.columns).sort_values(ascending=False)

print("\n🏆 Top 10 Features (Regression - Productivity):")
print(reg_importances.head(10))
print("\n🏆 Top 10 Features (Classification - Burnout):")
print(clf_importances.head(10))

# -----------------------------
# SAVE MODELS (optional)
# -----------------------------
# import joblib
# joblib.dump(reg_model, "productivity_model.pkl")
# joblib.dump(clf_model, "burnout_model.pkl")
# joblib.dump(scaler, "scaler.pkl")
# joblib.dump(le, "label_encoder.pkl")

print("\n✅ Training completed successfully!")


✅ Data loaded: (20000, 22)
   emp_id  week_start  week   department role_level  manager_id  base_skill  \
0       1  2024-01-01     1  engineering       lead           1        75.0   
1       1  2024-01-08     2  engineering       lead           1        75.0   
2       1  2024-01-15     3  engineering       lead           1        75.0   

   baseline_stress  workload  tasks_completed  ...  overtime_hours  \
0            0.305     1.182                2  ...            0.77   
1            0.305     0.686                7  ...            0.00   
2            0.305     0.976                6  ...            2.18   

   idle_time_ratio  msg_count  avg_sentiment  meetings  leave_days  \
0            0.139         28         -0.113         5           0   
1            0.303         32          0.275         2           0   
2            0.174          9          0.152         1           0   

   task_on_time_ratio  productivity_index  burnout_score  burnout_risk  
0               0.819

In [None]:
import joblib

# Save both trained models
joblib.dump(reg_model, "productivity_model.pkl")
joblib.dump(clf_model, "burnout_model.pkl")

print("✅ Models saved successfully!")


✅ Models saved successfully!


In [5]:
# =========================================
# 🔧 SETUP GIT + CLONE YOUR REPO IN COLAB
# =========================================

# (1) Clone your repo (public)
!git clone https://github.com/Jeevith-Devs/Dynamic-Employee-Productivity-and-Burnout-Prediction-System-using-Machine-Learning.git
%cd Dynamic-Employee-Productivity-and-Burnout-Prediction-System-using-Machine-Learning

# (2) Configure Git with your info
!git config --global user.email "jeeviththunderjoe@gmail.com"
!git config --global user.name "Jeevith Devs"

# =========================================
# 💾 MOVE TRAINED MODELS INTO THE REPO
# =========================================
!mv /content/productivity_model.pkl /content/Dynamic-Employee-Productivity-and-Burnout-Prediction-System-using-Machine-Learning/
!mv /content/burnout_model.pkl /content/Dynamic-Employee-Productivity-and-Burnout-Prediction-System-using-Machine-Learning/

# =========================================
# 📤 COMMIT & PUSH CHANGES TO GITHUB
# =========================================
!git add .
!git commit -m "Add trained productivity & burnout models"
!git push


Cloning into 'Dynamic-Employee-Productivity-and-Burnout-Prediction-System-using-Machine-Learning'...
remote: Enumerating objects: 17, done.[K
remote: Counting objects: 100% (17/17), done.[K
remote: Compressing objects: 100% (14/14), done.[K
remote: Total 17 (delta 5), reused 4 (delta 2), pack-reused 0 (from 0)[K
Receiving objects: 100% (17/17), 598.81 KiB | 17.11 MiB/s, done.
Resolving deltas: 100% (5/5), done.
/content/Dynamic-Employee-Productivity-and-Burnout-Prediction-System-using-Machine-Learning
[main dbfa7c1] Add trained productivity & burnout models
 2 files changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 burnout_model.pkl
 create mode 100644 productivity_model.pkl
fatal: could not read Username for 'https://github.com': No such device or address
