In [None]:
"""
train_productivity_and_burnout_models.py

Trains:
  1. Regression model -> productivity_index
  2. Classification model -> burnout_risk

Requirements:
    pip install pandas scikit-learn xgboost
"""

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, f1_score, classification_report
from xgboost import XGBRegressor, XGBClassifier

# -----------------------------
# LOAD DATA
# -----------------------------
df = pd.read_csv("synthetic_employee_weeks_20k.csv")

print("✅ Data loaded:", df.shape)
print(df.head(3))

# -----------------------------
# BASIC CLEANING
# -----------------------------
# Drop identifiers and date columns (not used as features)
drop_cols = ["emp_id", "manager_id", "week_start"]
df = df.drop(columns=drop_cols, errors='ignore')

# Encode burnout_risk as numbers for classification model
le = LabelEncoder()
df["burnout_risk_encoded"] = le.fit_transform(df["burnout_risk"])  # Low=1, Medium=2, High=0 (depends on fit order)
print("\nEncoded burnout_risk classes:", dict(zip(le.classes_, le.transform(le.classes_))))

# -----------------------------
# FEATURE SELECTION
# -----------------------------
features = [
    'week', 'department', 'role_level', 'base_skill', 'baseline_stress', 'workload',
    'tasks_completed', 'avg_task_difficulty', 'hours_worked', 'overtime_hours',
    'idle_time_ratio', 'msg_count', 'avg_sentiment', 'meetings', 'leave_days',
    'task_on_time_ratio'
]

# One-hot encode categorical features
df = pd.get_dummies(df, columns=['department', 'role_level'], drop_first=True)

# Define targets
y_reg = df["productivity_index"]
y_clf = df["burnout_risk_encoded"]

# Define X (remove both targets and their score)
X = df.drop(columns=["productivity_index", "burnout_risk", "burnout_risk_encoded", "burnout_score"], errors="ignore")

# -----------------------------
# SPLIT DATA
# -----------------------------
X_train, X_test, y_reg_train, y_reg_test, y_clf_train, y_clf_test = train_test_split(
    X, y_reg, y_clf, test_size=0.2, random_state=42
)

# -----------------------------
# SCALE NUMERICAL FEATURES
# -----------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# -----------------------------
# MODEL 1: PRODUCTIVITY REGRESSION
# -----------------------------
print("\n🔧 Training Productivity Regression Model...")

reg_model = XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
reg_model.fit(X_train_scaled, y_reg_train)

# Evaluate
y_pred_reg = reg_model.predict(X_test_scaled)
rmse = np.sqrt(mean_squared_error(y_reg_test, y_pred_reg))
r2 = r2_score(y_reg_test, y_pred_reg)

print("\n📊 Regression Results:")
print(f"RMSE: {rmse:.3f}")
print(f"R²: {r2:.3f}")

# -----------------------------
# MODEL 2: BURNOUT CLASSIFICATION
# -----------------------------
print("\n🔥 Training Burnout Classification Model...")

clf_model = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric='mlogloss'
)
clf_model.fit(X_train_scaled, y_clf_train)

# Evaluate
y_pred_clf = clf_model.predict(X_test_scaled)
acc = accuracy_score(y_clf_test, y_pred_clf)
f1 = f1_score(y_clf_test, y_pred_clf, average='weighted')

print("\n📊 Classification Results:")
print(f"Accuracy: {acc:.3f}")
print(f"Weighted F1-score: {f1:.3f}")
print("\nDetailed classification report:")
print(classification_report(y_clf_test, y_pred_clf, target_names=le.classes_))

# -----------------------------
# FEATURE IMPORTANCES
# -----------------------------
reg_importances = pd.Series(reg_model.feature_importances_, index=X.columns).sort_values(ascending=False)
clf_importances = pd.Series(clf_model.feature_importances_, index=X.columns).sort_values(ascending=False)

print("\n🏆 Top 10 Features (Regression - Productivity):")
print(reg_importances.head(10))
print("\n🏆 Top 10 Features (Classification - Burnout):")
print(clf_importances.head(10))

# -----------------------------
# SAVE MODELS (optional)
# -----------------------------
# import joblib
# joblib.dump(reg_model, "productivity_model.pkl")
# joblib.dump(clf_model, "burnout_model.pkl")
# joblib.dump(scaler, "scaler.pkl")
# joblib.dump(le, "label_encoder.pkl")

print("\n✅ Training completed successfully!")
