In [37]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline as ImbPipeline
from xgboost import XGBClassifier
import joblib
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report

# Load the dataset
data = pd.read_csv("Attrition.csv")

# Display original distribution
print("Original PerformanceRating Distribution:")
print(data["PerformanceRating"].value_counts(normalize=True))

# Adjusted synthetic rating function
def assign_performance(row):
    score = (
        row["JobSatisfaction"] * 0.3 +
        row["WorkLifeBalance"] * 0.3 +
        row["EnvironmentSatisfaction"] * 0.2 +
        row["JobInvolvement"] * 0.2
    )  # Max 4
    income_factor = min(row["MonthlyIncome"] / 10000, 1.5)  # Allow boost beyond 1
    training_boost = row["TrainingTimesLastYear"] / 6  # Max 1 from 0-6
    final_score = score * income_factor + training_boost  # Range: ~0-6
    if final_score < 1.5:
        return 1
    elif final_score < 2.5:
        return 2
    elif final_score < 3.5:
        return 3
    elif final_score < 4.5:
        return 4
    else:
        return 5

# Apply synthetic ratings
data["PerformanceRating"] = data.apply(assign_performance, axis=1)

# Verify new distribution
print("\nSynthetic PerformanceRating Distribution:")
print(data["PerformanceRating"].value_counts(normalize=True))

# Define features
features = [
    "Age", "Gender", "Department", "JobRole", "MonthlyIncome", "YearsAtCompany",
    "OverTime", "JobSatisfaction", "WorkLifeBalance", "TotalWorkingYears",
    "TrainingTimesLastYear", "JobInvolvement", "EnvironmentSatisfaction",
    "RelationshipSatisfaction"
]
X = data[features]
y = data["PerformanceRating"] - 1  # Shift to 0-4 for XGBClassifier

# Numerical and categorical features
numerical_features = [
    "Age", "MonthlyIncome", "YearsAtCompany", "TotalWorkingYears",
    "TrainingTimesLastYear", "JobSatisfaction", "WorkLifeBalance",
    "JobInvolvement", "EnvironmentSatisfaction", "RelationshipSatisfaction"
]
categorical_features = ["Gender", "Department", "JobRole", "OverTime"]

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_features),
        ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), categorical_features)
    ]
)

# Pipeline with SMOTE
model_pipeline = ImbPipeline(steps=[
    ("preprocessor", preprocessor),
    ("smote", SMOTE(random_state=42, k_neighbors=3)),
    ("model", XGBClassifier(
        learning_rate=0.1,
        max_depth=6,
        n_estimators=200,
        eval_metric="mlogloss",
        random_state=42,
        use_label_encoder=False
    ))
])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline
model_pipeline.fit(X_train, y_train)

# Evaluate on test set
y_pred = model_pipeline.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=["1", "2", "3", "4", "5"]))

# Save the full pipeline
joblib.dump(model_pipeline, "performance_model.pkl")
print("Full pipeline saved as performance_model.pkl")

Original PerformanceRating Distribution:
PerformanceRating
3    0.846259
4    0.153741
Name: proportion, dtype: float64

Synthetic PerformanceRating Distribution:
PerformanceRating
1    0.355782
2    0.354422
3    0.142857
4    0.082313
5    0.064626
Name: proportion, dtype: float64


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Classification Report:
              precision    recall  f1-score   support

           1       0.90      0.89      0.90        72
           2       0.87      0.89      0.88       127
           3       0.80      0.76      0.78        54
           4       0.75      0.75      0.75        20
           5       0.91      0.95      0.93        21

    accuracy                           0.86       294
   macro avg       0.85      0.85      0.85       294
weighted avg       0.86      0.86      0.86       294

Full pipeline saved as performance_model.pkl
