In [23]:
import pymongo
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE

# MongoDB URI - replace with your actual connection string
uri = "mongodb+srv://jerry10102002:dRNiFapLiSq5kCBB@cluster0.cvuzdzh.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"
client = pymongo.MongoClient(uri)

# Select the database and collection
db = client["HR_Attrition"]
collection = db["EmployeeAnalytics"]

# Fetch all records from the collection
data = collection.find()

# Convert the cursor to a list and then to a pandas DataFrame
df = pd.DataFrame(list(data))

# Data Preprocessing
# Handle missing values (if any)
df['YearsWithCurrManager'].fillna(df['YearsWithCurrManager'].median(), inplace=True)

# Drop non-informative columns
df.drop(['EmpID', 'EmployeeCount', 'EmployeeNumber', 'Over18', 'StandardHours'], axis=1, inplace=True)

# Encode target
df['Attrition'] = df['Attrition'].map({'Yes': 1, 'No': 0})

# Identify categorical columns
cat_cols = df.select_dtypes(include=['object']).columns

# Initialize a dictionary to store label encoders for each categorical column
label_encoders = {}

# Encode categorical columns and store each encoder in the dictionary
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Save label encoders as a dictionary
joblib.dump(label_encoders, "label_encoders.pkl")
print("✅ Label Encoders Saved: label_encoders.pkl")

# Define the top features for the model
personal_info = [
    'Age',
    'Department'
]
work_related_features = [
    'OverTime',
    'YearsWithCurrManager',
    'YearsSinceLastPromotion',
    'YearsInCurrentRole',
    'YearsAtCompany',
    'TotalWorkingYears',
    'NumCompaniesWorked'
]
income_benefits = [
    'MonthlyIncome',
    'SalarySlab',
    'StockOptionLevel'
]
satisfaction_involvement = [
    'EnvironmentSatisfaction',
    'JobSatisfaction',
    'JobInvolvement'
]
top_features = (
    personal_info +
    work_related_features +
    income_benefits +
    satisfaction_involvement
)

X = df[top_features]
y = df['Attrition']

# Apply SMOTE for balancing the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

# Scale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

# Train & evaluate models
best_model = None
best_score = 0
best_model_name = ""
model_scores = {}

for name, model in models.items():
    if name == "Logistic Regression":
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    model_scores[name] = acc
    print(f"\n{name} Accuracy: {acc:.4f}")
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))

    if acc > best_score:
        best_score = acc
        best_model = model
        best_model_name = name

# Save the best model
joblib.dump(best_model, f"best_model_smote_{best_model_name.replace(' ', '_')}.pkl")
print(f"\n Best SMOTE Model Saved: {best_model_name} with Accuracy {best_score:.4f}")

# Save best model, scaler, and encoder
joblib.dump(best_model, f"best_model_smote_{best_model_name.replace(' ', '_')}.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(top_features, "top_features.pkl")

print(f"\n Best SMOTE Model Saved: {best_model_name} with Accuracy {best_score:.4f}")
print(" Scaler Saved: scaler.pkl")
print(" Label Encoder Saved: label_encoder.pkl")


✅ Label Encoders Saved: label_encoders.pkl

Logistic Regression Accuracy: 0.7948
[[194  55]
 [ 47 201]]
              precision    recall  f1-score   support

           0       0.80      0.78      0.79       249
           1       0.79      0.81      0.80       248

    accuracy                           0.79       497
   macro avg       0.80      0.79      0.79       497
weighted avg       0.80      0.79      0.79       497


Decision Tree Accuracy: 0.7988
[[197  52]
 [ 48 200]]
              precision    recall  f1-score   support

           0       0.80      0.79      0.80       249
           1       0.79      0.81      0.80       248

    accuracy                           0.80       497
   macro avg       0.80      0.80      0.80       497
weighted avg       0.80      0.80      0.80       497


XGBoost Accuracy: 0.8974
[[226  23]
 [ 28 220]]
              precision    recall  f1-score   support

           0       0.89      0.91      0.90       249
           1       0.91      