In [31]:
import pandas as pd
from pathlib import Path
import os
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

import mlflow
import mlflow.sklearn

# ---------- UPDATED MLFLOW CONFIG ----------
# This new code points to the MLflow server you are running with Docker Compose.
# It replaces the old TRACKING_FOLDER and os.makedirs logic.
MLFLOW_SERVER_URI = "http://127.0.0.1:5000"
EXPERIMENT_NAME = "employee-attrition-prod"

mlflow.set_tracking_uri(MLFLOW_SERVER_URI)
mlflow.set_experiment(EXPERIMENT_NAME)

print(f"✅ MLflow is now configured to use the server at: {MLFLOW_SERVER_URI}")
# ------------------------------------------

# Using a relative path for your data is also a best practice.
DATA_PATH = "C:/Users/shani/VS Code/MLOPs/EmployeeAttrition/data/employee_attrition_clean.csv"
TARGET = "Attrition"

2025/09/28 05:38:02 INFO mlflow.tracking.fluent: Experiment with name 'employee-attrition-prod' does not exist. Creating a new experiment.


✅ MLflow is now configured to use the server at: http://127.0.0.1:5000


In [32]:
df = pd.read_csv(DATA_PATH)

# y = 0/1
y = df[TARGET].map({"No":0, "Yes":1})
X = df.drop(columns=[TARGET])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train size:", X_train.shape, "Test size:", X_test.shape)
print("Class balance in train:", y_train.value_counts(normalize=True))


Train size: (1176, 30) Test size: (294, 30)
Class balance in train: Attrition
0    0.838435
1    0.161565
Name: proportion, dtype: float64


In [33]:
num_features = X.select_dtypes(include=["int64","float64"]).columns
cat_features = X.select_dtypes(include=["object"]).columns

preprocessor = ColumnTransformer([
    ("num", Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ]), num_features),
    ("cat", Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
    ]), cat_features),
])


In [34]:
from sklearn.linear_model import LogisticRegression

logreg = Pipeline([
    ("preprocessor", preprocessor),
    ("clf", LogisticRegression(max_iter=1000, class_weight="balanced"))
])

with mlflow.start_run(run_name="logreg_balanced") as run:
    logreg.fit(X_train, y_train)
    y_pred  = logreg.predict(X_test)
    y_proba = logreg.predict_proba(X_test)[:,1]

    metrics = {
        "accuracy":  accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, zero_division=0),
        "recall":    recall_score(y_test, y_pred, zero_division=0),
        "f1":        f1_score(y_test, y_pred, zero_division=0),
        "roc_auc":   roc_auc_score(y_test, y_proba)
    }

    mlflow.log_params({"model": "LogisticRegression", "class_weight":"balanced"})
    mlflow.log_metrics(metrics)
    mlflow.sklearn.log_model(logreg, "model", input_example=X_train.iloc[:1])

print("Logistic Regression:", metrics)




🏃 View run logreg_balanced at: http://127.0.0.1:5000/#/experiments/321935713767798698/runs/a32fd70d4c914c3a9e2ac0b32e63d86c
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/321935713767798698
Logistic Regression: {'accuracy': 0.7517006802721088, 'precision': 0.3488372093023256, 'recall': 0.6382978723404256, 'f1': 0.45112781954887216, 'roc_auc': 0.8031699543457662}


In [35]:
#Random Forest
from sklearn.ensemble import RandomForestClassifier

rf = Pipeline([
    ("preprocessor", preprocessor),
    ("clf", RandomForestClassifier(
        n_estimators=300, random_state=42, n_jobs=-1,
        min_samples_split=5, class_weight="balanced_subsample"
    ))
])

with mlflow.start_run(run_name="rf_balanced") as run:
    rf.fit(X_train, y_train)
    y_pred  = rf.predict(X_test)
    y_proba = rf.predict_proba(X_test)[:,1]

    metrics_rf = {
        "accuracy":  accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, zero_division=0),
        "recall":    recall_score(y_test, y_pred, zero_division=0),
        "f1":        f1_score(y_test, y_pred, zero_division=0),
        "roc_auc":   roc_auc_score(y_test, y_proba)
    }

    mlflow.log_params({"model":"RandomForest","n_estimators":300,"min_samples_split":5})
    mlflow.log_metrics(metrics_rf)
    mlflow.sklearn.log_model(rf, "model", input_example=X_train.iloc[:1])

print("Random Forest:", metrics_rf)




🏃 View run rf_balanced at: http://127.0.0.1:5000/#/experiments/321935713767798698/runs/d0a47ad39bc6427686d0a0bf841a7bfe
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/321935713767798698
Random Forest: {'accuracy': 0.8401360544217688, 'precision': 0.5, 'recall': 0.10638297872340426, 'f1': 0.17543859649122806, 'roc_auc': 0.7926608665690412}


In [36]:
#XGBoost
import xgboost as xgb

pos = y_train.sum(); neg = len(y_train)-pos
scale = neg/pos if pos > 0 else 1.0

xgb_pipe = Pipeline([
    ("preprocessor", preprocessor),
    ("clf", xgb.XGBClassifier(
        n_estimators=400, max_depth=4, learning_rate=0.05,
        subsample=0.8, colsample_bytree=0.8, reg_lambda=1.0,
        objective="binary:logistic", eval_metric="logloss",
        scale_pos_weight=scale, random_state=42, n_jobs=-1
    ))
])

with mlflow.start_run(run_name="xgb_spw") as run:
    xgb_pipe.fit(X_train, y_train)
    y_pred  = xgb_pipe.predict(X_test)
    y_proba = xgb_pipe.predict_proba(X_test)[:,1]

    metrics_xgb = {
        "accuracy":  accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, zero_division=0),
        "recall":    recall_score(y_test, y_pred, zero_division=0),
        "f1":        f1_score(y_test, y_pred, zero_division=0),
        "roc_auc":   roc_auc_score(y_test, y_proba)
    }

    mlflow.log_params({"model":"XGBoost","scale_pos_weight":round(scale,2)})
    mlflow.log_metrics(metrics_xgb)
    mlflow.sklearn.log_model(xgb_pipe, "model", input_example=X_train.iloc[:1])

print("XGBoost:", metrics_xgb)




🏃 View run xgb_spw at: http://127.0.0.1:5000/#/experiments/321935713767798698/runs/c98bfcf7f2f649aa9c15568d3f644c4b
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/321935713767798698
XGBoost: {'accuracy': 0.8503401360544217, 'precision': 0.5517241379310345, 'recall': 0.3404255319148936, 'f1': 0.42105263157894735, 'roc_auc': 0.767077267637178}


In [37]:
print("LogReg:", metrics)
print("RandomForest:", metrics_rf)
print("XGBoost:", metrics_xgb)

LogReg: {'accuracy': 0.7517006802721088, 'precision': 0.3488372093023256, 'recall': 0.6382978723404256, 'f1': 0.45112781954887216, 'roc_auc': 0.8031699543457662}
RandomForest: {'accuracy': 0.8401360544217688, 'precision': 0.5, 'recall': 0.10638297872340426, 'f1': 0.17543859649122806, 'roc_auc': 0.7926608665690412}
XGBoost: {'accuracy': 0.8503401360544217, 'precision': 0.5517241379310345, 'recall': 0.3404255319148936, 'f1': 0.42105263157894735, 'roc_auc': 0.767077267637178}


In [38]:
from mlflow.tracking import MlflowClient

# Use the NEW experiment name here
EXPERIMENT_NAME = "employee-attrition-prod" 

client = MlflowClient()
exp = client.get_experiment_by_name(EXPERIMENT_NAME)
runs = mlflow.search_runs([exp.experiment_id])


# pick best by F1
best = runs.sort_values("metrics.f1", ascending=False).iloc[0]
best_run_id = best.run_id
best_f1 = best["metrics.f1"]

print("Best run:", best_run_id, "F1:", best_f1)

MODEL_NAME = "employee-attrition-model"
result = mlflow.register_model(model_uri=f"runs:/{best_run_id}/model", name=MODEL_NAME)
version = result.version

client.set_registered_model_alias(MODEL_NAME, "production", version=version)
client.set_model_version_tag(MODEL_NAME, version, "f1", f"{best_f1:.4f}")

print(f"✅ Registered {MODEL_NAME} v{version} as Production")


Best run: a32fd70d4c914c3a9e2ac0b32e63d86c F1: 0.45112781954887216


Successfully registered model 'employee-attrition-model'.
2025/09/28 05:38:29 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: employee-attrition-model, version 1
Created version '1' of model 'employee-attrition-model'.


✅ Registered employee-attrition-model v1 as Production


In [39]:
from mlflow.tracking import MlflowClient

client = MlflowClient()

MODEL_NAME = "employee-attrition-model"

# Get the latest registered version
latest_version = client.get_latest_versions(MODEL_NAME, ["None"])[0].version

features = [
    "Age", "DailyRate", "DistanceFromHome", "Education", "EnvironmentSatisfaction",
    "HourlyRate", "JobInvolvement", "JobLevel", "JobSatisfaction", "MonthlyIncome",
    "MonthlyRate", "NumCompaniesWorked", "PercentSalaryHike", "PerformanceRating",
    "RelationshipSatisfaction", "StockOptionLevel", "TotalWorkingYears",
    "TrainingTimesLastYear", "WorkLifeBalance", "YearsAtCompany", "YearsInCurrentRole",
    "YearsSinceLastPromotion", "YearsWithCurrManager",
    "BusinessTravel", "Department", "EducationField", "Gender", "JobRole", "MaritalStatus", "OverTime"
]

client.set_model_version_tag(MODEL_NAME, latest_version, "features_used", ",".join(features))


  latest_version = client.get_latest_versions(MODEL_NAME, ["None"])[0].version


In [40]:
# Add this to a new, final cell and run it
import mlflow

print(f"MLflow is currently saving data to: {mlflow.get_tracking_uri()}")

MLflow is currently saving data to: http://127.0.0.1:5000
