<h2 align='center'>Codebasics ML Course: ML Flow Tutorial</h2>

In [1]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Step 1: Create an imbalanced binary classification dataset
X, y = make_classification(n_samples=1000, n_features=10, n_informative=2, n_redundant=8, 
                           weights=[0.9, 0.1], flip_y=0, random_state=42)

np.unique(y, return_counts=True)

(array([0, 1]), array([900, 100]))

In [3]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

### Experiment 1: Train Logistic Regression Classifier

In [4]:
log_reg = LogisticRegression(C=1, solver='liblinear')
log_reg.fit(X_train, y_train)
y_pred_log_reg = log_reg.predict(X_test)
print(classification_report(y_test, y_pred_log_reg))

              precision    recall  f1-score   support

           0       0.95      0.96      0.95       270
           1       0.60      0.50      0.55        30

    accuracy                           0.92       300
   macro avg       0.77      0.73      0.75       300
weighted avg       0.91      0.92      0.91       300



### Experiment 2: Train Random Forest Classifier

In [5]:
rf_clf = RandomForestClassifier(n_estimators=30, max_depth=3)
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       270
           1       0.95      0.67      0.78        30

    accuracy                           0.96       300
   macro avg       0.96      0.83      0.88       300
weighted avg       0.96      0.96      0.96       300



### Experiment 3: Train XGBoost

In [6]:
xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_clf.fit(X_train, y_train)
y_pred_xgb = xgb_clf.predict(X_test)
print(classification_report(y_test, y_pred_xgb))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       270
           1       0.96      0.80      0.87        30

    accuracy                           0.98       300
   macro avg       0.97      0.90      0.93       300
weighted avg       0.98      0.98      0.98       300



### Experiment 4: Handle class imbalance using SMOTETomek and then Train XGBoost

In [7]:
import numpy as np
from sklearn.neighbors import KDTree

class SMOTETomek_like:
    def __init__(self, random_state=42, k_neighbors=5, remove_tomek=True, target_count=None):
        """
        SMOTE + Tomek links combination, mimicking imbalanced-learn SMOTETomek.

        Parameters
        ----------
        random_state : int
            Seed for reproducibility.
        k_neighbors : int
            Number of nearest neighbors for SMOTE.
        remove_tomek : bool
            Whether to remove Tomek links after SMOTE.
        target_count : int or None
            If int, upsample all classes to exactly this number of samples.
            If None, upsample to the original majority class.
        """
        self.random_state = random_state
        self.k_neighbors = k_neighbors
        self.remove_tomek = remove_tomek
        self.target_count = target_count

    def fit_resample(self, X, y):
        np.random.seed(self.random_state)
        X = np.asarray(X)
        y = np.asarray(y)

        # --- Step 1: SMOTE ---
        classes, counts = np.unique(y, return_counts=True)

        if self.target_count is None:
            majority_count = max(counts)
        else:
            majority_count = self.target_count

        X_res_list = []
        y_res_list = []

        for cls in classes:
            X_cls = X[y == cls]
            n_cls = len(X_cls)
            X_res_list.append(X_cls)
            y_res_list.append(np.full(n_cls, cls))

            if n_cls < majority_count:
                n_to_generate = majority_count - n_cls
                k = min(self.k_neighbors, n_cls)
                tree = KDTree(X_cls)
                synthetic_samples = []

                for _ in range(n_to_generate):
                    idx = np.random.randint(0, n_cls)
                    x = X_cls[idx]
                    nn_idx = tree.query([x], k=k, return_distance=False)[0]
                    neighbor = X_cls[np.random.choice(nn_idx)]
                    synthetic = x + np.random.rand() * (neighbor - x)
                    synthetic_samples.append(synthetic)

                X_res_list.append(np.array(synthetic_samples))
                y_res_list.append(np.full(n_to_generate, cls))

        X_res = np.vstack(X_res_list)
        y_res = np.hstack(y_res_list)

        # --- Step 2: Optional Tomek links removal ---
        if self.remove_tomek:
            X_res, y_res = self._remove_tomek_links(X_res, y_res, target_count=majority_count)

        return X_res, y_res

    def _remove_tomek_links(self, X, y, target_count):
        """
        Remove majority samples forming Tomek links, but never reduce
        any class below the target_count.
        """
        classes = np.unique(y)
        X_clean_list = []
        y_clean_list = []

        # Start with all samples
        X_remaining = X.copy()
        y_remaining = y.copy()

        for cls in classes:
            X_cls = X_remaining[y_remaining == cls]
            y_cls = y_remaining[y_remaining == cls]

            for other_cls in classes:
                if cls == other_cls:
                    continue
                X_other = X_remaining[y_remaining == other_cls]
                y_other = y_remaining[y_remaining == other_cls]

                if len(X_other) == 0 or len(X_cls) == 0:
                    continue

                tree_cls = KDTree(X_cls)
                tree_other = KDTree(X_other)

                dist_other, idx_other = tree_cls.query(X_other, k=1)
                dist_cls, idx_cls = tree_other.query(X_cls, k=1)

                mask_other = np.ones(len(X_other), dtype=bool)
                for i, j in enumerate(idx_other[:, 0]):
                    if idx_cls[j, 0] == i:
                        # Remove sample only if class count remains >= target_count
                        if len(X_other[mask_other]) > target_count:
                            mask_other[i] = False

                X_other_clean = X_other[mask_other]
                y_other_clean = y_other[mask_other]

                # Add cleaned samples
                X_clean_list.append(X_cls)
                y_clean_list.append(y_cls)
                X_clean_list.append(X_other_clean)
                y_clean_list.append(y_other_clean)

                # Update remaining samples
                X_remaining = np.vstack([X_cls, X_other_clean])
                y_remaining = np.hstack([y_cls, y_other_clean])

        X_final = np.vstack(X_clean_list)
        y_final = np.hstack(y_clean_list)

        # --- Step 3: Trim any excess if necessary to match exact target_count ---
        X_trimmed_list = []
        y_trimmed_list = []
        for cls in np.unique(y_final):
            X_cls = X_final[y_final == cls]
            y_cls = y_final[y_final == cls]
            if len(X_cls) > target_count:
                idx_keep = np.random.choice(len(X_cls), target_count, replace=False)
                X_cls = X_cls[idx_keep]
                y_cls = y_cls[idx_keep]
            X_trimmed_list.append(X_cls)
            y_trimmed_list.append(y_cls)

        X_final = np.vstack(X_trimmed_list)
        y_final = np.hstack(y_trimmed_list)

        return X_final, y_final


In [8]:
# from imblearn.combine import SMOTETomek

# smt = SMOTETomek(random_state=42)
# X_train_res, y_train_res = smt.fit_resample(X_train, y_train)

# np.unique(y_train_res, return_counts=True)

# --- Step 1: Create instance ---
desired_count = 619  # exact number of samples per class
smt_like = SMOTETomek_like(
    random_state=42,
    k_neighbors=5,
    remove_tomek=True,
    target_count=desired_count
)

# --- Step 2: Resample training data ---
X_train_res, y_train_res = smt_like.fit_resample(X_train, y_train)

# --- Step 3: Check the class distribution ---
np.unique(y_train_res, return_counts=True)
# Expected output: (array([0, 1]), array([619, 619]))


(array([0, 1]), array([619, 619]))

In [9]:
xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_clf.fit(X_train_res, y_train_res)
y_pred_xgb = xgb_clf.predict(X_test)
print(classification_report(y_test, y_pred_xgb))

              precision    recall  f1-score   support

           0       0.98      0.94      0.96       270
           1       0.62      0.83      0.71        30

    accuracy                           0.93       300
   macro avg       0.80      0.89      0.84       300
weighted avg       0.95      0.93      0.94       300



<h2 align="center" style="color:blue">Track Experiments Using MLFlow</h2>

In [10]:
models = [
    (
        "Logistic Regression", 
        {"C": 1, "solver": 'liblinear'},
        LogisticRegression(), 
        (X_train, y_train),
        (X_test, y_test)
    ),
    (
        "Random Forest", 
        {"n_estimators": 30, "max_depth": 3},
        RandomForestClassifier(), 
        (X_train, y_train),
        (X_test, y_test)
    ),
    (
        "XGBClassifier",
        {"use_label_encoder": False, "eval_metric": 'logloss'},
        XGBClassifier(), 
        (X_train, y_train),
        (X_test, y_test)
    ),
    (
        "XGBClassifier With SMOTE",
        {"use_label_encoder": False, "eval_metric": 'logloss'},
        XGBClassifier(), 
        (X_train_res, y_train_res),
        (X_test, y_test)
    )
]

In [11]:
reports = []

for model_name, params, model, train_set, test_set in models:
    X_train = train_set[0]
    y_train = train_set[1]
    X_test = test_set[0]
    y_test = test_set[1]
    
    model.set_params(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    reports.append(report)

In [12]:
reports

[{'0': {'precision': 0.9454545454545454,
   'recall': 0.9629629629629629,
   'f1-score': 0.9541284403669725,
   'support': 270.0},
  '1': {'precision': 0.6,
   'recall': 0.5,
   'f1-score': 0.5454545454545454,
   'support': 30.0},
  'accuracy': 0.9166666666666666,
  'macro avg': {'precision': 0.7727272727272727,
   'recall': 0.7314814814814814,
   'f1-score': 0.749791492910759,
   'support': 300.0},
  'weighted avg': {'precision': 0.9109090909090909,
   'recall': 0.9166666666666666,
   'f1-score': 0.91326105087573,
   'support': 300.0}},
 {'0': {'precision': 0.9676258992805755,
   'recall': 0.9962962962962963,
   'f1-score': 0.9817518248175182,
   'support': 270.0},
  '1': {'precision': 0.9545454545454546,
   'recall': 0.7,
   'f1-score': 0.8076923076923077,
   'support': 30.0},
  'accuracy': 0.9666666666666667,
  'macro avg': {'precision': 0.961085676913015,
   'recall': 0.8481481481481481,
   'f1-score': 0.8947220662549129,
   'support': 300.0},
  'weighted avg': {'precision': 0.9663

In [13]:
# =========================
# Imports
# =========================
import mlflow
import mlflow.sklearn
import mlflow.xgboost

# from mlflow.tracking import MlflowClient
from xgboost import XGBClassifier, XGBRegressor

In [14]:
# =========================
# Helper function to log models
# =========================
def log_model_with_mlflow(model, model_name: str):
    """
    Safely log models to MLflow on Python 3.13.
    - Uses mlflow.xgboost for XGBoost models
    - Uses mlflow.sklearn for all other models
    - Uses the new MLflow `name` parameter (no deprecation warnings)
    """

    if "XGB" in model_name:
        # Defensive estimator type patching
        if not hasattr(model, "_estimator_type"):
            if isinstance(model, XGBClassifier):
                model._estimator_type = "classifier"
            elif isinstance(model, XGBRegressor):
                model._estimator_type = "regressor"
            else:
                model._estimator_type = "classifier"

        mlflow.xgboost.log_model(
            model,
            name="model",
            pip_requirements=[
                "xgboost==3.1.2",
                "scikit-learn",
                "numpy"
            ]
        )

    else:
        mlflow.sklearn.log_model(
            model,
            name="model",
            pip_requirements=[
                "scikit-learn",
                "numpy"
            ]
        )

In [15]:
# =========================
# MLflow Initialization (DEFENSIVE)
# =========================
# mlflow.set_tracking_uri("http://localhost:5000")

# client = MlflowClient()
# experiment_name = "Anomaly Detection"

# exp = client.get_experiment_by_name(experiment_name)

# if exp and exp.lifecycle_stage == "deleted":
    # client.restore_experiment(exp.experiment_id)

# mlflow.set_experiment(experiment_name)

# 1. Set tracking server FIRST
mlflow.set_tracking_uri("http://localhost:5000")

# 2. Then set / create experiment
mlflow.set_experiment("Anomaly Detection")


# =========================
# Training / Logging Loop
# =========================
for i, element in enumerate(models):
    model_name = element[0]
    params = element[1]
    model = element[2]
    report = reports[i]

    with mlflow.start_run(run_name=model_name):
        mlflow.log_params(params)

        mlflow.log_metrics({
            "accuracy": report["accuracy"],
            "recall_class_1": report["1"]["recall"],
            "recall_class_0": report["0"]["recall"],
            "f1_score_macro": report["macro avg"]["f1-score"]
        })

        log_model_with_mlflow(model, model_name)

# =========================
# End of MLflow logging
# =========================

2025/12/25 15:59:52 INFO mlflow.tracking.fluent: Experiment with name 'Anomaly Detection' does not exist. Creating a new experiment.


üèÉ View run Logistic Regression at: http://localhost:5000/#/experiments/1/runs/e139d36b0e764b249328dd575f4e93eb
üß™ View experiment at: http://localhost:5000/#/experiments/1
üèÉ View run Random Forest at: http://localhost:5000/#/experiments/1/runs/db0c943a4bdd458eab596e2ea9fb0ad0
üß™ View experiment at: http://localhost:5000/#/experiments/1
üèÉ View run XGBClassifier at: http://localhost:5000/#/experiments/1/runs/29e5335215cb4b618419af136a17fc23
üß™ View experiment at: http://localhost:5000/#/experiments/1
üèÉ View run XGBClassifier With SMOTE at: http://localhost:5000/#/experiments/1/runs/640aa0b1aea4450f84eff29b8e1ad6cd
üß™ View experiment at: http://localhost:5000/#/experiments/1


### Register the Model

In [16]:
import mlflow

model_name = "XGB-Smote"
run_id = input("Please type RunID: ").strip()

# IMPORTANT: this must match the artifact path used when the model was logged
artifact_path = "model"

model_uri = f"runs:/{run_id}/{artifact_path}"

# You do NOT need to reopen the run
mlflow.register_model(
    model_uri=model_uri,
    name=model_name
)

Please type RunID:  640aa0b1aea4450f84eff29b8e1ad6cd


Successfully registered model 'XGB-Smote'.
2025/12/25 16:02:22 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: XGB-Smote, version 1
Created version '1' of model 'XGB-Smote'.


<ModelVersion: aliases=[], creation_timestamp=1766671342474, current_stage='None', deployment_job_state=<ModelVersionDeploymentJobState: current_task_name='', job_id='', job_state='DEPLOYMENT_JOB_CONNECTION_STATE_UNSPECIFIED', run_id='', run_state='DEPLOYMENT_JOB_RUN_STATE_UNSPECIFIED'>, description='', last_updated_timestamp=1766671342474, metrics=None, model_id=None, name='XGB-Smote', params=None, run_id='640aa0b1aea4450f84eff29b8e1ad6cd', run_link='', source='models:/m-298ec5bf10fc41458a31cf34d1f4abe1', status='READY', status_message=None, tags={}, user_id='', version='1'>

### Load the Model

In [20]:
import os

print("Local MLflow model path:", local_path)

for root, dirs, files in os.walk(local_path):
    for name in files:
        print(os.path.join(root, name))


Local MLflow model path: C:\Users\GGL\AppData\Local\Temp\tmp7eedxrid\
C:\Users\GGL\AppData\Local\Temp\tmp7eedxrid\conda.yaml
C:\Users\GGL\AppData\Local\Temp\tmp7eedxrid\MLmodel
C:\Users\GGL\AppData\Local\Temp\tmp7eedxrid\model.ubj
C:\Users\GGL\AppData\Local\Temp\tmp7eedxrid\python_env.yaml
C:\Users\GGL\AppData\Local\Temp\tmp7eedxrid\registered_model_meta
C:\Users\GGL\AppData\Local\Temp\tmp7eedxrid\requirements.txt


In [24]:
import mlflow
import xgboost as xgb
import os

# Download model artifacts
local_path = mlflow.artifacts.download_artifacts(
    artifact_uri=f"models:/{model_name}@challenger" # /{model_version}
)

# Correct model file (from inspection)
model_file = os.path.join(local_path, "model.ubj")

# Load raw XGBoost booster
booster = xgb.Booster()
booster.load_model(model_file)

# Convert test data to DMatrix
dtest = xgb.DMatrix(X_test)

# Predict
y_pred = booster.predict(dtest)

y_pred[:4]



Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

array([0.00236887, 0.0019822 , 0.01328653, 0.000823  ], dtype=float32)

### Transition the Model to Production

In [25]:
current_model_uri = f"models:/{model_name}@challenger"
production_model_name = "anomaly-detection-prod"

client = mlflow.MlflowClient()
client.copy_model_version(src_model_uri=current_model_uri, dst_name=production_model_name)

Successfully registered model 'anomaly-detection-prod'.
Copied version '1' of model 'XGB-Smote' to version '1' of model 'anomaly-detection-prod'.


<ModelVersion: aliases=[], creation_timestamp=1766674468455, current_stage='None', deployment_job_state=<ModelVersionDeploymentJobState: current_task_name='', job_id='', job_state='DEPLOYMENT_JOB_CONNECTION_STATE_UNSPECIFIED', run_id='', run_state='DEPLOYMENT_JOB_RUN_STATE_UNSPECIFIED'>, description='', last_updated_timestamp=1766674468455, metrics=None, model_id=None, name='anomaly-detection-prod', params=None, run_id='640aa0b1aea4450f84eff29b8e1ad6cd', run_link='', source='models:/XGB-Smote/1', status='READY', status_message=None, tags={}, user_id='', version='1'>

In [26]:
import mlflow
import xgboost as xgb
import os

# Download model artifacts
local_path = mlflow.artifacts.download_artifacts(
    artifact_uri=f"models:/{production_model_name}@champion" # /{model_version}
)

# Correct model file (from inspection)
model_file = os.path.join(local_path, "model.ubj")

# Load raw XGBoost booster
booster = xgb.Booster()
booster.load_model(model_file)

# Convert test data to DMatrix
dtest = xgb.DMatrix(X_test)

# Predict
y_pred = booster.predict(dtest)

y_pred[:4]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

array([0.00236887, 0.0019822 , 0.01328653, 0.000823  ], dtype=float32)

### Please use to the following link to learn more about model registry

https://mlflow.org/docs/latest/model-registry.html#model-registry-workflows 

### to learn from the docs