In [26]:
# --- Import ---
from sklearn.ensemble import RandomForestRegressor
import mlflow
import mlflow.sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, make_scorer

# --- Set experiment ---
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("used-car-price-prediction")

# --- Utility function ---
def rmse_score(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def train_and_register_model(model, param_grid, model_name, registered_name, data_path):
    # Load data
    data = np.load(data_path, allow_pickle=True)
    X_train = data['X_train']
    X_test = data['X_test']
    y_train = data['y_train']
    y_test = data['y_test']
    feature_names = data['feature_names']

    # MLflow run
    with mlflow.start_run(run_name=model_name) as run:
        scorer = make_scorer(rmse_score, greater_is_better=False)
        grid = GridSearchCV(model, param_grid, scoring=scorer, cv=5, n_jobs=-1)
        grid.fit(X_train, y_train)

        best_model = grid.best_estimator_
        y_pred = best_model.predict(X_test)

        rmse = rmse_score(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        mlflow.log_params(grid.best_params_)
        mlflow.log_metrics({"rmse": rmse, "r2": r2})
        mlflow.set_tag("model_name", model_name)

        # Log model
        mlflow.sklearn.log_model(best_model, artifact_path="model")
        model_uri = f"runs:/{run.info.run_id}/model"
        mlflow.register_model(model_uri=model_uri, name=registered_name)

        # Log feature importance
        if hasattr(best_model, "feature_importances_"):
            fi = pd.DataFrame({
                "feature": feature_names,
                "importance": best_model.feature_importances_
            }).sort_values("importance", ascending=False)

            plt.figure(figsize=(10, 5))
            plt.bar(fi["feature"][:10], fi["importance"][:10])
            plt.xticks(rotation=45)
            plt.tight_layout()
            plt.savefig("feature_importance.png")
            mlflow.log_artifact("feature_importance.png")
            plt.close()

        print(f"🏆 Registered: {registered_name} | RMSE: {rmse:.2f}")
        print(f"🔗 View run at: http://localhost:5000/#/experiments/{run.info.experiment_id}/runs/{run.info.run_id}")


In [27]:
# --- Set up and call ---
param_grid = {"n_estimators": [50, 100], "max_depth": [None, 10]}
model = RandomForestRegressor()

train_and_register_model(
    model=model,
    param_grid=param_grid,
    model_name="random_forest",
    registered_name="used-car-random-forest",
    data_path="../airflow/data/preprocessed_data.npz")


Successfully registered model 'used-car-random-forest'.
2025/05/28 23:36:44 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: used-car-random-forest, version 1
Created version '1' of model 'used-car-random-forest'.


🏆 Registered: used-car-random-forest | RMSE: 3478.88
🔗 View run at: http://localhost:5000/#/experiments/3/runs/5aba8fa12bf242beb63261b1fb8fb77f
🏃 View run random_forest at: http://localhost:5000/#/experiments/3/runs/5aba8fa12bf242beb63261b1fb8fb77f
🧪 View experiment at: http://localhost:5000/#/experiments/3


In [28]:
from xgboost import XGBRegressor

param_grid = {
    "n_estimators": [50, 100],
    "max_depth": [3, 5]
}
model = XGBRegressor()

train_and_register_model(
    model=model,
    param_grid=param_grid,
    model_name="xgboost",
    registered_name="used-car-xgboost",
    data_path="../airflow/data/preprocessed_data.npz"
)


Successfully registered model 'used-car-xgboost'.
2025/05/28 23:37:16 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: used-car-xgboost, version 1
Created version '1' of model 'used-car-xgboost'.


🏆 Registered: used-car-xgboost | RMSE: 4666.20
🔗 View run at: http://localhost:5000/#/experiments/3/runs/0b882e804d5b4e4b92c78b587115dd58
🏃 View run xgboost at: http://localhost:5000/#/experiments/3/runs/0b882e804d5b4e4b92c78b587115dd58
🧪 View experiment at: http://localhost:5000/#/experiments/3


In [29]:
from lightgbm import LGBMRegressor

param_grid = {
    "n_estimators": [50, 100],
    "num_leaves": [31, 50]
}
model = LGBMRegressor()

train_and_register_model(
    model=model,
    param_grid=param_grid,
    model_name="lightgbm",
    registered_name="used-car-lightgbm",
    data_path="../airflow/data/preprocessed_data.npz"
)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014983 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 770
[LightGBM] [Info] Number of data points in the train set: 185402, number of used features: 243
[LightGBM] [Info] Start training from score 19459.428458


Successfully registered model 'used-car-lightgbm'.
2025/05/28 23:42:31 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: used-car-lightgbm, version 1
Created version '1' of model 'used-car-lightgbm'.


🏆 Registered: used-car-lightgbm | RMSE: 4493.01
🔗 View run at: http://localhost:5000/#/experiments/3/runs/4574b73998e74c8392662d7319db7a9d
🏃 View run lightgbm at: http://localhost:5000/#/experiments/3/runs/4574b73998e74c8392662d7319db7a9d
🧪 View experiment at: http://localhost:5000/#/experiments/3


In [30]:
from mlflow.tracking import MlflowClient

def register_and_promote_model(run_id, model_name, stage="Production"):
    # Register
    model_uri = f"runs:/{run_id}/model"
    result = mlflow.register_model(model_uri=model_uri, name=model_name)

    print("⚠️ WARNING mlflow.models.model: Model logged without a signature and input example.")
    print("✅ Please set `input_example` parameter when logging the model to auto infer the model signature.")

    print(f"Successfully registered model '{model_name}'.")
    print(f"🕒 Waiting for model version creation...")

    client = MlflowClient()
    client.transition_model_version_stage(
        name=model_name,
        version=result.version,
        stage=stage
    )

    print(f"Created version '{result.version}' of model '{model_name}'.")
    print(f"🏆 Registered: {model_name} | version: {result.version}")
    print(f"🔗 View run at: http://localhost:5000/#/experiments/{result.run_id}/runs/{result.run_id}")
    print(f"🏃 View run {model_name} at: http://localhost:5000/#/experiments/{result.run_id}/runs/{result.run_id}")
    print(f"🧪 View experiment at: http://localhost:5000/#/experiments/{mlflow.get_experiment_by_name('used-car-price-prediction').experiment_id}")


In [31]:
register_and_promote_model(
    run_id="5aba8fa12bf242beb63261b1fb8fb77f",
    model_name="used-car-random-forest",
    stage="Production"
)


Registered model 'used-car-random-forest' already exists. Creating a new version of this model...
2025/05/28 23:47:30 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: used-car-random-forest, version 2


✅ Please set `input_example` parameter when logging the model to auto infer the model signature.
Successfully registered model 'used-car-random-forest'.
🕒 Waiting for model version creation...
Created version '2' of model 'used-car-random-forest'.
🏆 Registered: used-car-random-forest | version: 2
🔗 View run at: http://localhost:5000/#/experiments/5aba8fa12bf242beb63261b1fb8fb77f/runs/5aba8fa12bf242beb63261b1fb8fb77f
🏃 View run used-car-random-forest at: http://localhost:5000/#/experiments/5aba8fa12bf242beb63261b1fb8fb77f/runs/5aba8fa12bf242beb63261b1fb8fb77f
🧪 View experiment at: http://localhost:5000/#/experiments/3


Created version '2' of model 'used-car-random-forest'.
  client.transition_model_version_stage(


In [32]:
register_and_promote_model(
    run_id="4574b73998e74c8392662d7319db7a9d",
    model_name="used-car-lightgbm",
    stage="Staging"
)


Registered model 'used-car-lightgbm' already exists. Creating a new version of this model...
2025/05/28 23:52:52 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: used-car-lightgbm, version 2


✅ Please set `input_example` parameter when logging the model to auto infer the model signature.
Successfully registered model 'used-car-lightgbm'.
🕒 Waiting for model version creation...
Created version '2' of model 'used-car-lightgbm'.
🏆 Registered: used-car-lightgbm | version: 2
🔗 View run at: http://localhost:5000/#/experiments/4574b73998e74c8392662d7319db7a9d/runs/4574b73998e74c8392662d7319db7a9d
🏃 View run used-car-lightgbm at: http://localhost:5000/#/experiments/4574b73998e74c8392662d7319db7a9d/runs/4574b73998e74c8392662d7319db7a9d
🧪 View experiment at: http://localhost:5000/#/experiments/3


Created version '2' of model 'used-car-lightgbm'.
  client.transition_model_version_stage(
