In [53]:

import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from mlflow.tracking import MlflowClient
import mlflow
import mlflow.sklearn
import pickle
import math
import os
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("cpe-taxi")
pname = "cpetaxi-dur" 

client = MlflowClient()

2025/03/28 02:28:46 INFO mlflow.tracking.fluent: Experiment with name 'cpe-taxi' does not exist. Creating a new experiment.


In [54]:

#Générate prod 
df = pd.read_csv("green_tripdata_2021-07.csv", low_memory=False)

df = df[(df["trip_distance"] > 0) & (df["fare_amount"] > 0)]
df["PULocationID"] = df["PULocationID"].astype(str)
df["DOLocationID"] = df["DOLocationID"].astype(str)

categorical = ["PULocationID", "DOLocationID"]
numerical = ["trip_distance"]
target = "fare_amount"

dv = DictVectorizer()
train_dicts_prod = df[categorical + numerical].to_dict(orient="records")
X_train_prod = dv.fit_transform(train_dicts_prod)
y_train_prod = df[target].values

#First model test
model = LinearRegression()
model.fit(X_train_prod, y_train_prod)

y_pred_prod = model.predict(X_train_prod)
rmse = math.sqrt(mean_squared_error(y_train_prod, y_pred_prod))

with mlflow.start_run():
    mlflow.log_metric("rmse", rmse)
    
    os.makedirs("./artifacts", exist_ok=True)
    with open("./artifacts/preprocessor_prod.b", "wb") as f_out:
        print(f_out.name)
        pickle.dump(dv, f_out)
    
    mlflow.log_artifact("./artifacts/preprocessor_prod.b", artifact_path="preprocessor_prod")

    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="model",
        registered_model_name=pname
    )

./artifacts/preprocessor_prod.b


Successfully registered model 'cpetaxi-dur'.
2025/03/28 02:28:58 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: cpetaxi-dur, version 1


🏃 View run powerful-calf-675 at: http://127.0.0.1:5000/#/experiments/3/runs/fee33483d59c4163a4b1d49052af21a2
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/3


Created version '1' of model 'cpetaxi-dur'.


In [55]:
#To prod
client.transition_model_version_stage(
    name=pname,
    version=1,
    stage="Production"
)

  client.transition_model_version_stage(


<ModelVersion: aliases=[], creation_timestamp=1743103738287, current_stage='Production', description='', last_updated_timestamp=1743103741209, name='cpetaxi-dur', run_id='fee33483d59c4163a4b1d49052af21a2', run_link='', source='mlflow-artifacts:/3/fee33483d59c4163a4b1d49052af21a2/artifacts/model', status='READY', status_message=None, tags={}, user_id='', version='1'>

In [56]:
from sklearn.ensemble import RandomForestRegressor

df["passenger_count"] = df["passenger_count"].fillna(1).astype(int).astype(str)  

categorical = ["PULocationID", "DOLocationID", "passenger_count"]
numerical = ["trip_distance"]

dv = DictVectorizer()
train_dicts_stage = df[categorical + numerical].to_dict(orient="records")
X_train_stage = dv.fit_transform(train_dicts_stage)
y_train_stage = df["fare_amount"].values

model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
model.fit(X_train_stage, y_train_stage)

with mlflow.start_run():
    mlflow.log_metric("rmse", math.sqrt(mean_squared_error(y_train_stage, model.predict(X_train_stage))))

    with open("./artifacts/preprocessor_stage.b", "wb") as f_out:
        pickle.dump(dv, f_out)
    mlflow.log_artifact("./artifacts/preprocessor_stage.b", artifact_path="preprocessor_stage")

    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="model",
        registered_model_name=pname
    )


Registered model 'cpetaxi-dur' already exists. Creating a new version of this model...
2025/03/28 02:30:01 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: cpetaxi-dur, version 2
Created version '2' of model 'cpetaxi-dur'.


🏃 View run gregarious-kite-120 at: http://127.0.0.1:5000/#/experiments/3/runs/c6a44e9a0e98443fbe2e90b78d821bb0
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/3


In [57]:
#To Staging
client.transition_model_version_stage(
    name=pname,
    version=2,
    stage="Staging"
)

  client.transition_model_version_stage(


<ModelVersion: aliases=[], creation_timestamp=1743103801883, current_stage='Staging', description='', last_updated_timestamp=1743103805945, name='cpetaxi-dur', run_id='c6a44e9a0e98443fbe2e90b78d821bb0', run_link='', source='mlflow-artifacts:/3/c6a44e9a0e98443fbe2e90b78d821bb0/artifacts/model', status='READY', status_message=None, tags={}, user_id='', version='2'>

In [58]:
#Load prod 
model_uri = f"models:/{pname}/Production"
_ = mlflow.pyfunc.load_model(model_uri)

with open("./artifacts/preprocessor_prod.b", "rb") as f:
    dv = pickle.load(f)

X_test_prod = dv.transform(train_dicts_prod)
target_prod = df.loc[df.index, "fare_amount"]



Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00, 95.40it/s]


In [49]:
#Load stage
model_uri = f"models:/{pname}/Staging"
_ = mlflow.pyfunc.load_model(model_uri)

with open("./artifacts/preprocessor_stage.b", "rb") as f:
    dv = pickle.load(f)

X_test_stage = dv.transform(train_dicts_stage)
target_stage = df.loc[df.index, "fare_amount"]


Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00, 15.36it/s]


In [59]:

prod_model = mlflow.pyfunc.load_model(f"models:/{pname}/Production")
staging_model = mlflow.pyfunc.load_model(f"models:/{pname}/Staging")

y_pred_prod = prod_model.predict(X_test_prod)
y_pred_staging = staging_model.predict(X_test_stage)

rmse_prod = math.sqrt(mean_squared_error(target_prod, y_pred_prod))
rmse_staging = math.sqrt(mean_squared_error(target_stage, y_pred_staging))

print(f"RMSE Pro: {rmse_prod:.3f}")
print(f"RMSE Stage: {rmse_staging:.3f}")


Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00, 95.21it/s] 
Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00, 15.81it/s]


RMSE Pro: 11.682
RMSE Stage: 5.955


In [61]:

if rmse_staging < rmse_prod:
    print("Promote to Production...")
    staging_version = client.get_latest_versions(pname, stages=["Staging"])[0].version
    client.transition_model_version_stage(
        name=pname,
        version=staging_version,
        stage="Production",
        archive_existing_versions=True
    )
else:
    print("Curr model is better.")


Promote to Production...


  staging_version = client.get_latest_versions(pname, stages=["Staging"])[0].version
