In [1]:
import time
import mlflow
import pandas as pd
from sklearn.model_selection import train_test_split
from mlflow.models.signature import infer_signature
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    ConfusionMatrixDisplay,
    RocCurveDisplay,
    classification_report
)

# Train a model and send it to MLflow

In [5]:
# mlflow server connection
mlflow.set_tracking_uri("https://mlflow-s3-5c46c0d9d46b.herokuapp.com/")
#mlflow.set_tracking_uri("../mlruns")
EXPERIMENT_NAME="demo_experiment"
mlflow.set_experiment(EXPERIMENT_NAME)
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
if experiment:
    print("Experiment ID:", experiment.experiment_id)
    print("Artifact Location:", experiment.artifact_location)
else:
    print(f"Experiment '{EXPERIMENT_NAME}' does not exist.")

# start experiment time tracking
start_time = time.time()
mlflow.sklearn.autolog(log_models=False)

# load dataset for training
dataset = pd.read_csv('template_dataset.csv').drop('Unnamed: 0', axis=1)
Y = dataset['rental_price_per_day']
X = dataset.drop('rental_price_per_day', axis=1)
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2, random_state=1)

# list numeric/categorical columns
numeric_features = []
categorical_features = []
for i, t in X.dtypes.items():
    if ("float" in str(t)) or ("int" in str(t)):
        numeric_features.append(i)
    else:
        categorical_features.append(i)
# list check
print("Found numeric features ", numeric_features)
print("Found categorical features ", categorical_features)

# assemble the model with preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features),
    ]
)
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", XGBRegressor())
])

# train the model
with mlflow.start_run(experiment_id = experiment.experiment_id):
    model.fit(X_train, Y_train)
    predictions = model.predict(X_train)

    mlflow.sklearn.log_model(model, "xgboost")
    '''mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="getaround_price_prediction",
        registered_model_name="xgboost",
        signature=infer_signature(X_train, predictions)
    )'''

print("...Done!")
print(f"---Total training time: {time.time()-start_time}")
print('train_score', model.score(X_train, Y_train))
print('test_score', model.score(X_test, Y_test))

mlflow.end_run()

Experiment ID: 38
Artifact Location: mlflow-artifacts:/38
Found numeric features  ['mileage', 'engine_power']
Found categorical features  ['model_key', 'fuel', 'paint_color', 'car_type', 'private_parking_available', 'has_gps', 'has_air_conditioning', 'automatic_car', 'has_getaround_connect', 'has_speed_regulator', 'winter_tires']




...Done!
---Total training time: 18.527926921844482
train_score 0.9417796329507986




test_score 0.7119470671202109


# Run prediction using a model from MLflow

In [6]:
mlflow.set_tracking_uri("https://mlflow-s3-5c46c0d9d46b.herokuapp.com/")
# Read data 
input = dataset.loc[2].to_list()    
print(input)

columns = ['model_key', 'mileage', 'engine_power', 'fuel', 'paint_color',
       'car_type', 'private_parking_available', 'has_gps',
       'has_air_conditioning', 'automatic_car', 'has_getaround_connect',
       'has_speed_regulator', 'winter_tires', 'rental_price_per_day']
features = pd.DataFrame([input], 
                        columns=columns,
                        )
display('features',features)

logged_model = 'runs:/2d469d9db04b4bb7b63c8ec9c8aae5c2/xgboost'
#logged_model = f'logged_models/{EXPERIMENT_NAME}'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

prediction = loaded_model.predict(features)
print('prediction',prediction)

# Format response
response = {"prediction": prediction.tolist()[0]}
response

['Citroën', 183297, 120, 'diesel', 'white', 'convertible', False, False, False, False, True, False, True, 101]


'features'

Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
0,Citroën,183297,120,diesel,white,convertible,False,False,False,False,True,False,True,101


  from .autonotebook import tqdm as notebook_tqdm
Downloading artifacts: 100%|██████████| 9/9 [00:02<00:00,  3.25it/s]


prediction [110.788]


{'prediction': 110.78800201416016}

# Download model from MLflow server to local folder

This is useful to COPY the model in the DOCKER image we want to run the API

In [7]:
import os
from mlflow.tracking import MlflowClient

# Initialize MLflow client
mlflow.set_tracking_uri("https://mlflow-s3-5c46c0d9d46b.herokuapp.com/")
client = MlflowClient()

# Create the local directory if it doesn't exist
local_dir = "logged_models"
if not os.path.exists(local_dir):
    os.mkdir(local_dir)

# Download the artifact to local storage >> Change Artifact_id & Model_name below
local_path = client.download_artifacts('2d469d9db04b4bb7b63c8ec9c8aae5c2', "xgboost", local_dir)
print(f"Artifacts downloaded in: {local_dir}")

Downloading artifacts: 100%|██████████| 9/9 [00:03<00:00,  2.32it/s]

Artifacts downloaded in: logged_models



