In [1]:
!python -V

Python 3.10.6


In [1]:
import pandas as pd

In [2]:
import pickle

In [3]:
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

In [5]:
import mlflow


mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-taxi-experiment")

<Experiment: artifact_location='/home/luca/teaching/simplon/IA_2024/teaching-resources/Ressources/MLOps/mlops-zoomcamp/02-experiment-tracking/mlruns/1', creation_time=1730210315440, experiment_id='1', last_update_time=1730210315440, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [6]:
def read_dataframe(filename):
    df = pd.read_csv(filename)

    df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
    df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    return df

In [7]:
df_train = read_dataframe('../data/green_tripdata_2021-01.csv')
df_val = read_dataframe('../data/green_tripdata_2021-02.csv')

  df = pd.read_csv(filename)


In [8]:
len(df_train), len(df_val)

(73908, 61921)

In [9]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [10]:
categorical = ['PU_DO'] #'PULocationID', 'DOLocationID']
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [11]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [14]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

mean_squared_error(y_val, y_pred, squared=False)



np.float64(7.75871521021275)

In [16]:
with open('models/lin_reg.bin', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)

In [18]:
with mlflow.start_run():

    mlflow.set_tag("developer", "luca")

    mlflow.log_param("train-data-path", "../data/green_tripdata_2021-01.csv")
    mlflow.log_param("valid-data-path", "../data/green_tripdata_2021-02.csv")

    alpha = 0.01
    mlflow.log_param("alpha", alpha)
    lr = Lasso(alpha)
    lr.fit(X_train, y_train)

    y_pred = lr.predict(X_val)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    mlflow.log_metric("rmse", rmse)

    mlflow.log_artifact(local_path="models/lin_reg.bin", artifact_path="models_pickle")



### With Parameter Tuning

In [12]:
import xgboost as xgb

In [13]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [14]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [15]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50
        )
        y_pred = booster.predict(valid)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

https://hyperopt.github.io/hyperopt/getting-started/search_spaces/

In [24]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:linear',
    'seed': 42
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]




[0]	validation-rmse:7.07990                           
[1]	validation-rmse:6.73721                           
[2]	validation-rmse:6.70237                           
[3]	validation-rmse:6.68999                           
[4]	validation-rmse:6.68528                           
[5]	validation-rmse:6.67578                           
[6]	validation-rmse:6.67510                           
[7]	validation-rmse:6.66843                           
[8]	validation-rmse:6.65853                           
[9]	validation-rmse:6.65503                           
[10]	validation-rmse:6.65078                          
[11]	validation-rmse:6.64610                          
[12]	validation-rmse:6.63696                          
[13]	validation-rmse:6.63250                          
[14]	validation-rmse:6.63151                          
[15]	validation-rmse:6.62829                          
[16]	validation-rmse:6.62365                          
[17]	validation-rmse:6.62053                          
[18]	valid





[0]	validation-rmse:7.64690                                                    
[1]	validation-rmse:6.76605                                                    
[2]	validation-rmse:6.59333                                                    
[3]	validation-rmse:6.52936                                                    
[4]	validation-rmse:6.50718                                                    
[5]	validation-rmse:6.49108                                                    
[6]	validation-rmse:6.48232                                                    
[7]	validation-rmse:6.47572                                                    
[8]	validation-rmse:6.46846                                                    
[9]	validation-rmse:6.46360                                                    
[10]	validation-rmse:6.46036                                                   
[11]	validation-rmse:6.45618                                                   
[12]	validation-rmse:6.45007            





[0]	validation-rmse:11.24407                                                    
[1]	validation-rmse:10.43053                                                    
[2]	validation-rmse:9.74973                                                     
[3]	validation-rmse:9.18404                                                     
[4]	validation-rmse:8.71969                                                     
[5]	validation-rmse:8.33681                                                     
[6]	validation-rmse:8.02451                                                     
[7]	validation-rmse:7.76846                                                     
[8]	validation-rmse:7.56075                                                     
[9]	validation-rmse:7.39248                                                     
[10]	validation-rmse:7.25445                                                    
[11]	validation-rmse:7.14418                                                    
[12]	validation-rmse:7.05341





[0]	validation-rmse:9.03421                                                       
[1]	validation-rmse:7.64995                                                       
[2]	validation-rmse:7.07646                                                       
[3]	validation-rmse:6.84706                                                       
[4]	validation-rmse:6.74420                                                       
[5]	validation-rmse:6.69433                                                       
[6]	validation-rmse:6.67013                                                       
[7]	validation-rmse:6.65539                                                       
[8]	validation-rmse:6.64307                                                       
[9]	validation-rmse:6.63538                                                       
[10]	validation-rmse:6.63034                                                      
[11]	validation-rmse:6.62604                                                      
[12]





[0]	validation-rmse:10.93038                                                      
[1]	validation-rmse:9.92036                                                       
[2]	validation-rmse:9.13266                                                       
[3]	validation-rmse:8.52593                                                       
[4]	validation-rmse:8.06345                                                       
[5]	validation-rmse:7.71306                                                       
[6]	validation-rmse:7.44966                                                       
[7]	validation-rmse:7.25127                                                       
[8]	validation-rmse:7.10131                                                       
[9]	validation-rmse:6.98678                                                       
[10]	validation-rmse:6.89924                                                      
[11]	validation-rmse:6.82976                                                      
[12]





[0]	validation-rmse:6.78023                                                       
[1]	validation-rmse:6.69311                                                       
[2]	validation-rmse:6.67572                                                       
[3]	validation-rmse:6.66699                                                       
[4]	validation-rmse:6.66400                                                       
[5]	validation-rmse:6.65191                                                       
[6]	validation-rmse:6.64613                                                       
[7]	validation-rmse:6.63911                                                       
[8]	validation-rmse:6.63326                                                       
[9]	validation-rmse:6.62145                                                       
[10]	validation-rmse:6.61991                                                      
[11]	validation-rmse:6.62083                                                      
[12]





[0]	validation-rmse:7.40303                                                       
[1]	validation-rmse:6.78694                                                       
[2]	validation-rmse:6.68602                                                       
[3]	validation-rmse:6.65553                                                       
[4]	validation-rmse:6.64531                                                       
[5]	validation-rmse:6.64035                                                       
[6]	validation-rmse:6.63533                                                       
[7]	validation-rmse:6.63118                                                       
[8]	validation-rmse:6.62368                                                       
[9]	validation-rmse:6.61866                                                       
[10]	validation-rmse:6.61522                                                      
[11]	validation-rmse:6.61107                                                      
[12]





[0]	validation-rmse:11.77684                                                      
[1]	validation-rmse:11.37046                                                      
[2]	validation-rmse:10.99519                                                      
[3]	validation-rmse:10.64437                                                      
[4]	validation-rmse:10.31966                                                      
[5]	validation-rmse:10.02099                                                      
[6]	validation-rmse:9.74466                                                       
[7]	validation-rmse:9.48818                                                       
[8]	validation-rmse:9.25276                                                       
[9]	validation-rmse:9.03283                                                       
[10]	validation-rmse:8.83235                                                      
[11]	validation-rmse:8.64644                                                      
[12]

KeyboardInterrupt: 

In [26]:
mlflow.xgboost.autolog(disable=True)

Running best model by getting best parameters from the interface:

In [27]:
with mlflow.start_run():
    mlflow.set_tag("model", "xgboost")

    train = xgb.DMatrix(X_train, label=y_train)
    valid = xgb.DMatrix(X_val, label=y_val)

    best_params = {
        'learning_rate': 0.09585355369315604,
        'max_depth': 30,
        'min_child_weight': 1.060597050922164,
        'objective': 'reg:linear',
        'reg_alpha': 0.018060244040060163,
        'reg_lambda': 0.011658731377413597,
        'seed': 42
    }

    mlflow.log_params(best_params)

    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=1000,
        evals=[(valid, 'validation')],
        early_stopping_rounds=50
    )

    y_pred = booster.predict(valid)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    mlflow.log_metric("rmse", rmse)

    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)
    mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

    mlflow.xgboost.log_model(booster, artifact_path="models_mlflow")



[0]	validation-rmse:11.44482
[1]	validation-rmse:10.77202
[2]	validation-rmse:10.18363
[3]	validation-rmse:9.67396
[4]	validation-rmse:9.23166
[5]	validation-rmse:8.84808
[6]	validation-rmse:8.51883
[7]	validation-rmse:8.23597
[8]	validation-rmse:7.99320
[9]	validation-rmse:7.78709
[10]	validation-rmse:7.61022
[11]	validation-rmse:7.45952
[12]	validation-rmse:7.33049
[13]	validation-rmse:7.22098
[14]	validation-rmse:7.12713
[15]	validation-rmse:7.04752
[16]	validation-rmse:6.98005
[17]	validation-rmse:6.92232
[18]	validation-rmse:6.87112
[19]	validation-rmse:6.82740
[20]	validation-rmse:6.78995
[21]	validation-rmse:6.75792
[22]	validation-rmse:6.72994
[23]	validation-rmse:6.70547
[24]	validation-rmse:6.68390
[25]	validation-rmse:6.66421
[26]	validation-rmse:6.64806
[27]	validation-rmse:6.63280
[28]	validation-rmse:6.61924
[29]	validation-rmse:6.60773
[30]	validation-rmse:6.59777
[31]	validation-rmse:6.58875
[32]	validation-rmse:6.58107
[33]	validation-rmse:6.57217
[34]	validation-rmse:



### Running one inference on the inference server

In [12]:
import numpy as np

# preparing test data

df_test = read_dataframe('../data/green_tripdata_2021-03.csv')
y_test = df_test[target].values
df_test['PU_DO'] = df_test['PULocationID'] + '_' + df_test['DOLocationID']
categorical = ['PU_DO'] #'PULocationID', 'DOLocationID']
numerical = ['trip_distance']

test_dicts = df_test[categorical + numerical].to_dict(orient='records')
X_test = dv.transform(test_dicts)

# np.savetxt('test.txt', X_test.toarray()[0], delimiter=',', newline=',', fmt='%f')

# then go on Postman or a terminal to POST request to the inference server

  df = pd.read_csv(filename)


### Running Batch Inference in Python

In [49]:
name="duration-prediction-xgb"
stage="None"
loaded_model = mlflow.pyfunc.load_model(f"models:/{name}/{stage}")

  latest = client.get_latest_versions(name, None if stage is None else [stage])


In [47]:
predictions = loaded_model.predict(X_test)

In [None]:
loaded_model = mlflow.pyfunc.load_model("mlruns/1/f0c472b508b548888b18f85b5616940b/artifacts/models_mlflow")

predictions = loaded_model.predict(X_test)

predictions

### Autolog

In [17]:

best_params = {
    'learning_rate': 0.09585355369315604,
    'max_depth': 30,
    'min_child_weight': 1.060597050922164,
    'objective': 'reg:linear',
    'reg_alpha': 0.018060244040060163,
    'reg_lambda': 0.011658731377413597,
    'seed': 42
}


mlflow.xgboost.autolog(extra_tags={"model": "xgboost"})

# the following line is technically optional as well, but we need it if we want to add custom metrics/other info
with mlflow.start_run() as run:

    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=50,
        evals=[(valid, 'validation')],
        early_stopping_rounds=10
    )

    # optional, to show how to add custom metric
    X_test_xgb = xgb.DMatrix(X_test)
    y_pred = booster.predict(X_test_xgb)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mlflow.log_metric("test-rmse", rmse)




[0]	validation-rmse:11.44482
[1]	validation-rmse:10.77202
[2]	validation-rmse:10.18363
[3]	validation-rmse:9.67396
[4]	validation-rmse:9.23166
[5]	validation-rmse:8.84808
[6]	validation-rmse:8.51883
[7]	validation-rmse:8.23597
[8]	validation-rmse:7.99320
[9]	validation-rmse:7.78709
[10]	validation-rmse:7.61022
[11]	validation-rmse:7.45952
[12]	validation-rmse:7.33049
[13]	validation-rmse:7.22098
[14]	validation-rmse:7.12713
[15]	validation-rmse:7.04752
[16]	validation-rmse:6.98005
[17]	validation-rmse:6.92232
[18]	validation-rmse:6.87112
[19]	validation-rmse:6.82740
[20]	validation-rmse:6.78995
[21]	validation-rmse:6.75792
[22]	validation-rmse:6.72994
[23]	validation-rmse:6.70547
[24]	validation-rmse:6.68390
[25]	validation-rmse:6.66421
[26]	validation-rmse:6.64806
[27]	validation-rmse:6.63280
[28]	validation-rmse:6.61924
[29]	validation-rmse:6.60773
[30]	validation-rmse:6.59777
[31]	validation-rmse:6.58875
[32]	validation-rmse:6.58107
[33]	validation-rmse:6.57217
[34]	validation-rmse:



Validate the model will work (code from the documentation) :

In [22]:
from mlflow.models import validate_serving_input

model_uri = 'runs:/21980e83684b492db5abd46461e4467a/model'

# The logged model does not contain an input_example.
# Manually generate a serving payload to verify your model prior to deployment.
from mlflow.models import convert_input_example_to_serving_input

# Define INPUT_EXAMPLE via assignment with your own input example to the model
# A valid input example is a data instance suitable for pyfunc prediction

serving_payload = convert_input_example_to_serving_input(data_to_predict[0])

# Validate the serving payload works on the model
validate_serving_input(model_uri, serving_payload)

Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00, 578.88it/s]


array([66.00237], dtype=float32)

In [25]:
# You can also load the xgboost model and not a python function
xgboost_model = mlflow.xgboost.load_model(model_uri)
xgboost_model

<xgboost.core.Booster at 0x7f97863a5e40>

Another example with sklearn with autologging enabled:

In [None]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.svm import LinearSVR

mlflow.sklearn.autolog()

for model_class in (LinearRegression, ExtraTreesRegressor, LinearSVR):

    with mlflow.start_run():

        mlflow.log_param("train-data-path", "./data/green_tripdata_2021-01.csv")
        mlflow.log_param("valid-data-path", "./data/green_tripdata_2021-02.csv")
        mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

        mlmodel = model_class()
        mlmodel.fit(X_train, y_train)

        y_pred = mlmodel.predict(X_val)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("validation-rmse", rmse)


In [14]:
mlflow.sklearn.autolog()

with mlflow.start_run():

    mlflow.log_param("train-data-path", "./data/green_tripdata_2021-01.csv")
    mlflow.log_param("valid-data-path", "./data/green_tripdata_2021-02.csv")
    mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

    mlmodel = LinearRegression()
    mlmodel.fit(X_train, y_train)

    y_pred = mlmodel.predict(X_val)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    mlflow.log_metric("validation-rmse", rmse)



: 

In [None]:
mlmodel = LinearRegression()
mlmodel.fit(X_train, y_train)