# Tutoriel : introduction au MLOps avec MLFlow

In [54]:
# !pip install mlflow

In [74]:
import os

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_predict, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, LabelEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib
from urllib.parse import urlparse
import mlflow
import mlflow.sklearn
import mlflow.pyfunc

In [2]:
SEED = 0

## Import et pré-traitement des données

In [3]:
url_data = "https://minio.lab.sspcloud.fr/projet-formation/diffusion/mlops/data/adult-census-us.csv"
df_census = pd.read_csv(url_data)

In [4]:
df_train, df_test = train_test_split(df_census, test_size=0.2, random_state=SEED)

In [5]:
df_train.sample(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country,class
41747,1,State-gov,27051,Bachelors,13,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,3,United-States,>50K
10052,4,Private,193479,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,1,United-States,<=50K
26318,3,Private,139127,HS-grad,9,Divorced,Other-service,Not-in-family,White,Female,0,0,2,United-States,<=50K
40931,4,Self-emp-not-inc,227906,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,1,0,1,Germany,<=50K
37663,1,Private,109920,Bachelors,13,Never-married,Adm-clerical,Not-in-family,Black,Female,0,0,2,United-States,<=50K


In [6]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39073 entries, 22729 to 2732
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             39073 non-null  int64 
 1   workclass       36830 non-null  object
 2   fnlwgt          39073 non-null  int64 
 3   education       39073 non-null  object
 4   education-num   39073 non-null  int64 
 5   marital-status  39073 non-null  object
 6   occupation      36821 non-null  object
 7   relationship    39073 non-null  object
 8   race            39073 non-null  object
 9   sex             39073 non-null  object
 10  capitalgain     39073 non-null  int64 
 11  capitalloss     39073 non-null  int64 
 12  hoursperweek    39073 non-null  int64 
 13  native-country  38404 non-null  object
 14  class           39073 non-null  object
dtypes: int64(6), object(9)
memory usage: 4.8+ MB


In [7]:
le = LabelEncoder()

X_train = df_train.drop(columns="class")
y_train = le.fit_transform(df_train["class"].values)

In [8]:
y_train

array([0, 0, 0, ..., 1, 1, 0])

In [9]:
le.classes_

array(['<=50K', '>50K'], dtype=object)

In [10]:
median_imputer = SimpleImputer(missing_values=np.nan, strategy='median')
mode_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
ordinal_encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)

categorical_transformer = make_pipeline(mode_imputer, ordinal_encoder)

preprocessor = ColumnTransformer(
    transformers=[
        ("numerical", median_imputer, make_column_selector(dtype_include=np.int64)),
        ("categorical", categorical_transformer, make_column_selector(dtype_include=object))
    ], remainder="passthrough"
)

In [11]:
preprocessor.fit_transform(X_train)

array([[0.00000e+00, 1.17372e+05, 7.00000e+00, ..., 4.00000e+00,
        1.00000e+00, 3.80000e+01],
       [2.00000e+00, 3.57720e+05, 1.10000e+01, ..., 4.00000e+00,
        0.00000e+00, 3.80000e+01],
       [4.00000e+00, 2.02242e+05, 9.00000e+00, ..., 4.00000e+00,
        1.00000e+00, 3.80000e+01],
       ...,
       [2.00000e+00, 3.44624e+05, 1.00000e+01, ..., 4.00000e+00,
        1.00000e+00, 3.80000e+01],
       [3.00000e+00, 1.04489e+05, 1.30000e+01, ..., 4.00000e+00,
        1.00000e+00, 3.80000e+01],
       [0.00000e+00, 1.86925e+05, 1.00000e+01, ..., 4.00000e+00,
        1.00000e+00, 3.80000e+01]])

## Entraîner des modèles : méthode classique

In [41]:
rf_clf = RandomForestClassifier(random_state=SEED)

pipe_rf = Pipeline([
    ('preprocessor', preprocessor), 
    ('classifier', rf_clf)
])

In [42]:
param_grid = {
    "classifier__n_estimators": [50, 100, 200],
    "classifier__max_leaf_nodes": [5, 10, 50]
}


pipe_gscv = GridSearchCV(pipe_rf, param_grid=param_grid, 
                         scoring=["accuracy", "precision", "recall", "f1"],
                         refit="f1",
                         cv=5, n_jobs=5, verbose=1)

In [43]:
pipe_gscv.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


In [44]:
gscv_results = pd.DataFrame(pipe_gscv.cv_results_)
gscv_results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__max_leaf_nodes,param_classifier__n_estimators,params,split0_test_accuracy,split1_test_accuracy,split2_test_accuracy,...,std_test_recall,rank_test_recall,split0_test_f1,split1_test_f1,split2_test_f1,split3_test_f1,split4_test_f1,mean_test_f1,std_test_f1,rank_test_f1
0,0.907181,0.096214,0.088951,0.009771,5,50,"{'classifier__max_leaf_nodes': 5, 'classifier_...",0.827639,0.833781,0.832118,...,0.013073,7,0.541993,0.545963,0.543811,0.557421,0.528157,0.543469,0.009356,7
1,1.59409,0.20633,0.120288,0.016464,5,100,"{'classifier__max_leaf_nodes': 5, 'classifier_...",0.82508,0.831734,0.827895,...,0.007155,8,0.514732,0.536809,0.523557,0.530239,0.517448,0.524557,0.00813,8
2,2.94778,0.209209,0.165557,0.013797,5,200,"{'classifier__max_leaf_nodes': 5, 'classifier_...",0.8238,0.828279,0.825848,...,0.00581,9,0.503067,0.520714,0.514449,0.513224,0.512221,0.512735,0.005667,9
3,0.984936,0.128525,0.076306,0.013316,10,50,"{'classifier__max_leaf_nodes': 10, 'classifier...",0.831862,0.841331,0.837876,...,0.010626,4,0.567763,0.595828,0.590894,0.574122,0.583145,0.582351,0.010351,4
4,1.728769,0.17854,0.117487,0.013329,10,100,"{'classifier__max_leaf_nodes': 10, 'classifier...",0.831094,0.840691,0.837236,...,0.009021,5,0.564931,0.591401,0.587281,0.575057,0.577098,0.579154,0.009374,5


In [45]:
pipe_gscv.best_params_

{'classifier__max_leaf_nodes': 50, 'classifier__n_estimators': 50}

In [46]:
joblib.dump(pipe_gscv, 'pipeline_train_model_20230118.joblib')
gscv_results.to_csv("pipeline_train_scores_20230118.csv")

## Entraîner des modèles avec MLFlow

### Configuration de MLFlow

In [67]:
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://minio.lab.sspcloud.fr"
os.environ["MLFLOW_TRACKING_URI"] = "https://user-avouacr-962711.user.lab.sspcloud.fr/"

### Entraînement et *tracking*

In [71]:
def train_model_with_mlflow_tracking(mlflow_experiment_name,
                                     n_estimators,
                                     max_leaf_nodes):
    # Set up MLFlow context
    mlflow.set_experiment(experiment_name=mlflow_experiment_name)
    
    with mlflow.start_run():
        
        # Training step
        rf_clf = RandomForestClassifier(n_estimators=n_estimators,
                                        max_leaf_nodes=max_leaf_nodes,
                                        random_state=SEED)
        pipe_rf = Pipeline([
            ('preprocessor', preprocessor), 
            ('classifier', rf_clf)
        ])
        pipe_rf.fit(X_train, y_train)
        y_train_pred = pipe_rf.predict(X_train)
        
        # Compute fit metrics
        accuracy = accuracy_score(y_train_pred, y_train)
        precision = precision_score(y_train_pred, y_train)
        recall = recall_score(y_train_pred, y_train)
        f1 = f1_score(y_train_pred, y_train)
        
        # Track training data
        mlflow.log_param("data_url", url_data)
        
        # Track hyperparameters
        mlflow.log_param("n_estimators", n_estimators)
        mlflow.log_param("max_leaf_nodes", max_leaf_nodes)
        
        # Track fit metrics
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1", f1)

        # Track model binary
        mlflow.sklearn.log_model(pipe_rf, "model")

In [73]:
n_estimators_range = [50, 100, 200]
max_leaf_nodes_range = [5, 10, 50]

for n_estimator in n_estimators_range:
    for max_leaf_nodes in max_leaf_nodes_range:
        train_model_with_mlflow_tracking(
            mlflow_experiment_name="tutorial-mlflow",
            n_estimators=n_estimator,
            max_leaf_nodes=max_leaf_nodes
        )

tracking -> model registry -> tag production

### Utiliser le modèle de production pour faire des prédictions

In [75]:
model_name = "20230118_rf_census"
stage = 'Production'

model = mlflow.pyfunc.load_model(
    model_uri=f"models:/{model_name}/{stage}"
)

In [79]:
y_train_pred = model.predict(X_train)
accuracy_score(y_train_pred, y_train)

0.8513551557341387

### Réentraînement et changement du modèle de production

### Servir le modèle comme une API locale

## Entraîner des modèles en parallèle avec MLFlow et Argo Workflow