# Tutoriel : introduction au MLOps avec MLFlow

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_predict, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, LabelEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib

In [2]:
SEED = 0

## Import et pré-traitement des données

In [3]:
url_data = "https://minio.lab.sspcloud.fr/projet-formation/diffusion/mlops/data/adult-census-us.csv"
df_census = pd.read_csv(url_data)

In [4]:
SEED = 32

df_train, df_test = train_test_split(df_census, test_size=0.2, random_state=SEED)

In [5]:
df_train.sample(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country,class
18214,2,Private,119098,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,3,United-States,>50K
47264,3,Private,217850,HS-grad,9,Married-civ-spouse,Transport-moving,Husband,Black,Male,0,0,2,United-States,>50K
46329,4,,141409,Assoc-voc,11,Married-civ-spouse,,Husband,White,Male,0,0,2,United-States,<=50K
28771,1,Self-emp-not-inc,267161,Bachelors,13,Divorced,Exec-managerial,Unmarried,Black,Female,0,0,1,United-States,<=50K
9820,2,Local-gov,251396,Assoc-acdm,12,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,2,Canada,>50K


In [6]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39073 entries, 42252 to 10967
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             39073 non-null  int64 
 1   workclass       36843 non-null  object
 2   fnlwgt          39073 non-null  int64 
 3   education       39073 non-null  object
 4   education-num   39073 non-null  int64 
 5   marital-status  39073 non-null  object
 6   occupation      36834 non-null  object
 7   relationship    39073 non-null  object
 8   race            39073 non-null  object
 9   sex             39073 non-null  object
 10  capitalgain     39073 non-null  int64 
 11  capitalloss     39073 non-null  int64 
 12  hoursperweek    39073 non-null  int64 
 13  native-country  38391 non-null  object
 14  class           39073 non-null  object
dtypes: int64(6), object(9)
memory usage: 4.8+ MB


In [7]:
median_imputer = SimpleImputer(missing_values=np.nan, strategy='median')
mode_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
ordinal_encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)

categorical_transformer = make_pipeline(mode_imputer, ordinal_encoder)

preprocessor = ColumnTransformer(
    transformers=[
        ("numerical", median_imputer, make_column_selector(dtype_include=np.int64)),
        ("categorical", categorical_transformer, make_column_selector(dtype_include=object))
    ], remainder="passthrough"
)

In [8]:
le = LabelEncoder()

X_train = df_train.drop(columns="class")
y_train = le.fit_transform(df_train["class"].values)

In [9]:
y_train

array([0, 1, 0, ..., 0, 0, 0])

In [10]:
le.classes_

array(['<=50K', '>50K'], dtype=object)

## Entraînement du modèle : méthode classique

In [17]:
rf_clf = RandomForestClassifier(n_estimators=200, n_jobs=10, 
                                random_state=SEED)

pipe_rf = Pipeline([
    ('preprocessor', preprocessor), 
    ('classifier', rf_clf)
])

pipe_rf.fit(X_train, y_train)

In [19]:
param_grid = {
    "classifier__n_estimators": [50, 100, 200, 500],
    "classifier__max_leaf_nodes": [2, 10, 50, 100]
}


pipe_gscv = GridSearchCV(pipe_rf, param_grid=param_grid, scoring="f1", 
                         cv=5, n_jobs=5, verbose=1)

In [22]:
pipe_gscv.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


In [28]:
gscv_results = pd.DataFrame(pipe_gscv.cv_results_)
gscv_results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__max_leaf_nodes,param_classifier__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.347151,0.052306,0.068669,0.007507,2,50,"{'classifier__max_leaf_nodes': 2, 'classifier_...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,13
1,0.522223,0.05256,0.098644,0.005824,2,100,"{'classifier__max_leaf_nodes': 2, 'classifier_...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,13
2,0.876562,0.062651,0.13547,0.006517,2,200,"{'classifier__max_leaf_nodes': 2, 'classifier_...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,13
3,1.965009,0.145277,0.282255,0.019379,2,500,"{'classifier__max_leaf_nodes': 2, 'classifier_...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,13
4,0.342445,0.017025,0.05801,0.003779,10,50,"{'classifier__max_leaf_nodes': 10, 'classifier...",0.54482,0.562046,0.547753,0.558393,0.574043,0.557411,0.010496,12


In [29]:
joblib.dump(pipe_gscv, 'pipeline_train_model_20230118.joblib')
gscv_results.to_csv("pipeline_train_scores_20230118.csv")

## Entraînement du modèle : avec MLFlow