# Tutoriel : introduction au MLOps avec MLFlow

In [61]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_predict, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, LabelEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [30]:
SEED = 0

## Import et pré-traitement des données

In [31]:
url_data = "https://minio.lab.sspcloud.fr/projet-formation/diffusion/mlops/data/adult-census-us.csv"
df_census = pd.read_csv(url_data)

In [32]:
SEED = 32

df_train, df_test = train_test_split(df_census, test_size=0.2, random_state=SEED)

In [33]:
df_train.sample(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country,class
11670,3,Private,227714,Bachelors,13,Married-civ-spouse,Sales,Husband,White,Male,0,0,3,United-States,<=50K
33221,2,Private,54038,Some-college,10,Married-civ-spouse,Adm-clerical,Wife,White,Female,0,3,0,United-States,>50K
25685,2,,205396,HS-grad,9,Widowed,,Not-in-family,White,Female,0,0,0,United-States,<=50K
6621,4,Self-emp-inc,70720,Masters,14,Divorced,Exec-managerial,Not-in-family,White,Male,4,0,3,United-States,>50K
25144,2,Private,215150,9th,5,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,3,United-States,<=50K


In [50]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39073 entries, 42252 to 10967
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             39073 non-null  int64 
 1   workclass       36843 non-null  object
 2   fnlwgt          39073 non-null  int64 
 3   education       39073 non-null  object
 4   education-num   39073 non-null  int64 
 5   marital-status  39073 non-null  object
 6   occupation      36834 non-null  object
 7   relationship    39073 non-null  object
 8   race            39073 non-null  object
 9   sex             39073 non-null  object
 10  capitalgain     39073 non-null  int64 
 11  capitalloss     39073 non-null  int64 
 12  hoursperweek    39073 non-null  int64 
 13  native-country  38391 non-null  object
 14  class           39073 non-null  object
dtypes: int64(6), object(9)
memory usage: 4.8+ MB


In [42]:
median_imputer = SimpleImputer(missing_values=np.nan, strategy='median')
mode_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
ordinal_encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)

categorical_transformer = make_pipeline(mode_imputer, ordinal_encoder)

preprocessor = ColumnTransformer(
    transformers=[
        ("numerical", median_imputer, make_column_selector(dtype_include=np.int64)),
        ("categorical", categorical_transformer, make_column_selector(dtype_include=object))
    ], remainder="passthrough"
)

In [51]:
le = LabelEncoder()

X_train = df_train.drop(columns="class")
y_train = le.fit_transform(df_train["class"].values)

In [53]:
y_train

array([0, 1, 0, ..., 0, 0, 0])

In [52]:
le.classes_

array(['<=50K', '>50K'], dtype=object)

## Entraînement du modèle : méthode classique

In [54]:
rf_clf = RandomForestClassifier(n_estimators=200, n_jobs=10, 
                                random_state=SEED)

pipe_rf = Pipeline([
    ('preprocessor', preprocessor), 
    ('classifier', rf_clf)
])

pipe_rf.fit(X_train, y_train)

In [64]:
y_train_pred = cross_val_predict(pipe_rf, X_train, y_train, 
                                 cv=5, n_jobs=5)

accuracy = accuracy_score(y_train_pred, y_train)
precision = precision_score(y_train_pred, y_train)
recall = recall_score(y_train_pred, y_train)
f1 = f1_score(y_train_pred, y_train)

print(
f"""
Accuracy : {accuracy}
Precision : {precision}
Recall : {recall}
F1-score : {f1}
""")


Accuracy : 0.8343613236761959
Precision : 0.5988508193232602
Recall : 0.6756302521008404
F1-score : 0.634927797833935



## Entraînement du modèle : avec MLFlow