In [14]:
# Data wranggling
import numpy as np
import pandas as pd

# Machine learning management
import mlflow
from mlflow.models.signature import infer_signature

# Visualização de dados
import matplotlib.pyplot as plt
from yellowbrick.regressor import residuals_plot, prediction_error

# Pipelines para tratar dados futuros
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Seleção e validação de modelos
from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV

# Pré-processamento para  incluir no pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

# Modelos
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Manter a saída do processamento do sklearn como DataFrame pandas
from sklearn import set_config
set_config(transform_output="pandas")

In [2]:
# Definição de função para mostrar as métricas do nosso modelo
def get_metrics(y_true, y_pred):
    dict_metrics = {
        'R2': metrics.r2_score(y_true, y_pred),
        'MAE': metrics.mean_absolute_error(y_true, y_pred),
        'MAPE': metrics.mean_absolute_percentage_error(y_true, y_pred),
        'RMSE': np.sqrt(metrics.mean_squared_error(y_true, y_pred))
    }
    return dict_metrics

In [4]:
# Carregar a base
df = pd.read_csv('../data/insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


# Modeling

In [5]:
NUMERICAL_FEATURES = ['age', 'bmi', 'children']
CATEGORICALA_FEATURES = ['smoker']

FEATURES = NUMERICAL_FEATURES + CATEGORICALA_FEATURES
TARGET = 'charges'

In [6]:
X = df[FEATURES]
y = df[TARGET]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=2023)

In [7]:
# Fazer a pipeline
numerical_transfomer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')), # aqui estamos colocando nos valores faltantes a média
    ('scaler', MinMaxScaler()) # Padronização
])

categorical_transfomer = OneHotEncoder(drop='if_binary', sparse_output=False)

preprocessor = ColumnTransformer([
    ('num', numerical_transfomer, NUMERICAL_FEATURES),
    ('cat', categorical_transfomer, CATEGORICALA_FEATURES)
])

preprocessor

In [8]:
preprocessor.fit_transform(X_train)

Unnamed: 0,num__age,num__bmi,num__children,cat__smoker_yes
1068,0.978261,0.153349,0.2,0.0
1018,0.782609,0.534167,0.6,0.0
995,0.456522,0.196798,0.6,0.0
1248,0.000000,0.641916,0.0,0.0
549,0.543478,0.813559,0.0,1.0
...,...,...,...,...
884,0.152174,0.288808,0.8,0.0
515,0.869565,0.531073,0.0,0.0
695,0.173913,0.651735,0.0,0.0
454,0.304348,0.822437,0.4,0.0


In [9]:
preprocessor.transform(X_test)

Unnamed: 0,num__age,num__bmi,num__children,cat__smoker_yes
748,0.630435,0.539144,0.2,0.0
745,0.695652,0.380818,0.2,0.0
57,0.000000,0.422922,0.4,1.0
546,0.217391,0.523944,0.0,0.0
279,0.717391,0.150659,0.2,0.0
...,...,...,...,...
681,0.021739,0.116761,0.0,0.0
340,0.130435,0.313156,0.0,0.0
1199,0.282609,0.264730,0.4,0.0
61,0.152174,0.476190,0.8,0.0


In [10]:
# Especificando modelos que vamos usar
models = {
    'Dummy': DummyRegressor(),
    'LinearRegression': LinearRegression(),
    'LassoCV': LassoCV(),
    'RidgeCV': RidgeCV(),
    'RandomForestRegressor': RandomForestRegressor(),
    'GradientBoostingRegressor': GradientBoostingRegressor()
}

In [11]:
# Agora com o MLflow vamos setar um experimento
mlflow.search_experiments('Insurance')

[]

In [13]:
# Como queremos salvar vários modelos no MLflow, vamos fazer um loop
for model_name, model in models.items():

    # Model pipeline
    model_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    # Treiar modelo
    model_pipeline.fit(X_train, y_train)
    # Pegar predições
    y_pred = model_pipeline.predict(X_test)
    model_metrics = get_metrics(y_test, y_pred)
    # Pegar parâmetros do modelo
    model_params = model_pipeline.named_steps['model'].get_params()

    # MLFlow
    with mlflow.start_run():
        # Log métricas do modelo
        mlflow.log_metrics(model_metrics)
        # Log hiperparâmetros do modelo
        mlflow.log_params(model_params)

        # Log figuras
        fig,ax = plt.subplots(figsize=(12,8))
        prediction_error(model_pipeline, X_train, y_train, X_test, y_test, show=False, ax=ax)
        mlflow.log_figure(fig, artifact_file='plots/prediction_error.png')
        fig,ax = plt.subplots(figsize=(12,8))
        residuals_plot(model_pipeline, X_train, y_train, X_test, y_test, show=False, ax=ax)
        mlflow.log_figure(fig, artifact_file='plots/residuals_plot.png')

        # Log do modelo
        signature = infer_signature(X_test, model_pipeline.predict(X_test))
        mlflow.sklearn.log_model(model_pipeline, 'model_pipeline', registered_model_name=model_name, signature=signature)


Dummy DummyRegressor()
LinearRegression LinearRegression()
LassoCV LassoCV()
RidgeCV RidgeCV()
RandomForestRegressor RandomForestRegressor()
GradientBoostingRegressor GradientBoostingRegressor()
