## Análise comparativa de dados

In [21]:
from IPython.display import Image, display, Markdown
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import jinja2
import joblib

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression

from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import ShuffleSplit, GridSearchCV, KFold, cross_validate
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer, mean_absolute_error, mean_squared_error, r2_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


## 1. Obtenção de Dados

Nessa etapa obtemos novamente os arquivos brutos de dados e o dicionário antes de iniciar o pre-processamento.

In [22]:
df = pd.read_csv("../data/raw/data.csv")
df_dict = pd.read_csv("../data/external/dictonary.csv")
df_dict

Unnamed: 0,variable,description,type,subtype
0,total_bill,total da conta,quantitativo,contínuo
1,tip,valor da gorjeta,quantitativo,contínuo
2,sex,sexo do pagador,qualitativo,nominal
3,smoker,há fumantes ?,qualitativo,nominal
4,day,dia da semana,qualitativo,ordinal
5,time,horário do dia,qualitativo,nominal
6,size,tamanho do grupo,quantitativo,discreto


## 2. Preparação de Dados

Aqui realizamos a normalização, codificação e o tratamento de dados discrepantes e/ou faltantes dentro do conjunto de dados.

--Dados Faltantes

Como anteriormente dito na analise exploratoria, não há colunas com dados faltantes:

In [23]:
df.isnull().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

In [24]:
target_column = 'tip'
nominal_columns = (
    df_dict
    .query("subtype == 'nominal' and variable != @target_column")
    .variable
    .to_list()
)
continuous_columns = (
    df_dict
    .query("subtype == 'continuo'")
    .variable
    .to_list()
)
continuous_columns = (
    df_dict
    .query("subtype == 'continuo'")
    .variable
    .to_list()
)
ordinal_columns = (
    df_dict
    .query("subtype == 'ordinal'")
    .variable
    .to_list()
)
discrete_columns = (
    df_dict
    .query("subtype == 'discreto'")
    .variable
    .to_list()
)

X = df.drop(columns=[target_column], axis=1)
y = df[target_column]

In [25]:
# tratamento de dados discrepantes
nominal_preprocessor = Pipeline([
    ('missing', SimpleImputer(strategy='most_frequent')), # tratamento de dados faltantes
    ('encoding', OneHotEncoder(sparse_output=False, drop='first')), # codificação de variáveis
    ('normalization', StandardScaler()) # normalização de dados
])
continuous_preprocessor = Pipeline([
    ('missing', SimpleImputer(strategy='mean')), # tratamento de dados faltantes
    ('normalization', StandardScaler()) # normalização de dados
])

preprocessor = ColumnTransformer([
    ('nominal', nominal_preprocessor, nominal_columns),
    ('continuous', continuous_preprocessor, continuous_columns)
])

model = LogisticRegression()

## 3. Seleção de Modelos

Iremos análisar três modelos, que serão testados utilizando um método de validação, a saber:

- K-Nearest-Neighbors
- Decision Tree
- Random Forest

Além disso, cada um desses algoritmos será testado com diferentes hiper-parametros, para que possamos encontrar o melhor modelo e a melhor configuração possível para esse modelo.

Utilizaremos as seguintes métricas para análise:

- Acurácia (accuracy): proporção entre os dados que foram corretamente previstos (como positivos ou negativos) com o total de dados observados;
- Precisão (precision): proporção entre dados corretamente previstos como positivos e o total de observações positivas.
- Recall: proporção entre dados corretamente previstos como positivos com o total de observações.
- F1-score: média entre precision e recall, portanto levando em conta tanto falsos positivos quanto falsos negativos.

In [26]:
# experiment settings
n_splits_comparative_analysis = 10
n_folds_grid_search = 5
test_size = .3
random_state = 0
metrics = {
    'MAE': make_scorer(mean_absolute_error),
    'MSE': make_scorer(mean_squared_error),
    'R2': make_scorer(r2_score)
}

# model settings
max_iter = 1000000
models = [
    ('K-Nearest Neighbors', KNeighborsRegressor(), {"n_neighbors": range(3, 20, 2), 'weights': ['uniform', 'distance']}),
    ('Decision Tree',  DecisionTreeRegressor(random_state=random_state), {'criterion':['squared_error', 'friedman_mse'],'max_depth': [3, 25, 40]}),
    ('Random Forest',  RandomForestRegressor(random_state=random_state), {'criterion':['squared_error', 'friedman_mse'],'max_depth': [3, 25, 40], 'n_estimators': [10, 50]}),
]

In [27]:
results = pd.DataFrame({})
cross_validate_grid_search = KFold(n_splits=n_folds_grid_search)
cross_validate_comparative_analysis = ShuffleSplit(n_splits=n_splits_comparative_analysis, test_size=test_size, random_state=random_state)
for model_name, model_object, model_parameters in models:
    print(f"running {model_name}...")
    model_grid_search = GridSearchCV(
        estimator=model_object,
        param_grid=model_parameters,
        scoring='neg_mean_absolute_error',
        n_jobs=2,
        cv=cross_validate_grid_search
    )
    approach = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model_grid_search)
    ])
    scores = cross_validate(
        estimator=approach,
        X=X,
        y=y,
        cv=cross_validate_comparative_analysis,
        n_jobs=2,
        scoring=metrics,
        return_train_score=False
    )
    scores['model_name'] = [model_name] * n_splits_comparative_analysis
    df_scores = pd.DataFrame(scores)
    df_scores =  df_scores.drop(columns=['model_name'])
    df_scores = df_scores.agg(['mean', 'std'])
    display(df_scores)
    results = pd.concat([results, pd.DataFrame(scores)], ignore_index=True)

running K-Nearest Neighbors...


Unnamed: 0,fit_time,score_time,test_MAE,test_MSE,test_R2
mean,0.347325,0.006291,1.101923,2.221517,-0.065936
std,0.169068,0.004567,0.096805,0.550068,0.066995


running Decision Tree...


Unnamed: 0,fit_time,score_time,test_MAE,test_MSE,test_R2
mean,0.074765,0.003472,1.0895,2.191843,-0.050567
std,0.009103,0.001004,0.086187,0.551734,0.043123


running Random Forest...


Unnamed: 0,fit_time,score_time,test_MAE,test_MSE,test_R2
mean,1.516379,0.00412,1.092872,2.200429,-0.055008
std,0.115753,0.001125,0.086992,0.547184,0.039989


In [28]:
def highlight_best(s, props=''):
    if s.name[1] != 'std':
        if s.name[0].endswith('time'):
            return np.where(s == np.nanmin(s.values), props, '')
        return np.where(s == np.nanmax(s.values), props, '')

display(Markdown("### 3.1 Resultados gerais"))
(
    results
    .groupby('model_name')
    .agg(['mean', 'std']).T
    .style
    .apply(highlight_best, props='color:white;background-color:gray;font-weight: bold;', axis=1)
    .set_table_styles([{'selector': 'td', 'props': 'text-align: center;'}])
)

### 3.1 Resultados gerais

Unnamed: 0,model_name,Decision Tree,K-Nearest Neighbors,Random Forest
fit_time,mean,0.074765,0.347325,1.516379
fit_time,std,0.009103,0.169068,0.115753
score_time,mean,0.003472,0.006291,0.00412
score_time,std,0.001004,0.004567,0.001125
test_MAE,mean,1.0895,1.101923,1.092872
test_MAE,std,0.086187,0.096805,0.086992
test_MSE,mean,2.191843,2.221517,2.200429
test_MSE,std,0.551734,0.550068,0.547184
test_R2,mean,-0.050567,-0.065936,-0.055008
test_R2,std,0.043123,0.066995,0.039989


Como pode ser visto, o classificador Decision Tree obteve melhores resultados para todas as métricas, portanto, podemos obter os melhores parâmetros deste modelo e salvá-lo em disco para utilização em uma próxima etapa.

## 3.2 Persistencia do modelo

In [29]:
# Obtem o modelo e os parametros ganhadores
model_name, model_object, model_parameters  = [foo for foo in models if foo[0] == "Decision Tree"][0] 

model_grid_search = GridSearchCV(
        estimator=model_object,
        param_grid=model_parameters,
        scoring='neg_mean_absolute_error',
        n_jobs=None,
        cv=cross_validate_grid_search
    )

approach = Pipeline([
    ("preprocessor", preprocessor),
    ("model", model_grid_search)
])

approach.fit(X, y) #Seleciona o approach

print(f"Hiper parâmetros do modelo: {approach.steps[1][1].best_params_}")

Hiper parâmetros do modelo: {'criterion': 'squared_error', 'max_depth': 3}


In [31]:
joblib.dump(approach, '../models/model.joblib') # Salva o modelo em disco.

['../models/model.joblib']