# Análise comparativa de modelos

In [25]:
# Bibliotecas
 
from IPython.display import display, Markdown, HTML
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline 
from sklearn.impute import SimpleImputer,KNNImputer 
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder 
from sklearn.compose import ColumnTransformer 
from sklearn.model_selection import train_test_split, ShuffleSplit, KFold, cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.svm import SVR

from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import cross_validate









## 1. Obtenção de dados

In [2]:
#Dicionário de dados
df = pd.read_csv("../data/raw/penguins.csv")
df_dict = pd.read_csv("../data/external/dictionary.csv")
df_dict

Unnamed: 0,variavel,descricao,tipo,subtipo
0,species,"Espécie do pinguim (Adelie, Chinstrap, Gentoo)",qualitativa,nominal
1,island,"Ilha onde o pinguim foi encontrado (Biscoe, Dr...",qualitativa,nominal
2,bill_length_mm,Comprimento do bico em milímetros,quantitativa,contínua
3,bill_depth_mm,Profundidade do bico em milímetros,quantitativa,contínua
4,flipper_length_mm,Comprimento da nadadeira em milímetros,quantitativa,contínua
5,body_mass_g,Massa corporal em gramas,quantitativa,contínua
6,sex,Sexo do pinguim (Masculino ou Feminino),qualitativa,nominal
7,year,Ano em que os dados foram coletados,quantitativa,discreta


# 2. Preparação de dados 

- Aqui realizamos a normalização, codificação e o tratamento de dados discrepantes e/ou faltantes dentro do conjunto de dados.



In [3]:
target_variable = 'species'
useless_variables =  (
    df_dict
    .query("tipo == 'inútil'")
    .variavel
    .to_list()
)

nominal_variables = (
    df_dict
    .query("subtipo == 'nominal' and variavel != @target_variable")
    .variavel
    .to_list()
)
ordinal_variables = (
    df_dict
    .query("subtipo == 'ordinal' and variavel != @target_variable")
    .variavel
    .to_list()
)
continuous_variables = (
    df_dict
    .query("subtipo == 'contínua' and variavel != @target_variable")
    .variavel
    .to_list()
)
discrete_variables = (
    df_dict
    .query("subtipo == 'discreta' and variavel != @target_variable")
    .variavel
    .to_list()
)

X = df.drop(columns=[target_variable] + useless_variables)
y = df[target_variable]

# 2. Pipeline de Pré-processamento de dados

In [4]:
nominal_preprocessor = Pipeline(steps=[
    ("missing", SimpleImputer(strategy='most_frequent')), # tratamento para dados faltantes
    ("encoding", OneHotEncoder(sparse_output=False, handle_unknown='infrequent_if_exist')), # codificação de variáveis
])

ordinal_preprocessor = Pipeline(steps=[
    ("missing", SimpleImputer(strategy='median')), # tratamento para dados faltantes
    ("encoding", OrdinalEncoder()), # codificação de variáveis
])

continuous_preprocessor = Pipeline(steps=[
    ("missing", SimpleImputer(strategy='mean')), # tratamento para dados faltantes
    ("normalization", StandardScaler()), # normalização de dados
])

discrete_preprocessor = Pipeline(steps=[
    ("missing", KNNImputer()), # tratamento para dados faltantes
    ("normalization", StandardScaler()), # normalização de dados
])

In [5]:
preprocessor = ColumnTransformer([
    ("nominal", nominal_preprocessor, nominal_variables),
    ("ordinal", ordinal_preprocessor, ordinal_variables),
    ("continuous", continuous_preprocessor, continuous_variables),
    ("discrete", discrete_preprocessor, discrete_variables),
])

preprocessor

In [6]:
# Print das variáveis separadas para conferência
print("Variáveis nominais:", nominal_variables)
print("Variáveis ordinais:", ordinal_variables)
print("Variáveis contínuas:", continuous_variables)
print("Variáveis discretas:", discrete_variables)

Variáveis nominais: ['island', 'sex']
Variáveis ordinais: []
Variáveis contínuas: ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']
Variáveis discretas: ['year']


# 3. Seleção de modelos

In [18]:
models = [
    DummyClassifier(strategy="most_frequent"),
    RandomForestClassifier(random_state=42),
    SVC(random_state=42),
    LogisticRegression(random_state=42, max_iter=1000)
]
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
monte_carlo = ShuffleSplit(n_splits=10, test_size=.2, random_state=42)
# hold_out = ShuffleSplit(n_splits=1, test_size=.2, random_state=42)
# kfold = KFold(n_splits=5, shuffle=True, random_state=42)

In [19]:
X_transformed = preprocessor.fit_transform(X)

X_transformed

array([[ 0.        ,  0.        ,  1.        , ..., -1.42248782,
        -0.56578921, -1.25931608],
       [ 0.        ,  0.        ,  1.        , ..., -1.06535169,
        -0.50316788, -1.25931608],
       [ 0.        ,  0.        ,  1.        , ..., -0.42250666,
        -1.19200251, -1.25931608],
       ...,
       [ 0.        ,  1.        ,  0.        , ..., -0.56536111,
        -0.53447855,  1.18816828],
       [ 0.        ,  1.        ,  0.        , ...,  0.64890172,
        -0.1274399 ,  1.18816828],
       [ 0.        ,  1.        ,  0.        , ..., -0.20822498,
        -0.53447855,  1.18816828]])

In [27]:
results_total = None
for model in models:
    model_name = model.__class__.__name__
    print(f"rodando para o modelo: {model_name}")
    approach = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model),
    ])

    scores = cross_validate(
        approach, X, y,
        scoring=metrics,
        cv=monte_carlo
    )
    results_model = pd.DataFrame(scores)
    results_model['model'] = model_name
    if results_total is None:
        results_total = results_model
    else:
        results_total = pd.concat([results_total, results_model])

rodando para o modelo: DummyClassifier
rodando para o modelo: RandomForestClassifier


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


rodando para o modelo: SVC
rodando para o modelo: LogisticRegression


In [28]:
results_total.groupby('model').agg(['mean', 'std']).T

Unnamed: 0,model,DummyClassifier,LogisticRegression,RandomForestClassifier,SVC
fit_time,mean,0.007898,0.008794,0.073579,0.007662
fit_time,std,0.006088,0.003,0.004318,0.001235
score_time,mean,0.005806,0.005898,0.008359,0.006187
score_time,std,0.0008,0.00038,0.000506,0.000264
test_accuracy,mean,0.436232,0.989855,0.988406,0.989855
test_accuracy,std,0.034466,0.011931,0.011432,0.011931
test_precision_macro,mean,0.145411,0.9929,0.988647,0.9929
test_precision_macro,std,0.011489,0.008128,0.011532,0.008128
test_recall_macro,mean,0.333333,0.984668,0.984725,0.984668
test_recall_macro,std,0.0,0.017398,0.016491,0.017398
