In [14]:
from IPython.display import display, Markdown
import joblib
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import ShuffleSplit, GridSearchCV, KFold, cross_validate
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

## 1. Obtenção de dados

Nessa etapa obtemos novamnete os arquivos brutos de dados e o dicionário antes de iniciar o pre-processamento.

In [15]:
# Carregar o dataset
df = pd.read_csv('../data/raw/data.csv') 
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [16]:
dicionario = pd.read_csv('../data/external/dictionary.csv')
dicionario

Unnamed: 0,variavel,descricao,tipo,subtipo
0,survived,Indica se o passageiro sobreviveu (1) ou não (0).,qualitativa,nominal
1,pclass,"Classe do bilhete do passageiro (1 = Primeiro,...",qualitativa,ordinal
2,sex,"Sexo do passageiro (male = masculino, female =...",qualitativa,nominal
3,age,Idade do passageiro em anos.,quantitativa,contínua
4,sibsp,Número de irmãos ou cônjuges a bordo.,quantitativa,discreta
5,parch,Número de pais ou filhos a bordo.,quantitativa,discreta
6,fare,Tarifa paga pelo passageiro.,quantitativa,contínua
7,embarked,Porto de embarque do passageiro (C = Cherbourg...,qualitativa,nominal
8,class,"Classe do bilhete do passageiro (First, Second...",qualitativa,ordinal
9,who,"Categoria do passageiro (man = homem, woman = ...",qualitativa,nominal


## 2. Preparação dos dados

In [17]:
target_column = 'survived' # prever coluna survived

nominal_columns = (
    dicionario
    .query("subtipo == 'nominal' and variavel != @target_column")
    .variavel
    .tolist()
)
discrete_columns = (
    dicionario
    .query("subtipo == 'discreta'")
    .variavel
    .tolist()
)
continuous_columns = (
    dicionario
    .query("subtipo == 'continua'")
    .variavel
    .tolist()
)
ordinal_columns = (
    dicionario
    .query("subtipo == 'ordinal'")
    .variavel
    .tolist()
)

X = df.drop(columns=[target_column], axis=1)
y = df[target_column]

### Tratar Outliers

In [19]:
def identify_outliers(df, column):
    column_data = df[column].copy()
    Q1 = column_data.quantile(0.25)
    Q3 = column_data.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[f'{column}_outlier'] = (column_data < lower_bound) | (column_data > upper_bound)
    return df

numeric_columns = ['fare', 'age', 'sibsp', 'parch']


### Tratar Dados Faltantes

In [18]:
# Visualizar a quantidade de dados faltantes
df.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [20]:
# excluir coluna 'deck', por ter mais da metade de dados faltates
df = df.drop(columns=['deck'])

# Tratar dados faltantes

nominal_preprocessor = Pipeline([
    ('missing', SimpleImputer(strategy='most_frequent')), 
    ('encoding', OneHotEncoder(sparse_output=False, drop='first')), 
    ('normalization', StandardScaler())  
])
discrete_preprocessor = Pipeline([
    ('missing', SimpleImputer(strategy='most_frequent')), 
    ('normalization', StandardScaler()) 
])
continuous_preprocessor = Pipeline([
    ('missing', SimpleImputer(strategy='mean')), 
    ('normalization', StandardScaler())  
])
ordinal_preprocessor = Pipeline([
    ('missing', SimpleImputer(strategy='most_frequent')), 
    ('encoding', OrdinalEncoder()) 
])
preprocessor = ColumnTransformer([
    ('nominal', nominal_preprocessor, nominal_columns),
    ('continuous', continuous_preprocessor, continuous_columns),
    ('ordinal', ordinal_preprocessor, ordinal_columns),
    ('discrete', discrete_preprocessor, discrete_columns)
])

model = LogisticRegression()


## 3. Seleção de Modelos

Iremos análisar quatro modelos, que serão testados utilizando um método de validação, a saber:

* K-Nearest-Neighbors
* Support Vector Machine
* Decision Tree
* Random Forest

Além disso, cada um desses algoritmos será testado com diferentes hiper-parametros, para que possamos encontrar o melhor modelo e a melhor configuração possível para esse modelo.

Utilizaremos as seguintes métricas para análise:

* Acurácia (accuracy): proporção entre os dados que foram corretamente previstos (como positivos ou negativos) com o total de dados observados;
* Precisão (precision): proporção entre dados corretamente previstos como positivos e o total de observações positivas.
* Recall: proporção entre dados corretamente previstos como positivos com o total de observações.
* F1-score: média entre precision e recall, portanto levando em conta tanto falsos positivos quanto falsos negativos.

In [30]:
# experiment settings
n_splits_comparative_analysis = 10
n_folds_grid_search = 5
test_size = .2
random_state = 42
scoring = 'accuracy'
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']

# model settings
max_iter = 10000
models = [
    ('K-Nearest Neighbors', KNeighborsClassifier(), {"n_neighbors": range(3, 20, 2), 'weights': ['uniform', 'distance']}),
    ('Suport Vector Machines', SVC(random_state=random_state, max_iter=max_iter), {"kernel": ["linear", "rbf"], 'C':[1,10,100,1000],'gamma':[0.0001, 0.001, 0.1, 1]}),
    ('Decision Tree',  DecisionTreeClassifier(random_state=random_state), {'criterion':['gini','entropy'],'max_depth': [3, 6, 8]}),
    ('Random Forest',  RandomForestClassifier(random_state=random_state), {'criterion':['gini','entropy'],'max_depth': [3, 6, 8], 'n_estimators': [10, 30]}),
]

In [35]:
results = pd.DataFrame({})
cross_validate_grid_search = KFold(n_splits=n_folds_grid_search)
cross_validate_comparative_analysis = ShuffleSplit(n_splits=n_splits_comparative_analysis, test_size=test_size, random_state=random_state)
for model_name, model_object, model_parameters in models:
    print(f"running {model_name}...")
    model_grid_search = GridSearchCV(
        estimator=model_object,
        param_grid=model_parameters,
        scoring=scoring,
        n_jobs=-1,
        cv=cross_validate_grid_search
    )
    approach = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model_grid_search)
    ])
    scores = cross_validate(
        estimator=approach,
        X=X,
        y=y,
        cv=cross_validate_comparative_analysis,
        n_jobs=-1,
        scoring=metrics
    )
    scores['model_name'] = [model_name] * n_splits_comparative_analysis
    display(pd.DataFrame(scores).agg(['mean', 'std']))
    results = pd.concat([results, pd.DataFrame(scores)], ignore_index=True)
    
  

running K-Nearest Neighbors...


  display(pd.DataFrame(scores).agg(['mean', 'std']))


Unnamed: 0,fit_time,score_time,test_accuracy,test_precision_macro,test_recall_macro,test_f1_macro
mean,0.537538,0.023367,0.97095,0.970555,0.968561,0.969312
std,0.031088,0.009272,0.010131,0.010045,0.011708,0.010544


running Suport Vector Machines...


  display(pd.DataFrame(scores).agg(['mean', 'std']))


Unnamed: 0,fit_time,score_time,test_accuracy,test_precision_macro,test_recall_macro,test_f1_macro
mean,1.392756,0.028128,1.0,1.0,1.0,1.0
std,0.361225,0.016923,0.0,0.0,0.0,0.0


running Decision Tree...


  display(pd.DataFrame(scores).agg(['mean', 'std']))


Unnamed: 0,fit_time,score_time,test_accuracy,test_precision_macro,test_recall_macro,test_f1_macro
mean,0.210071,0.01939,1.0,1.0,1.0,1.0
std,0.040699,0.005529,0.0,0.0,0.0,0.0


running Random Forest...


  display(pd.DataFrame(scores).agg(['mean', 'std']))


Unnamed: 0,fit_time,score_time,test_accuracy,test_precision_macro,test_recall_macro,test_f1_macro
mean,4.224876,0.026763,0.998883,0.998848,0.998848,0.998848
std,0.107908,0.009527,0.003533,0.003643,0.003643,0.003643


In [36]:
def highlight_best(s, props=''):
    if s.name[1] != 'std':
        if s.name[0].endswith('time'):
            return np.where(s == np.nanmin(s.values), props, '')
        return np.where(s == np.nanmax(s.values), props, '')

display(Markdown("### 3.1 Resultados gerais"))
(
    results
    .groupby('model_name')
    .agg(['mean', 'std']).T
    .style
    .apply(highlight_best, props='color:white;background-color:gray;font-weight: bold;', axis=1)
    .set_table_styles([{'selector': 'td', 'props': 'text-align: center;'}])
)

### 3.1 Resultados gerais

Unnamed: 0,model_name,Decision Tree,K-Nearest Neighbors,Random Forest,Suport Vector Machines
fit_time,mean,0.210071,0.537538,4.224876,1.392756
fit_time,std,0.040699,0.031088,0.107908,0.361225
score_time,mean,0.01939,0.023367,0.026763,0.028128
score_time,std,0.005529,0.009272,0.009527,0.016923
test_accuracy,mean,1.0,0.97095,0.998883,1.0
test_accuracy,std,0.0,0.010131,0.003533,0.0
test_precision_macro,mean,1.0,0.970555,0.998848,1.0
test_precision_macro,std,0.0,0.010045,0.003643,0.0
test_recall_macro,mean,1.0,0.968561,0.998848,1.0
test_recall_macro,std,0.0,0.011708,0.003643,0.0


O Decision Tree obteve melhores resultados, equilibrando perfeitamente a alta performance com o tempo de ajuste rápido.

### 3.2 Persistência do modelo

In [37]:
#Obtem o modelo e os parametros ganhadores
model_name, model_object, model_parameters  = [foo for foo in models if foo[0] == "K-Nearest Neighbors"][0] 


model_grid_search = GridSearchCV(
        estimator=model_object,
        param_grid=model_parameters,
        scoring=scoring,
        n_jobs=-1,
        cv=cross_validate_grid_search
    )

approach = Pipeline([
    ("preprocessor", preprocessor),
    ("model", model_grid_search)
])

approach.fit(X, y) #Seleciona o approach

print(f"Hiper parâmetros do modelo: {approach.steps[1][1].best_params_}")

Hiper parâmetros do modelo: {'n_neighbors': 15, 'weights': 'distance'}


In [39]:
joblib.dump(approach, '../models/model.joblib') # Salva o modelo em disco

['../models/model.joblib']