In [1]:
import pandas as pd
import numpy as np

In [2]:
dados_clinicos = pd.read_csv('./dados/dados_ajustados.csv')
dados_clinicos.head()

Unnamed: 0,PATIENT_VISIT_IDENTIFIER,AGE_ABOVE65,AGE_PERCENTIL,GENDER,DISEASE GROUPING 1,DISEASE GROUPING 2,DISEASE GROUPING 3,DISEASE GROUPING 4,DISEASE GROUPING 5,DISEASE GROUPING 6,...,TEMPERATURE_DIFF,OXYGEN_SATURATION_DIFF,BLOODPRESSURE_DIASTOLIC_DIFF_REL,BLOODPRESSURE_SISTOLIC_DIFF_REL,HEART_RATE_DIFF_REL,RESPIRATORY_RATE_DIFF_REL,TEMPERATURE_DIFF_REL,OXYGEN_SATURATION_DIFF_REL,WINDOW,ICU
0,3,0,40th,1,0,0,0,0,0,0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0-2,0
1,4,0,10th,0,0,0,0,0,0,0,...,-0.952381,-0.979798,-1.0,-0.883669,-0.956805,-0.870968,-0.953536,-0.980333,0-2,0
2,5,0,10th,0,0,0,0,0,0,0,...,-0.97619,-0.979798,-0.86087,-0.71446,-0.986481,-1.0,-0.975891,-0.980129,0-2,0
3,6,1,70th,1,0,0,0,0,0,0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0-2,0
4,7,0,20th,0,0,0,0,0,0,0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0-2,0


In [3]:
dados_clinicos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 353 entries, 0 to 352
Columns: 231 entries, PATIENT_VISIT_IDENTIFIER to ICU
dtypes: float64(215), int64(14), object(2)
memory usage: 637.2+ KB


In [4]:
dados_clinicos.select_dtypes(include='object')

Unnamed: 0,AGE_PERCENTIL,WINDOW
0,40th,0-2
1,10th,0-2
2,10th,0-2
3,70th,0-2
4,20th,0-2
...,...,...
348,80th,0-2
349,90th,0-2
350,90th,0-2
351,40th,0-2


O primeiro modelo será criado sem parametros, para servir de base

Antes, as transformações que são necessárias no dataset:
* Converter a coluna `AGE_PERCENTIL` para valor numérico;
* Retirar as colunas `PATIENT_VISIT_IDENTIFIER` e `WINDOW`, já que eles não são relevantes para predição;
* Definir a(s) métrica(s) para avaliação do modelo;

## Criar sessão para discutir Métrica

## Pré Processamento

In [5]:
pre_processado = dados_clinicos.copy()

pre_processado.drop(['PATIENT_VISIT_IDENTIFIER', 'WINDOW'], axis=1, inplace=True)

### Separação dos Dados  
Antes de selecionarmos o método para separação dos dados de teste e o tamanho, vamos avaliar a a distribuição da nossa variável alvo e tamanho do dataset.

In [6]:
pre_processado.shape

(353, 229)

In [7]:
pre_processado['ICU'].value_counts(normalize=True)

0    0.538244
1    0.461756
Name: ICU, dtype: float64

A distribuição dos dados parece estar balanceada, então a princípio não corremos o risco de na separação de dados concentrar uma classe, prejudicando o treinamento do modelo. O sklearn possui diferentes métodos de separação de dados, seja para dados balanceados e não balanceados, e dentre a classe dos balanceados vamos utilizar o [`Shufflesplit`](https://scikit-learn.org/stable/modules/cross_validation.html#shufflesplit), que antes de separar os dados embaralha eles, para garantir uma boa distribuição.

### Tratamento da coluna AGE_PERCENTIL  
Vamos verificar como os valores estão distribuído no dataset.

In [8]:
pre_processado['AGE_PERCENTIL'].value_counts().sort_index()

10th          38
20th          42
30th          39
40th          38
50th          34
60th          30
70th          34
80th          36
90th          28
Above 90th    34
Name: AGE_PERCENTIL, dtype: int64

Percebam que, essa coluna representa a faixa etária do paciente e essa informação possui uma sequencia lógica, ou seja, tratase de uma `variável quantitativa ordinal`. Como ele possui uma sequencia, podemos utilizar o `OrdinalEncoder` para converter os essa coluna para numérico - ele atribui um valor respeitando a sequencia lógica. Dessa forma o modelo vai entender a importancia de um valor.  
Um adendo: Como essa coluna possui apenas 10 categorias, poderíamos utilizar também o `OneHotEncoder`, que costuma funcionar bem.

Agora que definimos como trataremos os casos, ao invés de tratar um a um, vamos utilizar a classe Pipeline, que permite encapsular todas as etapas do desenvolvimento do modelo.

## Desenvolvendo o Modelo

In [9]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import StratifiedShuffleSplit

from sklearn.preprocessing import OrdinalEncoder

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier


from sklearn.model_selection import GridSearchCV


from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_validate


np.random.seed(789872)

In [10]:
performance = pd.DataFrame()

In [11]:
def roda_modelo(descricao_modelo, dados, modelo, paramns):
    print(dados.shape)
    X = dados.drop('ICU', axis=1)
    y = dados['ICU']
    
    ordinal_encoder = OrdinalEncoder()

    preprocessor = ColumnTransformer(transformers=[
        ('ordinal', ordinal_encoder, ['AGE_PERCENTIL'])
    ])

    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', modelo(**paramns))
    ])
    
    cv = StratifiedShuffleSplit(n_splits=10, random_state=False)
    results = cross_validate(model, X, y, cv=cv, return_train_score=True, scoring='roc_auc')
    
    auc_medio = results['test_score'].mean()
    auc_std = results['test_score'].std()
    
    #print(f'AUC {auc_medio} Intervalo: {auc_medio - (2* auc_std)} - {auc_medio + (2* auc_std)}')
    
    return performance.append({'model': descricao_modelo,
                               'auc_medio' : auc_medio,
                               'intervalo' : f'\
                               {auc_medio-(2* auc_std) : .3f} - {auc_medio+(2* auc_std) : .3f}',
                               }, 
                             ignore_index=True)
    
    
    

In [12]:
performance = roda_modelo('reg_log', pre_processado, LogisticRegression, {})

(353, 229)


In [13]:
performance = roda_modelo('SVC', pre_processado, SVC, {})

(353, 229)


In [14]:
performance = roda_modelo('RandomForest', pre_processado, 
                          RandomForestClassifier, {'random_state': False})

(353, 229)


In [15]:
performance = roda_modelo('XGBoost', pre_processado, XGBClassifier, 
                          {'eval_metric' : 'mlogloss', 'use_label_encoder' : False})

(353, 229)


In [16]:
performance

Unnamed: 0,model,auc_medio,intervalo
0,reg_log,0.76548,0.628 - 0.903
1,SVC,0.720898,0.546 - 0.896
2,RandomForest,0.730495,0.601 - 0.860
3,XGBoost,0.732198,0.604 - 0.860


Podemos observar que, sem especificar nenhum parametro, o auc médio é bem parecido. Chama atenção intervalo de confiança apresentado em todos os modelos, que ainda é muito grande, especialmente nos modelos baseados em arvore (`DecisionTree` e `RandomForest` e `XGBoost`). Para tentar melhorar o modelo, agora vamos partir para feature selection, e treinar o modelo com menos variáveis e ver se o desempenho melhora.

In [17]:
from sklearn.feature_selection import VarianceThreshold

vt = VarianceThreshold(threshold=0.20)

_ = vt.fit(pre_processado.drop(['AGE_PERCENTIL', 'ICU'], axis=1))

mask = vt.get_support()

In [18]:
_

VarianceThreshold(threshold=0.2)

In [19]:
dados_wo_low_variance = pre_processado.drop(['AGE_PERCENTIL', 'ICU'], axis=1).loc[:, mask]

In [20]:
dados_wo_low_variance['AGE_PERCENTIL'] = pre_processado['AGE_PERCENTIL']
dados_wo_low_variance['ICU'] = pre_processado['ICU']

In [21]:
dados_wo_low_variance

Unnamed: 0,AGE_ABOVE65,GENDER,LACTATE_MEDIAN,LACTATE_MEAN,LACTATE_MIN,LACTATE_MAX,AGE_PERCENTIL,ICU
0,0,1,-0.828421,-0.828421,-0.828421,-0.828421,40th,0
1,0,0,1.000000,1.000000,1.000000,1.000000,10th,0
2,0,0,1.000000,1.000000,1.000000,1.000000,10th,0
3,1,1,1.000000,1.000000,1.000000,1.000000,70th,0
4,0,0,1.000000,1.000000,1.000000,1.000000,20th,0
...,...,...,...,...,...,...,...,...
348,1,0,1.000000,1.000000,1.000000,1.000000,80th,1
349,1,1,-0.926434,-0.926434,-0.926434,-0.926434,90th,1
350,1,0,1.000000,1.000000,1.000000,1.000000,90th,1
351,0,1,1.000000,1.000000,1.000000,1.000000,40th,1


In [22]:
from sklearn.feature_selection import SelectFromModel
from sklearn.tree import DecisionTreeClassifier

X = pre_processado.drop(['AGE_PERCENTIL', 'ICU'], axis=1)
y = pre_processado['ICU']

selector = SelectFromModel(estimator=DecisionTreeClassifier()).fit(X, y)

selector.get_support().sum()

41

In [23]:
mask = selector.get_support()

In [24]:
data_selected = pre_processado.drop(['AGE_PERCENTIL', 'ICU'], axis=1).loc[:, mask]

In [25]:
data_selected['AGE_PERCENTIL'] = pre_processado['AGE_PERCENTIL']
data_selected['ICU'] = pre_processado['ICU']

In [26]:
data_selected

Unnamed: 0,AGE_ABOVE65,BILLIRUBIN_MEDIAN,BILLIRUBIN_MAX,CALCIUM_MEDIAN,CREATININ_MEDIAN,CREATININ_MAX,FFA_MEAN,GLUCOSE_MEDIAN,HEMATOCRITE_MAX,HEMOGLOBIN_MEAN,...,RESPIRATORY_RATE_MEAN,OXYGEN_SATURATION_MEAN,BLOODPRESSURE_DIASTOLIC_MEDIAN,BLOODPRESSURE_DIASTOLIC_MIN,HEART_RATE_MIN,RESPIRATORY_RATE_MIN,TEMPERATURE_MIN,BLOODPRESSURE_DIASTOLIC_MAX,AGE_PERCENTIL,ICU
0,0,-0.972789,-0.972789,0.326531,-0.968861,-0.968861,-0.194030,-0.891993,-0.203354,-0.219512,...,-0.457627,0.684211,0.012346,0.175258,-0.384615,-0.357143,0.208791,-0.299145,40th,0
1,0,-0.935113,-0.935113,0.357143,-0.913659,-0.913659,-0.829424,-0.851024,0.358491,0.304878,...,-0.593220,0.868421,0.333333,0.443299,0.196581,-0.571429,0.538462,-0.076923,10th,0
2,0,-0.938950,-0.938950,0.357143,-0.891012,-0.891012,-0.742004,-0.891993,0.291405,0.243902,...,-0.525424,0.815789,-0.037037,0.030928,-0.401709,-0.428571,0.252747,-0.247863,10th,0
3,1,-0.938950,-0.938950,0.357143,-0.944798,-0.944798,-0.742004,-0.891993,-0.471698,-0.475610,...,-0.593220,0.894737,-0.209877,-0.010309,-0.282051,-0.500000,0.494505,-0.452991,70th,0
4,0,-0.966510,-0.966510,0.357143,-0.912243,-0.912243,-0.742004,-0.851024,-0.052411,-0.024390,...,-0.389831,0.842105,0.185185,0.319588,-0.435897,-0.285714,0.208791,-0.179487,20th,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
348,1,-0.938950,-0.938950,0.357143,-0.923567,-0.923567,-0.742004,-0.891993,0.060797,0.024390,...,-0.480226,0.754386,-0.308642,-0.092784,0.333333,-0.428571,0.494505,-0.487179,80th,1
349,1,-0.933019,-0.933019,0.183673,-0.864119,-0.864119,-0.742004,-0.891993,-0.002096,-0.091463,...,-0.457627,0.644737,-0.456790,-0.257732,0.162393,-0.357143,0.670330,-0.589744,90th,1
350,1,-0.938950,-0.938950,0.357143,-0.908402,-0.908402,-0.742004,-0.891993,-0.140461,-0.207317,...,-0.457627,0.526316,-0.283951,-0.072165,-0.401709,-0.357143,0.604396,-0.504274,90th,1
351,0,-0.293564,-0.293564,0.326531,-0.937721,-0.937721,1.000000,-0.824953,-0.253669,-0.195122,...,-0.457627,0.736842,-0.160494,0.030928,0.401709,-0.357143,0.472527,-0.418803,40th,1


In [27]:
selector

SelectFromModel(estimator=DecisionTreeClassifier())

In [28]:
ordinal_encoder = OrdinalEncoder()

preprocessor = ColumnTransformer(transformers=[
    ('ordinal', ordinal_encoder, ['AGE_PERCENTIL'])
])

model_teste1 = Pipeline([
    ('preprocessor', preprocessor),
    ('selector', SelectFromModel(estimator=RandomForestClassifier())),
    ('model', LogisticRegression())
])

In [29]:
model_teste1

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('ordinal', OrdinalEncoder(),
                                                  ['AGE_PERCENTIL'])])),
                ('selector',
                 SelectFromModel(estimator=RandomForestClassifier())),
                ('model', LogisticRegression())])

In [30]:
cv_stratified = StratifiedShuffleSplit(n_splits=10, random_state=False)

In [31]:
X = pre_processado.drop('ICU', axis=1)
y = pre_processado['ICU']

In [32]:
resultado = cross_validate(model_teste1, X, y, cv=cv_stratified, scoring='roc_auc')

print(f'{resultado["test_score"].mean() : .5f}')

 0.76548


In [33]:
clf = GridSearchCV(model_teste1, param_grid={}, scoring='roc_auc', cv=cv_stratified)

In [34]:
clf.fit(X, y)

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=10, random_state=False, test_size=None,
            train_size=None),
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('ordinal',
                                                                         OrdinalEncoder(),
                                                                         ['AGE_PERCENTIL'])])),
                                       ('selector',
                                        SelectFromModel(estimator=RandomForestClassifier())),
                                       ('model', LogisticRegression())]),
             param_grid={}, scoring='roc_auc')

In [35]:
print(f"{clf.best_score_ : .5f}")

 0.76548


In [36]:
performance

Unnamed: 0,model,auc_medio,intervalo
0,reg_log,0.76548,0.628 - 0.903
1,SVC,0.720898,0.546 - 0.896
2,RandomForest,0.730495,0.601 - 0.860
3,XGBoost,0.732198,0.604 - 0.860


In [37]:
corr = pre_processado.corr()

icu_corr = corr['ICU'][:-1]

columns_with_low_correlation = []

for column, row in icu_corr.items():
    if abs(float(row) <= .99):
        columns_with_low_correlation.append(column)
        print(
            '%s has lower correlation with sale price: %.2f' % (column, row), \
            ', so will be dropped')


dados_corr = pre_processado.drop(columns_with_low_correlation, axis=1)

AGE_ABOVE65 has lower correlation with sale price: 0.29 , so will be dropped
GENDER has lower correlation with sale price: -0.12 , so will be dropped
DISEASE GROUPING 1 has lower correlation with sale price: 0.07 , so will be dropped
DISEASE GROUPING 2 has lower correlation with sale price: 0.09 , so will be dropped
DISEASE GROUPING 3 has lower correlation with sale price: 0.12 , so will be dropped
DISEASE GROUPING 4 has lower correlation with sale price: 0.11 , so will be dropped
DISEASE GROUPING 5 has lower correlation with sale price: 0.12 , so will be dropped
DISEASE GROUPING 6 has lower correlation with sale price: -0.03 , so will be dropped
HTN has lower correlation with sale price: 0.18 , so will be dropped
IMMUNOCOMPROMISED has lower correlation with sale price: 0.07 , so will be dropped
OTHER has lower correlation with sale price: 0.05 , so will be dropped
ALBUMIN_MEAN has lower correlation with sale price: -0.11 , so will be dropped
ALBUMIN_MIN has lower correlation with sale

In [38]:
dados_corr.shape

(353, 39)

vamos verificar o novo desempenho do modelo.

In [39]:
performance = roda_modelo('reg_log_corr1', data_selected, LogisticRegression, {})
performance = roda_modelo('SVC_corr1', data_selected, SVC, {})
performance = roda_modelo('RandomForest_corr1', data_selected, 
                          RandomForestClassifier, {'random_state': False})

performance = roda_modelo('XGBoost_corr1', data_selected, XGBClassifier, 
                          {'eval_metric' : 'mlogloss', 'use_label_encoder' : False})

performance

(353, 43)
(353, 43)
(353, 43)
(353, 43)


Unnamed: 0,model,auc_medio,intervalo
0,reg_log,0.76548,0.628 - 0.903
1,SVC,0.720898,0.546 - 0.896
2,RandomForest,0.730495,0.601 - 0.860
3,XGBoost,0.732198,0.604 - 0.860
4,reg_log_corr1,0.76548,0.628 - 0.903
5,SVC_corr1,0.720898,0.546 - 0.896
6,RandomForest_corr1,0.730495,0.601 - 0.860
7,XGBoost_corr1,0.732198,0.604 - 0.860


In [40]:
def find_best_params(model, dados, parameters):
    
    X = dados.drop('ICU', axis=1)
    y = dados['ICU']
    
    ordinal_encoder = OrdinalEncoder()
    
    preprocessor = ColumnTransformer(transformers=[
        ('ordinal', ordinal_encoder, ['AGE_PERCENTIL'])
    ])

    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model())
    ])
    
    cv = ShuffleSplit(n_splits=10, random_state=False)
    grid = GridSearchCV(model, param_grid=parameters, cv=cv, scoring='roc_auc', n_jobs=1)    
    grid.fit(X, y)
    
    return grid.best_params_

Agora que definimos a função, vamos rodar e descobrir os melhores parametros para os modelos que selecionamos.

In [41]:
parametros_log = {
    'model__penalty' : ('l1', 'l2', 'elasticnet'),
    'model__C' : [100, 10, 1.0, 0.1, 0.01],
    
}

melhores_parametros_log = find_best_params(LogisticRegression, dados_wo_low_variance, 
                                              parameters=parametros_log)

melhores_parametros_log

Traceback (most recent call last):
  File "/home/johnny/Documentos/dev/data-science/ds/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/johnny/Documentos/dev/data-science/ds/lib/python3.8/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/johnny/Documentos/dev/data-science/ds/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/johnny/Documentos/dev/data-science/ds/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "/home/johnny/Documentos/dev/data-science/ds/l

Traceback (most recent call last):
  File "/home/johnny/Documentos/dev/data-science/ds/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/johnny/Documentos/dev/data-science/ds/lib/python3.8/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/johnny/Documentos/dev/data-science/ds/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/johnny/Documentos/dev/data-science/ds/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalty.

Traceback (most recent call last):
  File "/home/johnny/Documentos/dev/data-scie

Traceback (most recent call last):
  File "/home/johnny/Documentos/dev/data-science/ds/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/johnny/Documentos/dev/data-science/ds/lib/python3.8/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/johnny/Documentos/dev/data-science/ds/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/johnny/Documentos/dev/data-science/ds/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalty.

Traceback (most recent call last):
  File "/home/johnny/Documentos/dev/data-scie

Traceback (most recent call last):
  File "/home/johnny/Documentos/dev/data-science/ds/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/johnny/Documentos/dev/data-science/ds/lib/python3.8/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/johnny/Documentos/dev/data-science/ds/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/johnny/Documentos/dev/data-science/ds/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalty.

Traceback (most recent call last):
  File "/home/johnny/Documentos/dev/data-scie

{'model__C': 100, 'model__penalty': 'l2'}

In [42]:
def organiza_parametros(dicionario):
    lista_chave = list(dicionario)
    for chave in lista_chave:
        nova_chave = chave.replace('model__', '')
        dicionario[nova_chave] = dicionario.pop(chave)
        
    return dicionario

In [43]:
melhores_parametros_log = organiza_parametros(melhores_parametros_log)

In [44]:
parametros_svc = {
    'model__kernel' : ('linear', 'rbf', 'sigmoid'),
    'model__gamma' : [0.1, 1, 10, 100],
    'model__C' : [0.1, 1, 10, 100],
}

melhores_parametros_svc = find_best_params(SVC, dados_wo_low_variance, parameters=parametros_svc)

melhores_parametros_svc


{'model__C': 0.1, 'model__gamma': 0.1, 'model__kernel': 'linear'}

In [45]:
melhores_parametros_svc = organiza_parametros(melhores_parametros_svc)
melhores_parametros_svc

{'C': 0.1, 'gamma': 0.1, 'kernel': 'linear'}

In [46]:
parametros_floresta = {
    'model__n_estimators' : [10, 50, 100],
    'model__criterion': ('gini', 'entropy'),
    'model__min_samples_split': [4, 6, 8],
    'model__max_depth' : [3, 5, 6],
    'model__max_features': ('auto', 'sqrt', 'log2')
}

melhores_parametros_floresta = find_best_params(RandomForestClassifier, dados_wo_low_variance, 
                                                parameters=parametros_floresta)

melhores_parametros_floresta

{'model__criterion': 'entropy',
 'model__max_depth': 3,
 'model__max_features': 'sqrt',
 'model__min_samples_split': 8,
 'model__n_estimators': 10}

In [47]:
melhores_parametros_floresta = organiza_parametros(melhores_parametros_floresta)

In [48]:
parametros_xgboost = {
    'model__n_estimators' : [10, 50, 90, 100],
    'model__learning_rate': [0.01, 0.05, 0.09, 0.1],
    'model__max_depth': [3, 4, 5, 6],    
}

melhores_parametros_xgboost = find_best_params(XGBClassifier, dados_wo_low_variance, 
                                                parameters=parametros_xgboost)

melhores_parametros_xgboost











































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































{'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 10}

In [49]:
melhores_parametros_xgboost = organiza_parametros(melhores_parametros_xgboost)

In [50]:
melhores_parametros_xgboost

{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 10}

Agora que temos os parametros selecionados, vamos rodar novamnte.

In [51]:
performance = roda_modelo('log_grid', dados_wo_low_variance, 
                          LogisticRegression, melhores_parametros_log)

(353, 8)


In [52]:
performance = roda_modelo('SVC_grid', dados_wo_low_variance, SVC, melhores_parametros_svc)

(353, 8)


In [53]:
performance = roda_modelo('RandomForest_grid', dados_wo_low_variance, 
                          RandomForestClassifier, melhores_parametros_floresta)

(353, 8)


In [54]:
performance = roda_modelo('XGBoost_grid', dados_wo_low_variance, 
                          XGBClassifier, melhores_parametros_xgboost)

(353, 8)








In [55]:
performance

Unnamed: 0,model,auc_medio,intervalo
0,reg_log,0.76548,0.628 - 0.903
1,SVC,0.720898,0.546 - 0.896
2,RandomForest,0.730495,0.601 - 0.860
3,XGBoost,0.732198,0.604 - 0.860
4,reg_log_corr1,0.76548,0.628 - 0.903
5,SVC_corr1,0.720898,0.546 - 0.896
6,RandomForest_corr1,0.730495,0.601 - 0.860
7,XGBoost_corr1,0.732198,0.604 - 0.860
8,log_grid,0.76548,0.628 - 0.903
9,SVC_grid,0.76548,0.628 - 0.903


In [56]:
performance.sort_values('auc_medio', ascending=False)

Unnamed: 0,model,auc_medio,intervalo
0,reg_log,0.76548,0.628 - 0.903
4,reg_log_corr1,0.76548,0.628 - 0.903
8,log_grid,0.76548,0.628 - 0.903
9,SVC_grid,0.76548,0.628 - 0.903
3,XGBoost,0.732198,0.604 - 0.860
7,XGBoost_corr1,0.732198,0.604 - 0.860
10,RandomForest_grid,0.732043,0.595 - 0.869
2,RandomForest,0.730495,0.601 - 0.860
6,RandomForest_corr1,0.730495,0.601 - 0.860
11,XGBoost_grid,0.729257,0.606 - 0.852
