# Instalando novas bibliotecas

In [119]:
%%capture
!pip install optuna

In [120]:
%%capture
!pip install lightgbm

# Importando módulos

In [121]:
import warnings
import numpy as np
import pandas as pd
import optuna as opt
import seaborn as sns
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
from sklearn.utils import compute_class_weight
from sklearn.model_selection import train_test_split
from optuna.integration.lightgbm import LightGBMTuner

In [122]:
# Só encher o saco uma vez
warnings.filterwarnings('once')

# __Execução I__

Passos
- Importar dados
- Pré-processamento
- Divisão treino e validação usando *Kennard-Stone*
- Imputação de dados - Mode
- Light GBM padrão com class_weight
- Ensemble de modelos binários

# Importando dados

In [123]:
# Importar tabelas
df_train = pd.read_csv('ml-olympiad-ensure-healthy-lives/train.csv', low_memory=False)
df_test = pd.read_csv('ml-olympiad-ensure-healthy-lives/test.csv', low_memory=False)
df_subm = pd.read_csv('ml-olympiad-ensure-healthy-lives/sample_submission.csv', low_memory=False)

In [124]:
# Montar base para previsão do desafio
df_sub_desafio = pd.DataFrame()
df_sub_desafio['ID'] = df_test['ID']
df_sub_desafio['CLASSI_FIN'] = np.nan

In [125]:
# Aplicar correcao nas variáveis coringa
df_train['FNT_IN_COV'].replace({1:'MANUAL',2:'INTEGRACAO',np.nan:'FALTANTE'}, inplace=True)
df_test['FNT_IN_COV'].replace({1:'MANUAL',2:'INTEGRACAO',np.nan:'FALTANTE'}, inplace=True)

df_train['COD_IDADE'].replace({np.nan:'FALTANTE'}, inplace=True)
df_test['COD_IDADE'].replace({np.nan:'FALTANTE'}, inplace=True)

# Pré-processamento

In [126]:
# Definir colunas para serem derrubadas
cols_to_drop = ['ID',         # Identificação
                'DELTA_UTI',  # Constante
                'PAC_COCBO',  # Completamente faltante
                'PAC_DSCBO',  # Completamente faltante
                'OBES_IMC',   # Completamente faltante
                'COD_IDADE',  # Redundante tendo as outras duas variáveis de idade
                'FNT_IN_COV'] # Não relacionado ao problema

# Derrubar
df_train.drop(columns=cols_to_drop, inplace=True)
df_test.drop(columns=cols_to_drop, inplace=True)

In [127]:
# Especificar label do desafio
label = 'CLASSI_FIN'

**Variáveis relacionadas a sexo + idade, porém o foco está em mães / bebês**

In [128]:
# Definir colunas de análise nesta seção
cols = ['CS_SEXO','CS_GESTANT','PUERPERA','NU_IDADE_N','TP_IDADE','MAE_VAC','M_AMAMENTA']

Alterações
- Adicionar '99' para identificar que são homens, (código universal para não aplicável) nas colunas referentes a gestação e paridade
- Demais que possuem 'F' em CS_SEXO, adicionar '9' para identificar as mulheres como ignorados, pois provavelmente não estavam classificadas neste grupo

`NU_IDADE_N` + `TP_IDADE`

- Converter coluna de idade para anos
- Remover idades impossíveis

[Expectativa de vida no Brasil](https://pt.wikipedia.org/wiki/Lista_de_unidades_federativas_do_Brasil_por_expectativa_de_vida)

In [129]:
def corrigir_idade(nu_idade, tp_idade):
    '''
    Corrigir padrão de idade para anos
    '''
    if tp_idade == 3:
        return abs(nu_idade)
    elif tp_idade == 2:
        return abs(nu_idade)/12
    else:
        return abs(nu_idade)/365

In [130]:
# Aplicar correção de anos
df_train['NU_IDADE_N'] = df_train.apply(lambda df:corrigir_idade(df['NU_IDADE_N'], df['TP_IDADE']), axis=1).astype(int)

# Qualquer valor acima dos 95 serão tratados e fixados em 95 devido a impossibilidade + expectativa de vida no país
df_train.loc[df_train['NU_IDADE_N'] >= 95, 'NU_IDADE_N'] = 95

In [131]:
# Aplicar correção de anos
df_test['NU_IDADE_N'] = df_test.apply(lambda df:corrigir_idade(df['NU_IDADE_N'], df['TP_IDADE']), axis=1).astype(int)

# Qualquer valor acima dos 95 serão tratados e fixados em 95 devido a impossibilidade + expectativa de vida no país
df_test.loc[df_test['NU_IDADE_N'] >= 95, 'NU_IDADE_N'] = 95

`CS_SEXO`

[Motivo de imputação em último nível](https://educa.ibge.gov.br/jovens/conheca-o-brasil/populacao/18320-quantidade-de-homens-e-mulheres.html#:~:text=Segundo%20dados%20da%20PNAD%20Cont%C3%ADnu,51%2C1%25%20de%20mulheres.)

In [132]:
# Remover classe desconhecida e rara
df_train.loc[df_train['CS_SEXO'] == 'I', 'CS_SEXO'] = np.nan

# Aplicar primeiro nível
c1 = (df_train['CS_SEXO'].isna()) & (df_train['CS_GESTANT'] == 6) & (df_train['NU_IDADE_N'] >= 9)
df_train.loc[c1, 'CS_SEXO'] = 'M'

# Aplicar segundo nível
c2 = (df_train['CS_GESTANT'] != 6) & (df_train['CS_SEXO'].isna())
df_train.loc[c2, 'CS_SEXO'] = 'F'

# Aplicar terceiro nível
c3 = df_train['CS_SEXO'].isna()
df_train.loc[c3, 'CS_SEXO'] = 'F'

In [133]:
# Remover classe desconhecida e rara
df_test.loc[df_test['CS_SEXO'] == 'I', 'CS_SEXO'] = np.nan

# Aplicar primeiro nível
c1 = (df_test['CS_SEXO'].isna()) & (df_test['CS_GESTANT'] == 6) & (df_test['NU_IDADE_N'] >= 9)
df_test.loc[c1, 'CS_SEXO'] = 'M'

# Aplicar segundo nível
c2 = (df_test['CS_GESTANT'] != 6) & (df_test['CS_SEXO'].isna())
df_test.loc[c2, 'CS_SEXO'] = 'F'

# Aplicar terceiro nível
c3 = df_test['CS_SEXO'].isna()
df_test.loc[c3, 'CS_SEXO'] = 'F'

`CS_GESTANT`

In [134]:
# Remover classe desconhecida e rara
df_train.loc[df_train['CS_GESTANT'] == 0, 'CS_GESTANT'] = np.nan

# Aplicar primeiro nível
c1 = (df_train['CS_GESTANT'].isna()) & ((df_train['CS_SEXO'] == 'M') | (df_train['NU_IDADE_N'] <= 9))
df_train.loc[c1, 'CS_GESTANT'] = 6

# Aplicar segundo nível
c2 = (df_train['CS_GESTANT'].isna()) & (df_train['CS_SEXO'] == 'F') & (df_train['NU_IDADE_N'] > 9)
df_train.loc[c2, 'CS_GESTANT'] = 5

# Aplicar terceiro e último nível
c3 = (df_train['CS_GESTANT'].isna())
df_train.loc[c3, 'CS_GESTANT'] = 9

In [135]:
# Remover classe desconhecida e rara
df_test.loc[df_test['CS_GESTANT'] == 0, 'CS_GESTANT'] = np.nan

# Aplicar primeiro nível
c1 = (df_test['CS_GESTANT'].isna()) & ((df_test['CS_SEXO'] == 'M') | (df_test['NU_IDADE_N'] <= 9))
df_test.loc[c1, 'CS_GESTANT'] = 6

# Aplicar segundo nível
c2 = (df_test['CS_GESTANT'].isna()) & (df_test['CS_SEXO'] == 'F') & (df_test['NU_IDADE_N'] > 9)
df_test.loc[c2, 'CS_GESTANT'] = 5

# Aplicar terceiro e último nível
c3 = (df_test['CS_GESTANT'].isna())
df_test.loc[c3, 'CS_GESTANT'] = 9

`PUERPERA`

In [136]:
# Aplicar primeiro nível
c1 = df_train['CS_SEXO'] == 'M'
df_train.loc[c1, 'PUERPERA'] = 99

# Aplicar segundo nível
c2 = (df_train['PUERPERA'].isna()) & (df_train['CS_SEXO'] == 'F') & (df_train['CS_GESTANT'] != 6)
df_train.loc[c2, 'PUERPERA'] = 2

# Aplicar terceiro nível
c3 = (df_train['CS_SEXO'] == 'F') & (df_train['PUERPERA'].isna())
df_train.loc[c3, 'PUERPERA'] = 9

In [137]:
# Aplicar primeiro nível
c1 = df_test['CS_SEXO'] == 'M'
df_test.loc[c1, 'PUERPERA'] = 99

# Aplicar segundo nível
c2 = (df_test['PUERPERA'].isna()) & (df_test['CS_SEXO'] == 'F') & (df_test['CS_GESTANT'] != 6)
df_test.loc[c2, 'PUERPERA'] = 2

# Aplicar terceiro nível
c3 = (df_test['CS_SEXO'] == 'F') & (df_test['PUERPERA'].isna())
df_test.loc[c3, 'PUERPERA'] = 9

`MAE_VAC` + `M_AMAMENTA`

In [138]:
# Aplicar primeiro nível
c1 = df_train['NU_IDADE_N'] > 0
df_train.loc[c1, ['MAE_VAC','M_AMAMENTA']] = 99

# Aplicar segundo nível
df_train.loc[df_train['MAE_VAC'].isna(), 'MAE_VAC'] = 9
df_train.loc[df_train['M_AMAMENTA'].isna(), 'M_AMAMENTA'] = 9

In [139]:
# Aplicar primeiro nível
c1 = df_test['NU_IDADE_N'] > 0
df_test.loc[c1, ['MAE_VAC','M_AMAMENTA']] = 99

# Aplicar segundo nível
df_test.loc[df_test['MAE_VAC'].isna(), 'MAE_VAC'] = 9
df_test.loc[df_test['M_AMAMENTA'].isna(), 'M_AMAMENTA'] = 9

**Variáveis relacionadas ao COVID**

In [140]:
# Definir colunas de análise nesta seção
cols = ['VACINA_COV','DOSE_1_COV','DOSE_2_COV','DOSE_REF']

`VACINA_COV`

Possibilidade de binarizar e converter `9` em `2`

OBS - Criar nova coluna com a contagem de doses tomadas.

In [141]:
# Aplicar primeiro nível
c1 = df_train[['DOSE_1_COV','DOSE_2_COV','DOSE_REF']].sum(axis=1) >= 1
df_train.loc[c1, 'VACINA_COV'] = 1

# Aplicar segundo nível
c2 = df_train['VACINA_COV'].isna()
df_train.loc[c2, 'VACINA_COV'] = 9

In [142]:
# Aplicar primeiro nível
c1 = df_test[['DOSE_1_COV','DOSE_2_COV','DOSE_REF']].sum(axis=1) >= 1
df_test.loc[c1, 'VACINA_COV'] = 1

# Aplicar segundo nível
c2 = df_test['VACINA_COV'].isna()
df_test.loc[c2, 'VACINA_COV'] = 9

In [143]:
# Criar nova coluna
df_train['DOSE_COV_N'] = df_train[['DOSE_1_COV','DOSE_2_COV','DOSE_REF']].sum(axis=1)
df_test['DOSE_COV_N'] = df_test[['DOSE_1_COV','DOSE_2_COV','DOSE_REF']].sum(axis=1)

**Variáveis relacionadas ao EXAME**

In [144]:
# Definir colunas de análise nesta seção
cols = ['RAIOX_RES','RAIOX_OUT','TOMO_RES','TOMO_OUT','AMOSTRA','OUT_AMOST','TP_AMOSTRA']

In [145]:
# Especificar todas as possibilidades de outros para raio x e tomografia
outros_raioX = list(set(df_train['RAIOX_OUT'].dropna().unique().tolist() + df_test['RAIOX_OUT'].dropna().unique().tolist()))
outros_tomog = list(set(df_train['TOMO_OUT'].dropna().unique().tolist() + df_test['TOMO_OUT'].dropna().unique().tolist()))
outros_amost = list(set(df_train['OUT_AMOST'].dropna().unique().tolist() + df_test['OUT_AMOST'].dropna().unique().tolist()))

Melhorar colunas referente a OUTROS

In [146]:
# Gerar listas hierarquicas especificas para consolidar categorias em tomografia (prioridade de baixo para cima)
outros_tomog_laudo = [c for c in outros_tomog if 'LAUDO' in c or 'AGUARDA' in c or 'EM ANDAMENTO' in c or 'SEM INFORMACAO' in c]
outros_tomog_norml = [c for c in outros_tomog if 'NORMAL' in c or 'SEM ALTERAC' in c or 'SEM SINAIS DE INFECCAO' in c]
outros_tomog_vidro = [c for c in outros_tomog if 'VIDRO' in c or 'OPACIDADE' in c]
outros_tomog_bronc = [c for c in outros_tomog if 'BRONCOPATIA' in c]
outros_tomog_covid = [c for c in outros_tomog if 'COVID' in c]
outros_tomog_derrp = [c for c in outros_tomog if 'DERRAME' in c and 'PLEURAL' in c]
outros_tomog_enfis = [c for c in outros_tomog if 'ENFISEMA' in c]
outros_tomog_pneum = [c for c in outros_tomog if 'PNEUMONIA' in c or 'PNM' in c]
outros_tomog_conso = [c for c in outros_tomog if 'CONSOLIDA' in c]
outros_tomog_solic = [c for c in outros_tomog if 'SOLICITADO' in c]
outros_tomog_atele = [c for c in outros_tomog if 'ATELECTAS' in c]
outros_tomog_tuber = [c for c in outros_tomog if 'TUBERCUL' in c or 'TB' in c]

# Especificar novas categorias de agrupamento
outros_tomog_nCats = ['REL_TUBERCULOSE', 'REL_OPACIDADE_VIDRO_FOSCO', 'REL_BRONCOPATIA', 'REL_COVID',
                      'REL_DERRAME', 'REL_ENFISEMA', 'REL_PNEUMONIA', 'REL_CONSOLIDACAO',
                      'REL_ATELECTASIA', 'REL_SEM_LAUDO', 'REL_RES_NORMAL', 'REL_SOLICITADO']

outros_tomog_oCats = [outros_tomog_tuber, outros_tomog_vidro, outros_tomog_bronc, outros_tomog_covid,
                      outros_tomog_derrp, outros_tomog_enfis, outros_tomog_pneum, outros_tomog_conso,
                      outros_tomog_atele, outros_tomog_laudo, outros_tomog_norml, outros_tomog_solic]

# Criar colunas
df_train['TOMO_OUT_NOVO'] = np.nan
df_test['TOMO_OUT_NOVO'] = np.nan

# Aplicar novas categorias evitando sobreposição
for n_cat, o_cats in zip(outros_tomog_nCats, outros_tomog_oCats):
    df_train.loc[(df_train['TOMO_OUT'].isin(o_cats)) & (df_train['TOMO_OUT_NOVO'].isna()), 'TOMO_OUT_NOVO'] = n_cat
    df_test.loc[(df_test['TOMO_OUT'].isin(o_cats)) & (df_test['TOMO_OUT_NOVO'].isna()), 'TOMO_OUT_NOVO'] = n_cat

# Converter todas as demais categorias restantes provalvemente raras em umas só para reduzir ruído ao modelo
c1 = (~df_train['TOMO_OUT_NOVO'].isin(outros_tomog_nCats)) & (df_train['TOMO_OUT'].notnull())
df_train.loc[c1, 'TOMO_OUT_NOVO'] = 'REL_OUTROS_RESULTADOS'
c1 = (~df_test['TOMO_OUT_NOVO'].isin(outros_tomog_nCats)) & (df_test['TOMO_OUT'].notnull())
df_test.loc[c1, 'TOMO_OUT_NOVO'] = n_cat

In [147]:
# Gerar listas hierarquicas especificas para consolidar categorias em raio X (prioridade de baixo para cima)
outros_raiox_infil = [c for c in outros_raioX if 'INFILT' in c]
outros_raiox_vidro = [c for c in outros_raioX if 'VIDRO' in c or 'FOSCO' in c]
outros_raiox_opaci = [c for c in outros_raioX if 'OPACIDADE' in c or 'OPACIF' in c]
outros_raiox_tomog = [c for c in outros_raioX if 'TC' in c or 'TOMOGRAFIA' in c or 'TOMO' in c or 'CT' in c]
outros_raiox_derrp = [c for c in outros_raioX if 'DERRAME' in c and 'PLEU' in c]
outros_raiox_atele = [c for c in outros_raioX if 'ATELECTAS' in c]
outros_raiox_congt = [c for c in outros_raioX if 'CONGESTAO' in c or 'CONGEST' in c]
outros_raiox_pneum = [c for c in outros_raioX if 'PNEUMONIA' in c or 'PNM' in c]
outros_raiox_infla = [c for c in outros_raioX if 'INFLAMA' in c]
outros_raiox_espes = [c for c in outros_raioX if 'ESPESSA' in c]
outros_raiox_solic = [c for c in outros_raioX if 'SOLICITADO' in c or 'PEDIDO' in c]
outros_raiox_conde = [c for c in outros_raioX if 'CONDENSA' in c]
outros_raiox_hiper = [c for c in outros_raioX if 'HIPERINSUF' in c]
outros_raiox_arcos = [c for c in outros_raioX if 'ARCO' in c or 'RETIFICACAO' in c or 'COSTAIS' in c]
outros_raiox_norml = [c for c in outros_raioX if 'NORMAL' in c or 'SEM ALTERAC' in c or 'SEM SINAIS DE INFECCAO' in c]
outros_raiox_laudo = [c for c in outros_raioX if 'LAUDO' in c or 'SEM DESCRICAO' in c or 'SEM RELATO' in c or 'SEM INFO' in c or 'AGUARDA' in c or 'NAO INFO' in c]

# Especificar novas categorias de agrupamento por ordem de prioridade
outros_raiox_nCats = ['REL_INFILTRACAO', 'REL_VIDRO_FOSCO','REL_OPACIDADE', 'REL_DERRAME_PLEURAL',
                      'REL_ATELECTASIA', 'REL_CONGESTAO', 'REL_PNEUMONIA', 'REL_INFLAMACAO', 
                      'REL_ESPESSAMENTO', 'REL_CONDENSACAO', 'REL_HIPERINSUFLACAO', 'REL_SOLICITADO',
                      'REL_ARCOS_COSTAIS', 'REL_SEM_LAUDO', 'REL_RES_NORMAL','REL_PEDIDO_TOMOG']
outros_raiox_oCats = [outros_raiox_infil, outros_raiox_vidro, outros_raiox_opaci, outros_raiox_derrp,
                      outros_raiox_atele, outros_raiox_congt, outros_raiox_pneum, outros_raiox_infla,
                      outros_raiox_espes, outros_raiox_conde, outros_raiox_hiper, outros_raiox_solic,
                      outros_raiox_arcos, outros_raiox_laudo, outros_raiox_norml, outros_raiox_tomog]

# Criar colunas
df_train['RAIOX_OUT_NOVO'] = np.nan
df_test['RAIOX_OUT_NOVO'] = np.nan

# Aplicar novas categorias evitando sobreposição
for n_cat, o_cats in zip(outros_raiox_nCats, outros_raiox_oCats):
    df_train.loc[(df_train['RAIOX_OUT'].isin(o_cats)) & (df_train['RAIOX_OUT_NOVO'].isna()), 'RAIOX_OUT_NOVO'] = n_cat
    df_test.loc[(df_test['RAIOX_OUT'].isin(o_cats)) & (df_test['RAIOX_OUT_NOVO'].isna()), 'RAIOX_OUT_NOVO'] = n_cat

# Converter todas as demais categorias restantes provalvemente raras em umas só para reduzir ruído ao modelo
c1 = (~df_train['RAIOX_OUT_NOVO'].isin(outros_raiox_nCats)) & (df_train['RAIOX_OUT'].notnull())
df_train.loc[c1, 'RAIOX_OUT_NOVO'] = 'REL_OUTROS_RESULTADOS'
c1 = (~df_test['RAIOX_OUT_NOVO'].isin(outros_raiox_nCats)) & (df_test['RAIOX_OUT'].notnull())
df_test.loc[c1, 'RAIOX_OUT_NOVO'] = n_cat

Corrigir variáveis relacionadas ao raio X

In [148]:
# Caso o motivo seja outro e não tenha descrição, adicionar string referente
c1 = (df_train['RAIOX_RES'].isin([5])) & (df_train['RAIOX_OUT'].isna())
df_train.loc[c1, 'RAIOX_OUT'] = 'FEITO_E_SEM_DETALHE'
df_train.loc[c1, 'RAIOX_OUT_NOVO'] = 'FEITO_E_SEM_DETALHE'

c1 = (df_test['RAIOX_RES'].isin([5])) & (df_test['RAIOX_OUT'].isna())
df_test.loc[c1, 'RAIOX_OUT'] = 'FEITO_E_SEM_DETALHE'
df_test.loc[c1, 'RAIOX_OUT_NOVO'] = 'FEITO_E_SEM_DETALHE'

# Caso seja nulo, será considerado que não foi feito nenhum exame
c2 = df_train['RAIOX_RES'].isna()
df_train.loc[c2, 'RAIOX_RES'] = 6
df_train.loc[c2, 'RAIOX_OUT'] = 'NDA'
df_train.loc[c2, 'RAIOX_OUT_NOVO'] = 'NDA'

c2 = df_test['RAIOX_RES'].isna()
df_test.loc[c2, 'RAIOX_RES'] = 6
df_test.loc[c2, 'RAIOX_OUT'] = 'NDA'
df_test.loc[c2, 'RAIOX_OUT_NOVO'] = 'NDA'

Corrigir variáveis relacionadas a tomografia

In [149]:
# Caso o motivo seja outro e não tenha descrição, adicionar string referente
c1 = (df_train['TOMO_RES'].isin([5])) & (df_train['TOMO_OUT'].isna())
df_train.loc[c1, 'TOMO_OUT'] = 'FEITO_E_SEM_DETALHE'
df_train.loc[c1, 'TOMO_OUT_NOVO'] = 'FEITO_E_SEM_DETALHE'

c1 = (df_test['TOMO_RES'].isin([5])) & (df_test['TOMO_OUT'].isna())
df_test.loc[c1, 'RAIOX_OUT'] = 'FEITO_E_SEM_DETALHE'
df_test.loc[c1, 'RAIOX_OUT_NOVO'] = 'FEITO_E_SEM_DETALHE'

# Caso seja nulo, será considerado que não foi feito nenhum exame
c2 = df_train['TOMO_RES'].isna()
df_train.loc[c2, 'TOMO_RES'] = 6
df_train.loc[c2, 'TOMO_OUT'] = 'NDA'
df_train.loc[c2, 'TOMO_OUT_NOVO'] = 'NDA'

c2 = df_test['TOMO_RES'].isna()
df_test.loc[c2, 'TOMO_RES'] = 6
df_test.loc[c2, 'TOMO_OUT'] = 'NDA'
df_test.loc[c2, 'TOMO_OUT_NOVO'] = 'NDA'

Corrigir variáveis referentes a amostra

In [150]:
# Caso seja nulo em AMOSTRA, será considerado que a coleta foi ignorada
c1 = df_train['AMOSTRA'].isna()
df_train.loc[c1, 'AMOSTRA'] = 9

# Caso AMOSTRA seja 2 ou 9 (não e ignorado respectivamente), ignoramos o tipo de amostra
c2 = df_train['AMOSTRA'].isin([2, 9])
df_train.loc[c2, 'TP_AMOSTRA'] = 9

# Caso TP_AMOSTRA seja 4 (outra) e não tenha informação em OUT_AMOST colocamos uma string especifica
c3 = (df_train['TP_AMOSTRA'] == 4) & (df_train['OUT_AMOST'].isna())
df_train.loc[c3, 'OUT_AMOST'] = 'TIPO_AMOST_NAO_ESPECIFICADO'

# Caso AMOSTRA seja 1 e TP_AMOSTRA seja diferente de 4, inserimos string especifica em OUT_AMOST
c4 = (df_train['AMOSTRA'] == 1) & (df_train['TP_AMOSTRA'].isin([1,2,3,5]))
df_train.loc[c4, 'OUT_AMOST'] = 'COLETA_FEITA_TIPO_DEFINIDO'

# Caso AMOSTRA seja 2 ou 9, inserimos string especifica em OUT_AMOSTRA
c5 = (df_train['AMOSTRA'].isin([2, 9]))
df_train.loc[c5, 'OUT_AMOST'] = 'COLETA_NAO_REALIZADA_OU_IGNORADA'

# Caso AMOSTRA seja 1 e TP_AMOSTRA seja nulo, inserimos 9
c6 = (df_train['AMOSTRA'] == 1) & (df_train['TP_AMOSTRA'].isna())
df_train.loc[c6, 'TP_AMOSTRA'] = 9

# Caso AMOSTRA seja 1 e TP_AMOSTRA seja diferente de 4, inserimos string especifica em OUT_AMOST
c7 = (df_train['AMOSTRA'] == 1) & (df_train['TP_AMOSTRA'] != 4)
df_train.loc[c7, 'OUT_AMOST'] = 'COLETA_FEITA_TIPO_IGNORADO'

In [151]:
# Caso seja nulo em AMOSTRA, será considerado que a coleta foi ignorada
c1 = df_test['AMOSTRA'].isna()
df_test.loc[c1, 'AMOSTRA'] = 9

# Caso AMOSTRA seja 2 ou 9 (não e ignorado respectivamente), ignoramos o tipo de amostra
c2 = df_test['AMOSTRA'].isin([2, 9])
df_test.loc[c2, 'TP_AMOSTRA'] = 9

# Caso TP_AMOSTRA seja 4 (outra) e não tenha informação em OUT_AMOST colocamos uma string especifica
c3 = (df_test['TP_AMOSTRA'] == 4) & (df_test['OUT_AMOST'].isna())
df_test.loc[c3, 'OUT_AMOST'] = 'TIPO_AMOST_NAO_ESPECIFICADO'

# Caso AMOSTRA seja 1 e TP_AMOSTRA seja diferente de 4, inserimos string especifica em OUT_AMOST
c4 = (df_test['AMOSTRA'] == 1) & (df_test['TP_AMOSTRA'].isin([1,2,3,5]))
df_test.loc[c4, 'OUT_AMOST'] = 'COLETA_FEITA_TIPO_DEFINIDO'

# Caso AMOSTRA seja 2 ou 9, inserimos string especifica em OUT_AMOSTRA
c5 = (df_test['AMOSTRA'].isin([2, 9]))
df_test.loc[c5, 'OUT_AMOST'] = 'COLETA_NAO_REALIZADA_OU_IGNORADA'

# Caso AMOSTRA seja 1 e TP_AMOSTRA seja nulo, inserimos 9
c6 = (df_test['AMOSTRA'] == 1) & (df_test['TP_AMOSTRA'].isna())
df_test.loc[c6, 'TP_AMOSTRA'] = 9

# Caso AMOSTRA seja 1 e TP_AMOSTRA seja diferente de 4, inserimos string especifica em OUT_AMOST
c7 = (df_test['AMOSTRA'] == 1) & (df_test['TP_AMOSTRA'] != 4)
df_test.loc[c7, 'OUT_AMOST'] = 'COLETA_FEITA_TIPO_IGNORADO'

Melhorar colunas referente a OUTROS II

In [152]:
# Gerar listas hierarquicas especificas para consolidar categorias em tipo de amostra X (prioridade de baixo para cima)
outros_amost_nasof = [c for c in outros_amost if 'NASOFARINGE' in c or 'NASO' in c or 'NF' in c or 'NOSOFARINGE' in c]
outros_amost_rapid = [c for c in outros_amost if 'TESTE RAPIDO' in c or 'RAPIDO' in c or 'TR' in c or 'ANTIGEN' in c]
outros_amost_blood = [c for c in outros_amost if 'SANGUE' in c or 'SAGUE' in c]
outros_amost_swab = [c for c in outros_amost if 'SWAB' in c]
outros_amost_soro = [c for c in outros_amost if 'SORO' in c]
outros_amost_tpcr = [c for c in outros_amost if 'PCR' in c]
outros_amost_nasal = [c for c in outros_amost if 'NASAL' in c]


# Especificar novas categorias de agrupamento por ordem de prioridade
outros_amost_nCats = ['REL_NASOFARINGE', 'REL_TESTE_RAPIDO','REL_PCR', 'REL_TESTE_SANGUE',
                      'REL_SWAB', 'REL_SORO', 'REL_NASAL']
outros_amost_oCats = [outros_amost_nasof, outros_amost_rapid, outros_amost_tpcr, outros_amost_blood,
                      outros_amost_swab, outros_amost_soro, outros_amost_nasal]

# Criar colunas
df_train['AMOSTRA_OUT_NOVO'] = np.nan
df_test['AMOSTRA_OUT_NOVO'] = np.nan

# Aplicar novas categorias evitando sobreposição
for n_cat, o_cats in zip(outros_amost_nCats, outros_amost_oCats):
    df_train.loc[(df_train['OUT_AMOST'].isin(o_cats)) & (df_train['AMOSTRA_OUT_NOVO'].isna()), 'AMOSTRA_OUT_NOVO'] = n_cat
    df_test.loc[(df_test['OUT_AMOST'].isin(o_cats)) & (df_test['AMOSTRA_OUT_NOVO'].isna()), 'AMOSTRA_OUT_NOVO'] = n_cat

# Converter todas as demais categorias restantes provalvemente raras em umas só para reduzir ruído ao modelo
c1 = (~df_train['AMOSTRA_OUT_NOVO'].isin(outros_amost_nCats)) & (df_train['OUT_AMOST'].notnull())
df_train.loc[c1, 'AMOSTRA_OUT_NOVO'] = 'REL_OUTROS_RESULTADOS'

c1 = (~df_test['AMOSTRA_OUT_NOVO'].isin(outros_amost_nCats)) & (df_test['OUT_AMOST'].notnull())
df_test.loc[c1, 'AMOSTRA_OUT_NOVO'] = n_cat

**Variáveis relacionadas ao SINTOMA**

In [153]:
# Definir colunas de análise nesta seção
cols = ['SEM_PRI','SEM_NOT','FEBRE','TOSSE','GARGANTA','DISPNEIA','DESC_RESP','SATURACAO','DIARREIA','VOMITO',
        'OUTRO_SIN','OUTRO_DES','PERD_PALA','PERD_OLFT','DOR_ABD','FADIGA']

In [154]:
# Checar nulos
df_train[cols].isna().sum()

SEM_PRI           0
SEM_NOT           0
FEBRE        231370
TOSSE        172085
GARGANTA     396565
DISPNEIA     176050
DESC_RESP    261255
SATURACAO    223622
DIARREIA     414223
VOMITO       422753
OUTRO_SIN    395283
OUTRO_DES    904480
PERD_PALA    434644
PERD_OLFT    434124
DOR_ABD      439039
FADIGA       389988
dtype: int64

In [155]:
# Criar coluna de acumulo de sintomas em dois formatos: progressivo
cols_sin = ['FEBRE','TOSSE','GARGANTA','DISPNEIA','DESC_RESP','SATURACAO','DIARREIA','VOMITO',
            'PERD_PALA','PERD_OLFT','DOR_ABD','FADIGA','OUTRO_SIN']
df_train['ACUMULADO_SINTOMAS'] = (df_train[cols_sin] == 1).sum(axis=1)
df_test['ACUMULADO_SINTOMAS'] = (df_test[cols_sin] == 1).sum(axis=1)

**Nota** - Não tem muito o que trabalhar nessas variáveis em relação a imputação.

**Variáveis relacionadas ao HOSPITAL**

In [156]:
# Definir colunas de análise nesta seção
cols = ['HOSPITAL','UTI','SUPORT_VEN','NOSOCOMIAL']

In [157]:
# Checar nulos
df_train[cols].isna().sum()

HOSPITAL       26483
UTI           162527
SUPORT_VEN    164962
NOSOCOMIAL    201505
dtype: int64

Imputar variáveis a partir de condições lógicas

In [158]:
# Caso HOSPITAL seja nulo, inserir como Sim caso certas condições sejam apresentadas
c1 = df_train['HOSPITAL'].isna()
c2 = (df_train['UTI'] == 1) | (df_train['SUPORT_VEN'].isin([1, 2])) | (df_train['NOSOCOMIAL'] == 1)
df_train.loc[c1 & c2, 'HOSPITAL'] = 1

# Caso HOSPITAL seja 1 e NOSOCOMIAL nulo, inserir 9
c3 = (df_train['HOSPITAL'] == 1) & (df_train['NOSOCOMIAL'].isna())
df_train.loc[c3, 'NOSOCOMIAL'] = 9

In [159]:
# Caso HOSPITAL seja nulo, inserir como Sim caso certas condições sejam apresentadas
c1 = df_test['HOSPITAL'].isna()
c2 = (df_test['UTI'] == 1) | (df_test['SUPORT_VEN'].isin([1, 2])) | (df_test['NOSOCOMIAL'] == 1)
df_test.loc[c1 & c2, 'HOSPITAL'] = 1

# Caso HOSPITAL seja 1 e NOSOCOMIAL nulo, inserir 9
c3 = (df_test['HOSPITAL'] == 1) & (df_test['NOSOCOMIAL'].isna())
df_test.loc[c3, 'NOSOCOMIAL'] = 9

**Variáveis relacionadas ao MEDICACAO**

In [160]:
# Definir colunas de análise nesta seção
cols = ['VACINA','ANTIVIRAL','TP_ANTIVIR']

In [161]:
# Checar nulos
df_train[cols].isna().sum()

VACINA         384795
ANTIVIRAL      224350
TP_ANTIVIR    1268247
dtype: int64

In [162]:
# Caso ANTIVIRAL seja nulo, inserir como Sim caso certas condições sejam apresentadas
c1 = (df_train['ANTIVIRAL'].isna()) & (df_train['TP_ANTIVIR'].isin([1,2,3]))
df_train.loc[c1, 'ANTIVIRAL'] = 1

In [163]:
# Caso ANTIVIRAL seja, inserir como Não em TP_ANTIVIR como 99
c1 = (df_train['ANTIVIRAL'] == 2) & (df_train['TP_ANTIVIR'].isna())
df_train.loc[c1, 'TP_ANTIVIR'] = 99

**Variáveis relacionadas ao PACIENTE_SAUDE_DETALHE**

In [164]:
# Definir colunas de análise nesta seção
cols = ['FATOR_RISC','CARDIOPATI','HEMATOLOGI','SIND_DOWN','HEPATICA','ASMA','DIABETES','NEUROLOGIC','PNEUMOPATI',
        'IMUNODEPRE','RENAL','OBESIDADE','OBES_IMC','OUT_MORBI','MORB_DESC']

In [165]:
# Especificar colunas de SAUDE_DETALHE
cols_saude = ['CARDIOPATI','HEMATOLOGI','SIND_DOWN','HEPATICA','ASMA','DIABETES','NEUROLOGIC','PNEUMOPATI',
              'IMUNODEPRE','RENAL','OBESIDADE','OUT_MORBI','PUERPERA']

# Criar acumulado
df_train['ACUMULADO_SAUDE'] = (df_train[cols_saude] == 1).sum(axis=1)
df_test['ACUMULADO_SAUDE'] = (df_test[cols_saude] == 1).sum(axis=1)

In [166]:
# Criar outras variações
df_train['ACUMULADO_SAUDE_MAIS_SINTOMA'] = df_train[['ACUMULADO_SAUDE','ACUMULADO_SINTOMAS']].sum(axis=1)
df_test['ACUMULADO_SAUDE_MAIS_SINTOMA'] = df_test[['ACUMULADO_SAUDE','ACUMULADO_SINTOMAS']].sum(axis=1)

df_train['ACUMULADO_SAUDE_SINTOMA_VACCOV'] = df_train['ACUMULADO_SAUDE_MAIS_SINTOMA']+(df_train['VACINA_COV'] == 1).astype(int)
df_test['ACUMULADO_SAUDE_SINTOMA_VACCOV'] = df_test['ACUMULADO_SAUDE_MAIS_SINTOMA']+(df_test['VACINA_COV'] == 1).astype(int)

In [167]:
# Caso FATOR de RISCO seja 2, então para as variáveis saúde inserir 2 indicando ausência nas de fator de riso
df_train.loc[df_train['FATOR_RISC'] == 2, cols_saude] = 2
df_train.loc[(df_train['FATOR_RISC'] == 2)&(df_train['PUERPERA'] != 99), 'PUERPERA'] = 2

df_test.loc[df_test['FATOR_RISC'] == 2, cols_saude] = 2
df_test.loc[(df_test['FATOR_RISC'] == 2)&(df_test['PUERPERA'] != 99), 'PUERPERA'] = 2

In [168]:
# Caso tenha outro tipo de fator de risco porém sem info, adicionar string específica
df_train.loc[(df_train['OUT_MORBI'] == 1)&(df_train['MORB_DESC'].isna()), 'MORB_DESC'] = 'FATOR_RISCO_NAO_DESCRITO'
df_test.loc[(df_test['OUT_MORBI'] == 1)&(df_test['MORB_DESC'].isna()), 'MORB_DESC'] = 'FATOR_RISCO_NAO_DESCRITO'

In [169]:
# Caso tenha nulo em MORB_DESC inserir string de ignorado
df_train.loc[df_train['MORB_DESC'].isna(), 'MORB_DESC'] = 'IGNORADO'
df_test.loc[df_test['MORB_DESC'].isna(), 'MORB_DESC'] = 'IGNORADO'

**Variáveis relacionadas ao PACIENTE_INFORMACAO**

In [170]:
# Definir colunas de análise nesta seção
cols = ['SG_UF','SG_UF_NOT','ID_REGIONA','CO_REGIONA','ID_MUNICIP','CO_MUN_NOT','CS_SEXO','NU_IDADE_N','TP_IDADE',
        'CS_RACA','CS_ESCOL_N','CS_ZONA','AVE_SUINO','OUT_ANIM']

In [171]:
# Checar nulos
(100*df_train[cols].isna().sum()/len(df_train)).round(2)

SG_UF          0.02
SG_UF_NOT      0.00
ID_REGIONA    11.36
CO_REGIONA    11.36
ID_MUNICIP     0.00
CO_MUN_NOT     0.00
CS_SEXO        0.00
NU_IDADE_N     0.00
TP_IDADE       0.00
CS_RACA        0.00
CS_ESCOL_N    34.98
CS_ZONA       10.36
AVE_SUINO     16.90
OUT_ANIM      99.91
dtype: float64

In [172]:
# Imputar SG_UF através da versão not
c1 = (df_train['SG_UF'].isna())&(df_train['SG_UF_NOT'].notnull())
df_train.loc[c1, 'SG_UF'] = df_train.loc[c1, 'SG_UF_NOT']

c1 = (df_test['SG_UF'].isna())&(df_test['SG_UF_NOT'].notnull())
df_test.loc[c1, 'SG_UF'] = df_test.loc[c1, 'SG_UF_NOT']

In [173]:
# Definir colunas para derrubar - redundantes e repetidas
cols_to_drop = ['ID_REGIONA','CO_REGIONA','SG_UF','CO_MUN_NOT']

df_train.drop(columns=cols_to_drop, inplace=True)
df_test.drop(columns=cols_to_drop, inplace=True)

In [174]:
# Definir colunas de análise nesta seção novamente
cols = [c for c in cols if c not in cols_to_drop]

Aqui vamos otimizar a coluna `AVES_SUINO` unificando ela a `OUT_ANIM` criando uma nova variável:

In [175]:
# Criar
outros_anim = list(set(df_train['OUT_ANIM'].dropna().unique().tolist() + df_test['OUT_ANIM'].dropna().unique().tolist()))

In [176]:
# Gerar listas hierarquicas especificas para consolidar categorias em tipo de amostra X (prioridade de baixo para cima)
outros_anim_dog = [c for c in outros_anim if 'CACHOR' in c or 'CANINO' in c or 'CAES' in c or 'CAO' in c or 'CAHORRO' in c]
outros_anim_cat = [c for c in outros_anim if 'GATO' in c or 'FELIN' in c]
outros_anim_boi = [c for c in outros_anim if 'GADO' in c or 'BOVINO' in c or 'BOI' in c or 'VACA' in c]
outros_anim_gal = [c for c in outros_anim if 'GALINHA' in c]

# Especificar novas categorias de agrupamento por ordem de prioridade
outros_anim_nCats = ['REL_CACHORRO', 'REL_GATO','REL_BOI', 'REL_GALINHA']
outros_anim_oCats = [outros_anim_dog, outros_anim_cat, outros_anim_boi, outros_anim_gal]

# Criar colunas
df_train['ANIM_OUT_NOVO'] = np.nan
df_test['ANIM_OUT_NOVO'] = np.nan

# Aplicar novas categorias evitando sobreposição
for n_cat, o_cats in zip(outros_anim_nCats, outros_anim_oCats):
    df_train.loc[(df_train['OUT_ANIM'].isin(o_cats)) & (df_train['ANIM_OUT_NOVO'].isna()), 'ANIM_OUT_NOVO'] = n_cat
    df_test.loc[(df_test['OUT_ANIM'].isin(o_cats)) & (df_test['ANIM_OUT_NOVO'].isna()), 'ANIM_OUT_NOVO'] = n_cat

# Converter todas as demais categorias restantes provalvemente raras em umas só para reduzir ruído ao modelo
c1 = (~df_train['ANIM_OUT_NOVO'].isin(outros_anim_nCats)) & (df_train['OUT_ANIM'].notnull())
df_train.loc[c1, 'ANIM_OUT_NOVO'] = 'REL_OUTROS_RESULTADOS'

c1 = (~df_test['ANIM_OUT_NOVO'].isin(outros_anim_nCats)) & (df_test['OUT_ANIM'].notnull())
df_test.loc[c1, 'ANIM_OUT_NOVO'] = n_cat

In [177]:
# Caso AVE_SUINO seja vazio, inserir ignorado
df_train['AVE_SUINO'].fillna(9, inplace=True)
df_test['AVE_SUINO'].fillna(9, inplace=True)

In [178]:
# Converter AVE_SUINO em string
c1 = (df_train['AVE_SUINO'] == 1)
df_train.loc[c1, 'AVE_SUINO_NOVO'] = 'CONVIVIO_AVE_SUINO'

c2 = (df_train['AVE_SUINO'] == 2)
df_train.loc[c2, 'AVE_SUINO_NOVO'] = 'SEM_CONVIVIO_ANIMAL'

c3 = (df_train['AVE_SUINO'] == 3) & (df_train['ANIM_OUT_NOVO'].notnull())
df_train.loc[c3, 'AVE_SUINO_NOVO'] = df_train.loc[c3, 'ANIM_OUT_NOVO']

c4 = (df_train['AVE_SUINO'] == 3) & (df_train['ANIM_OUT_NOVO'].isna())
df_train.loc[c4, 'AVE_SUINO_NOVO'] = 'CONVIVIO_NAO_DETALHADO'

c5 = (df_train['AVE_SUINO'] == 9)
df_train.loc[c5, 'AVE_SUINO_NOVO'] = 'CONVIVIO_IGNORADO'

# Visualizar nova coluna
df_train['AVE_SUINO_NOVO'].value_counts(dropna=False)

SEM_CONVIVIO_ANIMAL      879296
CONVIVIO_IGNORADO        409145
CONVIVIO_AVE_SUINO        11375
REL_CACHORRO                843
REL_GATO                    158
REL_OUTROS_RESULTADOS        77
REL_BOI                      68
REL_GALINHA                  18
Name: AVE_SUINO_NOVO, dtype: int64

In [179]:
# Converter AVE_SUINO em string
c1 = (df_test['AVE_SUINO'] == 1)
df_test.loc[c1, 'AVE_SUINO_NOVO'] = 'CONVIVIO_AVE_SUINO'

c2 = (df_test['AVE_SUINO'] == 2)
df_test.loc[c2, 'AVE_SUINO_NOVO'] = 'SEM_CONVIVIO_ANIMAL'

c3 = (df_test['AVE_SUINO'] == 3) & (df_test['ANIM_OUT_NOVO'].notnull())
df_test.loc[c3, 'AVE_SUINO_NOVO'] = df_test.loc[c3, 'ANIM_OUT_NOVO']

c4 = (df_test['AVE_SUINO'] == 3) & (df_test['ANIM_OUT_NOVO'].isna())
df_test.loc[c4, 'AVE_SUINO_NOVO'] = 'CONVIVIO_NAO_DETALHADO'

c5 = (df_test['AVE_SUINO'] == 9)
df_test.loc[c5, 'AVE_SUINO_NOVO'] = 'CONVIVIO_IGNORADO'

# Visualizar nova coluna
df_test['AVE_SUINO_NOVO'].value_counts(dropna=False)

SEM_CONVIVIO_ANIMAL    586685
CONVIVIO_IGNORADO      272295
CONVIVIO_AVE_SUINO       7558
REL_CACHORRO              561
REL_GATO                  106
REL_GALINHA                63
REL_BOI                    53
Name: AVE_SUINO_NOVO, dtype: int64

In [180]:
# Derrubar colunas
cols_to_drop = ['AVE_SUINO', 'OUT_ANIM','ANIM_OUT_NOVO']

df_train.drop(columns=cols_to_drop, inplace=True)
df_test.drop(columns=cols_to_drop, inplace=True)

In [181]:
# Modificar cols novamente
cols = ['SG_UF_NOT','ID_MUNICIP','CS_ZONA','CS_SEXO','CS_RACA','NU_IDADE_N','TP_IDADE','CS_ESCOL_N','AVE_SUINO_NOVO']

In [182]:
# Checar nulos
(100*df_train[cols].isna().sum()/len(df_train)).round(2).sort_values(ascending=False)

CS_ESCOL_N        34.98
CS_ZONA           10.36
SG_UF_NOT          0.00
ID_MUNICIP         0.00
CS_SEXO            0.00
CS_RACA            0.00
NU_IDADE_N         0.00
TP_IDADE           0.00
AVE_SUINO_NOVO     0.00
dtype: float64

### **Listar variáveis de acordo a natureza**

In [183]:
# Listar todas as colunas que são categóricas por definição e que podem estar codificadas como numéricas
cols_cat = ['SG_UF_NOT', 'ID_MUNICIP', 'CS_ZONA', 'SURTO_SG',                                              # LOCALIDADE
            'CS_SEXO', 'CS_GESTANT', 'CS_RACA', 'CS_ESCOL_N', 'HISTO_VGM','AVE_SUINO_NOVO',                # INFO PACIENTE
            'FEBRE', 'TOSSE', 'GARGANTA', 'DISPNEIA', 'DESC_RESP', 'SATURACAO', 'DIARREIA', 'VOMITO',      # SINTOMA
            'OUTRO_SIN', 'OUTRO_DES', 'DOR_ABD', 'FADIGA', 'PERD_OLFT', 'PERD_PALA',                       # SINTOMA
            'FATOR_RISC', 'PUERPERA', 'CARDIOPATI', 'HEMATOLOGI', 'SIND_DOWN', 'HEPATICA', 'ASMA',         # FATOR DE RISCO
            'DIABETES', 'NEUROLOGIC', 'PNEUMOPATI', 'IMUNODEPRE', 'RENAL', 'OBESIDADE',                    # FATOR DE RISCO 
            'MORB_DESC','OUT_MORBI',                                                                       # FATOR DE RISCO
            'RAIOX_RES',  'RAIOX_OUT_NOVO', 'TOMO_RES', 'TOMO_OUT_NOVO',                                   # EXAME
            #'TOMO_OUT', 'RAIOX_OUT',
            'VACINA', 'ANTIVIRAL', 'TP_ANTIVIR',                                                           # GRIPE / ANTI VIRAL
            'VACINA_COV',                                                                                  # VACINA COVID
            'HOSPITAL', 'UTI', 'SUPORT_VEN', 'NOSOCOMIAL',                                                 # HOSPITALIZAÇÃO
            'AMOSTRA', 'TP_AMOSTRA', 'AMOSTRA_OUT_NOVO',                                                   # COLETA AMOSTRA
            #'OUT_AMOST', 
            'MAE_VAC', 'M_AMAMENTA']                                                                       # MAE RELACIONADO

# Listar colunas numéricas
cols_num = ['SEM_NOT','SEM_PRI',                      # INFECCAO
            'NU_IDADE_N','TP_IDADE',                  # INFO PACIENTE
            'DOSE_COV_N',                             # QTD DOSES TOMADAS
            'ACUMULADO_SINTOMAS',                     # ACUMULADO SINTOMAS
            'ACUMULADO_SAUDE',                        # ACUMULADO FATOR DE RISCO
            'ACUMULADO_SAUDE_MAIS_SINTOMA',           # ACUMULADO SINTOMAS + FATOR DE RISCO
            'ACUMULADO_SAUDE_SINTOMA_VACCOV']         # ACUMULADO SINTOMAS + FATOR DE RISCO + DOSES VACINA
# Listar colunas binárias por definição
cols_bin = ['DOSE_1_COV','DOSE_2_COV', 'DOSE_REF']    # VACINA COVID

### Codificar corretamente variáveis categóricas

In [184]:
# Especificar dicionário para casos especificos
de_para_cat = {'CS_ZONA': {1:'URBANA', 2:'RURAL', 3:'PERIURBANA',9:'URBANA'},
               'HISTO_VGM':{1:'SIM', 2:'NAO', 9:'IGNORADO', 0:'IGNORADO'},
               'CS_GESTANT': {1:'PRI_TRI',2:'SEG_TRI',3:'TRC_TRI',4:'IDD_GEST_IGNORADA',5:'NAO',6:'NAO_APLICA',9:'IGNORADO'},
               'CS_RACA': {1:'BRANCA',2:'PRETA',3:'AMARELA',4:'PARDA',5:'INDIGENA',9:'IGNORADO'},
               'CS_ESCOL_N': {0:'ANALFAB',1:'1_5_SERIE',2:'6_9_SERIE',3:'ENS_MEDIO',4:'ENS_SUP',5:'NAO_APLICA',9:'IGNORADO'},
               'PUERPERA': {1:'SIM', 2:'NAO', 9:'IGNORADO',99:'NAO_SE_APLICA_HOMEM'},
               'RAIOX_RES': {1:'NORMAL',2:'INFILTRADO_INTERSTICIAL',3:'CONSOLIDACAO',
                             4:'MISTO',5:'OUTRO',6:'NAO_FEITO',9:'IGNORADO'},
               'TOMO_RES': {1:'TIPICO_COV19',2:'INDETERMINADO_COV19',3:'ATIPICO_COV19',
                            4:'NEGATIVO_PNM',5:'OUTRO',6:'NAO_FEITO',9:'IGNORADO'},
               'TP_ANTIVIR': {1:'OSELTAMIVIR',2:'ZANAMIVIR',3:'OUTRO'},
               'SUPORT_VEN': {1:'SIM_INVASIVO',2:'SIM_NAO_INVASIVO',3:'NAO',9:'IGNORADO'},
               'TP_AMOSTRA': {1:'SECREC_NASO_OROFARINGE',2:'LAVADO_BROCO_ALVEOLAR',3:'TECIDO_POST_MORTEM',
                              4:'OUTRA',5:'LCR',9:'IGNORADO'},
               'MAE_VAC': {1:'SIM', 2:'NAO', 9:'IGNORADO',99:'NAO_SE_APLICA_HOMEM'},
               'M_AMAMENTA': {1:'SIM', 2:'NAO', 9:'IGNORADO',99:'NAO_SE_APLICA_HOMEM'}}

# Loop para modificar
for c, d in de_para_cat.items():
    df_train[c].replace(d, inplace=True)
    df_test[c].replace(d, inplace=True)

In [185]:
# Especificar lista de colunas que possuem seus valores numéricos equivalentes as mesmas strings
cols_cat_gen = ['FEBRE', 'TOSSE', 'GARGANTA', 'DISPNEIA', 'DESC_RESP', 'SATURACAO', 'DIARREIA', 'VOMITO',
                'OUTRO_SIN', 'DOR_ABD', 'FADIGA', 'PERD_OLFT', 'PERD_PALA', 'FATOR_RISC', 'CARDIOPATI', 
                'HEMATOLOGI', 'SIND_DOWN', 'HEPATICA', 'ASMA', 'DIABETES', 'NEUROLOGIC', 'PNEUMOPATI', 
                'IMUNODEPRE', 'RENAL', 'OBESIDADE', 'OUT_MORBI', 
                 'SURTO_SG','VACINA','ANTIVIRAL','VACINA_COV','HOSPITAL','UTI','NOSOCOMIAL','AMOSTRA']
# Loop para modificar
for c in cols_cat_gen:
    df_train[c].replace({1:'SIM', 2:'NAO', 9:'IGNORADO'}, inplace=True)
    df_test[c].replace({1:'SIM', 2:'NAO', 9:'IGNORADO'}, inplace=True)

In [186]:
# Validação das colunas categoricas corrigida
for c in cols_cat_gen + list(de_para_cat.keys()):
    print(f'##################################### {c}')
    print(df_test[c].unique())

##################################### FEBRE
['SIM' nan 'NAO' 'IGNORADO']
##################################### TOSSE
['SIM' nan 'NAO' 'IGNORADO']
##################################### GARGANTA
['NAO' nan 'SIM' 'IGNORADO']
##################################### DISPNEIA
['NAO' nan 'SIM' 'IGNORADO']
##################################### DESC_RESP
['SIM' nan 'NAO' 'IGNORADO']
##################################### SATURACAO
['NAO' nan 'SIM' 'IGNORADO']
##################################### DIARREIA
['NAO' nan 'IGNORADO' 'SIM']
##################################### VOMITO
['NAO' nan 'SIM' 'IGNORADO']
##################################### OUTRO_SIN
['NAO' nan 'SIM' 'IGNORADO']
##################################### DOR_ABD
['NAO' nan 'SIM' 'IGNORADO']
##################################### FADIGA
['NAO' nan 'SIM' 'IGNORADO']
##################################### PERD_OLFT
['NAO' nan 'IGNORADO' 'SIM']
##################################### PERD_PALA
['NAO' nan 'IGNORADO' 'SIM']
##################

### Aplicar + criação de variáveis

In [187]:
# Quando a notificao foi na semana dos primeiros sintomas
df_train['SEM_PRI_NOT_IGUAIS'] = (df_train['SEM_PRI'] == df_train['SEM_NOT']).astype(int)
df_test['SEM_PRI_NOT_IGUAIS'] = (df_test['SEM_PRI'] == df_test['SEM_NOT']).astype(int)

# Calcular diff absoluta
df_train['SEM_PRI_NOT_DIFF_ABS'] = (df_train['SEM_PRI'] - df_train['SEM_NOT']).abs()
df_test['SEM_PRI_NOT_DIFF_ABS'] = (df_test['SEM_PRI'] - df_test['SEM_NOT']).abs()

In [188]:
# Apontar realização de Raio X + TC e Ambos
df_train['FEZ_RAIOX'] = df_train['RAIOX_RES'].isin(['NORMAL','INFILTRADO_INTERSTICIAL','CONSOLIDACAO',
                                                    'MISTO','OUTRO']).astype(int)
df_train['FEZ_TOMOG'] = df_train['TOMO_RES'].isin(['TIPICO_COV19','INDETERMINADO_COV19','ATIPICO_COV19',
                                                   'NEGATIVO_PNM','OUTRO']).astype(int)
df_train['FEZ_RAIOX_TOMOG'] = df_train['FEZ_RAIOX'] + df_train['FEZ_TOMOG']

# Apontar realização de Raio X + TC e Ambos
df_test['FEZ_RAIOX'] = df_test['RAIOX_RES'].isin(['NORMAL','INFILTRADO_INTERSTICIAL','CONSOLIDACAO',
                                                    'MISTO','OUTRO']).astype(int)
df_test['FEZ_TOMOG'] = df_test['TOMO_RES'].isin(['TIPICO_COV19','INDETERMINADO_COV19','ATIPICO_COV19',
                                                   'NEGATIVO_PNM','OUTRO']).astype(int)
df_test['FEZ_RAIOX_TOMOG'] = df_test['FEZ_RAIOX'] + df_test['FEZ_TOMOG']

In [189]:
# Apontar TOMOG que apresentaram covid
df_train['TOMOG_COV19'] = ((df_train['TOMO_RES'] == 'TIPICO_COV19')|(df_train['TOMO_OUT_NOVO'] == 'REL_COVID')).astype(int)
df_test['TOMOG_COV19'] = ((df_test['TOMO_RES'] == 'TIPICO_COV19')|(df_test['TOMO_OUT_NOVO'] == 'REL_COVID')).astype(int)

In [190]:
# Aplicar a unidade de fato em TP para diferenciar melhor os valores
df_train.loc[df_train['TP_IDADE'] == 1, 'TP_IDADE'] = 1
df_train.loc[df_train['TP_IDADE'] == 2, 'TP_IDADE'] = 30
df_train.loc[df_train['TP_IDADE'] == 3, 'TP_IDADE'] = 365

df_test.loc[df_test['TP_IDADE'] == 1, 'TP_IDADE'] = 1
df_test.loc[df_test['TP_IDADE'] == 2, 'TP_IDADE'] = 30
df_test.loc[df_test['TP_IDADE'] == 3, 'TP_IDADE'] = 365

### Listar novamente todas as variáveis por natureza II

Adicionar as criadas anteriormente.

In [191]:
# Checar variáveis faltantes na listagem
[c for c in df_train.columns if c not in cols_cat + cols_num + cols_bin and c != 'CLASSI_FIN']

['RAIOX_OUT',
 'OUT_AMOST',
 'TOMO_OUT',
 'SEM_PRI_NOT_IGUAIS',
 'SEM_PRI_NOT_DIFF_ABS',
 'FEZ_RAIOX',
 'FEZ_TOMOG',
 'FEZ_RAIOX_TOMOG',
 'TOMOG_COV19']

In [192]:
# Adicionar de acordo a natureza
cols_num += ['SEM_PRI_NOT_IGUAIS','SEM_PRI_NOT_DIFF_ABS','FEZ_RAIOX_TOMOG']
cols_bin += ['FEZ_RAIOX','FEZ_TOMOG','TOMOG_COV19']

In [193]:
# Checar variáveis faltantes na listagem
[c for c in df_train.columns if c not in cols_cat + cols_num + cols_bin and c != 'CLASSI_FIN']

['RAIOX_OUT', 'OUT_AMOST', 'TOMO_OUT']

### Trocar `IGNORADO` por nulo para reduzir complexidade

In [194]:
# Aplicar
#df_train.replace({'IGNORADO':np.nan}, inplace=True)
#df_test.replace({'IGNORADO':np.nan}, inplace=True)

# Divisão de treino e validação

In [195]:
# Separar dados em X e Y
X = df_train[cols_num + cols_bin + cols_cat].copy()
Y = df_train['CLASSI_FIN'].copy()

In [196]:
# Separar
x_train, x_valid, y_train, y_valid = train_test_split(X, Y, test_size=0.20, random_state=1206, stratify=Y)

In [197]:
# Criar teste seguindo a mesma ordem das colunas
x_test = df_test[cols_num + cols_bin + cols_cat].copy()

In [198]:
# Liberar espaço deletando dataframes antigos
del X, Y, df_test, df_train

In [199]:
# Checar distribuição de Y
y_train.value_counts(normalize=True)

5    0.666859
4    0.292403
2    0.024164
1    0.012012
3    0.004562
Name: CLASSI_FIN, dtype: float64

In [200]:
# Checar distribuição de Y
y_valid.value_counts(normalize=True)

5    0.666859
4    0.292403
2    0.024163
1    0.012014
3    0.004562
Name: CLASSI_FIN, dtype: float64

# Imputação de dados

Estratégia:
- Categóricas: Imputar string "INFO_FALTANTE"
- Numéricas e Binárias: Imputar a moda

In [201]:
# Aplicar estratégia para categoricas
for c in cols_cat:
    x_train.loc[x_train[c].isna(), c] = 'INFO_FALTANTE'
    x_valid.loc[x_valid[c].isna(), c] = 'INFO_FALTANTE'
    x_test.loc[x_test[c].isna(), c] = 'INFO_FALTANTE'

In [202]:
# Aplicar estratégia para numéricas
for c in cols_num + cols_bin:
    # Calcular a moda no treino
    valor_moda = x_train[c].dropna().mode()[0]
    print(f'> {c} possui moda de {valor_moda}')
    # Imputar
    x_train.loc[x_train[c].isna(), c] = valor_moda
    x_valid.loc[x_valid[c].isna(), c] = valor_moda
    x_test.loc[x_test[c].isna(), c] = valor_moda

> SEM_NOT possui moda de 12
> SEM_PRI possui moda de 9
> NU_IDADE_N possui moda de 0
> TP_IDADE possui moda de 365
> DOSE_COV_N possui moda de 0
> ACUMULADO_SINTOMAS possui moda de 4
> ACUMULADO_SAUDE possui moda de 0
> ACUMULADO_SAUDE_MAIS_SINTOMA possui moda de 5
> ACUMULADO_SAUDE_SINTOMA_VACCOV possui moda de 5
> SEM_PRI_NOT_IGUAIS possui moda de 0
> SEM_PRI_NOT_DIFF_ABS possui moda de 1
> FEZ_RAIOX_TOMOG possui moda de 1
> DOSE_1_COV possui moda de 0
> DOSE_2_COV possui moda de 0
> DOSE_REF possui moda de 0
> FEZ_RAIOX possui moda de 0
> FEZ_TOMOG possui moda de 0
> TOMOG_COV19 possui moda de 0


# Modelagem

**Preparar dados**

In [203]:
# Criar conjuntos de treino Y binários
y1_train = (y_train == 1).astype(int)
y2_train = (y_train == 2).astype(int)
y3_train = (y_train == 3).astype(int)
y4_train = (y_train == 4).astype(int)
y5_train = (y_train == 5).astype(int)

In [204]:
# Criar conjuntos de validação Y binários
y1_valid = (y_valid == 1).astype(int)
y2_valid = (y_valid == 2).astype(int)
y3_valid = (y_valid == 3).astype(int)
y4_valid = (y_valid == 4).astype(int)
y5_valid = (y_valid == 5).astype(int)

In [205]:
# Converter para categorical type
for c in cols_cat:
    x_train[c] = x_train[c].astype('category')
    x_valid[c] = pd.Categorical(x_valid[c], x_train[c].astype('category').cat.categories) 
    x_test[c] = pd.Categorical(x_test[c], x_train[c].astype('category').cat.categories)

**Validação de Modelos já estabelecidos**

[Referência](https://optuna.readthedocs.io/en/stable/reference/generated/optuna.integration.lightgbm.LightGBMTuner.html)

[Parâmetros](https://lightgbm.readthedocs.io/en/latest/Parameters.html)

In [207]:
# Listar nomes das variáveis
feats_all = cols_num + cols_bin + cols_cat
feats_cat = cols_cat

In [208]:
# Converter X e Y para formato de dataset do lightgbm - treino
ds1_train = lgb.Dataset(data=x_train, label=y1_train, feature_name=feats_all, categorical_feature=feats_cat)
ds2_train = lgb.Dataset(data=x_train, label=y2_train, feature_name=feats_all, categorical_feature=feats_cat)
ds3_train = lgb.Dataset(data=x_train, label=y3_train, feature_name=feats_all, categorical_feature=feats_cat)
ds4_train = lgb.Dataset(data=x_train, label=y4_train, feature_name=feats_all, categorical_feature=feats_cat)
ds5_train = lgb.Dataset(data=x_train, label=y5_train, feature_name=feats_all, categorical_feature=feats_cat)

# Converter X e Y para formato de dataset do lightgbm - validação
ds1_valid = lgb.Dataset(data=x_valid, label=y1_valid, feature_name=feats_all, categorical_feature=feats_cat)
ds2_valid = lgb.Dataset(data=x_valid, label=y2_valid, feature_name=feats_all, categorical_feature=feats_cat)
ds3_valid = lgb.Dataset(data=x_valid, label=y3_valid, feature_name=feats_all, categorical_feature=feats_cat)
ds4_valid = lgb.Dataset(data=x_valid, label=y4_valid, feature_name=feats_all, categorical_feature=feats_cat)
ds5_valid = lgb.Dataset(data=x_valid, label=y5_valid, feature_name=feats_all, categorical_feature=feats_cat)

In [209]:
# Criar versões unificadas para novo treinamento
x_train_valid = pd.concat([x_train, x_valid])
y1_train_valid = pd.concat([y1_train, y1_valid])
y2_train_valid = pd.concat([y2_train, y2_valid])
y3_train_valid = pd.concat([y3_train, y3_valid])
y4_train_valid = pd.concat([y4_train, y4_valid])
y5_train_valid = pd.concat([y5_train, y5_valid])

# Converter X e Y para formato de dataset do lightgbm - validação
ds1_train_valid = lgb.Dataset(data=x_train_valid, label=y1_train_valid, feature_name=feats_all, categorical_feature=feats_cat)
ds2_train_valid = lgb.Dataset(data=x_train_valid, label=y2_train_valid, feature_name=feats_all, categorical_feature=feats_cat)
ds3_train_valid = lgb.Dataset(data=x_train_valid, label=y3_train_valid, feature_name=feats_all, categorical_feature=feats_cat)
ds4_train_valid = lgb.Dataset(data=x_train_valid, label=y4_train_valid, feature_name=feats_all, categorical_feature=feats_cat)
ds5_train_valid = lgb.Dataset(data=x_train_valid, label=y5_train_valid, feature_name=feats_all, categorical_feature=feats_cat)

In [210]:
# Lidar com excesso de warnings de usuário não solucionáveis do LGBM
warnings.filterwarnings('ignore')

**Montar modelos para avaliar em prompt a validação**

In [211]:
# Definir número de rounds
nbr = 2500

In [297]:
# Especificar hiper parâmetros finais
hp5 = {'objective':'binary',
       'is_unbalance': False,
       'boosting_type': 'gbdt',
       'metric': 'binary_logloss', 
       'eta': 0.01,
       'num_leaves': 224,
       'feature_fraction': 0.85,
       'early_stopping_round': 25,
       "lambda_l1":8.791369324758216,
       "lambda_l2":0.05026876807406415,
       'seed': 1206,
       'verbosity': -1,
       'num_threads':4}
hp4 = {'objective':'binary',
       'is_unbalance': False,
       'boosting_type': 'gbdt',
       'metric': 'binary_logloss', 
       'eta': 0.01,
       'num_leaves': 243,
       'feature_fraction': 0.85,
       'early_stopping_round': 25,
       "lambda_l1":8.791369324758216,
       "lambda_l2":0.05026876807406415,
       'seed': 1206, 
       'verbosity': -1,
       'num_threads':4}
hp3 = {'objective':'binary',
       'is_unbalance': False,
       'boosting_type': 'gbdt',
       'metric': 'binary_logloss', 
       'eta': 0.01,
       'num_leaves': 243,
       'feature_fraction': 0.85,
       'early_stopping_round': 25,
       "lambda_l1":8.791369324758216,
       "lambda_l2":0.05026876807406415,
       'seed': 1206, 
       'verbosity': -1,
       'num_threads':4}
hp2 = {'objective':'binary',
       'is_unbalance': False,
       'boosting_type': 'gbdt',
       'metric': 'binary_logloss', 
       'eta': 0.01,
       'num_leaves': 243,
       'feature_fraction': 0.85,
       'early_stopping_round': 25,
       "lambda_l1":8.791369324758216,
       "lambda_l2":0.05026876807406415,
       'seed': 1206, 
       'verbosity': -1,
       'num_threads':4}
hp1 = {'objective':'binary',
       'is_unbalance': False,
       'boosting_type': 'gbdt',
       'metric': 'binary_logloss', 
       'eta': 0.01,
       'num_leaves': 243,
       'feature_fraction': 0.85,
       'early_stopping_round': 25,
       "lambda_l1":8.791369324758216,
       "lambda_l2":0.05026876807406415,
       'seed': 1206, 
       'verbosity': -1,
       'num_threads':4}

In [298]:
# Treinar 100% modelo da classe 5
m5 = lgb.train(params=hp5, num_boost_round=nbr, train_set=ds5_train, valid_sets=[ds5_valid], 
               verbose_eval=True, categorical_feature=feats_cat)
print('Modelo da classe 5 finalizado')

[1]	valid_0's binary_logloss: 0.632241
Training until validation scores don't improve for 25 rounds
[2]	valid_0's binary_logloss: 0.628736
[3]	valid_0's binary_logloss: 0.624749
[4]	valid_0's binary_logloss: 0.621175
[5]	valid_0's binary_logloss: 0.617342
[6]	valid_0's binary_logloss: 0.613589
[7]	valid_0's binary_logloss: 0.609917
[8]	valid_0's binary_logloss: 0.606326
[9]	valid_0's binary_logloss: 0.602816
[10]	valid_0's binary_logloss: 0.599361
[11]	valid_0's binary_logloss: 0.595987
[12]	valid_0's binary_logloss: 0.59268
[13]	valid_0's binary_logloss: 0.589755
[14]	valid_0's binary_logloss: 0.586573
[15]	valid_0's binary_logloss: 0.58345
[16]	valid_0's binary_logloss: 0.580391
[17]	valid_0's binary_logloss: 0.577382
[18]	valid_0's binary_logloss: 0.574433
[19]	valid_0's binary_logloss: 0.571983
[20]	valid_0's binary_logloss: 0.56914
[21]	valid_0's binary_logloss: 0.566662
[22]	valid_0's binary_logloss: 0.564028
[23]	valid_0's binary_logloss: 0.561317
[24]	valid_0's binary_logloss: 

[202]	valid_0's binary_logloss: 0.393072
[203]	valid_0's binary_logloss: 0.392786
[204]	valid_0's binary_logloss: 0.392504
[205]	valid_0's binary_logloss: 0.392236
[206]	valid_0's binary_logloss: 0.391997
[207]	valid_0's binary_logloss: 0.391729
[208]	valid_0's binary_logloss: 0.391454
[209]	valid_0's binary_logloss: 0.391208
[210]	valid_0's binary_logloss: 0.390948
[211]	valid_0's binary_logloss: 0.390688
[212]	valid_0's binary_logloss: 0.390431
[213]	valid_0's binary_logloss: 0.390177
[214]	valid_0's binary_logloss: 0.389928
[215]	valid_0's binary_logloss: 0.389682
[216]	valid_0's binary_logloss: 0.38944
[217]	valid_0's binary_logloss: 0.389194
[218]	valid_0's binary_logloss: 0.388952
[219]	valid_0's binary_logloss: 0.3887
[220]	valid_0's binary_logloss: 0.388468
[221]	valid_0's binary_logloss: 0.388236
[222]	valid_0's binary_logloss: 0.38801
[223]	valid_0's binary_logloss: 0.387777
[224]	valid_0's binary_logloss: 0.387552
[225]	valid_0's binary_logloss: 0.387327
[226]	valid_0's bina

[403]	valid_0's binary_logloss: 0.36477
[404]	valid_0's binary_logloss: 0.364694
[405]	valid_0's binary_logloss: 0.364632
[406]	valid_0's binary_logloss: 0.364557
[407]	valid_0's binary_logloss: 0.364477
[408]	valid_0's binary_logloss: 0.364418
[409]	valid_0's binary_logloss: 0.364349
[410]	valid_0's binary_logloss: 0.36429
[411]	valid_0's binary_logloss: 0.36421
[412]	valid_0's binary_logloss: 0.364136
[413]	valid_0's binary_logloss: 0.364062
[414]	valid_0's binary_logloss: 0.363996
[415]	valid_0's binary_logloss: 0.363905
[416]	valid_0's binary_logloss: 0.363823
[417]	valid_0's binary_logloss: 0.363754
[418]	valid_0's binary_logloss: 0.363667
[419]	valid_0's binary_logloss: 0.363596
[420]	valid_0's binary_logloss: 0.36352
[421]	valid_0's binary_logloss: 0.363464
[422]	valid_0's binary_logloss: 0.363391
[423]	valid_0's binary_logloss: 0.36332
[424]	valid_0's binary_logloss: 0.363244
[425]	valid_0's binary_logloss: 0.363178
[426]	valid_0's binary_logloss: 0.363119
[427]	valid_0's binar

[604]	valid_0's binary_logloss: 0.354565
[605]	valid_0's binary_logloss: 0.354538
[606]	valid_0's binary_logloss: 0.354507
[607]	valid_0's binary_logloss: 0.354485
[608]	valid_0's binary_logloss: 0.354448
[609]	valid_0's binary_logloss: 0.354419
[610]	valid_0's binary_logloss: 0.354358
[611]	valid_0's binary_logloss: 0.354335
[612]	valid_0's binary_logloss: 0.354306
[613]	valid_0's binary_logloss: 0.354275
[614]	valid_0's binary_logloss: 0.354245
[615]	valid_0's binary_logloss: 0.354212
[616]	valid_0's binary_logloss: 0.354187
[617]	valid_0's binary_logloss: 0.354161
[618]	valid_0's binary_logloss: 0.354132
[619]	valid_0's binary_logloss: 0.354114
[620]	valid_0's binary_logloss: 0.354087
[621]	valid_0's binary_logloss: 0.354052
[622]	valid_0's binary_logloss: 0.354034
[623]	valid_0's binary_logloss: 0.354009
[624]	valid_0's binary_logloss: 0.353982
[625]	valid_0's binary_logloss: 0.353946
[626]	valid_0's binary_logloss: 0.353905
[627]	valid_0's binary_logloss: 0.353879
[628]	valid_0's 

[805]	valid_0's binary_logloss: 0.3497
[806]	valid_0's binary_logloss: 0.349678
[807]	valid_0's binary_logloss: 0.349657
[808]	valid_0's binary_logloss: 0.349644
[809]	valid_0's binary_logloss: 0.349615
[810]	valid_0's binary_logloss: 0.349595
[811]	valid_0's binary_logloss: 0.349575
[812]	valid_0's binary_logloss: 0.349544
[813]	valid_0's binary_logloss: 0.34954
[814]	valid_0's binary_logloss: 0.349531
[815]	valid_0's binary_logloss: 0.349514
[816]	valid_0's binary_logloss: 0.349481
[817]	valid_0's binary_logloss: 0.349471
[818]	valid_0's binary_logloss: 0.349439
[819]	valid_0's binary_logloss: 0.349427
[820]	valid_0's binary_logloss: 0.349409
[821]	valid_0's binary_logloss: 0.349384
[822]	valid_0's binary_logloss: 0.349374
[823]	valid_0's binary_logloss: 0.349338
[824]	valid_0's binary_logloss: 0.349333
[825]	valid_0's binary_logloss: 0.349315
[826]	valid_0's binary_logloss: 0.349287
[827]	valid_0's binary_logloss: 0.349266
[828]	valid_0's binary_logloss: 0.349241
[829]	valid_0's bin

[1006]	valid_0's binary_logloss: 0.346575
[1007]	valid_0's binary_logloss: 0.346562
[1008]	valid_0's binary_logloss: 0.346554
[1009]	valid_0's binary_logloss: 0.346545
[1010]	valid_0's binary_logloss: 0.346542
[1011]	valid_0's binary_logloss: 0.346535
[1012]	valid_0's binary_logloss: 0.346525
[1013]	valid_0's binary_logloss: 0.346507
[1014]	valid_0's binary_logloss: 0.346502
[1015]	valid_0's binary_logloss: 0.346493
[1016]	valid_0's binary_logloss: 0.346485
[1017]	valid_0's binary_logloss: 0.346482
[1018]	valid_0's binary_logloss: 0.34647
[1019]	valid_0's binary_logloss: 0.34645
[1020]	valid_0's binary_logloss: 0.346433
[1021]	valid_0's binary_logloss: 0.34643
[1022]	valid_0's binary_logloss: 0.34642
[1023]	valid_0's binary_logloss: 0.346399
[1024]	valid_0's binary_logloss: 0.346397
[1025]	valid_0's binary_logloss: 0.346389
[1026]	valid_0's binary_logloss: 0.34638
[1027]	valid_0's binary_logloss: 0.346366
[1028]	valid_0's binary_logloss: 0.346359
[1029]	valid_0's binary_logloss: 0.3463

[1203]	valid_0's binary_logloss: 0.344619
[1204]	valid_0's binary_logloss: 0.344618
[1205]	valid_0's binary_logloss: 0.344613
[1206]	valid_0's binary_logloss: 0.344605
[1207]	valid_0's binary_logloss: 0.344601
[1208]	valid_0's binary_logloss: 0.344582
[1209]	valid_0's binary_logloss: 0.344582
[1210]	valid_0's binary_logloss: 0.344582
[1211]	valid_0's binary_logloss: 0.344577
[1212]	valid_0's binary_logloss: 0.344562
[1213]	valid_0's binary_logloss: 0.344553
[1214]	valid_0's binary_logloss: 0.344538
[1215]	valid_0's binary_logloss: 0.344527
[1216]	valid_0's binary_logloss: 0.344511
[1217]	valid_0's binary_logloss: 0.344495
[1218]	valid_0's binary_logloss: 0.344482
[1219]	valid_0's binary_logloss: 0.344474
[1220]	valid_0's binary_logloss: 0.344467
[1221]	valid_0's binary_logloss: 0.344466
[1222]	valid_0's binary_logloss: 0.344451
[1223]	valid_0's binary_logloss: 0.344441
[1224]	valid_0's binary_logloss: 0.344429
[1225]	valid_0's binary_logloss: 0.344419
[1226]	valid_0's binary_logloss: 0

[1399]	valid_0's binary_logloss: 0.343301
[1400]	valid_0's binary_logloss: 0.343294
[1401]	valid_0's binary_logloss: 0.343281
[1402]	valid_0's binary_logloss: 0.343275
[1403]	valid_0's binary_logloss: 0.343277
[1404]	valid_0's binary_logloss: 0.343274
[1405]	valid_0's binary_logloss: 0.343265
[1406]	valid_0's binary_logloss: 0.343265
[1407]	valid_0's binary_logloss: 0.343262
[1408]	valid_0's binary_logloss: 0.343253
[1409]	valid_0's binary_logloss: 0.343245
[1410]	valid_0's binary_logloss: 0.343244
[1411]	valid_0's binary_logloss: 0.343238
[1412]	valid_0's binary_logloss: 0.343234
[1413]	valid_0's binary_logloss: 0.343222
[1414]	valid_0's binary_logloss: 0.343215
[1415]	valid_0's binary_logloss: 0.343205
[1416]	valid_0's binary_logloss: 0.343199
[1417]	valid_0's binary_logloss: 0.343186
[1418]	valid_0's binary_logloss: 0.343176
[1419]	valid_0's binary_logloss: 0.343172
[1420]	valid_0's binary_logloss: 0.343161
[1421]	valid_0's binary_logloss: 0.343156
[1422]	valid_0's binary_logloss: 0

[1595]	valid_0's binary_logloss: 0.342354
[1596]	valid_0's binary_logloss: 0.342349
[1597]	valid_0's binary_logloss: 0.342345
[1598]	valid_0's binary_logloss: 0.342345
[1599]	valid_0's binary_logloss: 0.342345
[1600]	valid_0's binary_logloss: 0.342341
[1601]	valid_0's binary_logloss: 0.342338
[1602]	valid_0's binary_logloss: 0.34234
[1603]	valid_0's binary_logloss: 0.342335
[1604]	valid_0's binary_logloss: 0.342334
[1605]	valid_0's binary_logloss: 0.342334
[1606]	valid_0's binary_logloss: 0.342329
[1607]	valid_0's binary_logloss: 0.342328
[1608]	valid_0's binary_logloss: 0.342324
[1609]	valid_0's binary_logloss: 0.342318
[1610]	valid_0's binary_logloss: 0.34231
[1611]	valid_0's binary_logloss: 0.342302
[1612]	valid_0's binary_logloss: 0.342301
[1613]	valid_0's binary_logloss: 0.342296
[1614]	valid_0's binary_logloss: 0.342291
[1615]	valid_0's binary_logloss: 0.342288
[1616]	valid_0's binary_logloss: 0.342281
[1617]	valid_0's binary_logloss: 0.342274
[1618]	valid_0's binary_logloss: 0.3

[1792]	valid_0's binary_logloss: 0.341673
[1793]	valid_0's binary_logloss: 0.341673
[1794]	valid_0's binary_logloss: 0.341669
[1795]	valid_0's binary_logloss: 0.341666
[1796]	valid_0's binary_logloss: 0.341666
[1797]	valid_0's binary_logloss: 0.341661
[1798]	valid_0's binary_logloss: 0.341658
[1799]	valid_0's binary_logloss: 0.341651
[1800]	valid_0's binary_logloss: 0.341648
[1801]	valid_0's binary_logloss: 0.341647
[1802]	valid_0's binary_logloss: 0.341646
[1803]	valid_0's binary_logloss: 0.341647
[1804]	valid_0's binary_logloss: 0.341644
[1805]	valid_0's binary_logloss: 0.34164
[1806]	valid_0's binary_logloss: 0.341635
[1807]	valid_0's binary_logloss: 0.341633
[1808]	valid_0's binary_logloss: 0.341632
[1809]	valid_0's binary_logloss: 0.341632
[1810]	valid_0's binary_logloss: 0.341625
[1811]	valid_0's binary_logloss: 0.341628
[1812]	valid_0's binary_logloss: 0.341625
[1813]	valid_0's binary_logloss: 0.341625
[1814]	valid_0's binary_logloss: 0.341619
[1815]	valid_0's binary_logloss: 0.

[1989]	valid_0's binary_logloss: 0.341105
[1990]	valid_0's binary_logloss: 0.341103
[1991]	valid_0's binary_logloss: 0.3411
[1992]	valid_0's binary_logloss: 0.341098
[1993]	valid_0's binary_logloss: 0.341094
[1994]	valid_0's binary_logloss: 0.341094
[1995]	valid_0's binary_logloss: 0.341089
[1996]	valid_0's binary_logloss: 0.341088
[1997]	valid_0's binary_logloss: 0.341084
[1998]	valid_0's binary_logloss: 0.34108
[1999]	valid_0's binary_logloss: 0.341076
[2000]	valid_0's binary_logloss: 0.341073
[2001]	valid_0's binary_logloss: 0.341071
[2002]	valid_0's binary_logloss: 0.341064
[2003]	valid_0's binary_logloss: 0.34106
[2004]	valid_0's binary_logloss: 0.341058
[2005]	valid_0's binary_logloss: 0.341058
[2006]	valid_0's binary_logloss: 0.341054
[2007]	valid_0's binary_logloss: 0.341052
[2008]	valid_0's binary_logloss: 0.341049
[2009]	valid_0's binary_logloss: 0.341048
[2010]	valid_0's binary_logloss: 0.341047
[2011]	valid_0's binary_logloss: 0.341045
[2012]	valid_0's binary_logloss: 0.341

[2186]	valid_0's binary_logloss: 0.340636
[2187]	valid_0's binary_logloss: 0.340638
[2188]	valid_0's binary_logloss: 0.340638
[2189]	valid_0's binary_logloss: 0.340634
[2190]	valid_0's binary_logloss: 0.340634
[2191]	valid_0's binary_logloss: 0.340635
[2192]	valid_0's binary_logloss: 0.340632
[2193]	valid_0's binary_logloss: 0.34063
[2194]	valid_0's binary_logloss: 0.340629
[2195]	valid_0's binary_logloss: 0.34063
[2196]	valid_0's binary_logloss: 0.340627
[2197]	valid_0's binary_logloss: 0.340622
[2198]	valid_0's binary_logloss: 0.340618
[2199]	valid_0's binary_logloss: 0.340616
[2200]	valid_0's binary_logloss: 0.340617
[2201]	valid_0's binary_logloss: 0.340619
[2202]	valid_0's binary_logloss: 0.340618
[2203]	valid_0's binary_logloss: 0.340615
[2204]	valid_0's binary_logloss: 0.340615
[2205]	valid_0's binary_logloss: 0.340615
[2206]	valid_0's binary_logloss: 0.340617
[2207]	valid_0's binary_logloss: 0.340611
[2208]	valid_0's binary_logloss: 0.34061
[2209]	valid_0's binary_logloss: 0.34

[2383]	valid_0's binary_logloss: 0.340329
[2384]	valid_0's binary_logloss: 0.340328
[2385]	valid_0's binary_logloss: 0.340328
[2386]	valid_0's binary_logloss: 0.340328
[2387]	valid_0's binary_logloss: 0.340327
[2388]	valid_0's binary_logloss: 0.340324
[2389]	valid_0's binary_logloss: 0.34032
[2390]	valid_0's binary_logloss: 0.340316
[2391]	valid_0's binary_logloss: 0.340313
[2392]	valid_0's binary_logloss: 0.340312
[2393]	valid_0's binary_logloss: 0.340311
[2394]	valid_0's binary_logloss: 0.34031
[2395]	valid_0's binary_logloss: 0.340305
[2396]	valid_0's binary_logloss: 0.340305
[2397]	valid_0's binary_logloss: 0.340302
[2398]	valid_0's binary_logloss: 0.3403
[2399]	valid_0's binary_logloss: 0.340295
[2400]	valid_0's binary_logloss: 0.340291
[2401]	valid_0's binary_logloss: 0.340289
[2402]	valid_0's binary_logloss: 0.34029
[2403]	valid_0's binary_logloss: 0.34029
[2404]	valid_0's binary_logloss: 0.340293
[2405]	valid_0's binary_logloss: 0.340292
[2406]	valid_0's binary_logloss: 0.34028

In [299]:
# Treinar 100% modelo da classe 4
m4 = lgb.train(params=hp4, num_boost_round=nbr, train_set=ds4_train, valid_sets=[ds4_valid], 
               verbose_eval=True, categorical_feature=feats_cat)
print('Modelo da classe 4 finalizado')

[1]	valid_0's binary_logloss: 0.601124
Training until validation scores don't improve for 25 rounds
[2]	valid_0's binary_logloss: 0.598625
[3]	valid_0's binary_logloss: 0.595582
[4]	valid_0's binary_logloss: 0.592895
[5]	valid_0's binary_logloss: 0.589964
[6]	valid_0's binary_logloss: 0.587096
[7]	valid_0's binary_logloss: 0.584288
[8]	valid_0's binary_logloss: 0.581542
[9]	valid_0's binary_logloss: 0.578842
[10]	valid_0's binary_logloss: 0.576211
[11]	valid_0's binary_logloss: 0.573621
[12]	valid_0's binary_logloss: 0.571087
[13]	valid_0's binary_logloss: 0.568844
[14]	valid_0's binary_logloss: 0.566407
[15]	valid_0's binary_logloss: 0.564017
[16]	valid_0's binary_logloss: 0.561674
[17]	valid_0's binary_logloss: 0.559373
[18]	valid_0's binary_logloss: 0.557115
[19]	valid_0's binary_logloss: 0.555197
[20]	valid_0's binary_logloss: 0.553018
[21]	valid_0's binary_logloss: 0.551085
[22]	valid_0's binary_logloss: 0.549153
[23]	valid_0's binary_logloss: 0.547073
[24]	valid_0's binary_loglos

[204]	valid_0's binary_logloss: 0.417023
[205]	valid_0's binary_logloss: 0.416788
[206]	valid_0's binary_logloss: 0.416606
[207]	valid_0's binary_logloss: 0.416359
[208]	valid_0's binary_logloss: 0.416139
[209]	valid_0's binary_logloss: 0.415919
[210]	valid_0's binary_logloss: 0.415698
[211]	valid_0's binary_logloss: 0.415467
[212]	valid_0's binary_logloss: 0.415251
[213]	valid_0's binary_logloss: 0.41502
[214]	valid_0's binary_logloss: 0.414825
[215]	valid_0's binary_logloss: 0.414607
[216]	valid_0's binary_logloss: 0.414406
[217]	valid_0's binary_logloss: 0.414191
[218]	valid_0's binary_logloss: 0.413996
[219]	valid_0's binary_logloss: 0.413791
[220]	valid_0's binary_logloss: 0.413594
[221]	valid_0's binary_logloss: 0.413399
[222]	valid_0's binary_logloss: 0.413199
[223]	valid_0's binary_logloss: 0.412996
[224]	valid_0's binary_logloss: 0.412786
[225]	valid_0's binary_logloss: 0.412596
[226]	valid_0's binary_logloss: 0.412411
[227]	valid_0's binary_logloss: 0.412203
[228]	valid_0's b

[405]	valid_0's binary_logloss: 0.392249
[406]	valid_0's binary_logloss: 0.392182
[407]	valid_0's binary_logloss: 0.392113
[408]	valid_0's binary_logloss: 0.39206
[409]	valid_0's binary_logloss: 0.39199
[410]	valid_0's binary_logloss: 0.391929
[411]	valid_0's binary_logloss: 0.391863
[412]	valid_0's binary_logloss: 0.3918
[413]	valid_0's binary_logloss: 0.391744
[414]	valid_0's binary_logloss: 0.391678
[415]	valid_0's binary_logloss: 0.391616
[416]	valid_0's binary_logloss: 0.391551
[417]	valid_0's binary_logloss: 0.391493
[418]	valid_0's binary_logloss: 0.391436
[419]	valid_0's binary_logloss: 0.391375
[420]	valid_0's binary_logloss: 0.391308
[421]	valid_0's binary_logloss: 0.391255
[422]	valid_0's binary_logloss: 0.391195
[423]	valid_0's binary_logloss: 0.391134
[424]	valid_0's binary_logloss: 0.391073
[425]	valid_0's binary_logloss: 0.391012
[426]	valid_0's binary_logloss: 0.390963
[427]	valid_0's binary_logloss: 0.390918
[428]	valid_0's binary_logloss: 0.39088
[429]	valid_0's binar

[606]	valid_0's binary_logloss: 0.383231
[607]	valid_0's binary_logloss: 0.383208
[608]	valid_0's binary_logloss: 0.383152
[609]	valid_0's binary_logloss: 0.383116
[610]	valid_0's binary_logloss: 0.38308
[611]	valid_0's binary_logloss: 0.383058
[612]	valid_0's binary_logloss: 0.383022
[613]	valid_0's binary_logloss: 0.383001
[614]	valid_0's binary_logloss: 0.382962
[615]	valid_0's binary_logloss: 0.382943
[616]	valid_0's binary_logloss: 0.382906
[617]	valid_0's binary_logloss: 0.382873
[618]	valid_0's binary_logloss: 0.382857
[619]	valid_0's binary_logloss: 0.382827
[620]	valid_0's binary_logloss: 0.382805
[621]	valid_0's binary_logloss: 0.382763
[622]	valid_0's binary_logloss: 0.382734
[623]	valid_0's binary_logloss: 0.382726
[624]	valid_0's binary_logloss: 0.382689
[625]	valid_0's binary_logloss: 0.382648
[626]	valid_0's binary_logloss: 0.382623
[627]	valid_0's binary_logloss: 0.382585
[628]	valid_0's binary_logloss: 0.382572
[629]	valid_0's binary_logloss: 0.382536
[630]	valid_0's b

[807]	valid_0's binary_logloss: 0.378442
[808]	valid_0's binary_logloss: 0.378428
[809]	valid_0's binary_logloss: 0.37842
[810]	valid_0's binary_logloss: 0.378407
[811]	valid_0's binary_logloss: 0.378382
[812]	valid_0's binary_logloss: 0.378365
[813]	valid_0's binary_logloss: 0.378357
[814]	valid_0's binary_logloss: 0.37834
[815]	valid_0's binary_logloss: 0.3783
[816]	valid_0's binary_logloss: 0.378291
[817]	valid_0's binary_logloss: 0.378277
[818]	valid_0's binary_logloss: 0.37826
[819]	valid_0's binary_logloss: 0.378237
[820]	valid_0's binary_logloss: 0.378222
[821]	valid_0's binary_logloss: 0.378198
[822]	valid_0's binary_logloss: 0.378182
[823]	valid_0's binary_logloss: 0.378161
[824]	valid_0's binary_logloss: 0.378131
[825]	valid_0's binary_logloss: 0.378115
[826]	valid_0's binary_logloss: 0.378102
[827]	valid_0's binary_logloss: 0.378076
[828]	valid_0's binary_logloss: 0.378059
[829]	valid_0's binary_logloss: 0.378045
[830]	valid_0's binary_logloss: 0.378036
[831]	valid_0's binar

[1008]	valid_0's binary_logloss: 0.375445
[1009]	valid_0's binary_logloss: 0.375441
[1010]	valid_0's binary_logloss: 0.375426
[1011]	valid_0's binary_logloss: 0.375419
[1012]	valid_0's binary_logloss: 0.375397
[1013]	valid_0's binary_logloss: 0.375375
[1014]	valid_0's binary_logloss: 0.37536
[1015]	valid_0's binary_logloss: 0.375339
[1016]	valid_0's binary_logloss: 0.375327
[1017]	valid_0's binary_logloss: 0.375314
[1018]	valid_0's binary_logloss: 0.375304
[1019]	valid_0's binary_logloss: 0.375289
[1020]	valid_0's binary_logloss: 0.375266
[1021]	valid_0's binary_logloss: 0.37526
[1022]	valid_0's binary_logloss: 0.375255
[1023]	valid_0's binary_logloss: 0.375249
[1024]	valid_0's binary_logloss: 0.375244
[1025]	valid_0's binary_logloss: 0.375228
[1026]	valid_0's binary_logloss: 0.375221
[1027]	valid_0's binary_logloss: 0.375209
[1028]	valid_0's binary_logloss: 0.375184
[1029]	valid_0's binary_logloss: 0.375179
[1030]	valid_0's binary_logloss: 0.37517
[1031]	valid_0's binary_logloss: 0.37

[1204]	valid_0's binary_logloss: 0.373526
[1205]	valid_0's binary_logloss: 0.373516
[1206]	valid_0's binary_logloss: 0.373509
[1207]	valid_0's binary_logloss: 0.373504
[1208]	valid_0's binary_logloss: 0.373495
[1209]	valid_0's binary_logloss: 0.373493
[1210]	valid_0's binary_logloss: 0.373489
[1211]	valid_0's binary_logloss: 0.373478
[1212]	valid_0's binary_logloss: 0.373466
[1213]	valid_0's binary_logloss: 0.37346
[1214]	valid_0's binary_logloss: 0.373449
[1215]	valid_0's binary_logloss: 0.373438
[1216]	valid_0's binary_logloss: 0.373425
[1217]	valid_0's binary_logloss: 0.373416
[1218]	valid_0's binary_logloss: 0.373412
[1219]	valid_0's binary_logloss: 0.373398
[1220]	valid_0's binary_logloss: 0.373387
[1221]	valid_0's binary_logloss: 0.373376
[1222]	valid_0's binary_logloss: 0.373374
[1223]	valid_0's binary_logloss: 0.373372
[1224]	valid_0's binary_logloss: 0.37336
[1225]	valid_0's binary_logloss: 0.373353
[1226]	valid_0's binary_logloss: 0.373346
[1227]	valid_0's binary_logloss: 0.3

[1400]	valid_0's binary_logloss: 0.372208
[1401]	valid_0's binary_logloss: 0.372203
[1402]	valid_0's binary_logloss: 0.372197
[1403]	valid_0's binary_logloss: 0.372191
[1404]	valid_0's binary_logloss: 0.372189
[1405]	valid_0's binary_logloss: 0.37218
[1406]	valid_0's binary_logloss: 0.372177
[1407]	valid_0's binary_logloss: 0.372179
[1408]	valid_0's binary_logloss: 0.372163
[1409]	valid_0's binary_logloss: 0.372161
[1410]	valid_0's binary_logloss: 0.37216
[1411]	valid_0's binary_logloss: 0.372149
[1412]	valid_0's binary_logloss: 0.372142
[1413]	valid_0's binary_logloss: 0.37214
[1414]	valid_0's binary_logloss: 0.372135
[1415]	valid_0's binary_logloss: 0.372138
[1416]	valid_0's binary_logloss: 0.372137
[1417]	valid_0's binary_logloss: 0.372132
[1418]	valid_0's binary_logloss: 0.372131
[1419]	valid_0's binary_logloss: 0.372126
[1420]	valid_0's binary_logloss: 0.372108
[1421]	valid_0's binary_logloss: 0.372097
[1422]	valid_0's binary_logloss: 0.372088
[1423]	valid_0's binary_logloss: 0.37

[1596]	valid_0's binary_logloss: 0.371043
[1597]	valid_0's binary_logloss: 0.37104
[1598]	valid_0's binary_logloss: 0.371039
[1599]	valid_0's binary_logloss: 0.371027
[1600]	valid_0's binary_logloss: 0.371028
[1601]	valid_0's binary_logloss: 0.371021
[1602]	valid_0's binary_logloss: 0.371018
[1603]	valid_0's binary_logloss: 0.371015
[1604]	valid_0's binary_logloss: 0.371014
[1605]	valid_0's binary_logloss: 0.371007
[1606]	valid_0's binary_logloss: 0.370995
[1607]	valid_0's binary_logloss: 0.370992
[1608]	valid_0's binary_logloss: 0.370981
[1609]	valid_0's binary_logloss: 0.370979
[1610]	valid_0's binary_logloss: 0.370967
[1611]	valid_0's binary_logloss: 0.370969
[1612]	valid_0's binary_logloss: 0.370965
[1613]	valid_0's binary_logloss: 0.370965
[1614]	valid_0's binary_logloss: 0.370954
[1615]	valid_0's binary_logloss: 0.370954
[1616]	valid_0's binary_logloss: 0.370948
[1617]	valid_0's binary_logloss: 0.370943
[1618]	valid_0's binary_logloss: 0.370934
[1619]	valid_0's binary_logloss: 0.

[1792]	valid_0's binary_logloss: 0.370317
[1793]	valid_0's binary_logloss: 0.370318
[1794]	valid_0's binary_logloss: 0.370313
[1795]	valid_0's binary_logloss: 0.370311
[1796]	valid_0's binary_logloss: 0.370304
[1797]	valid_0's binary_logloss: 0.370297
[1798]	valid_0's binary_logloss: 0.370299
[1799]	valid_0's binary_logloss: 0.370296
[1800]	valid_0's binary_logloss: 0.370291
[1801]	valid_0's binary_logloss: 0.370285
[1802]	valid_0's binary_logloss: 0.370281
[1803]	valid_0's binary_logloss: 0.370277
[1804]	valid_0's binary_logloss: 0.370278
[1805]	valid_0's binary_logloss: 0.370279
[1806]	valid_0's binary_logloss: 0.370276
[1807]	valid_0's binary_logloss: 0.370273
[1808]	valid_0's binary_logloss: 0.37027
[1809]	valid_0's binary_logloss: 0.370262
[1810]	valid_0's binary_logloss: 0.370254
[1811]	valid_0's binary_logloss: 0.370257
[1812]	valid_0's binary_logloss: 0.370252
[1813]	valid_0's binary_logloss: 0.370248
[1814]	valid_0's binary_logloss: 0.370244
[1815]	valid_0's binary_logloss: 0.

[1988]	valid_0's binary_logloss: 0.369673
[1989]	valid_0's binary_logloss: 0.369674
[1990]	valid_0's binary_logloss: 0.369671
[1991]	valid_0's binary_logloss: 0.369668
[1992]	valid_0's binary_logloss: 0.369663
[1993]	valid_0's binary_logloss: 0.369656
[1994]	valid_0's binary_logloss: 0.369653
[1995]	valid_0's binary_logloss: 0.369652
[1996]	valid_0's binary_logloss: 0.369651
[1997]	valid_0's binary_logloss: 0.369647
[1998]	valid_0's binary_logloss: 0.369642
[1999]	valid_0's binary_logloss: 0.369643
[2000]	valid_0's binary_logloss: 0.36964
[2001]	valid_0's binary_logloss: 0.369638
[2002]	valid_0's binary_logloss: 0.36964
[2003]	valid_0's binary_logloss: 0.369637
[2004]	valid_0's binary_logloss: 0.369634
[2005]	valid_0's binary_logloss: 0.369636
[2006]	valid_0's binary_logloss: 0.369634
[2007]	valid_0's binary_logloss: 0.36963
[2008]	valid_0's binary_logloss: 0.369622
[2009]	valid_0's binary_logloss: 0.369622
[2010]	valid_0's binary_logloss: 0.369623
[2011]	valid_0's binary_logloss: 0.36

[2184]	valid_0's binary_logloss: 0.369166
[2185]	valid_0's binary_logloss: 0.369164
[2186]	valid_0's binary_logloss: 0.369165
[2187]	valid_0's binary_logloss: 0.369164
[2188]	valid_0's binary_logloss: 0.369166
[2189]	valid_0's binary_logloss: 0.369163
[2190]	valid_0's binary_logloss: 0.369164
[2191]	valid_0's binary_logloss: 0.369163
[2192]	valid_0's binary_logloss: 0.369161
[2193]	valid_0's binary_logloss: 0.369161
[2194]	valid_0's binary_logloss: 0.369162
[2195]	valid_0's binary_logloss: 0.369159
[2196]	valid_0's binary_logloss: 0.369155
[2197]	valid_0's binary_logloss: 0.369155
[2198]	valid_0's binary_logloss: 0.369153
[2199]	valid_0's binary_logloss: 0.369151
[2200]	valid_0's binary_logloss: 0.369153
[2201]	valid_0's binary_logloss: 0.369152
[2202]	valid_0's binary_logloss: 0.369148
[2203]	valid_0's binary_logloss: 0.369147
[2204]	valid_0's binary_logloss: 0.369143
[2205]	valid_0's binary_logloss: 0.369143
[2206]	valid_0's binary_logloss: 0.369145
[2207]	valid_0's binary_logloss: 0

[2381]	valid_0's binary_logloss: 0.368742
[2382]	valid_0's binary_logloss: 0.368743
[2383]	valid_0's binary_logloss: 0.36874
[2384]	valid_0's binary_logloss: 0.368738
[2385]	valid_0's binary_logloss: 0.368735
[2386]	valid_0's binary_logloss: 0.368736
[2387]	valid_0's binary_logloss: 0.368739
[2388]	valid_0's binary_logloss: 0.368734
[2389]	valid_0's binary_logloss: 0.368731
[2390]	valid_0's binary_logloss: 0.368731
[2391]	valid_0's binary_logloss: 0.36873
[2392]	valid_0's binary_logloss: 0.368728
[2393]	valid_0's binary_logloss: 0.368727
[2394]	valid_0's binary_logloss: 0.368728
[2395]	valid_0's binary_logloss: 0.368726
[2396]	valid_0's binary_logloss: 0.368724
[2397]	valid_0's binary_logloss: 0.368719
[2398]	valid_0's binary_logloss: 0.368715
[2399]	valid_0's binary_logloss: 0.368715
[2400]	valid_0's binary_logloss: 0.368711
[2401]	valid_0's binary_logloss: 0.368708
[2402]	valid_0's binary_logloss: 0.368708
[2403]	valid_0's binary_logloss: 0.368703
[2404]	valid_0's binary_logloss: 0.3

In [300]:
# Treinar 100% modelo da classe 3
m3 = lgb.train(params=hp3, num_boost_round=nbr, train_set=ds3_train, valid_sets=[ds3_valid], 
               verbose_eval=True, categorical_feature=feats_cat)
print('Modelo da classe 3 finalizado')

[1]	valid_0's binary_logloss: 0.0279432
Training until validation scores don't improve for 25 rounds
[2]	valid_0's binary_logloss: 0.0277376
[3]	valid_0's binary_logloss: 0.0272779
[4]	valid_0's binary_logloss: 0.0269112
[5]	valid_0's binary_logloss: 0.0265756
[6]	valid_0's binary_logloss: 0.026297
[7]	valid_0's binary_logloss: 0.026049
[8]	valid_0's binary_logloss: 0.0258286
[9]	valid_0's binary_logloss: 0.0256253
[10]	valid_0's binary_logloss: 0.0254339
[11]	valid_0's binary_logloss: 0.0252598
[12]	valid_0's binary_logloss: 0.0250955
[13]	valid_0's binary_logloss: 0.0249406
[14]	valid_0's binary_logloss: 0.0247933
[15]	valid_0's binary_logloss: 0.0246555
[16]	valid_0's binary_logloss: 0.0245181
[17]	valid_0's binary_logloss: 0.0243867
[18]	valid_0's binary_logloss: 0.0242676
[19]	valid_0's binary_logloss: 0.0241498
[20]	valid_0's binary_logloss: 0.0240356
[21]	valid_0's binary_logloss: 0.0239351
[22]	valid_0's binary_logloss: 0.0238672
[23]	valid_0's binary_logloss: 0.0237665
[24]	va

[199]	valid_0's binary_logloss: 0.0188422
[200]	valid_0's binary_logloss: 0.0188338
[201]	valid_0's binary_logloss: 0.0188221
[202]	valid_0's binary_logloss: 0.0188134
[203]	valid_0's binary_logloss: 0.0188034
[204]	valid_0's binary_logloss: 0.018795
[205]	valid_0's binary_logloss: 0.0187859
[206]	valid_0's binary_logloss: 0.0187791
[207]	valid_0's binary_logloss: 0.0187713
[208]	valid_0's binary_logloss: 0.0187643
[209]	valid_0's binary_logloss: 0.0187572
[210]	valid_0's binary_logloss: 0.0187473
[211]	valid_0's binary_logloss: 0.0187384
[212]	valid_0's binary_logloss: 0.0187299
[213]	valid_0's binary_logloss: 0.018721
[214]	valid_0's binary_logloss: 0.0187134
[215]	valid_0's binary_logloss: 0.0187057
[216]	valid_0's binary_logloss: 0.0186984
[217]	valid_0's binary_logloss: 0.0186905
[218]	valid_0's binary_logloss: 0.0186829
[219]	valid_0's binary_logloss: 0.0186745
[220]	valid_0's binary_logloss: 0.0186669
[221]	valid_0's binary_logloss: 0.0186583
[222]	valid_0's binary_logloss: 0.01

[396]	valid_0's binary_logloss: 0.0179053
[397]	valid_0's binary_logloss: 0.0179034
[398]	valid_0's binary_logloss: 0.0179016
[399]	valid_0's binary_logloss: 0.0178999
[400]	valid_0's binary_logloss: 0.0178966
[401]	valid_0's binary_logloss: 0.0178946
[402]	valid_0's binary_logloss: 0.0178922
[403]	valid_0's binary_logloss: 0.0178889
[404]	valid_0's binary_logloss: 0.0178869
[405]	valid_0's binary_logloss: 0.0178847
[406]	valid_0's binary_logloss: 0.0178834
[407]	valid_0's binary_logloss: 0.0178809
[408]	valid_0's binary_logloss: 0.0178796
[409]	valid_0's binary_logloss: 0.0178778
[410]	valid_0's binary_logloss: 0.0178763
[411]	valid_0's binary_logloss: 0.017874
[412]	valid_0's binary_logloss: 0.0178724
[413]	valid_0's binary_logloss: 0.0178704
[414]	valid_0's binary_logloss: 0.0178676
[415]	valid_0's binary_logloss: 0.0178647
[416]	valid_0's binary_logloss: 0.0178634
[417]	valid_0's binary_logloss: 0.017861
[418]	valid_0's binary_logloss: 0.0178586
[419]	valid_0's binary_logloss: 0.01

[592]	valid_0's binary_logloss: 0.0176122
[593]	valid_0's binary_logloss: 0.0176105
[594]	valid_0's binary_logloss: 0.0176091
[595]	valid_0's binary_logloss: 0.0176089
[596]	valid_0's binary_logloss: 0.0176095
[597]	valid_0's binary_logloss: 0.0176088
[598]	valid_0's binary_logloss: 0.0176074
[599]	valid_0's binary_logloss: 0.0176072
[600]	valid_0's binary_logloss: 0.0176055
[601]	valid_0's binary_logloss: 0.0176054
[602]	valid_0's binary_logloss: 0.0176044
[603]	valid_0's binary_logloss: 0.0176036
[604]	valid_0's binary_logloss: 0.0176027
[605]	valid_0's binary_logloss: 0.0176027
[606]	valid_0's binary_logloss: 0.0176017
[607]	valid_0's binary_logloss: 0.0176006
[608]	valid_0's binary_logloss: 0.0175998
[609]	valid_0's binary_logloss: 0.0175991
[610]	valid_0's binary_logloss: 0.0175986
[611]	valid_0's binary_logloss: 0.0175982
[612]	valid_0's binary_logloss: 0.0175967
[613]	valid_0's binary_logloss: 0.0175952
[614]	valid_0's binary_logloss: 0.017594
[615]	valid_0's binary_logloss: 0.0

[789]	valid_0's binary_logloss: 0.0174929
[790]	valid_0's binary_logloss: 0.0174924
[791]	valid_0's binary_logloss: 0.0174917
[792]	valid_0's binary_logloss: 0.0174908
[793]	valid_0's binary_logloss: 0.0174905
[794]	valid_0's binary_logloss: 0.0174903
[795]	valid_0's binary_logloss: 0.0174897
[796]	valid_0's binary_logloss: 0.0174892
[797]	valid_0's binary_logloss: 0.0174893
[798]	valid_0's binary_logloss: 0.0174881
[799]	valid_0's binary_logloss: 0.0174882
[800]	valid_0's binary_logloss: 0.0174885
[801]	valid_0's binary_logloss: 0.0174882
[802]	valid_0's binary_logloss: 0.017488
[803]	valid_0's binary_logloss: 0.017488
[804]	valid_0's binary_logloss: 0.0174881
[805]	valid_0's binary_logloss: 0.0174877
[806]	valid_0's binary_logloss: 0.0174876
[807]	valid_0's binary_logloss: 0.0174879
[808]	valid_0's binary_logloss: 0.0174882
[809]	valid_0's binary_logloss: 0.0174879
[810]	valid_0's binary_logloss: 0.0174881
[811]	valid_0's binary_logloss: 0.0174881
[812]	valid_0's binary_logloss: 0.01

[986]	valid_0's binary_logloss: 0.0174271
[987]	valid_0's binary_logloss: 0.0174275
[988]	valid_0's binary_logloss: 0.0174271
[989]	valid_0's binary_logloss: 0.0174271
[990]	valid_0's binary_logloss: 0.017427
[991]	valid_0's binary_logloss: 0.0174271
[992]	valid_0's binary_logloss: 0.0174269
[993]	valid_0's binary_logloss: 0.0174275
[994]	valid_0's binary_logloss: 0.0174272
[995]	valid_0's binary_logloss: 0.0174272
[996]	valid_0's binary_logloss: 0.0174272
[997]	valid_0's binary_logloss: 0.0174273
[998]	valid_0's binary_logloss: 0.0174274
[999]	valid_0's binary_logloss: 0.0174275
[1000]	valid_0's binary_logloss: 0.0174264
[1001]	valid_0's binary_logloss: 0.0174262
[1002]	valid_0's binary_logloss: 0.0174265
[1003]	valid_0's binary_logloss: 0.0174262
[1004]	valid_0's binary_logloss: 0.0174258
[1005]	valid_0's binary_logloss: 0.0174258
[1006]	valid_0's binary_logloss: 0.0174254
[1007]	valid_0's binary_logloss: 0.0174248
[1008]	valid_0's binary_logloss: 0.0174252
[1009]	valid_0's binary_lo

In [301]:
# Treinar 100% modelo da classe 2
m2 = lgb.train(params=hp2, num_boost_round=nbr, train_set=ds2_train, valid_sets=[ds2_valid], 
               verbose_eval=True, categorical_feature=feats_cat)
print('Modelo da classe 2 finalizado')

[1]	valid_0's binary_logloss: 0.111189
Training until validation scores don't improve for 25 rounds
[2]	valid_0's binary_logloss: 0.109087
[3]	valid_0's binary_logloss: 0.107097
[4]	valid_0's binary_logloss: 0.105283
[5]	valid_0's binary_logloss: 0.103643
[6]	valid_0's binary_logloss: 0.102146
[7]	valid_0's binary_logloss: 0.100767
[8]	valid_0's binary_logloss: 0.099472
[9]	valid_0's binary_logloss: 0.0982501
[10]	valid_0's binary_logloss: 0.0971251
[11]	valid_0's binary_logloss: 0.0960573
[12]	valid_0's binary_logloss: 0.0950475
[13]	valid_0's binary_logloss: 0.0941728
[14]	valid_0's binary_logloss: 0.0932589
[15]	valid_0's binary_logloss: 0.0923837
[16]	valid_0's binary_logloss: 0.09156
[17]	valid_0's binary_logloss: 0.0907644
[18]	valid_0's binary_logloss: 0.0900092
[19]	valid_0's binary_logloss: 0.089355
[20]	valid_0's binary_logloss: 0.0886435
[21]	valid_0's binary_logloss: 0.0880378
[22]	valid_0's binary_logloss: 0.0874101
[23]	valid_0's binary_logloss: 0.0867778
[24]	valid_0's b

[198]	valid_0's binary_logloss: 0.0580624
[199]	valid_0's binary_logloss: 0.0580223
[200]	valid_0's binary_logloss: 0.0579803
[201]	valid_0's binary_logloss: 0.0579386
[202]	valid_0's binary_logloss: 0.057899
[203]	valid_0's binary_logloss: 0.0578614
[204]	valid_0's binary_logloss: 0.0578225
[205]	valid_0's binary_logloss: 0.0577852
[206]	valid_0's binary_logloss: 0.0577471
[207]	valid_0's binary_logloss: 0.0577091
[208]	valid_0's binary_logloss: 0.057668
[209]	valid_0's binary_logloss: 0.0576346
[210]	valid_0's binary_logloss: 0.0575973
[211]	valid_0's binary_logloss: 0.05756
[212]	valid_0's binary_logloss: 0.0575174
[213]	valid_0's binary_logloss: 0.0574763
[214]	valid_0's binary_logloss: 0.0574399
[215]	valid_0's binary_logloss: 0.0574054
[216]	valid_0's binary_logloss: 0.057368
[217]	valid_0's binary_logloss: 0.0573339
[218]	valid_0's binary_logloss: 0.0572961
[219]	valid_0's binary_logloss: 0.0572575
[220]	valid_0's binary_logloss: 0.0572225
[221]	valid_0's binary_logloss: 0.05718

[395]	valid_0's binary_logloss: 0.0541056
[396]	valid_0's binary_logloss: 0.0540949
[397]	valid_0's binary_logloss: 0.0540843
[398]	valid_0's binary_logloss: 0.0540708
[399]	valid_0's binary_logloss: 0.0540626
[400]	valid_0's binary_logloss: 0.0540574
[401]	valid_0's binary_logloss: 0.0540463
[402]	valid_0's binary_logloss: 0.054039
[403]	valid_0's binary_logloss: 0.0540288
[404]	valid_0's binary_logloss: 0.0540243
[405]	valid_0's binary_logloss: 0.0540151
[406]	valid_0's binary_logloss: 0.0540054
[407]	valid_0's binary_logloss: 0.0539929
[408]	valid_0's binary_logloss: 0.0539871
[409]	valid_0's binary_logloss: 0.0539769
[410]	valid_0's binary_logloss: 0.0539712
[411]	valid_0's binary_logloss: 0.0539618
[412]	valid_0's binary_logloss: 0.0539553
[413]	valid_0's binary_logloss: 0.0539483
[414]	valid_0's binary_logloss: 0.0539413
[415]	valid_0's binary_logloss: 0.0539312
[416]	valid_0's binary_logloss: 0.0539229
[417]	valid_0's binary_logloss: 0.0539145
[418]	valid_0's binary_logloss: 0.0

[591]	valid_0's binary_logloss: 0.0529639
[592]	valid_0's binary_logloss: 0.0529607
[593]	valid_0's binary_logloss: 0.0529573
[594]	valid_0's binary_logloss: 0.0529563
[595]	valid_0's binary_logloss: 0.0529554
[596]	valid_0's binary_logloss: 0.0529475
[597]	valid_0's binary_logloss: 0.0529452
[598]	valid_0's binary_logloss: 0.0529433
[599]	valid_0's binary_logloss: 0.0529411
[600]	valid_0's binary_logloss: 0.0529409
[601]	valid_0's binary_logloss: 0.0529377
[602]	valid_0's binary_logloss: 0.0529348
[603]	valid_0's binary_logloss: 0.0529316
[604]	valid_0's binary_logloss: 0.0529289
[605]	valid_0's binary_logloss: 0.0529257
[606]	valid_0's binary_logloss: 0.0529231
[607]	valid_0's binary_logloss: 0.0529182
[608]	valid_0's binary_logloss: 0.0529167
[609]	valid_0's binary_logloss: 0.0529143
[610]	valid_0's binary_logloss: 0.0529105
[611]	valid_0's binary_logloss: 0.0529075
[612]	valid_0's binary_logloss: 0.0529012
[613]	valid_0's binary_logloss: 0.0528989
[614]	valid_0's binary_logloss: 0.

[787]	valid_0's binary_logloss: 0.0524956
[788]	valid_0's binary_logloss: 0.052495
[789]	valid_0's binary_logloss: 0.0524926
[790]	valid_0's binary_logloss: 0.0524906
[791]	valid_0's binary_logloss: 0.0524916
[792]	valid_0's binary_logloss: 0.0524889
[793]	valid_0's binary_logloss: 0.0524872
[794]	valid_0's binary_logloss: 0.0524836
[795]	valid_0's binary_logloss: 0.0524792
[796]	valid_0's binary_logloss: 0.0524766
[797]	valid_0's binary_logloss: 0.052473
[798]	valid_0's binary_logloss: 0.0524698
[799]	valid_0's binary_logloss: 0.0524678
[800]	valid_0's binary_logloss: 0.0524644
[801]	valid_0's binary_logloss: 0.0524631
[802]	valid_0's binary_logloss: 0.0524615
[803]	valid_0's binary_logloss: 0.0524616
[804]	valid_0's binary_logloss: 0.0524603
[805]	valid_0's binary_logloss: 0.0524576
[806]	valid_0's binary_logloss: 0.052455
[807]	valid_0's binary_logloss: 0.0524532
[808]	valid_0's binary_logloss: 0.0524494
[809]	valid_0's binary_logloss: 0.0524482
[810]	valid_0's binary_logloss: 0.052

[983]	valid_0's binary_logloss: 0.0522339
[984]	valid_0's binary_logloss: 0.0522326
[985]	valid_0's binary_logloss: 0.0522328
[986]	valid_0's binary_logloss: 0.0522327
[987]	valid_0's binary_logloss: 0.0522311
[988]	valid_0's binary_logloss: 0.0522299
[989]	valid_0's binary_logloss: 0.0522294
[990]	valid_0's binary_logloss: 0.0522294
[991]	valid_0's binary_logloss: 0.0522297
[992]	valid_0's binary_logloss: 0.0522289
[993]	valid_0's binary_logloss: 0.0522275
[994]	valid_0's binary_logloss: 0.052227
[995]	valid_0's binary_logloss: 0.0522271
[996]	valid_0's binary_logloss: 0.0522278
[997]	valid_0's binary_logloss: 0.0522271
[998]	valid_0's binary_logloss: 0.0522282
[999]	valid_0's binary_logloss: 0.0522272
[1000]	valid_0's binary_logloss: 0.0522249
[1001]	valid_0's binary_logloss: 0.0522228
[1002]	valid_0's binary_logloss: 0.0522213
[1003]	valid_0's binary_logloss: 0.0522197
[1004]	valid_0's binary_logloss: 0.0522198
[1005]	valid_0's binary_logloss: 0.0522182
[1006]	valid_0's binary_loglo

[1175]	valid_0's binary_logloss: 0.0521519
[1176]	valid_0's binary_logloss: 0.0521519
[1177]	valid_0's binary_logloss: 0.0521528
[1178]	valid_0's binary_logloss: 0.0521529
[1179]	valid_0's binary_logloss: 0.0521535
[1180]	valid_0's binary_logloss: 0.0521529
[1181]	valid_0's binary_logloss: 0.0521535
[1182]	valid_0's binary_logloss: 0.0521525
[1183]	valid_0's binary_logloss: 0.0521513
[1184]	valid_0's binary_logloss: 0.0521505
[1185]	valid_0's binary_logloss: 0.0521499
[1186]	valid_0's binary_logloss: 0.0521503
[1187]	valid_0's binary_logloss: 0.0521505
[1188]	valid_0's binary_logloss: 0.0521507
[1189]	valid_0's binary_logloss: 0.0521513
[1190]	valid_0's binary_logloss: 0.0521514
[1191]	valid_0's binary_logloss: 0.0521515
[1192]	valid_0's binary_logloss: 0.0521511
[1193]	valid_0's binary_logloss: 0.0521519
[1194]	valid_0's binary_logloss: 0.052152
[1195]	valid_0's binary_logloss: 0.0521529
[1196]	valid_0's binary_logloss: 0.0521516
[1197]	valid_0's binary_logloss: 0.0521509
[1198]	valid

In [302]:
# Treinar 100% modelo da classe 1
m1 = lgb.train(params=hp1, num_boost_round=nbr, train_set=ds1_train, valid_sets=[ds1_valid], 
               verbose_eval=True, categorical_feature=feats_cat)
print('Modelo da classe 1 finalizado')

[1]	valid_0's binary_logloss: 0.0633144
Training until validation scores don't improve for 25 rounds
[2]	valid_0's binary_logloss: 0.0620583
[3]	valid_0's binary_logloss: 0.0609959
[4]	valid_0's binary_logloss: 0.0600462
[5]	valid_0's binary_logloss: 0.0591971
[6]	valid_0's binary_logloss: 0.0584526
[7]	valid_0's binary_logloss: 0.0577502
[8]	valid_0's binary_logloss: 0.0571318
[9]	valid_0's binary_logloss: 0.0565437
[10]	valid_0's binary_logloss: 0.0559677
[11]	valid_0's binary_logloss: 0.0554408
[12]	valid_0's binary_logloss: 0.0549541
[13]	valid_0's binary_logloss: 0.0544898
[14]	valid_0's binary_logloss: 0.0540297
[15]	valid_0's binary_logloss: 0.0536046
[16]	valid_0's binary_logloss: 0.0531958
[17]	valid_0's binary_logloss: 0.0528203
[18]	valid_0's binary_logloss: 0.0524557
[19]	valid_0's binary_logloss: 0.0522183
[20]	valid_0's binary_logloss: 0.0518692
[21]	valid_0's binary_logloss: 0.0515491
[22]	valid_0's binary_logloss: 0.0512458
[23]	valid_0's binary_logloss: 0.0509366
[24]	

[198]	valid_0's binary_logloss: 0.0366174
[199]	valid_0's binary_logloss: 0.0365953
[200]	valid_0's binary_logloss: 0.0365744
[201]	valid_0's binary_logloss: 0.0365517
[202]	valid_0's binary_logloss: 0.0365296
[203]	valid_0's binary_logloss: 0.0365075
[204]	valid_0's binary_logloss: 0.0364858
[205]	valid_0's binary_logloss: 0.0364646
[206]	valid_0's binary_logloss: 0.0364472
[207]	valid_0's binary_logloss: 0.0364272
[208]	valid_0's binary_logloss: 0.0364073
[209]	valid_0's binary_logloss: 0.0363835
[210]	valid_0's binary_logloss: 0.0363604
[211]	valid_0's binary_logloss: 0.0363405
[212]	valid_0's binary_logloss: 0.0363178
[213]	valid_0's binary_logloss: 0.036299
[214]	valid_0's binary_logloss: 0.0362782
[215]	valid_0's binary_logloss: 0.0362581
[216]	valid_0's binary_logloss: 0.0362366
[217]	valid_0's binary_logloss: 0.0362173
[218]	valid_0's binary_logloss: 0.0361972
[219]	valid_0's binary_logloss: 0.0361781
[220]	valid_0's binary_logloss: 0.0361598
[221]	valid_0's binary_logloss: 0.0

[395]	valid_0's binary_logloss: 0.0345655
[396]	valid_0's binary_logloss: 0.0345626
[397]	valid_0's binary_logloss: 0.034559
[398]	valid_0's binary_logloss: 0.0345563
[399]	valid_0's binary_logloss: 0.0345522
[400]	valid_0's binary_logloss: 0.0345477
[401]	valid_0's binary_logloss: 0.034544
[402]	valid_0's binary_logloss: 0.0345398
[403]	valid_0's binary_logloss: 0.0345383
[404]	valid_0's binary_logloss: 0.0345345
[405]	valid_0's binary_logloss: 0.0345309
[406]	valid_0's binary_logloss: 0.0345246
[407]	valid_0's binary_logloss: 0.0345224
[408]	valid_0's binary_logloss: 0.034517
[409]	valid_0's binary_logloss: 0.0345139
[410]	valid_0's binary_logloss: 0.0345104
[411]	valid_0's binary_logloss: 0.0345058
[412]	valid_0's binary_logloss: 0.034501
[413]	valid_0's binary_logloss: 0.0345012
[414]	valid_0's binary_logloss: 0.0344969
[415]	valid_0's binary_logloss: 0.0344938
[416]	valid_0's binary_logloss: 0.0344897
[417]	valid_0's binary_logloss: 0.0344857
[418]	valid_0's binary_logloss: 0.0344

[591]	valid_0's binary_logloss: 0.0340921
[592]	valid_0's binary_logloss: 0.0340916
[593]	valid_0's binary_logloss: 0.0340876
[594]	valid_0's binary_logloss: 0.0340869
[595]	valid_0's binary_logloss: 0.0340864
[596]	valid_0's binary_logloss: 0.0340841
[597]	valid_0's binary_logloss: 0.034083
[598]	valid_0's binary_logloss: 0.0340816
[599]	valid_0's binary_logloss: 0.0340799
[600]	valid_0's binary_logloss: 0.0340792
[601]	valid_0's binary_logloss: 0.0340758
[602]	valid_0's binary_logloss: 0.0340745
[603]	valid_0's binary_logloss: 0.0340747
[604]	valid_0's binary_logloss: 0.0340728
[605]	valid_0's binary_logloss: 0.0340717
[606]	valid_0's binary_logloss: 0.0340707
[607]	valid_0's binary_logloss: 0.0340697
[608]	valid_0's binary_logloss: 0.0340672
[609]	valid_0's binary_logloss: 0.0340668
[610]	valid_0's binary_logloss: 0.0340659
[611]	valid_0's binary_logloss: 0.0340648
[612]	valid_0's binary_logloss: 0.0340637
[613]	valid_0's binary_logloss: 0.034063
[614]	valid_0's binary_logloss: 0.03

[788]	valid_0's binary_logloss: 0.0339227
[789]	valid_0's binary_logloss: 0.0339222
[790]	valid_0's binary_logloss: 0.0339222
[791]	valid_0's binary_logloss: 0.0339208
[792]	valid_0's binary_logloss: 0.0339183
[793]	valid_0's binary_logloss: 0.0339189
[794]	valid_0's binary_logloss: 0.0339191
[795]	valid_0's binary_logloss: 0.0339168
[796]	valid_0's binary_logloss: 0.0339147
[797]	valid_0's binary_logloss: 0.0339151
[798]	valid_0's binary_logloss: 0.0339123
[799]	valid_0's binary_logloss: 0.033912
[800]	valid_0's binary_logloss: 0.0339103
[801]	valid_0's binary_logloss: 0.0339114
[802]	valid_0's binary_logloss: 0.033911
[803]	valid_0's binary_logloss: 0.0339116
[804]	valid_0's binary_logloss: 0.0339118
[805]	valid_0's binary_logloss: 0.0339115
[806]	valid_0's binary_logloss: 0.0339111
[807]	valid_0's binary_logloss: 0.0339127
[808]	valid_0's binary_logloss: 0.0339119
[809]	valid_0's binary_logloss: 0.0339117
[810]	valid_0's binary_logloss: 0.0339134
[811]	valid_0's binary_logloss: 0.03

**Avaliação do F1-Score em conjunto de validação com thresholding móvel**

In [303]:
# Checar matrix de confusão
from sklearn.metrics import classification_report, confusion_matrix

In [304]:
def calcular_f1_score(yt, yp):
    '''
    
    '''
    # Calcular F1-Score Macro para cada threshold
    vTo = list()
    vF1 = list()
    maxF1 = 0
    maxTo = 0
    
    # Loop em diversas possibilidades
    for threshold in np.linspace(0,1, 100):
        # Calcular
        yd = (yp >= threshold).astype(int)
        f1s = f1_score(yt, yd, average='macro')
        # Adicionar
        vTo.append(threshold)
        vF1.append(f1s)
        # Salvar valor maior caso possível
        if f1s > maxF1:
            maxF1 = f1s
            maxTo = threshold
            
    
    # Pegar a mediana para report
    mF1 = round(np.median(vF1), 3)
    nF1 = round(np.mean(vF1), 3)
    # Report
    print(f'> F1 Score Macro mediano = {mF1} e médio = {nF1}')
    
    # Retornar
    return maxF1, maxTo, mF1, nF1

In [305]:
# Fazer previsões
y1_pred_valid = m1.predict(x_valid)
print(f' # Para classe 1, feito!')
v1 = calcular_f1_score(y1_valid.values, y1_pred_valid)

 # Para classe 1, feito!
> F1 Score Macro mediano = 0.667 e médio = 0.65


In [306]:
# Ver resultados mais em foco
print(classification_report(y1_valid.values, (y1_pred_valid >= v1[1]).astype(int)))
print(confusion_matrix(y1_valid.values, (y1_pred_valid >= v1[1]).astype(int)))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99    257070
           1       0.49      0.46      0.47      3126

    accuracy                           0.99    260196
   macro avg       0.74      0.73      0.73    260196
weighted avg       0.99      0.99      0.99    260196

[[255578   1492]
 [  1700   1426]]


In [307]:
# Fazer previsões
y2_pred_valid = m2.predict(x_valid)
print(f' # Para classe 2, feito!')
v2 = calcular_f1_score(y2_valid.values, y2_pred_valid)
# Ver resultados mais em foco
print(classification_report(y2_valid.values, (y2_pred_valid >= v2[1]).astype(int)))
print(confusion_matrix(y2_valid.values, (y2_pred_valid >= v2[1]).astype(int)))

 # Para classe 2, feito!
> F1 Score Macro mediano = 0.701 e médio = 0.659
              precision    recall  f1-score   support

           0       0.99      0.99      0.99    253909
           1       0.52      0.61      0.56      6287

    accuracy                           0.98    260196
   macro avg       0.75      0.80      0.77    260196
weighted avg       0.98      0.98      0.98    260196

[[250279   3630]
 [  2432   3855]]


In [308]:
# Fazer previsões
y3_pred_valid = m3.predict(x_valid)
print(f' # Para classe 3, feito!')
v3 = calcular_f1_score(y3_valid.values, y3_pred_valid)
# Ver resultados mais em foco
print(classification_report(y3_valid.values, (y3_pred_valid >= v3[1]).astype(int)))
print(confusion_matrix(y3_valid.values, (y3_pred_valid >= v3[1]).astype(int)))

 # Para classe 3, feito!
> F1 Score Macro mediano = 0.645 e médio = 0.627
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    259009
           1       0.43      0.40      0.41      1187

    accuracy                           0.99    260196
   macro avg       0.71      0.70      0.70    260196
weighted avg       0.99      0.99      0.99    260196

[[258382    627]
 [   718    469]]


In [309]:
# Fazer previsões
y4_pred_valid = m4.predict(x_valid)
print(f' # Para classe 4, feito!')
v4 = calcular_f1_score(y4_valid.values, y4_pred_valid)
# Ver resultados mais em foco
print(classification_report(y4_valid.values, (y4_pred_valid >= v4[1]).astype(int)))
print(confusion_matrix(y4_valid.values, (y4_pred_valid >= v4[1]).astype(int)))

 # Para classe 4, feito!
> F1 Score Macro mediano = 0.738 e médio = 0.685
              precision    recall  f1-score   support

           0       0.89      0.86      0.88    184114
           1       0.69      0.74      0.71     76082

    accuracy                           0.83    260196
   macro avg       0.79      0.80      0.79    260196
weighted avg       0.83      0.83      0.83    260196

[[158927  25187]
 [ 20003  56079]]


In [310]:
# Fazer previsões
y5_pred_valid = m5.predict(x_valid)
print(f' # Para classe 5, feito!')
v5 = calcular_f1_score(y5_valid.values, y5_pred_valid)
# Ver resultados mais em foco
print(classification_report(y5_valid.values, (y5_pred_valid >= v5[1]).astype(int)))
print(confusion_matrix(y5_valid.values, (y5_pred_valid >= v5[1]).astype(int)))

 # Para classe 5, feito!
> F1 Score Macro mediano = 0.79 e médio = 0.741
              precision    recall  f1-score   support

           0       0.77      0.78      0.77     86682
           1       0.89      0.88      0.89    173514

    accuracy                           0.85    260196
   macro avg       0.83      0.83      0.83    260196
weighted avg       0.85      0.85      0.85    260196

[[ 67477  19205]
 [ 20263 153251]]


In [311]:
# Calcular F1 Score Macro para todos modelos
print(f' ############ Mediana Geral')
mg = round(np.mean([v1[2], v2[2], v3[2], v4[2], v5[2]]), 5)
print(f'> F1 Score Macro = {mg}')

 ############ Mediana Geral
> F1 Score Macro = 0.7082


In [312]:
# Calcular F1 Score Macro para todos modelos
print(f' ############ Média Geral')
mg = round(np.mean([v1[3], v2[3], v3[2], v4[3], v5[3]]), 5)
print(f'> F1 Score Macro = {mg}')

 ############ Média Geral
> F1 Score Macro = 0.676


In [313]:
# Calcular F1 Score Macro para todos modelos
print(f' ############ Máxima Geral')
mg = round(np.mean([v1[0], v2[0], v3[0], v4[0], v5[0]]), 5)
print(f'> F1 Score Macro = {mg}')

 ############ Máxima Geral
> F1 Score Macro = 0.76698


In [314]:
# Ver valores
for i, v in enumerate([v1, v2, v3, v4, v5]):
    print(f'> Classe {i+1} tem Máximo F1 Score de {v[0]} com threhsold de {v[1]}')

> Classe 1 tem Máximo F1 Score de 0.7328335083859125 com threhsold de 0.23232323232323235
> Classe 2 tem Máximo F1 Score de 0.7739329832414704 com threhsold de 0.2828282828282829
> Classe 3 tem Máximo F1 Score de 0.704133460427478 com threhsold de 0.16161616161616163
> Classe 4 tem Máximo F1 Score de 0.7941634630951195 com threhsold de 0.4141414141414142
> Classe 5 tem Máximo F1 Score de 0.8298209747335534 com threhsold de 0.5656565656565657


**Construção de Modelos Finais (treino e validação) após tentativas de otimização**

In [315]:
# Especificar hiper parâmetros finais
hp5 = {'objective':'binary',
       'is_unbalance': False,
       'boosting_type': 'gbdt',
       'metric': 'binary_logloss', 
       'eta': 0.01,
       'num_leaves': 224,
       'feature_fraction': 0.85,
       #'early_stopping_round': 25,
       "lambda_l1":8.791369324758216,
       "lambda_l2":0.05026876807406415,
       'seed': 1206,
       'verbosity': -1,
       'num_threads':4}
hp4 = {'objective':'binary',
       'is_unbalance': False,
       'boosting_type': 'gbdt',
       'metric': 'binary_logloss', 
       'eta': 0.01,
       'num_leaves': 243,
       'feature_fraction': 0.85,
       #'early_stopping_round': 25,
       "lambda_l1":8.791369324758216,
       "lambda_l2":0.05026876807406415,
       'seed': 1206, 
       'verbosity': -1,
       'num_threads':4}
hp3 = {'objective':'binary',
       'is_unbalance': False,
       'boosting_type': 'gbdt',
       'metric': 'binary_logloss', 
       'eta': 0.01,
       'num_leaves': 243,
       'feature_fraction': 0.85,
       #'early_stopping_round': 25,
       "lambda_l1":8.791369324758216,
       "lambda_l2":0.05026876807406415,
       'seed': 1206, 
       'verbosity': -1,
       'num_threads':4}
hp2 = {'objective':'binary',
       'is_unbalance': False,
       'boosting_type': 'gbdt',
       'metric': 'binary_logloss', 
       'eta': 0.01,
       'num_leaves': 243,
       'feature_fraction': 0.85,
       #'early_stopping_round': 25,
       "lambda_l1":8.791369324758216,
       "lambda_l2":0.05026876807406415,
       'seed': 1206, 
       'verbosity': -1,
       'num_threads':4}
hp1 = {'objective':'binary',
       'is_unbalance': False,
       'boosting_type': 'gbdt',
       'metric': 'binary_logloss', 
       'eta': 0.01,
       'num_leaves': 243,
       'feature_fraction': 0.85,
       #'early_stopping_round': 25,
       "lambda_l1":8.791369324758216,
       "lambda_l2":0.05026876807406415,
       'seed': 1206, 
       'verbosity': -1,
       'num_threads':4}

In [316]:
# Definir número de rounds
nbr = 2500
print('Inicio de treinamento ...')
# Treinar 100% modelo da classe 5
m5f = lgb.train(params=hp5, num_boost_round=nbr, train_set=ds5_train_valid, verbose_eval=False, categorical_feature=feats_cat)
print('Modelo da classe 5 finalizado')
# Treinar 100% modelo da classe 4
m4f = lgb.train(params=hp4, num_boost_round=nbr, train_set=ds4_train_valid, verbose_eval=False, categorical_feature=feats_cat)
print('Modelo da classe 4 finalizado')
# Treinar 100% modelo da classe 3
m3f = lgb.train(params=hp3, num_boost_round=nbr, train_set=ds3_train_valid, verbose_eval=False, categorical_feature=feats_cat)
print('Modelo da classe 3 finalizado')
# Treinar 100% modelo da classe 2
m2f = lgb.train(params=hp2, num_boost_round=nbr, train_set=ds2_train_valid, verbose_eval=False, categorical_feature=feats_cat)
print('Modelo da classe 2 finalizado')
# Treinar 100% modelo da classe 1
m1f = lgb.train(params=hp1, num_boost_round=nbr, train_set=ds1_train_valid, verbose_eval=False, categorical_feature=feats_cat)
print('Modelo da classe 1 finalizado')

Inicio de treinamento ...
Modelo da classe 5 finalizado
Modelo da classe 4 finalizado
Modelo da classe 3 finalizado
Modelo da classe 2 finalizado
Modelo da classe 1 finalizado


# Previsão

**Probabilidade Máxima**

In [None]:
# Carregar modelos
#m1_fl = lgb.Booster(model_file='modelos/m1.txt')
#m2_fl = lgb.Booster(model_file='modelos/m2.txt')
#m3_fl = lgb.Booster(model_file='modelos/m3.txt')
#m4_fl = lgb.Booster(model_file='modelos/m4.txt')
#m5_fl = lgb.Booster(model_file='modelos/m5.txt')

In [317]:
# Fazer previsões
print('Inicio de previsao ...')
y1_pred = m1f.predict(x_test)
print('Previsão da classe 1 finalizado')
y2_pred = m2f.predict(x_test)
print('Previsão da classe 2 finalizado')
y3_pred = m3f.predict(x_test)
print('Previsão da classe 3 finalizado')
y4_pred = m4f.predict(x_test)
print('Previsão da classe 4 finalizado')
y5_pred = m5f.predict(x_test)
print('Previsão da classe 5 finalizado')

Inicio de previsao ...
Previsão da classe 1 finalizado
Previsão da classe 2 finalizado
Previsão da classe 3 finalizado
Previsão da classe 4 finalizado
Previsão da classe 5 finalizado


In [318]:
# Unificar
y_pred = list()

# Loop em cada previsão
for y1,y2,y3,y4,y5 in zip(y1_pred, y2_pred, y3_pred, y4_pred, y5_pred):
    # Listar sequencialmente as previsoes
    l = [y1,y2,y3,y4,y5]
    # Puxar maior previsão
    y_pred.append(l.index(max(l)) + 1)

In [319]:
# Salvar previsão final
df_sub_desafio['CLASSI_FIN'] = y_pred

# Salvar probabilidades
df_sub_desafio['PROBA_1'] = y1_pred
df_sub_desafio['PROBA_2'] = y2_pred
df_sub_desafio['PROBA_3'] = y3_pred
df_sub_desafio['PROBA_4'] = y4_pred
df_sub_desafio['PROBA_5'] = y5_pred

In [320]:
# Viz
df_sub_desafio.head()

Unnamed: 0,ID,CLASSI_FIN,PROBA_1,PROBA_2,PROBA_3,PROBA_4,PROBA_5
0,c42342a4-cfca-497f-8ee6-a6c76ed4bc67,4,0.000833,0.000225,0.000136,0.920425,0.104799
1,6c86b555-8bf2-4306-8f2e-db809a98e3c3,4,7.4e-05,0.000167,1.4e-05,0.87526,0.098138
2,0ac9582e-2683-474b-8316-5e9c0ea6edd5,5,0.000646,0.000134,0.002277,0.358798,0.522043
3,7bc6de6f-041c-4866-8778-7542c230a498,4,0.00034,0.001049,0.000659,0.896631,0.14667
4,8eefb2e4-626e-4f40-9b93-5146906fe07b,5,0.000194,1.5e-05,0.000578,0.416102,0.615924


**Camada Adicional: Melhor modelo(s)**

In [335]:
# Adicionar previsão do melhor modelo com threshold optimizado
df_sub_desafio_to = df_sub_desafio.copy()
df_sub_desafio_to['THRESH_OPT4'] = (y4_pred >= 0.4141414141414142).astype(int)
df_sub_desafio_to['THRESH_OPT5'] = (y5_pred >= 0.5656565656565657).astype(int)


# Checar contagem de previsoes considerando TO
df_sub_desafio_to.loc[df_sub_desafio_to['THRESH_OPT5'] == 1, 'CLASSI_FIN'].value_counts()

5    571348
4       326
1        10
3         7
2         1
Name: CLASSI_FIN, dtype: int64

In [349]:
# Viz distribuição
df_sub_desafio_to['CLASSI_FIN'].value_counts(normalize=True)

5    0.699573
4    0.288047
2    0.006353
1    0.004755
3    0.001273
Name: CLASSI_FIN, dtype: float64

In [336]:
# Aplicar TO
df_sub_desafio_to.loc[(df_sub_desafio_to['THRESH_OPT5'] == 1), 'CLASSI_FIN'] = 5

In [337]:
# Salvar
df_sub_desafio_to[['ID','CLASSI_FIN']].to_csv('ml-olympiad-ensure-healthy-lives/submissao_try_8_pm_to5_without_unbalance.csv',
                                              index=False)

In [338]:
# Salvar localmente
df_sub_desafio[['ID','CLASSI_FIN']].to_csv('ml-olympiad-ensure-healthy-lives/submissao_try_8_pm_without_unbalance.csv',
                                           index=False)

In [339]:
# Salvar localmente
df_sub_desafio.to_csv('ml-olympiad-ensure-healthy-lives/completo_try_8_pm_without_unbalance.csv', index=False)
df_sub_desafio_to.to_csv('ml-olympiad-ensure-healthy-lives/completo_try_8_pm_to5_without_unbalance.csv', index=False)

# Salvar modelos treinados

In [340]:
# Salvar modelos localmente
m1f.save_model('modelos/m1_v8_unb.txt')
m2f.save_model('modelos/m2_v8_unb.txt')
m3f.save_model('modelos/m3_v8_unb.txt')
m4f.save_model('modelos/m4_v8_unb.txt')
m5f.save_model('modelos/m5_v8_unb.txt')

<lightgbm.basic.Booster at 0x1b4877ced00>

# Checar Feature Importance de cada modelo

In [341]:
# Criar
df_fi = pd.DataFrame()
df_fi['FEAT'] = x_train.columns
df_fi['FI_M1'] = (100*m1f.feature_importance() / m1f.feature_importance().sum()).round(2)
df_fi['FI_M2'] = (100*m2f.feature_importance() / m2f.feature_importance().sum()).round(2)
df_fi['FI_M3'] = (100*m3f.feature_importance() / m3f.feature_importance().sum()).round(2)
df_fi['FI_M4'] = (100*m4f.feature_importance() / m4f.feature_importance().sum()).round(2)
df_fi['FI_M5'] = (100*m5f.feature_importance() / m5f.feature_importance().sum()).round(2)
df_fi['FI_ORDER'] = df_fi[[c for c in df_fi.columns if 'FI_M' in c]].sum(axis=1)

In [342]:
# Ver
df_fi.sort_values('FI_ORDER', ascending=False).head(25)

Unnamed: 0,FEAT,FI_M1,FI_M2,FI_M3,FI_M4,FI_M5,FI_ORDER
19,ID_MUNICIP,23.11,18.26,12.93,42.71,44.54,141.55
37,OUTRO_DES,9.39,8.82,8.46,8.65,8.57,43.89
2,NU_IDADE_N,9.07,5.51,9.61,4.34,3.93,32.46
55,MORB_DESC,6.49,4.49,6.82,6.96,7.3,32.06
0,SEM_NOT,3.99,7.85,8.06,3.95,3.8,27.65
1,SEM_PRI,3.43,7.79,6.64,4.01,3.82,25.69
18,SG_UF_NOT,2.38,1.76,1.87,3.38,3.42,12.81
5,ACUMULADO_SINTOMAS,3.31,2.99,4.01,0.97,0.98,12.26
8,ACUMULADO_SAUDE_SINTOMA_VACCOV,3.27,2.43,4.39,0.59,0.55,11.23
10,SEM_PRI_NOT_DIFF_ABS,1.96,2.97,3.48,1.18,1.2,10.79


In [343]:
# Salvar localmente
df_fi.to_csv('ml-olympiad-ensure-healthy-lives/feature_importance_models.csv', index=False)

In [352]:
df_best = pd.read_csv('ml-olympiad-ensure-healthy-lives/blending1.csv', low_memory=False)

In [355]:
df_aux = pd.merge(df_best, df_sub_desafio_to[['ID','CLASSI_FIN']], on='ID', how='left')

In [356]:
((df_aux['CLASSI_FIN_x'] != df_aux['CLASSI_FIN_y']).sum()/len(df_aux),
(df_aux['CLASSI_FIN_x'] != df_aux['CLASSI_FIN_y']).sum())

(0.07313785784040741, 63434)

In [357]:
0.60875*(1-0.05552384872498187)

0.5749498570886673

In [358]:
df_aux.loc[(df_aux['CLASSI_FIN_x'] != df_aux['CLASSI_FIN_y']), 'CLASSI_FIN_x'].value_counts()

2    24358
4    23247
1     7731
5     5899
3     2199
Name: CLASSI_FIN_x, dtype: int64

In [359]:
df_aux.loc[(df_aux['CLASSI_FIN_x'] != df_aux['CLASSI_FIN_y']), 'CLASSI_FIN_y'].value_counts()

4    36202
5    27115
1       85
3       18
2       14
Name: CLASSI_FIN_y, dtype: int64

In [351]:
df_aux['CLASSI_FIN_x'].value_counts(normalize=True)

5    0.661831
4    0.285854
2    0.035949
1    0.012999
3    0.003368
Name: CLASSI_FIN_x, dtype: float64