# Instalando novas bibliotecas

In [196]:
%%capture
!pip install optuna

In [197]:
%%capture
!pip install lightgbm

# Importando módulos

In [198]:
import warnings
import numpy as np
import pandas as pd
import optuna as opt
import seaborn as sns
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
from sklearn.utils import compute_class_weight
from sklearn.model_selection import train_test_split
from optuna.integration.lightgbm import LightGBMTuner

In [199]:
# Só encher o saco uma vez
warnings.filterwarnings('once')

# __Execução I__

Passos
- Importar dados
- Pré-processamento
- Divisão treino e validação
- Imputação de dados
- Light GBM padrão com `class_weight`
- Ensemble de modelos binários

# Importando dados

In [200]:
# Importar tabelas
df_train = pd.read_csv('ml-olympiad-ensure-healthy-lives/train.csv', low_memory=False)
df_test = pd.read_csv('ml-olympiad-ensure-healthy-lives/test.csv', low_memory=False)
df_subm = pd.read_csv('ml-olympiad-ensure-healthy-lives/sample_submission.csv', low_memory=False)

In [201]:
# Montar base para previsão do desafio
df_sub_desafio = pd.DataFrame()
df_sub_desafio['ID'] = df_test['ID']
df_sub_desafio['CLASSI_FIN'] = np.nan

In [202]:
# Aplicar correcao nas variáveis coringa
df_train['FNT_IN_COV'].replace({1:'MANUAL',2:'INTEGRACAO',np.nan:'INFO_FALTANTE'}, inplace=True)
df_test['FNT_IN_COV'].replace({1:'MANUAL',2:'INTEGRACAO',np.nan:'INFO_FALTANTE'}, inplace=True)

df_train['COD_IDADE'].replace({np.nan:'INFO_FALTANTE'}, inplace=True)
df_test['COD_IDADE'].replace({np.nan:'INFO_FALTANTE'}, inplace=True)

# Pré-processamento

In [203]:
# Definir colunas para serem derrubadas
cols_to_drop = ['ID',         # Identificação
                'DELTA_UTI',  # Constante
                'PAC_COCBO',  # Completamente faltante
                'PAC_DSCBO',  # Completamente faltante
                'OBES_IMC',   # Completamente faltante
                'COD_IDADE',  # Redundante tendo as outras duas variáveis de idade
                'FNT_IN_COV'] # Não relacionado ao problema

# Derrubar
df_train.drop(columns=cols_to_drop, inplace=True)
df_test.drop(columns=cols_to_drop, inplace=True)

In [204]:
# Especificar label do desafio
label = 'CLASSI_FIN'

**Variáveis relacionadas a sexo + idade, porém o foco está em mães / bebês**

In [205]:
# Definir colunas de análise nesta seção
cols = ['CS_SEXO','CS_GESTANT','PUERPERA','NU_IDADE_N','TP_IDADE','MAE_VAC','M_AMAMENTA']

Alterações
- Adicionar '99' para identificar que são homens, (código universal para não aplicável) nas colunas referentes a gestação e paridade
- Demais que possuem 'F' em CS_SEXO, adicionar '9' para identificar as mulheres como ignorados, pois provavelmente não estavam classificadas neste grupo

`NU_IDADE_N` + `TP_IDADE`

- Converter coluna de idade para anos
- Remover idades impossíveis

[Expectativa de vida no Brasil](https://pt.wikipedia.org/wiki/Lista_de_unidades_federativas_do_Brasil_por_expectativa_de_vida)

In [206]:
def corrigir_idade(nu_idade, tp_idade):
    '''
    Corrigir padrão de idade para anos
    '''
    if tp_idade == 3:
        return abs(nu_idade)
    elif tp_idade == 2:
        return abs(nu_idade)/12
    else:
        return abs(nu_idade)/365

In [207]:
# Aplicar correção de anos
df_train['NU_IDADE_N'] = df_train.apply(lambda df:corrigir_idade(df['NU_IDADE_N'], df['TP_IDADE']), axis=1).astype(int)

# Qualquer valor acima dos 95 serão tratados e fixados em 95 devido a impossibilidade + expectativa de vida no país
df_train.loc[df_train['NU_IDADE_N'] >= 95, 'NU_IDADE_N'] = 95

In [208]:
# Aplicar correção de anos
df_test['NU_IDADE_N'] = df_test.apply(lambda df:corrigir_idade(df['NU_IDADE_N'], df['TP_IDADE']), axis=1).astype(int)

# Qualquer valor acima dos 95 serão tratados e fixados em 95 devido a impossibilidade + expectativa de vida no país
df_test.loc[df_test['NU_IDADE_N'] >= 95, 'NU_IDADE_N'] = 95

`CS_SEXO`

[Motivo de imputação em último nível](https://educa.ibge.gov.br/jovens/conheca-o-brasil/populacao/18320-quantidade-de-homens-e-mulheres.html#:~:text=Segundo%20dados%20da%20PNAD%20Cont%C3%ADnu,51%2C1%25%20de%20mulheres.)

In [209]:
# Remover classe desconhecida e rara
df_train.loc[df_train['CS_SEXO'] == 'I', 'CS_SEXO'] = np.nan

# Aplicar primeiro nível
c1 = (df_train['CS_SEXO'].isna()) & (df_train['CS_GESTANT'] == 6) & (df_train['NU_IDADE_N'] >= 9)
df_train.loc[c1, 'CS_SEXO'] = 'M'

# Aplicar segundo nível
c2 = (df_train['CS_GESTANT'] != 6) & (df_train['CS_SEXO'].isna())
df_train.loc[c2, 'CS_SEXO'] = 'F'

# Aplicar terceiro nível
c3 = df_train['CS_SEXO'].isna()
df_train.loc[c3, 'CS_SEXO'] = 'F'

In [210]:
# Remover classe desconhecida e rara
df_test.loc[df_test['CS_SEXO'] == 'I', 'CS_SEXO'] = np.nan

# Aplicar primeiro nível
c1 = (df_test['CS_SEXO'].isna()) & (df_test['CS_GESTANT'] == 6) & (df_test['NU_IDADE_N'] >= 9)
df_test.loc[c1, 'CS_SEXO'] = 'M'

# Aplicar segundo nível
c2 = (df_test['CS_GESTANT'] != 6) & (df_test['CS_SEXO'].isna())
df_test.loc[c2, 'CS_SEXO'] = 'F'

# Aplicar terceiro nível
c3 = df_test['CS_SEXO'].isna()
df_test.loc[c3, 'CS_SEXO'] = 'F'

`CS_GESTANT`

In [211]:
# Remover classe desconhecida e rara
df_train.loc[df_train['CS_GESTANT'] == 0, 'CS_GESTANT'] = np.nan

# Aplicar primeiro nível
c1 = (df_train['CS_GESTANT'].isna()) & ((df_train['CS_SEXO'] == 'M') | (df_train['NU_IDADE_N'] <= 9))
df_train.loc[c1, 'CS_GESTANT'] = 6

# Aplicar segundo nível
c2 = (df_train['CS_GESTANT'].isna()) & (df_train['CS_SEXO'] == 'F') & (df_train['NU_IDADE_N'] > 9)
df_train.loc[c2, 'CS_GESTANT'] = 5

# Aplicar terceiro e último nível
c3 = (df_train['CS_GESTANT'].isna())
df_train.loc[c3, 'CS_GESTANT'] = 9

In [212]:
# Remover classe desconhecida e rara
df_test.loc[df_test['CS_GESTANT'] == 0, 'CS_GESTANT'] = np.nan

# Aplicar primeiro nível
c1 = (df_test['CS_GESTANT'].isna()) & ((df_test['CS_SEXO'] == 'M') | (df_test['NU_IDADE_N'] <= 9))
df_test.loc[c1, 'CS_GESTANT'] = 6

# Aplicar segundo nível
c2 = (df_test['CS_GESTANT'].isna()) & (df_test['CS_SEXO'] == 'F') & (df_test['NU_IDADE_N'] > 9)
df_test.loc[c2, 'CS_GESTANT'] = 5

# Aplicar terceiro e último nível
c3 = (df_test['CS_GESTANT'].isna())
df_test.loc[c3, 'CS_GESTANT'] = 9

`PUERPERA`

In [213]:
# Aplicar primeiro nível
c1 = df_train['CS_SEXO'] == 'M'
df_train.loc[c1, 'PUERPERA'] = 99

# Aplicar segundo nível
c2 = (df_train['PUERPERA'].isna()) & (df_train['CS_SEXO'] == 'F') & (df_train['CS_GESTANT'] != 6)
df_train.loc[c2, 'PUERPERA'] = 2

# Aplicar terceiro nível
c3 = (df_train['CS_SEXO'] == 'F') & (df_train['PUERPERA'].isna())
df_train.loc[c3, 'PUERPERA'] = 9

In [214]:
# Aplicar primeiro nível
c1 = df_test['CS_SEXO'] == 'M'
df_test.loc[c1, 'PUERPERA'] = 99

# Aplicar segundo nível
c2 = (df_test['PUERPERA'].isna()) & (df_test['CS_SEXO'] == 'F') & (df_test['CS_GESTANT'] != 6)
df_test.loc[c2, 'PUERPERA'] = 2

# Aplicar terceiro nível
c3 = (df_test['CS_SEXO'] == 'F') & (df_test['PUERPERA'].isna())
df_test.loc[c3, 'PUERPERA'] = 9

`MAE_VAC` + `M_AMAMENTA`

In [215]:
# Aplicar primeiro nível
c1 = df_train['NU_IDADE_N'] > 0
df_train.loc[c1, ['MAE_VAC','M_AMAMENTA']] = 99

# Aplicar segundo nível
df_train.loc[df_train['MAE_VAC'].isna(), 'MAE_VAC'] = 9
df_train.loc[df_train['M_AMAMENTA'].isna(), 'M_AMAMENTA'] = 9

In [216]:
# Aplicar primeiro nível
c1 = df_test['NU_IDADE_N'] > 0
df_test.loc[c1, ['MAE_VAC','M_AMAMENTA']] = 99

# Aplicar segundo nível
df_test.loc[df_test['MAE_VAC'].isna(), 'MAE_VAC'] = 9
df_test.loc[df_test['M_AMAMENTA'].isna(), 'M_AMAMENTA'] = 9

**Variáveis relacionadas ao COVID**

In [217]:
# Definir colunas de análise nesta seção
cols = ['VACINA_COV','DOSE_1_COV','DOSE_2_COV','DOSE_REF']

`VACINA_COV`

Possibilidade de binarizar e converter `9` em `2`

OBS - Criar nova coluna com a contagem de doses tomadas.

In [218]:
# Aplicar primeiro nível
c1 = df_train[['DOSE_1_COV','DOSE_2_COV','DOSE_REF']].sum(axis=1) >= 1
df_train.loc[c1, 'VACINA_COV'] = 1

# Aplicar segundo nível
c2 = df_train['VACINA_COV'].isna()
df_train.loc[c2, 'VACINA_COV'] = 9

In [219]:
# Aplicar primeiro nível
c1 = df_test[['DOSE_1_COV','DOSE_2_COV','DOSE_REF']].sum(axis=1) >= 1
df_test.loc[c1, 'VACINA_COV'] = 1

# Aplicar segundo nível
c2 = df_test['VACINA_COV'].isna()
df_test.loc[c2, 'VACINA_COV'] = 9

In [220]:
# Criar nova coluna
df_train['DOSE_COV_N'] = df_train[['DOSE_1_COV','DOSE_2_COV','DOSE_REF']].sum(axis=1)
df_test['DOSE_COV_N'] = df_test[['DOSE_1_COV','DOSE_2_COV','DOSE_REF']].sum(axis=1)

**Variáveis relacionadas ao EXAME**

In [221]:
# Definir colunas de análise nesta seção
cols = ['RAIOX_RES','RAIOX_OUT','TOMO_RES','TOMO_OUT','AMOSTRA','OUT_AMOST','TP_AMOSTRA']

In [222]:
# Especificar todas as possibilidades de outros para raio x e tomografia
outros_raioX = list(set(df_train['RAIOX_OUT'].dropna().unique().tolist() + df_test['RAIOX_OUT'].dropna().unique().tolist()))
outros_tomog = list(set(df_train['TOMO_OUT'].dropna().unique().tolist() + df_test['TOMO_OUT'].dropna().unique().tolist()))
outros_amost = list(set(df_train['OUT_AMOST'].dropna().unique().tolist() + df_test['OUT_AMOST'].dropna().unique().tolist()))

Melhorar colunas referente a OUTROS

In [223]:
# Gerar listas hierarquicas especificas para consolidar categorias em tomografia (prioridade de baixo para cima)
outros_tomog_laudo = [c for c in outros_tomog if 'LAUDO' in c or 'AGUARDA' in c or 'EM ANDAMENTO' in c or 'SEM INFORMACAO' in c]
outros_tomog_norml = [c for c in outros_tomog if 'NORMAL' in c or 'SEM ALTERAC' in c or 'SEM SINAIS DE INFECCAO' in c]
outros_tomog_vidro = [c for c in outros_tomog if 'VIDRO' in c or 'OPACIDADE' in c]
outros_tomog_bronc = [c for c in outros_tomog if 'BRONCOPATIA' in c]
outros_tomog_covid = [c for c in outros_tomog if 'COVID' in c]
outros_tomog_derrp = [c for c in outros_tomog if 'DERRAME' in c and 'PLEURAL' in c]
outros_tomog_enfis = [c for c in outros_tomog if 'ENFISEMA' in c]
outros_tomog_pneum = [c for c in outros_tomog if 'PNEUMONIA' in c or 'PNM' in c]
outros_tomog_conso = [c for c in outros_tomog if 'CONSOLIDA' in c]
outros_tomog_solic = [c for c in outros_tomog if 'SOLICITADO' in c]
outros_tomog_atele = [c for c in outros_tomog if 'ATELECTAS' in c]
outros_tomog_tuber = [c for c in outros_tomog if 'TUBERCUL' in c or 'TB' in c]

# Especificar novas categorias de agrupamento
outros_tomog_nCats = ['REL_TUBERCULOSE', 'REL_OPACIDADE_VIDRO_FOSCO', 'REL_BRONCOPATIA', 'REL_COVID',
                      'REL_DERRAME', 'REL_ENFISEMA', 'REL_PNEUMONIA', 'REL_CONSOLIDACAO',
                      'REL_ATELECTASIA', 'REL_SEM_LAUDO', 'REL_RES_NORMAL', 'REL_SOLICITADO']

outros_tomog_oCats = [outros_tomog_tuber, outros_tomog_vidro, outros_tomog_bronc, outros_tomog_covid,
                      outros_tomog_derrp, outros_tomog_enfis, outros_tomog_pneum, outros_tomog_conso,
                      outros_tomog_atele, outros_tomog_laudo, outros_tomog_norml, outros_tomog_solic]

# Criar colunas
df_train['TOMO_OUT_NOVO'] = np.nan
df_test['TOMO_OUT_NOVO'] = np.nan

# Aplicar novas categorias evitando sobreposição
for n_cat, o_cats in zip(outros_tomog_nCats, outros_tomog_oCats):
    df_train.loc[(df_train['TOMO_OUT'].isin(o_cats)) & (df_train['TOMO_OUT_NOVO'].isna()), 'TOMO_OUT_NOVO'] = n_cat
    df_test.loc[(df_test['TOMO_OUT'].isin(o_cats)) & (df_test['TOMO_OUT_NOVO'].isna()), 'TOMO_OUT_NOVO'] = n_cat

# Converter todas as demais categorias restantes provalvemente raras em umas só para reduzir ruído ao modelo
c1 = (~df_train['TOMO_OUT_NOVO'].isin(outros_tomog_nCats)) & (df_train['TOMO_OUT'].notnull())
df_train.loc[c1, 'TOMO_OUT_NOVO'] = 'REL_OUTROS_RESULTADOS'
c1 = (~df_test['TOMO_OUT_NOVO'].isin(outros_tomog_nCats)) & (df_test['TOMO_OUT'].notnull())
df_test.loc[c1, 'TOMO_OUT_NOVO'] = n_cat

In [224]:
# Gerar listas hierarquicas especificas para consolidar categorias em raio X (prioridade de baixo para cima)
outros_raiox_infil = [c for c in outros_raioX if 'INFILT' in c]
outros_raiox_vidro = [c for c in outros_raioX if 'VIDRO' in c or 'FOSCO' in c]
outros_raiox_opaci = [c for c in outros_raioX if 'OPACIDADE' in c or 'OPACIF' in c]
outros_raiox_tomog = [c for c in outros_raioX if 'TC' in c or 'TOMOGRAFIA' in c or 'TOMO' in c or 'CT' in c]
outros_raiox_derrp = [c for c in outros_raioX if 'DERRAME' in c and 'PLEU' in c]
outros_raiox_atele = [c for c in outros_raioX if 'ATELECTAS' in c]
outros_raiox_congt = [c for c in outros_raioX if 'CONGESTAO' in c or 'CONGEST' in c]
outros_raiox_pneum = [c for c in outros_raioX if 'PNEUMONIA' in c or 'PNM' in c]
outros_raiox_infla = [c for c in outros_raioX if 'INFLAMA' in c]
outros_raiox_espes = [c for c in outros_raioX if 'ESPESSA' in c]
outros_raiox_solic = [c for c in outros_raioX if 'SOLICITADO' in c or 'PEDIDO' in c]
outros_raiox_conde = [c for c in outros_raioX if 'CONDENSA' in c]
outros_raiox_hiper = [c for c in outros_raioX if 'HIPERINSUF' in c]
outros_raiox_arcos = [c for c in outros_raioX if 'ARCO' in c or 'RETIFICACAO' in c or 'COSTAIS' in c]
outros_raiox_norml = [c for c in outros_raioX if 'NORMAL' in c or 'SEM ALTERAC' in c or 'SEM SINAIS DE INFECCAO' in c]
outros_raiox_laudo = [c for c in outros_raioX if 'LAUDO' in c or 'SEM DESCRICAO' in c or 'SEM RELATO' in c or 'SEM INFO' in c or 'AGUARDA' in c or 'NAO INFO' in c]

# Especificar novas categorias de agrupamento por ordem de prioridade
outros_raiox_nCats = ['REL_INFILTRACAO', 'REL_VIDRO_FOSCO','REL_OPACIDADE', 'REL_DERRAME_PLEURAL',
                      'REL_ATELECTASIA', 'REL_CONGESTAO', 'REL_PNEUMONIA', 'REL_INFLAMACAO', 
                      'REL_ESPESSAMENTO', 'REL_CONDENSACAO', 'REL_HIPERINSUFLACAO', 'REL_SOLICITADO',
                      'REL_ARCOS_COSTAIS', 'REL_SEM_LAUDO', 'REL_RES_NORMAL','REL_PEDIDO_TOMOG']
outros_raiox_oCats = [outros_raiox_infil, outros_raiox_vidro, outros_raiox_opaci, outros_raiox_derrp,
                      outros_raiox_atele, outros_raiox_congt, outros_raiox_pneum, outros_raiox_infla,
                      outros_raiox_espes, outros_raiox_conde, outros_raiox_hiper, outros_raiox_solic,
                      outros_raiox_arcos, outros_raiox_laudo, outros_raiox_norml, outros_raiox_tomog]

# Criar colunas
df_train['RAIOX_OUT_NOVO'] = np.nan
df_test['RAIOX_OUT_NOVO'] = np.nan

# Aplicar novas categorias evitando sobreposição
for n_cat, o_cats in zip(outros_raiox_nCats, outros_raiox_oCats):
    df_train.loc[(df_train['RAIOX_OUT'].isin(o_cats)) & (df_train['RAIOX_OUT_NOVO'].isna()), 'RAIOX_OUT_NOVO'] = n_cat
    df_test.loc[(df_test['RAIOX_OUT'].isin(o_cats)) & (df_test['RAIOX_OUT_NOVO'].isna()), 'RAIOX_OUT_NOVO'] = n_cat

# Converter todas as demais categorias restantes provalvemente raras em umas só para reduzir ruído ao modelo
c1 = (~df_train['RAIOX_OUT_NOVO'].isin(outros_raiox_nCats)) & (df_train['RAIOX_OUT'].notnull())
df_train.loc[c1, 'RAIOX_OUT_NOVO'] = 'REL_OUTROS_RESULTADOS'
c1 = (~df_test['RAIOX_OUT_NOVO'].isin(outros_raiox_nCats)) & (df_test['RAIOX_OUT'].notnull())
df_test.loc[c1, 'RAIOX_OUT_NOVO'] = n_cat

Corrigir variáveis relacionadas ao raio X

In [225]:
# Caso o motivo seja outro e não tenha descrição, adicionar string referente
c1 = (df_train['RAIOX_RES'].isin([5])) & (df_train['RAIOX_OUT'].isna())
df_train.loc[c1, 'RAIOX_OUT'] = 'FEITO_E_SEM_DETALHE'
df_train.loc[c1, 'RAIOX_OUT_NOVO'] = 'FEITO_E_SEM_DETALHE'

c1 = (df_test['RAIOX_RES'].isin([5])) & (df_test['RAIOX_OUT'].isna())
df_test.loc[c1, 'RAIOX_OUT'] = 'FEITO_E_SEM_DETALHE'
df_test.loc[c1, 'RAIOX_OUT_NOVO'] = 'FEITO_E_SEM_DETALHE'

# Caso seja nulo, será considerado que não foi feito nenhum exame
c2 = df_train['RAIOX_RES'].isna()
df_train.loc[c2, 'RAIOX_RES'] = 6
df_train.loc[c2, 'RAIOX_OUT'] = 'NDA'
df_train.loc[c2, 'RAIOX_OUT_NOVO'] = 'NDA'

c2 = df_test['RAIOX_RES'].isna()
df_test.loc[c2, 'RAIOX_RES'] = 6
df_test.loc[c2, 'RAIOX_OUT'] = 'NDA'
df_test.loc[c2, 'RAIOX_OUT_NOVO'] = 'NDA'

Corrigir variáveis relacionadas a tomografia

In [226]:
# Caso o motivo seja outro e não tenha descrição, adicionar string referente
c1 = (df_train['TOMO_RES'].isin([5])) & (df_train['TOMO_OUT'].isna())
df_train.loc[c1, 'TOMO_OUT'] = 'FEITO_E_SEM_DETALHE'
df_train.loc[c1, 'TOMO_OUT_NOVO'] = 'FEITO_E_SEM_DETALHE'

c1 = (df_test['TOMO_RES'].isin([5])) & (df_test['TOMO_OUT'].isna())
df_test.loc[c1, 'RAIOX_OUT'] = 'FEITO_E_SEM_DETALHE'
df_test.loc[c1, 'RAIOX_OUT_NOVO'] = 'FEITO_E_SEM_DETALHE'

# Caso seja nulo, será considerado que não foi feito nenhum exame
c2 = df_train['TOMO_RES'].isna()
df_train.loc[c2, 'TOMO_RES'] = 6
df_train.loc[c2, 'TOMO_OUT'] = 'NDA'
df_train.loc[c2, 'TOMO_OUT_NOVO'] = 'NDA'

c2 = df_test['TOMO_RES'].isna()
df_test.loc[c2, 'TOMO_RES'] = 6
df_test.loc[c2, 'TOMO_OUT'] = 'NDA'
df_test.loc[c2, 'TOMO_OUT_NOVO'] = 'NDA'

Corrigir variáveis referentes a amostra

In [227]:
# Caso seja nulo em AMOSTRA, será considerado que a coleta foi ignorada
c1 = df_train['AMOSTRA'].isna()
df_train.loc[c1, 'AMOSTRA'] = 9

# Caso AMOSTRA seja 2 ou 9 (não e ignorado respectivamente), ignoramos o tipo de amostra
c2 = df_train['AMOSTRA'].isin([2, 9])
df_train.loc[c2, 'TP_AMOSTRA'] = 9

# Caso TP_AMOSTRA seja 4 (outra) e não tenha informação em OUT_AMOST colocamos uma string especifica
c3 = (df_train['TP_AMOSTRA'] == 4) & (df_train['OUT_AMOST'].isna())
df_train.loc[c3, 'OUT_AMOST'] = 'TIPO_AMOST_NAO_ESPECIFICADO'

# Caso AMOSTRA seja 1 e TP_AMOSTRA seja diferente de 4, inserimos string especifica em OUT_AMOST
c4 = (df_train['AMOSTRA'] == 1) & (df_train['TP_AMOSTRA'].isin([1,2,3,5]))
df_train.loc[c4, 'OUT_AMOST'] = 'COLETA_FEITA_TIPO_DEFINIDO'

# Caso AMOSTRA seja 2 ou 9, inserimos string especifica em OUT_AMOSTRA
c5 = (df_train['AMOSTRA'].isin([2, 9]))
df_train.loc[c5, 'OUT_AMOST'] = 'COLETA_NAO_REALIZADA_OU_IGNORADA'

# Caso AMOSTRA seja 1 e TP_AMOSTRA seja nulo, inserimos 9
c6 = (df_train['AMOSTRA'] == 1) & (df_train['TP_AMOSTRA'].isna())
df_train.loc[c6, 'TP_AMOSTRA'] = 9

# Caso AMOSTRA seja 1 e TP_AMOSTRA seja diferente de 4, inserimos string especifica em OUT_AMOST
c7 = (df_train['AMOSTRA'] == 1) & (df_train['TP_AMOSTRA'] != 4)
df_train.loc[c7, 'OUT_AMOST'] = 'COLETA_FEITA_TIPO_IGNORADO'

In [228]:
# Caso seja nulo em AMOSTRA, será considerado que a coleta foi ignorada
c1 = df_test['AMOSTRA'].isna()
df_test.loc[c1, 'AMOSTRA'] = 9

# Caso AMOSTRA seja 2 ou 9 (não e ignorado respectivamente), ignoramos o tipo de amostra
c2 = df_test['AMOSTRA'].isin([2, 9])
df_test.loc[c2, 'TP_AMOSTRA'] = 9

# Caso TP_AMOSTRA seja 4 (outra) e não tenha informação em OUT_AMOST colocamos uma string especifica
c3 = (df_test['TP_AMOSTRA'] == 4) & (df_test['OUT_AMOST'].isna())
df_test.loc[c3, 'OUT_AMOST'] = 'TIPO_AMOST_NAO_ESPECIFICADO'

# Caso AMOSTRA seja 1 e TP_AMOSTRA seja diferente de 4, inserimos string especifica em OUT_AMOST
c4 = (df_test['AMOSTRA'] == 1) & (df_test['TP_AMOSTRA'].isin([1,2,3,5]))
df_test.loc[c4, 'OUT_AMOST'] = 'COLETA_FEITA_TIPO_DEFINIDO'

# Caso AMOSTRA seja 2 ou 9, inserimos string especifica em OUT_AMOSTRA
c5 = (df_test['AMOSTRA'].isin([2, 9]))
df_test.loc[c5, 'OUT_AMOST'] = 'COLETA_NAO_REALIZADA_OU_IGNORADA'

# Caso AMOSTRA seja 1 e TP_AMOSTRA seja nulo, inserimos 9
c6 = (df_test['AMOSTRA'] == 1) & (df_test['TP_AMOSTRA'].isna())
df_test.loc[c6, 'TP_AMOSTRA'] = 9

# Caso AMOSTRA seja 1 e TP_AMOSTRA seja diferente de 4, inserimos string especifica em OUT_AMOST
c7 = (df_test['AMOSTRA'] == 1) & (df_test['TP_AMOSTRA'] != 4)
df_test.loc[c7, 'OUT_AMOST'] = 'COLETA_FEITA_TIPO_IGNORADO'

Melhorar colunas referente a OUTROS II

In [229]:
# Gerar listas hierarquicas especificas para consolidar categorias em tipo de amostra X (prioridade de baixo para cima)
outros_amost_nasof = [c for c in outros_amost if 'NASOFARINGE' in c or 'NASO' in c or 'NF' in c or 'NOSOFARINGE' in c]
outros_amost_rapid = [c for c in outros_amost if 'TESTE RAPIDO' in c or 'RAPIDO' in c or 'TR' in c or 'ANTIGEN' in c]
outros_amost_blood = [c for c in outros_amost if 'SANGUE' in c or 'SAGUE' in c]
outros_amost_swab = [c for c in outros_amost if 'SWAB' in c]
outros_amost_soro = [c for c in outros_amost if 'SORO' in c]
outros_amost_tpcr = [c for c in outros_amost if 'PCR' in c]
outros_amost_nasal = [c for c in outros_amost if 'NASAL' in c]


# Especificar novas categorias de agrupamento por ordem de prioridade
outros_amost_nCats = ['REL_NASOFARINGE', 'REL_TESTE_RAPIDO','REL_PCR', 'REL_TESTE_SANGUE',
                      'REL_SWAB', 'REL_SORO', 'REL_NASAL']
outros_amost_oCats = [outros_amost_nasof, outros_amost_rapid, outros_amost_tpcr, outros_amost_blood,
                      outros_amost_swab, outros_amost_soro, outros_amost_nasal]

# Criar colunas
df_train['AMOSTRA_OUT_NOVO'] = np.nan
df_test['AMOSTRA_OUT_NOVO'] = np.nan

# Aplicar novas categorias evitando sobreposição
for n_cat, o_cats in zip(outros_amost_nCats, outros_amost_oCats):
    df_train.loc[(df_train['OUT_AMOST'].isin(o_cats)) & (df_train['AMOSTRA_OUT_NOVO'].isna()), 'AMOSTRA_OUT_NOVO'] = n_cat
    df_test.loc[(df_test['OUT_AMOST'].isin(o_cats)) & (df_test['AMOSTRA_OUT_NOVO'].isna()), 'AMOSTRA_OUT_NOVO'] = n_cat

# Converter todas as demais categorias restantes provalvemente raras em umas só para reduzir ruído ao modelo
c1 = (~df_train['AMOSTRA_OUT_NOVO'].isin(outros_amost_nCats)) & (df_train['OUT_AMOST'].notnull())
df_train.loc[c1, 'AMOSTRA_OUT_NOVO'] = 'REL_OUTROS_RESULTADOS'

c1 = (~df_test['AMOSTRA_OUT_NOVO'].isin(outros_amost_nCats)) & (df_test['OUT_AMOST'].notnull())
df_test.loc[c1, 'AMOSTRA_OUT_NOVO'] = n_cat

**Variáveis relacionadas ao SINTOMA**

In [230]:
# Definir colunas de análise nesta seção
cols = ['SEM_PRI','SEM_NOT','FEBRE','TOSSE','GARGANTA','DISPNEIA','DESC_RESP','SATURACAO','DIARREIA','VOMITO',
        'OUTRO_SIN','OUTRO_DES','PERD_PALA','PERD_OLFT','DOR_ABD','FADIGA']

In [231]:
# Checar nulos
df_train[cols].isna().sum()

SEM_PRI           0
SEM_NOT           0
FEBRE        231370
TOSSE        172085
GARGANTA     396565
DISPNEIA     176050
DESC_RESP    261255
SATURACAO    223622
DIARREIA     414223
VOMITO       422753
OUTRO_SIN    395283
OUTRO_DES    904480
PERD_PALA    434644
PERD_OLFT    434124
DOR_ABD      439039
FADIGA       389988
dtype: int64

In [232]:
# Criar coluna de acumulo de sintomas em dois formatos: progressivo
cols_sin = ['FEBRE','TOSSE','GARGANTA','DISPNEIA','DESC_RESP','SATURACAO','DIARREIA','VOMITO',
            'PERD_PALA','PERD_OLFT','DOR_ABD','FADIGA','OUTRO_SIN']
df_train['ACUMULADO_SINTOMAS'] = (df_train[cols_sin] == 1).sum(axis=1)
df_test['ACUMULADO_SINTOMAS'] = (df_test[cols_sin] == 1).sum(axis=1)

**Nota** - Não tem muito o que trabalhar nessas variáveis em relação a imputação.

**Variáveis relacionadas ao HOSPITAL**

In [233]:
# Definir colunas de análise nesta seção
cols = ['HOSPITAL','UTI','SUPORT_VEN','NOSOCOMIAL']

In [234]:
# Checar nulos
df_train[cols].isna().sum()

HOSPITAL       26483
UTI           162527
SUPORT_VEN    164962
NOSOCOMIAL    201505
dtype: int64

Imputar variáveis a partir de condições lógicas

In [235]:
# Caso HOSPITAL seja nulo, inserir como Sim caso certas condições sejam apresentadas
c1 = df_train['HOSPITAL'].isna()
c2 = (df_train['UTI'] == 1) | (df_train['SUPORT_VEN'].isin([1, 2])) | (df_train['NOSOCOMIAL'] == 1)
df_train.loc[c1 & c2, 'HOSPITAL'] = 1

# Caso HOSPITAL seja 1 e NOSOCOMIAL nulo, inserir 9
c3 = (df_train['HOSPITAL'] == 1) & (df_train['NOSOCOMIAL'].isna())
df_train.loc[c3, 'NOSOCOMIAL'] = 9

In [236]:
# Caso HOSPITAL seja nulo, inserir como Sim caso certas condições sejam apresentadas
c1 = df_test['HOSPITAL'].isna()
c2 = (df_test['UTI'] == 1) | (df_test['SUPORT_VEN'].isin([1, 2])) | (df_test['NOSOCOMIAL'] == 1)
df_test.loc[c1 & c2, 'HOSPITAL'] = 1

# Caso HOSPITAL seja 1 e NOSOCOMIAL nulo, inserir 9
c3 = (df_test['HOSPITAL'] == 1) & (df_test['NOSOCOMIAL'].isna())
df_test.loc[c3, 'NOSOCOMIAL'] = 9

**Variáveis relacionadas ao MEDICACAO**

In [237]:
# Definir colunas de análise nesta seção
cols = ['VACINA','ANTIVIRAL','TP_ANTIVIR']

In [238]:
# Checar nulos
df_train[cols].isna().sum()

VACINA         384795
ANTIVIRAL      224350
TP_ANTIVIR    1268247
dtype: int64

In [239]:
# Caso ANTIVIRAL seja nulo, inserir como Sim caso certas condições sejam apresentadas
c1 = (df_train['ANTIVIRAL'].isna()) & (df_train['TP_ANTIVIR'].isin([1,2,3]))
df_train.loc[c1, 'ANTIVIRAL'] = 1

In [240]:
# Caso ANTIVIRAL seja, inserir como Não em TP_ANTIVIR como 99
c1 = (df_train['ANTIVIRAL'] == 2) & (df_train['TP_ANTIVIR'].isna())
df_train.loc[c1, 'TP_ANTIVIR'] = 99

**Variáveis relacionadas ao PACIENTE_SAUDE_DETALHE**

In [241]:
# Definir colunas de análise nesta seção
cols = ['FATOR_RISC','CARDIOPATI','HEMATOLOGI','SIND_DOWN','HEPATICA','ASMA','DIABETES','NEUROLOGIC','PNEUMOPATI',
        'IMUNODEPRE','RENAL','OBESIDADE','OBES_IMC','OUT_MORBI','MORB_DESC']

In [242]:
# Especificar colunas de SAUDE_DETALHE
cols_saude = ['CARDIOPATI','HEMATOLOGI','SIND_DOWN','HEPATICA','ASMA','DIABETES','NEUROLOGIC','PNEUMOPATI',
              'IMUNODEPRE','RENAL','OBESIDADE','OUT_MORBI','PUERPERA']

# Criar acumulado
df_train['ACUMULADO_SAUDE'] = (df_train[cols_saude] == 1).sum(axis=1)
df_test['ACUMULADO_SAUDE'] = (df_test[cols_saude] == 1).sum(axis=1)

In [243]:
# Criar outras variações
df_train['ACUMULADO_SAUDE_MAIS_SINTOMA'] = df_train[['ACUMULADO_SAUDE','ACUMULADO_SINTOMAS']].sum(axis=1)
df_test['ACUMULADO_SAUDE_MAIS_SINTOMA'] = df_test[['ACUMULADO_SAUDE','ACUMULADO_SINTOMAS']].sum(axis=1)

df_train['ACUMULADO_SAUDE_SINTOMA_VACCOV'] = df_train['ACUMULADO_SAUDE_MAIS_SINTOMA']+(df_train['VACINA_COV'] == 1).astype(int)
df_test['ACUMULADO_SAUDE_SINTOMA_VACCOV'] = df_test['ACUMULADO_SAUDE_MAIS_SINTOMA']+(df_test['VACINA_COV'] == 1).astype(int)

In [244]:
# Caso FATOR de RISCO seja 2, então para as variáveis saúde inserir 2 indicando ausência nas de fator de riso
df_train.loc[df_train['FATOR_RISC'] == 2, cols_saude] = 2
df_train.loc[(df_train['FATOR_RISC'] == 2)&(df_train['PUERPERA'] != 99), 'PUERPERA'] = 2

df_test.loc[df_test['FATOR_RISC'] == 2, cols_saude] = 2
df_test.loc[(df_test['FATOR_RISC'] == 2)&(df_test['PUERPERA'] != 99), 'PUERPERA'] = 2

In [245]:
# Caso tenha outro tipo de fator de risco porém sem info, adicionar string específica
df_train.loc[(df_train['OUT_MORBI'] == 1)&(df_train['MORB_DESC'].isna()), 'MORB_DESC'] = 'FATOR_RISCO_NAO_DESCRITO'
df_test.loc[(df_test['OUT_MORBI'] == 1)&(df_test['MORB_DESC'].isna()), 'MORB_DESC'] = 'FATOR_RISCO_NAO_DESCRITO'

In [246]:
# Caso tenha nulo em MORB_DESC inserir string de ignorado
df_train.loc[df_train['MORB_DESC'].isna(), 'MORB_DESC'] = 'IGNORADO'
df_test.loc[df_test['MORB_DESC'].isna(), 'MORB_DESC'] = 'IGNORADO'

**Variáveis relacionadas ao PACIENTE_INFORMACAO**

In [247]:
# Definir colunas de análise nesta seção
cols = ['SG_UF','SG_UF_NOT','ID_REGIONA','CO_REGIONA','ID_MUNICIP','CO_MUN_NOT','CS_SEXO','NU_IDADE_N','TP_IDADE',
        'CS_RACA','CS_ESCOL_N','CS_ZONA','AVE_SUINO','OUT_ANIM']

In [248]:
# Checar nulos
(100*df_train[cols].isna().sum()/len(df_train)).round(2)

SG_UF          0.02
SG_UF_NOT      0.00
ID_REGIONA    11.36
CO_REGIONA    11.36
ID_MUNICIP     0.00
CO_MUN_NOT     0.00
CS_SEXO        0.00
NU_IDADE_N     0.00
TP_IDADE       0.00
CS_RACA        0.00
CS_ESCOL_N    34.98
CS_ZONA       10.36
AVE_SUINO     16.90
OUT_ANIM      99.91
dtype: float64

In [249]:
# Imputar SG_UF através da versão not
c1 = (df_train['SG_UF'].isna())&(df_train['SG_UF_NOT'].notnull())
df_train.loc[c1, 'SG_UF'] = df_train.loc[c1, 'SG_UF_NOT']

c1 = (df_test['SG_UF'].isna())&(df_test['SG_UF_NOT'].notnull())
df_test.loc[c1, 'SG_UF'] = df_test.loc[c1, 'SG_UF_NOT']

In [250]:
# Definir colunas para derrubar - redundantes e repetidas
cols_to_drop = ['ID_REGIONA','CO_REGIONA','SG_UF','CO_MUN_NOT']

df_train.drop(columns=cols_to_drop, inplace=True)
df_test.drop(columns=cols_to_drop, inplace=True)

In [251]:
# Definir colunas de análise nesta seção novamente
cols = [c for c in cols if c not in cols_to_drop]

Aqui vamos otimizar a coluna `AVES_SUINO` unificando ela a `OUT_ANIM` criando uma nova variável:

In [252]:
# Criar
outros_anim = list(set(df_train['OUT_ANIM'].dropna().unique().tolist() + df_test['OUT_ANIM'].dropna().unique().tolist()))

In [253]:
# Gerar listas hierarquicas especificas para consolidar categorias em tipo de amostra X (prioridade de baixo para cima)
outros_anim_dog = [c for c in outros_anim if 'CACHOR' in c or 'CANINO' in c or 'CAES' in c or 'CAO' in c or 'CAHORRO' in c]
outros_anim_cat = [c for c in outros_anim if 'GATO' in c or 'FELIN' in c]
outros_anim_boi = [c for c in outros_anim if 'GADO' in c or 'BOVINO' in c or 'BOI' in c or 'VACA' in c]
outros_anim_gal = [c for c in outros_anim if 'GALINHA' in c]

# Especificar novas categorias de agrupamento por ordem de prioridade
outros_anim_nCats = ['REL_CACHORRO', 'REL_GATO','REL_BOI', 'REL_GALINHA']
outros_anim_oCats = [outros_anim_dog, outros_anim_cat, outros_anim_boi, outros_anim_gal]

# Criar colunas
df_train['ANIM_OUT_NOVO'] = np.nan
df_test['ANIM_OUT_NOVO'] = np.nan

# Aplicar novas categorias evitando sobreposição
for n_cat, o_cats in zip(outros_anim_nCats, outros_anim_oCats):
    df_train.loc[(df_train['OUT_ANIM'].isin(o_cats)) & (df_train['ANIM_OUT_NOVO'].isna()), 'ANIM_OUT_NOVO'] = n_cat
    df_test.loc[(df_test['OUT_ANIM'].isin(o_cats)) & (df_test['ANIM_OUT_NOVO'].isna()), 'ANIM_OUT_NOVO'] = n_cat

# Converter todas as demais categorias restantes provalvemente raras em umas só para reduzir ruído ao modelo
c1 = (~df_train['ANIM_OUT_NOVO'].isin(outros_anim_nCats)) & (df_train['OUT_ANIM'].notnull())
df_train.loc[c1, 'ANIM_OUT_NOVO'] = 'REL_OUTROS_RESULTADOS'

c1 = (~df_test['ANIM_OUT_NOVO'].isin(outros_anim_nCats)) & (df_test['OUT_ANIM'].notnull())
df_test.loc[c1, 'ANIM_OUT_NOVO'] = n_cat

In [254]:
# Caso AVE_SUINO seja vazio, inserir ignorado
df_train['AVE_SUINO'].fillna(9, inplace=True)
df_test['AVE_SUINO'].fillna(9, inplace=True)

In [255]:
# Converter AVE_SUINO em string
c1 = (df_train['AVE_SUINO'] == 1)
df_train.loc[c1, 'AVE_SUINO_NOVO'] = 'CONVIVIO_AVE_SUINO'

c2 = (df_train['AVE_SUINO'] == 2)
df_train.loc[c2, 'AVE_SUINO_NOVO'] = 'SEM_CONVIVIO_ANIMAL'

c3 = (df_train['AVE_SUINO'] == 3) & (df_train['ANIM_OUT_NOVO'].notnull())
df_train.loc[c3, 'AVE_SUINO_NOVO'] = df_train.loc[c3, 'ANIM_OUT_NOVO']

c4 = (df_train['AVE_SUINO'] == 3) & (df_train['ANIM_OUT_NOVO'].isna())
df_train.loc[c4, 'AVE_SUINO_NOVO'] = 'CONVIVIO_NAO_DETALHADO'

c5 = (df_train['AVE_SUINO'] == 9)
df_train.loc[c5, 'AVE_SUINO_NOVO'] = 'CONVIVIO_IGNORADO'

# Visualizar nova coluna
df_train['AVE_SUINO_NOVO'].value_counts(dropna=False)

SEM_CONVIVIO_ANIMAL      879296
CONVIVIO_IGNORADO        409145
CONVIVIO_AVE_SUINO        11375
REL_CACHORRO                843
REL_GATO                    158
REL_OUTROS_RESULTADOS        77
REL_BOI                      68
REL_GALINHA                  18
Name: AVE_SUINO_NOVO, dtype: int64

In [256]:
# Converter AVE_SUINO em string
c1 = (df_test['AVE_SUINO'] == 1)
df_test.loc[c1, 'AVE_SUINO_NOVO'] = 'CONVIVIO_AVE_SUINO'

c2 = (df_test['AVE_SUINO'] == 2)
df_test.loc[c2, 'AVE_SUINO_NOVO'] = 'SEM_CONVIVIO_ANIMAL'

c3 = (df_test['AVE_SUINO'] == 3) & (df_test['ANIM_OUT_NOVO'].notnull())
df_test.loc[c3, 'AVE_SUINO_NOVO'] = df_test.loc[c3, 'ANIM_OUT_NOVO']

c4 = (df_test['AVE_SUINO'] == 3) & (df_test['ANIM_OUT_NOVO'].isna())
df_test.loc[c4, 'AVE_SUINO_NOVO'] = 'CONVIVIO_NAO_DETALHADO'

c5 = (df_test['AVE_SUINO'] == 9)
df_test.loc[c5, 'AVE_SUINO_NOVO'] = 'CONVIVIO_IGNORADO'

# Visualizar nova coluna
df_test['AVE_SUINO_NOVO'].value_counts(dropna=False)

SEM_CONVIVIO_ANIMAL    586685
CONVIVIO_IGNORADO      272295
CONVIVIO_AVE_SUINO       7558
REL_CACHORRO              561
REL_GATO                  106
REL_GALINHA                63
REL_BOI                    53
Name: AVE_SUINO_NOVO, dtype: int64

In [257]:
# Derrubar colunas
cols_to_drop = ['AVE_SUINO', 'OUT_ANIM','ANIM_OUT_NOVO']

df_train.drop(columns=cols_to_drop, inplace=True)
df_test.drop(columns=cols_to_drop, inplace=True)

In [258]:
# Modificar cols novamente
cols = ['SG_UF_NOT','ID_MUNICIP','CS_ZONA','CS_SEXO','CS_RACA','NU_IDADE_N','TP_IDADE','CS_ESCOL_N','AVE_SUINO_NOVO']

In [259]:
# Checar nulos
(100*df_train[cols].isna().sum()/len(df_train)).round(2).sort_values(ascending=False)

CS_ESCOL_N        34.98
CS_ZONA           10.36
SG_UF_NOT          0.00
ID_MUNICIP         0.00
CS_SEXO            0.00
CS_RACA            0.00
NU_IDADE_N         0.00
TP_IDADE           0.00
AVE_SUINO_NOVO     0.00
dtype: float64

### **Listar variáveis de acordo a natureza**

In [260]:
# Listar todas as colunas que são categóricas por definição e que podem estar codificadas como numéricas
cols_cat = ['SG_UF_NOT', 'ID_MUNICIP', 'CS_ZONA', 'SURTO_SG',                                              # LOCALIDADE
            'CS_SEXO', 'CS_GESTANT', 'CS_RACA', 'CS_ESCOL_N', 'HISTO_VGM','AVE_SUINO_NOVO',                # INFO PACIENTE
            'FEBRE', 'TOSSE', 'GARGANTA', 'DISPNEIA', 'DESC_RESP', 'SATURACAO', 'DIARREIA', 'VOMITO',      # SINTOMA
            'OUTRO_SIN', 'OUTRO_DES', 'DOR_ABD', 'FADIGA', 'PERD_OLFT', 'PERD_PALA',                       # SINTOMA
            'FATOR_RISC', 'PUERPERA', 'CARDIOPATI', 'HEMATOLOGI', 'SIND_DOWN', 'HEPATICA', 'ASMA',         # FATOR DE RISCO
            'DIABETES', 'NEUROLOGIC', 'PNEUMOPATI', 'IMUNODEPRE', 'RENAL', 'OBESIDADE',                    # FATOR DE RISCO 
            'MORB_DESC','OUT_MORBI',                                                                       # FATOR DE RISCO
            'RAIOX_RES', 'RAIOX_OUT_NOVO', 'TOMO_RES', 'TOMO_OUT_NOVO',                                    # EXAME
            'RAIOX_OUT', 'TOMO_OUT', 'OUT_AMOST',                                                          # OUTRO ORIGINAL
            'VACINA', 'ANTIVIRAL', 'TP_ANTIVIR',                                                           # GRIPE / ANTI VIRAL
            'VACINA_COV',                                                                                  # VACINA COVID
            'HOSPITAL', 'UTI', 'SUPORT_VEN', 'NOSOCOMIAL',                                                 # HOSPITALIZAÇÃO
            'AMOSTRA', 'TP_AMOSTRA', 'AMOSTRA_OUT_NOVO',                                                   # COLETA AMOSTRA
            'MAE_VAC', 'M_AMAMENTA']                                                                       # MAE RELACIONADO

# Listar colunas numéricas
cols_num = ['SEM_NOT','SEM_PRI',                      # INFECCAO
            'NU_IDADE_N','TP_IDADE',                  # INFO PACIENTE
            'DOSE_COV_N',                             # QTD DOSES TOMADAS
            'ACUMULADO_SINTOMAS',                     # ACUMULADO SINTOMAS
            'ACUMULADO_SAUDE',                        # ACUMULADO FATOR DE RISCO
            'ACUMULADO_SAUDE_MAIS_SINTOMA',           # ACUMULADO SINTOMAS + FATOR DE RISCO
            'ACUMULADO_SAUDE_SINTOMA_VACCOV']         # ACUMULADO SINTOMAS + FATOR DE RISCO + DOSES VACINA
# Listar colunas binárias por definição
cols_bin = ['DOSE_1_COV','DOSE_2_COV', 'DOSE_REF']    # VACINA COVID

### Codificar corretamente variáveis categóricas

In [261]:
# Especificar dicionário para casos especificos
de_para_cat = {'CS_ZONA': {1:'URBANA', 2:'RURAL', 3:'PERIURBANA',9:'URBANA'},
               'HISTO_VGM':{1:'SIM', 2:'NAO', 9:'IGNORADO', 0:'IGNORADO'},
               'CS_GESTANT': {1:'PRI_TRI',2:'SEG_TRI',3:'TRC_TRI',4:'IDD_GEST_IGNORADA',5:'NAO',6:'NAO_APLICA',9:'IGNORADO'},
               'CS_RACA': {1:'BRANCA',2:'PRETA',3:'AMARELA',4:'PARDA',5:'INDIGENA',9:'IGNORADO'},
               'CS_ESCOL_N': {0:'ANALFAB',1:'1_5_SERIE',2:'6_9_SERIE',3:'ENS_MEDIO',4:'ENS_SUP',5:'NAO_APLICA',9:'IGNORADO'},
               'PUERPERA': {1:'SIM', 2:'NAO', 9:'IGNORADO',99:'NAO_SE_APLICA_HOMEM'},
               'RAIOX_RES': {1:'NORMAL',2:'INFILTRADO_INTERSTICIAL',3:'CONSOLIDACAO',
                             4:'MISTO',5:'OUTRO',6:'NAO_FEITO',9:'IGNORADO'},
               'TOMO_RES': {1:'TIPICO_COV19',2:'INDETERMINADO_COV19',3:'ATIPICO_COV19',
                            4:'NEGATIVO_PNM',5:'OUTRO',6:'NAO_FEITO',9:'IGNORADO'},
               'TP_ANTIVIR': {1:'OSELTAMIVIR',2:'ZANAMIVIR',3:'OUTRO'},
               'SUPORT_VEN': {1:'SIM_INVASIVO',2:'SIM_NAO_INVASIVO',3:'NAO',9:'IGNORADO'},
               'TP_AMOSTRA': {1:'SECREC_NASO_OROFARINGE',2:'LAVADO_BROCO_ALVEOLAR',3:'TECIDO_POST_MORTEM',
                              4:'OUTRA',5:'LCR',9:'IGNORADO'},
               'MAE_VAC': {1:'SIM', 2:'NAO', 9:'IGNORADO',99:'NAO_SE_APLICA_HOMEM'},
               'M_AMAMENTA': {1:'SIM', 2:'NAO', 9:'IGNORADO',99:'NAO_SE_APLICA_HOMEM'}}

# Loop para modificar
for c, d in de_para_cat.items():
    df_train[c].replace(d, inplace=True)
    df_test[c].replace(d, inplace=True)

In [262]:
# Especificar lista de colunas que possuem seus valores numéricos equivalentes as mesmas strings
cols_cat_gen = ['FEBRE', 'TOSSE', 'GARGANTA', 'DISPNEIA', 'DESC_RESP', 'SATURACAO', 'DIARREIA', 'VOMITO',
                'OUTRO_SIN', 'DOR_ABD', 'FADIGA', 'PERD_OLFT', 'PERD_PALA', 'FATOR_RISC', 'CARDIOPATI', 
                'HEMATOLOGI', 'SIND_DOWN', 'HEPATICA', 'ASMA', 'DIABETES', 'NEUROLOGIC', 'PNEUMOPATI', 
                'IMUNODEPRE', 'RENAL', 'OBESIDADE', 'OUT_MORBI', 
                 'SURTO_SG','VACINA','ANTIVIRAL','VACINA_COV','HOSPITAL','UTI','NOSOCOMIAL','AMOSTRA']
# Loop para modificar
for c in cols_cat_gen:
    df_train[c].replace({1:'SIM', 2:'NAO', 9:'IGNORADO'}, inplace=True)
    df_test[c].replace({1:'SIM', 2:'NAO', 9:'IGNORADO'}, inplace=True)

In [263]:
# Validação das colunas categoricas corrigida
for c in cols_cat_gen + list(de_para_cat.keys()):
    print(f'##################################### {c}')
    print(df_test[c].unique())

##################################### FEBRE
['SIM' nan 'NAO' 'IGNORADO']
##################################### TOSSE
['SIM' nan 'NAO' 'IGNORADO']
##################################### GARGANTA
['NAO' nan 'SIM' 'IGNORADO']
##################################### DISPNEIA
['NAO' nan 'SIM' 'IGNORADO']
##################################### DESC_RESP
['SIM' nan 'NAO' 'IGNORADO']
##################################### SATURACAO
['NAO' nan 'SIM' 'IGNORADO']
##################################### DIARREIA
['NAO' nan 'IGNORADO' 'SIM']
##################################### VOMITO
['NAO' nan 'SIM' 'IGNORADO']
##################################### OUTRO_SIN
['NAO' nan 'SIM' 'IGNORADO']
##################################### DOR_ABD
['NAO' nan 'SIM' 'IGNORADO']
##################################### FADIGA
['NAO' nan 'SIM' 'IGNORADO']
##################################### PERD_OLFT
['NAO' nan 'IGNORADO' 'SIM']
##################################### PERD_PALA
['NAO' nan 'IGNORADO' 'SIM']
##################

### Aplicar + criação de variáveis

In [264]:
# Quando a notificao foi na semana dos primeiros sintomas
df_train['SEM_PRI_NOT_IGUAIS'] = (df_train['SEM_PRI'] == df_train['SEM_NOT']).astype(int)
df_test['SEM_PRI_NOT_IGUAIS'] = (df_test['SEM_PRI'] == df_test['SEM_NOT']).astype(int)

# Calcular diff absoluta
df_train['SEM_PRI_NOT_DIFF_ABS'] = (df_train['SEM_PRI'] - df_train['SEM_NOT']).abs()
df_test['SEM_PRI_NOT_DIFF_ABS'] = (df_test['SEM_PRI'] - df_test['SEM_NOT']).abs()

In [265]:
# Apontar realização de Raio X + TC e Ambos
df_train['FEZ_RAIOX'] = df_train['RAIOX_RES'].isin(['NORMAL','INFILTRADO_INTERSTICIAL','CONSOLIDACAO',
                                                    'MISTO','OUTRO']).astype(int)
df_train['FEZ_TOMOG'] = df_train['TOMO_RES'].isin(['TIPICO_COV19','INDETERMINADO_COV19','ATIPICO_COV19',
                                                   'NEGATIVO_PNM','OUTRO']).astype(int)
df_train['FEZ_RAIOX_TOMOG'] = df_train['FEZ_RAIOX'] + df_train['FEZ_TOMOG']

# Apontar realização de Raio X + TC e Ambos
df_test['FEZ_RAIOX'] = df_test['RAIOX_RES'].isin(['NORMAL','INFILTRADO_INTERSTICIAL','CONSOLIDACAO',
                                                    'MISTO','OUTRO']).astype(int)
df_test['FEZ_TOMOG'] = df_test['TOMO_RES'].isin(['TIPICO_COV19','INDETERMINADO_COV19','ATIPICO_COV19',
                                                   'NEGATIVO_PNM','OUTRO']).astype(int)
df_test['FEZ_RAIOX_TOMOG'] = df_test['FEZ_RAIOX'] + df_test['FEZ_TOMOG']

In [266]:
# Apontar TOMOG que apresentaram covid
df_train['TOMOG_COV19'] = ((df_train['TOMO_RES'] == 'TIPICO_COV19')|(df_train['TOMO_OUT_NOVO'] == 'REL_COVID')).astype(int)
df_test['TOMOG_COV19'] = ((df_test['TOMO_RES'] == 'TIPICO_COV19')|(df_test['TOMO_OUT_NOVO'] == 'REL_COVID')).astype(int)

In [267]:
# Aplicar a unidade de fato em TP para diferenciar melhor os valores
df_train.loc[df_train['TP_IDADE'] == 1, 'TP_IDADE'] = 1
df_train.loc[df_train['TP_IDADE'] == 2, 'TP_IDADE'] = 30
df_train.loc[df_train['TP_IDADE'] == 3, 'TP_IDADE'] = 365

df_test.loc[df_test['TP_IDADE'] == 1, 'TP_IDADE'] = 1
df_test.loc[df_test['TP_IDADE'] == 2, 'TP_IDADE'] = 30
df_test.loc[df_test['TP_IDADE'] == 3, 'TP_IDADE'] = 365

### Listar novamente todas as variáveis por natureza II

Adicionar as criadas anteriormente.

In [268]:
# Checar variáveis faltantes na listagem
[c for c in df_train.columns if c not in cols_cat + cols_num + cols_bin and c != 'CLASSI_FIN']

['SEM_PRI_NOT_IGUAIS',
 'SEM_PRI_NOT_DIFF_ABS',
 'FEZ_RAIOX',
 'FEZ_TOMOG',
 'FEZ_RAIOX_TOMOG',
 'TOMOG_COV19']

In [269]:
# Adicionar de acordo a natureza
cols_num += ['SEM_PRI_NOT_IGUAIS','SEM_PRI_NOT_DIFF_ABS','FEZ_RAIOX_TOMOG']
cols_bin += ['FEZ_RAIOX','FEZ_TOMOG','TOMOG_COV19']

In [270]:
# Checar variáveis faltantes na listagem
[c for c in df_train.columns if c not in cols_cat + cols_num + cols_bin and c != 'CLASSI_FIN']

[]

# Divisão de treino e validação

In [271]:
# Separar dados em X e Y
X = df_train[cols_num + cols_bin + cols_cat].copy()
Y = df_train['CLASSI_FIN'].copy()

In [272]:
# Separar
x_train, x_valid, y_train, y_valid = train_test_split(X, Y, test_size=0.20, random_state=1206, stratify=Y)

In [273]:
# Criar teste seguindo a mesma ordem das colunas
x_test = df_test[cols_num + cols_bin + cols_cat].copy()

In [274]:
# Liberar espaço deletando dataframes antigos
del X, Y, df_test, df_train

In [275]:
# Checar distribuição de Y
y_train.value_counts(normalize=True)

5    0.666859
4    0.292403
2    0.024164
1    0.012012
3    0.004562
Name: CLASSI_FIN, dtype: float64

In [276]:
# Checar distribuição de Y
y_valid.value_counts(normalize=True)

5    0.666859
4    0.292403
2    0.024163
1    0.012014
3    0.004562
Name: CLASSI_FIN, dtype: float64

# Imputação de dados

Estratégia:
- Categóricas: Imputar string "INFO_FALTANTE"
- Numéricas e Binárias: Imputar a moda

In [277]:
# Aplicar estratégia para categoricas
for c in cols_cat:
    x_train.loc[x_train[c].isna(), c] = 'INFO_FALTANTE'
    x_valid.loc[x_valid[c].isna(), c] = 'INFO_FALTANTE'
    x_test.loc[x_test[c].isna(), c] = 'INFO_FALTANTE'

In [278]:
# Aplicar estratégia para numéricas
for c in cols_num + cols_bin:
    # Calcular a moda no treino
    valor_moda = x_train[c].dropna().mode()[0]
    print(f'> {c} possui moda de {valor_moda}')
    # Imputar
    x_train.loc[x_train[c].isna(), c] = valor_moda
    x_valid.loc[x_valid[c].isna(), c] = valor_moda
    x_test.loc[x_test[c].isna(), c] = valor_moda

> SEM_NOT possui moda de 12
> SEM_PRI possui moda de 9
> NU_IDADE_N possui moda de 0
> TP_IDADE possui moda de 365
> DOSE_COV_N possui moda de 0
> ACUMULADO_SINTOMAS possui moda de 4
> ACUMULADO_SAUDE possui moda de 0
> ACUMULADO_SAUDE_MAIS_SINTOMA possui moda de 5
> ACUMULADO_SAUDE_SINTOMA_VACCOV possui moda de 5
> SEM_PRI_NOT_IGUAIS possui moda de 0
> SEM_PRI_NOT_DIFF_ABS possui moda de 1
> FEZ_RAIOX_TOMOG possui moda de 1
> DOSE_1_COV possui moda de 0
> DOSE_2_COV possui moda de 0
> DOSE_REF possui moda de 0
> FEZ_RAIOX possui moda de 0
> FEZ_TOMOG possui moda de 0
> TOMOG_COV19 possui moda de 0


# Modelagem

**Preparar dados**

In [279]:
# Criar conjuntos de treino Y binários
y1_train = (y_train == 1).astype(int)
y2_train = (y_train == 2).astype(int)
y3_train = (y_train == 3).astype(int)
y4_train = (y_train == 4).astype(int)
y5_train = (y_train == 5).astype(int)

In [280]:
# Criar conjuntos de validação Y binários
y1_valid = (y_valid == 1).astype(int)
y2_valid = (y_valid == 2).astype(int)
y3_valid = (y_valid == 3).astype(int)
y4_valid = (y_valid == 4).astype(int)
y5_valid = (y_valid == 5).astype(int)

In [281]:
# Converter para categorical type
for c in cols_cat:
    x_train[c] = x_train[c].astype('category')
    x_valid[c] = pd.Categorical(x_valid[c], x_train[c].astype('category').cat.categories) 
    x_test[c] = pd.Categorical(x_test[c], x_train[c].astype('category').cat.categories)

**Validação de Modelos já estabelecidos**

[Referência](https://optuna.readthedocs.io/en/stable/reference/generated/optuna.integration.lightgbm.LightGBMTuner.html)

[Parâmetros](https://lightgbm.readthedocs.io/en/latest/Parameters.html)

In [282]:
# Listar nomes das variáveis
feats_all = cols_num + cols_bin + cols_cat
feats_cat = cols_cat

In [283]:
# Converter X e Y para formato de dataset do lightgbm - treino
ds1_train = lgb.Dataset(data=x_train, label=y1_train, feature_name=feats_all, categorical_feature=feats_cat)
ds2_train = lgb.Dataset(data=x_train, label=y2_train, feature_name=feats_all, categorical_feature=feats_cat)
ds3_train = lgb.Dataset(data=x_train, label=y3_train, feature_name=feats_all, categorical_feature=feats_cat)
ds4_train = lgb.Dataset(data=x_train, label=y4_train, feature_name=feats_all, categorical_feature=feats_cat)
ds5_train = lgb.Dataset(data=x_train, label=y5_train, feature_name=feats_all, categorical_feature=feats_cat)

# Converter X e Y para formato de dataset do lightgbm - validação
ds1_valid = lgb.Dataset(data=x_valid, label=y1_valid, feature_name=feats_all, categorical_feature=feats_cat)
ds2_valid = lgb.Dataset(data=x_valid, label=y2_valid, feature_name=feats_all, categorical_feature=feats_cat)
ds3_valid = lgb.Dataset(data=x_valid, label=y3_valid, feature_name=feats_all, categorical_feature=feats_cat)
ds4_valid = lgb.Dataset(data=x_valid, label=y4_valid, feature_name=feats_all, categorical_feature=feats_cat)
ds5_valid = lgb.Dataset(data=x_valid, label=y5_valid, feature_name=feats_all, categorical_feature=feats_cat)

In [284]:
# Criar versões unificadas para novo treinamento
x_train_valid = pd.concat([x_train, x_valid])
y1_train_valid = pd.concat([y1_train, y1_valid])
y2_train_valid = pd.concat([y2_train, y2_valid])
y3_train_valid = pd.concat([y3_train, y3_valid])
y4_train_valid = pd.concat([y4_train, y4_valid])
y5_train_valid = pd.concat([y5_train, y5_valid])

# Converter X e Y para formato de dataset do lightgbm - validação
ds1_train_valid = lgb.Dataset(data=x_train_valid, label=y1_train_valid, feature_name=feats_all, categorical_feature=feats_cat)
ds2_train_valid = lgb.Dataset(data=x_train_valid, label=y2_train_valid, feature_name=feats_all, categorical_feature=feats_cat)
ds3_train_valid = lgb.Dataset(data=x_train_valid, label=y3_train_valid, feature_name=feats_all, categorical_feature=feats_cat)
ds4_train_valid = lgb.Dataset(data=x_train_valid, label=y4_train_valid, feature_name=feats_all, categorical_feature=feats_cat)
ds5_train_valid = lgb.Dataset(data=x_train_valid, label=y5_train_valid, feature_name=feats_all, categorical_feature=feats_cat)

In [285]:
# Lidar com excesso de warnings de usuário não solucionáveis do LGBM
warnings.filterwarnings('ignore')

**Montar modelos para avaliar em prompt a validação**

In [286]:
# Definir número de rounds
nbr = 2500

In [287]:
# Especificar hiper parâmetros finais
hp5 = {'objective':'binary',
       'is_unbalance': True,
       'boosting_type': 'gbdt',
       'metric': 'binary_logloss', 
       'eta': 0.01,
       'num_leaves': 224,
       'feature_fraction': 0.85,
       'early_stopping_round': 25,
       "lambda_l1":8.526,
       "lambda_l2":0.011,
       'seed': 1206,
       'verbosity': -1,
       'num_threads':4}
hp4 = {'objective':'binary',
       'is_unbalance': True,
       'boosting_type': 'gbdt',
       'metric': 'binary_logloss', 
       'eta': 0.01,
       'num_leaves': 243,
       'feature_fraction': 0.85,
       'early_stopping_round': 25,
       "lambda_l1":8.526,
       "lambda_l2":0.011,
       'seed': 1206, 
       'verbosity': -1,
       'num_threads':4}
hp3 = {'objective':'binary',
       'is_unbalance': True,
       'boosting_type': 'gbdt',
       'metric': 'binary_logloss', 
       'eta': 0.01,
       'num_leaves': 243,
       'feature_fraction': 0.85,
       'early_stopping_round': 25,
       "lambda_l1":8.526,
       "lambda_l2":0.011,
       'seed': 1206, 
       'verbosity': -1,
       'num_threads':4}
hp2 = {'objective':'binary',
       'is_unbalance': True,
       'boosting_type': 'gbdt',
       'metric': 'binary_logloss', 
       'eta': 0.01,
       'num_leaves': 243,
       'feature_fraction': 0.85,
       'early_stopping_round': 25,
       "lambda_l1":8.526,
       "lambda_l2":0.011,
       'seed': 1206, 
       'verbosity': -1,
       'num_threads':4}
hp1 = {'objective':'binary',
       'is_unbalance': True,
       'boosting_type': 'gbdt',
       'metric': 'binary_logloss', 
       'eta': 0.01,
       'num_leaves': 243,
       'feature_fraction': 0.85,
       'early_stopping_round': 25,
       "lambda_l1":8.526,
       "lambda_l2":0.011,
       'seed': 1206, 
       'verbosity': -1,
       'num_threads':4}

In [288]:
# Treinar 100% modelo da classe 5
m5 = lgb.train(params=hp5, num_boost_round=nbr, train_set=ds5_train, valid_sets=[ds5_valid], 
               verbose_eval=True, categorical_feature=feats_cat)
print('Modelo da classe 5 finalizado')

[1]	valid_0's binary_logloss: 0.632011
Training until validation scores don't improve for 25 rounds
[2]	valid_0's binary_logloss: 0.627744
[3]	valid_0's binary_logloss: 0.623591
[4]	valid_0's binary_logloss: 0.619528
[5]	valid_0's binary_logloss: 0.615586
[6]	valid_0's binary_logloss: 0.611977
[7]	valid_0's binary_logloss: 0.608229
[8]	valid_0's binary_logloss: 0.604568
[9]	valid_0's binary_logloss: 0.600985
[10]	valid_0's binary_logloss: 0.597697
[11]	valid_0's binary_logloss: 0.594293
[12]	valid_0's binary_logloss: 0.590935
[13]	valid_0's binary_logloss: 0.587663
[14]	valid_0's binary_logloss: 0.584648
[15]	valid_0's binary_logloss: 0.581552
[16]	valid_0's binary_logloss: 0.57848
[17]	valid_0's binary_logloss: 0.575577
[18]	valid_0's binary_logloss: 0.572652
[19]	valid_0's binary_logloss: 0.569788
[20]	valid_0's binary_logloss: 0.566981
[21]	valid_0's binary_logloss: 0.564225
[22]	valid_0's binary_logloss: 0.561534
[23]	valid_0's binary_logloss: 0.558895
[24]	valid_0's binary_logloss

[204]	valid_0's binary_logloss: 0.408506
[205]	valid_0's binary_logloss: 0.408284
[206]	valid_0's binary_logloss: 0.408067
[207]	valid_0's binary_logloss: 0.407872
[208]	valid_0's binary_logloss: 0.407701
[209]	valid_0's binary_logloss: 0.407469
[210]	valid_0's binary_logloss: 0.407243
[211]	valid_0's binary_logloss: 0.40705
[212]	valid_0's binary_logloss: 0.40683
[213]	valid_0's binary_logloss: 0.406633
[214]	valid_0's binary_logloss: 0.406408
[215]	valid_0's binary_logloss: 0.406269
[216]	valid_0's binary_logloss: 0.406076
[217]	valid_0's binary_logloss: 0.405863
[218]	valid_0's binary_logloss: 0.405675
[219]	valid_0's binary_logloss: 0.405465
[220]	valid_0's binary_logloss: 0.405299
[221]	valid_0's binary_logloss: 0.405116
[222]	valid_0's binary_logloss: 0.404963
[223]	valid_0's binary_logloss: 0.404764
[224]	valid_0's binary_logloss: 0.404585
[225]	valid_0's binary_logloss: 0.404398
[226]	valid_0's binary_logloss: 0.404275
[227]	valid_0's binary_logloss: 0.404106
[228]	valid_0's bi

[605]	valid_0's binary_logloss: 0.376733
[606]	valid_0's binary_logloss: 0.376705
[607]	valid_0's binary_logloss: 0.37667
[608]	valid_0's binary_logloss: 0.376635
[609]	valid_0's binary_logloss: 0.376608
[610]	valid_0's binary_logloss: 0.376579
[611]	valid_0's binary_logloss: 0.376536
[612]	valid_0's binary_logloss: 0.376484
[613]	valid_0's binary_logloss: 0.376458
[614]	valid_0's binary_logloss: 0.376433
[615]	valid_0's binary_logloss: 0.376405
[616]	valid_0's binary_logloss: 0.376375
[617]	valid_0's binary_logloss: 0.376341
[618]	valid_0's binary_logloss: 0.376297
[619]	valid_0's binary_logloss: 0.376269
[620]	valid_0's binary_logloss: 0.376235
[621]	valid_0's binary_logloss: 0.376211
[622]	valid_0's binary_logloss: 0.376188
[623]	valid_0's binary_logloss: 0.37617
[624]	valid_0's binary_logloss: 0.376148
[625]	valid_0's binary_logloss: 0.376096
[626]	valid_0's binary_logloss: 0.37607
[627]	valid_0's binary_logloss: 0.376041
[628]	valid_0's binary_logloss: 0.37601
[629]	valid_0's bina

[807]	valid_0's binary_logloss: 0.370693
[808]	valid_0's binary_logloss: 0.37067
[809]	valid_0's binary_logloss: 0.370655
[810]	valid_0's binary_logloss: 0.370615
[811]	valid_0's binary_logloss: 0.370592
[812]	valid_0's binary_logloss: 0.370575
[813]	valid_0's binary_logloss: 0.370558
[814]	valid_0's binary_logloss: 0.370545
[815]	valid_0's binary_logloss: 0.37053
[816]	valid_0's binary_logloss: 0.370478
[817]	valid_0's binary_logloss: 0.370446
[818]	valid_0's binary_logloss: 0.370414
[819]	valid_0's binary_logloss: 0.370393
[820]	valid_0's binary_logloss: 0.370363
[821]	valid_0's binary_logloss: 0.370354
[822]	valid_0's binary_logloss: 0.370311
[823]	valid_0's binary_logloss: 0.370285
[824]	valid_0's binary_logloss: 0.370254
[825]	valid_0's binary_logloss: 0.370241
[826]	valid_0's binary_logloss: 0.370215
[827]	valid_0's binary_logloss: 0.370175
[828]	valid_0's binary_logloss: 0.370157
[829]	valid_0's binary_logloss: 0.370141
[830]	valid_0's binary_logloss: 0.370115
[831]	valid_0's bi

[1008]	valid_0's binary_logloss: 0.366576
[1009]	valid_0's binary_logloss: 0.366572
[1010]	valid_0's binary_logloss: 0.366548
[1011]	valid_0's binary_logloss: 0.366541
[1012]	valid_0's binary_logloss: 0.366522
[1013]	valid_0's binary_logloss: 0.366505
[1014]	valid_0's binary_logloss: 0.366493
[1015]	valid_0's binary_logloss: 0.366483
[1016]	valid_0's binary_logloss: 0.366478
[1017]	valid_0's binary_logloss: 0.366448
[1018]	valid_0's binary_logloss: 0.366439
[1019]	valid_0's binary_logloss: 0.366416
[1020]	valid_0's binary_logloss: 0.366401
[1021]	valid_0's binary_logloss: 0.366397
[1022]	valid_0's binary_logloss: 0.366365
[1023]	valid_0's binary_logloss: 0.366338
[1024]	valid_0's binary_logloss: 0.366327
[1025]	valid_0's binary_logloss: 0.366299
[1026]	valid_0's binary_logloss: 0.366279
[1027]	valid_0's binary_logloss: 0.366271
[1028]	valid_0's binary_logloss: 0.36626
[1029]	valid_0's binary_logloss: 0.366251
[1030]	valid_0's binary_logloss: 0.366239
[1031]	valid_0's binary_logloss: 0.

[1204]	valid_0's binary_logloss: 0.363978
[1205]	valid_0's binary_logloss: 0.363967
[1206]	valid_0's binary_logloss: 0.363955
[1207]	valid_0's binary_logloss: 0.363945
[1208]	valid_0's binary_logloss: 0.363943
[1209]	valid_0's binary_logloss: 0.363918
[1210]	valid_0's binary_logloss: 0.363895
[1211]	valid_0's binary_logloss: 0.363877
[1212]	valid_0's binary_logloss: 0.363861
[1213]	valid_0's binary_logloss: 0.363853
[1214]	valid_0's binary_logloss: 0.363852
[1215]	valid_0's binary_logloss: 0.363848
[1216]	valid_0's binary_logloss: 0.36384
[1217]	valid_0's binary_logloss: 0.363832
[1218]	valid_0's binary_logloss: 0.363822
[1219]	valid_0's binary_logloss: 0.363806
[1220]	valid_0's binary_logloss: 0.363785
[1221]	valid_0's binary_logloss: 0.363775
[1222]	valid_0's binary_logloss: 0.363763
[1223]	valid_0's binary_logloss: 0.363753
[1224]	valid_0's binary_logloss: 0.363733
[1225]	valid_0's binary_logloss: 0.363708
[1226]	valid_0's binary_logloss: 0.363692
[1227]	valid_0's binary_logloss: 0.

[1401]	valid_0's binary_logloss: 0.361993
[1402]	valid_0's binary_logloss: 0.361983
[1403]	valid_0's binary_logloss: 0.361974
[1404]	valid_0's binary_logloss: 0.36197
[1405]	valid_0's binary_logloss: 0.361958
[1406]	valid_0's binary_logloss: 0.361945
[1407]	valid_0's binary_logloss: 0.361936
[1408]	valid_0's binary_logloss: 0.361925
[1409]	valid_0's binary_logloss: 0.361922
[1410]	valid_0's binary_logloss: 0.361917
[1411]	valid_0's binary_logloss: 0.361915
[1412]	valid_0's binary_logloss: 0.36191
[1413]	valid_0's binary_logloss: 0.3619
[1414]	valid_0's binary_logloss: 0.361893
[1415]	valid_0's binary_logloss: 0.36189
[1416]	valid_0's binary_logloss: 0.361875
[1417]	valid_0's binary_logloss: 0.36186
[1418]	valid_0's binary_logloss: 0.361855
[1419]	valid_0's binary_logloss: 0.361837
[1420]	valid_0's binary_logloss: 0.361831
[1421]	valid_0's binary_logloss: 0.361821
[1422]	valid_0's binary_logloss: 0.361813
[1423]	valid_0's binary_logloss: 0.361808
[1424]	valid_0's binary_logloss: 0.36179

[1597]	valid_0's binary_logloss: 0.360794
[1598]	valid_0's binary_logloss: 0.360786
[1599]	valid_0's binary_logloss: 0.360785
[1600]	valid_0's binary_logloss: 0.360779
[1601]	valid_0's binary_logloss: 0.360776
[1602]	valid_0's binary_logloss: 0.360764
[1603]	valid_0's binary_logloss: 0.360762
[1604]	valid_0's binary_logloss: 0.360755
[1605]	valid_0's binary_logloss: 0.360757
[1606]	valid_0's binary_logloss: 0.360753
[1607]	valid_0's binary_logloss: 0.360733
[1608]	valid_0's binary_logloss: 0.360732
[1609]	valid_0's binary_logloss: 0.360715
[1610]	valid_0's binary_logloss: 0.360711
[1611]	valid_0's binary_logloss: 0.360709
[1612]	valid_0's binary_logloss: 0.360702
[1613]	valid_0's binary_logloss: 0.360692
[1614]	valid_0's binary_logloss: 0.360691
[1615]	valid_0's binary_logloss: 0.360685
[1616]	valid_0's binary_logloss: 0.360683
[1617]	valid_0's binary_logloss: 0.360667
[1618]	valid_0's binary_logloss: 0.360659
[1619]	valid_0's binary_logloss: 0.360655
[1620]	valid_0's binary_logloss: 0

[1989]	valid_0's binary_logloss: 0.358949
[1990]	valid_0's binary_logloss: 0.358945
[1991]	valid_0's binary_logloss: 0.358945
[1992]	valid_0's binary_logloss: 0.358943
[1993]	valid_0's binary_logloss: 0.358943
[1994]	valid_0's binary_logloss: 0.358941
[1995]	valid_0's binary_logloss: 0.358938
[1996]	valid_0's binary_logloss: 0.358932
[1997]	valid_0's binary_logloss: 0.358931
[1998]	valid_0's binary_logloss: 0.358928
[1999]	valid_0's binary_logloss: 0.358927
[2000]	valid_0's binary_logloss: 0.358922
[2001]	valid_0's binary_logloss: 0.358917
[2002]	valid_0's binary_logloss: 0.358915
[2003]	valid_0's binary_logloss: 0.358896
[2004]	valid_0's binary_logloss: 0.358894
[2005]	valid_0's binary_logloss: 0.358885
[2006]	valid_0's binary_logloss: 0.358879
[2007]	valid_0's binary_logloss: 0.358877
[2008]	valid_0's binary_logloss: 0.358877
[2009]	valid_0's binary_logloss: 0.358873
[2010]	valid_0's binary_logloss: 0.358864
[2011]	valid_0's binary_logloss: 0.358858
[2012]	valid_0's binary_logloss: 0

[2186]	valid_0's binary_logloss: 0.358352
[2187]	valid_0's binary_logloss: 0.358345
[2188]	valid_0's binary_logloss: 0.358345
[2189]	valid_0's binary_logloss: 0.358338
[2190]	valid_0's binary_logloss: 0.35833
[2191]	valid_0's binary_logloss: 0.35833
[2192]	valid_0's binary_logloss: 0.358331
[2193]	valid_0's binary_logloss: 0.35833
[2194]	valid_0's binary_logloss: 0.358325
[2195]	valid_0's binary_logloss: 0.35832
[2196]	valid_0's binary_logloss: 0.358315
[2197]	valid_0's binary_logloss: 0.358311
[2198]	valid_0's binary_logloss: 0.358307
[2199]	valid_0's binary_logloss: 0.3583
[2200]	valid_0's binary_logloss: 0.358297
[2201]	valid_0's binary_logloss: 0.358292
[2202]	valid_0's binary_logloss: 0.358283
[2203]	valid_0's binary_logloss: 0.358284
[2204]	valid_0's binary_logloss: 0.358285
[2205]	valid_0's binary_logloss: 0.35828
[2206]	valid_0's binary_logloss: 0.358275
[2207]	valid_0's binary_logloss: 0.358266
[2208]	valid_0's binary_logloss: 0.358264
[2209]	valid_0's binary_logloss: 0.358266

[2383]	valid_0's binary_logloss: 0.357777
[2384]	valid_0's binary_logloss: 0.357769
[2385]	valid_0's binary_logloss: 0.357769
[2386]	valid_0's binary_logloss: 0.357768
[2387]	valid_0's binary_logloss: 0.357764
[2388]	valid_0's binary_logloss: 0.357766
[2389]	valid_0's binary_logloss: 0.357764
[2390]	valid_0's binary_logloss: 0.357764
[2391]	valid_0's binary_logloss: 0.357764
[2392]	valid_0's binary_logloss: 0.357762
[2393]	valid_0's binary_logloss: 0.357761
[2394]	valid_0's binary_logloss: 0.357757
[2395]	valid_0's binary_logloss: 0.357754
[2396]	valid_0's binary_logloss: 0.357754
[2397]	valid_0's binary_logloss: 0.357754
[2398]	valid_0's binary_logloss: 0.35775
[2399]	valid_0's binary_logloss: 0.357745
[2400]	valid_0's binary_logloss: 0.357743
[2401]	valid_0's binary_logloss: 0.35774
[2402]	valid_0's binary_logloss: 0.357743
[2403]	valid_0's binary_logloss: 0.357744
[2404]	valid_0's binary_logloss: 0.357741
[2405]	valid_0's binary_logloss: 0.357739
[2406]	valid_0's binary_logloss: 0.3

In [289]:
# Treinar 100% modelo da classe 4
m4 = lgb.train(params=hp4, num_boost_round=nbr, train_set=ds4_train, valid_sets=[ds4_valid], 
               verbose_eval=True, categorical_feature=feats_cat)
print('Modelo da classe 4 finalizado')

[1]	valid_0's binary_logloss: 0.600664
Training until validation scores don't improve for 25 rounds
[2]	valid_0's binary_logloss: 0.597133
[3]	valid_0's binary_logloss: 0.593722
[4]	valid_0's binary_logloss: 0.590392
[5]	valid_0's binary_logloss: 0.587165
[6]	valid_0's binary_logloss: 0.584307
[7]	valid_0's binary_logloss: 0.581266
[8]	valid_0's binary_logloss: 0.578323
[9]	valid_0's binary_logloss: 0.575455
[10]	valid_0's binary_logloss: 0.572909
[11]	valid_0's binary_logloss: 0.570193
[12]	valid_0's binary_logloss: 0.567523
[13]	valid_0's binary_logloss: 0.564936
[14]	valid_0's binary_logloss: 0.562643
[15]	valid_0's binary_logloss: 0.5602
[16]	valid_0's binary_logloss: 0.557797
[17]	valid_0's binary_logloss: 0.555526
[18]	valid_0's binary_logloss: 0.55325
[19]	valid_0's binary_logloss: 0.551029
[20]	valid_0's binary_logloss: 0.548845
[21]	valid_0's binary_logloss: 0.546743
[22]	valid_0's binary_logloss: 0.544678
[23]	valid_0's binary_logloss: 0.542672
[24]	valid_0's binary_logloss: 

[203]	valid_0's binary_logloss: 0.449155
[204]	valid_0's binary_logloss: 0.449066
[205]	valid_0's binary_logloss: 0.448935
[206]	valid_0's binary_logloss: 0.448851
[207]	valid_0's binary_logloss: 0.448747
[208]	valid_0's binary_logloss: 0.448713
[209]	valid_0's binary_logloss: 0.44862
[210]	valid_0's binary_logloss: 0.448486
[211]	valid_0's binary_logloss: 0.448386
[212]	valid_0's binary_logloss: 0.448295
[213]	valid_0's binary_logloss: 0.448203
[214]	valid_0's binary_logloss: 0.448105
[215]	valid_0's binary_logloss: 0.448069
[216]	valid_0's binary_logloss: 0.447964
[217]	valid_0's binary_logloss: 0.447875
[218]	valid_0's binary_logloss: 0.447796
[219]	valid_0's binary_logloss: 0.447712
[220]	valid_0's binary_logloss: 0.447662
[221]	valid_0's binary_logloss: 0.447558
[222]	valid_0's binary_logloss: 0.447469
[223]	valid_0's binary_logloss: 0.447389
[224]	valid_0's binary_logloss: 0.447292
[225]	valid_0's binary_logloss: 0.447206
[226]	valid_0's binary_logloss: 0.447179
[227]	valid_0's b

[404]	valid_0's binary_logloss: 0.436063
[405]	valid_0's binary_logloss: 0.435985
[406]	valid_0's binary_logloss: 0.435904
[407]	valid_0's binary_logloss: 0.435837
[408]	valid_0's binary_logloss: 0.435771
[409]	valid_0's binary_logloss: 0.435718
[410]	valid_0's binary_logloss: 0.435628
[411]	valid_0's binary_logloss: 0.43558
[412]	valid_0's binary_logloss: 0.435525
[413]	valid_0's binary_logloss: 0.435502
[414]	valid_0's binary_logloss: 0.435447
[415]	valid_0's binary_logloss: 0.435391
[416]	valid_0's binary_logloss: 0.435326
[417]	valid_0's binary_logloss: 0.435267
[418]	valid_0's binary_logloss: 0.435195
[419]	valid_0's binary_logloss: 0.435118
[420]	valid_0's binary_logloss: 0.435067
[421]	valid_0's binary_logloss: 0.435038
[422]	valid_0's binary_logloss: 0.435014
[423]	valid_0's binary_logloss: 0.434961
[424]	valid_0's binary_logloss: 0.434898
[425]	valid_0's binary_logloss: 0.434841
[426]	valid_0's binary_logloss: 0.434792
[427]	valid_0's binary_logloss: 0.434737
[428]	valid_0's b

[605]	valid_0's binary_logloss: 0.426323
[606]	valid_0's binary_logloss: 0.426274
[607]	valid_0's binary_logloss: 0.426233
[608]	valid_0's binary_logloss: 0.426202
[609]	valid_0's binary_logloss: 0.42616
[610]	valid_0's binary_logloss: 0.426128
[611]	valid_0's binary_logloss: 0.426087
[612]	valid_0's binary_logloss: 0.426033
[613]	valid_0's binary_logloss: 0.426013
[614]	valid_0's binary_logloss: 0.425963
[615]	valid_0's binary_logloss: 0.425934
[616]	valid_0's binary_logloss: 0.425911
[617]	valid_0's binary_logloss: 0.425888
[618]	valid_0's binary_logloss: 0.425842
[619]	valid_0's binary_logloss: 0.425815
[620]	valid_0's binary_logloss: 0.425776
[621]	valid_0's binary_logloss: 0.425747
[622]	valid_0's binary_logloss: 0.425711
[623]	valid_0's binary_logloss: 0.425689
[624]	valid_0's binary_logloss: 0.425668
[625]	valid_0's binary_logloss: 0.425624
[626]	valid_0's binary_logloss: 0.425606
[627]	valid_0's binary_logloss: 0.425575
[628]	valid_0's binary_logloss: 0.425535
[629]	valid_0's b

[1006]	valid_0's binary_logloss: 0.414979
[1007]	valid_0's binary_logloss: 0.414957
[1008]	valid_0's binary_logloss: 0.414925
[1009]	valid_0's binary_logloss: 0.414915
[1010]	valid_0's binary_logloss: 0.414887
[1011]	valid_0's binary_logloss: 0.414872
[1012]	valid_0's binary_logloss: 0.41486
[1013]	valid_0's binary_logloss: 0.414806
[1014]	valid_0's binary_logloss: 0.414786
[1015]	valid_0's binary_logloss: 0.414761
[1016]	valid_0's binary_logloss: 0.414723
[1017]	valid_0's binary_logloss: 0.41469
[1018]	valid_0's binary_logloss: 0.41468
[1019]	valid_0's binary_logloss: 0.414664
[1020]	valid_0's binary_logloss: 0.414646
[1021]	valid_0's binary_logloss: 0.414639
[1022]	valid_0's binary_logloss: 0.414628
[1023]	valid_0's binary_logloss: 0.414598
[1024]	valid_0's binary_logloss: 0.41458
[1025]	valid_0's binary_logloss: 0.414567
[1026]	valid_0's binary_logloss: 0.414549
[1027]	valid_0's binary_logloss: 0.414536
[1028]	valid_0's binary_logloss: 0.414491
[1029]	valid_0's binary_logloss: 0.414

[1202]	valid_0's binary_logloss: 0.411395
[1203]	valid_0's binary_logloss: 0.411373
[1204]	valid_0's binary_logloss: 0.411362
[1205]	valid_0's binary_logloss: 0.41135
[1206]	valid_0's binary_logloss: 0.411307
[1207]	valid_0's binary_logloss: 0.411263
[1208]	valid_0's binary_logloss: 0.411254
[1209]	valid_0's binary_logloss: 0.411242
[1210]	valid_0's binary_logloss: 0.411235
[1211]	valid_0's binary_logloss: 0.411227
[1212]	valid_0's binary_logloss: 0.411213
[1213]	valid_0's binary_logloss: 0.411197
[1214]	valid_0's binary_logloss: 0.411183
[1215]	valid_0's binary_logloss: 0.411147
[1216]	valid_0's binary_logloss: 0.411131
[1217]	valid_0's binary_logloss: 0.411123
[1218]	valid_0's binary_logloss: 0.411115
[1219]	valid_0's binary_logloss: 0.411101
[1220]	valid_0's binary_logloss: 0.411068
[1221]	valid_0's binary_logloss: 0.411048
[1222]	valid_0's binary_logloss: 0.411023
[1223]	valid_0's binary_logloss: 0.410989
[1224]	valid_0's binary_logloss: 0.410955
[1225]	valid_0's binary_logloss: 0.

[1399]	valid_0's binary_logloss: 0.408736
[1400]	valid_0's binary_logloss: 0.408731
[1401]	valid_0's binary_logloss: 0.408718
[1402]	valid_0's binary_logloss: 0.408704
[1403]	valid_0's binary_logloss: 0.408686
[1404]	valid_0's binary_logloss: 0.408681
[1405]	valid_0's binary_logloss: 0.408661
[1406]	valid_0's binary_logloss: 0.408657
[1407]	valid_0's binary_logloss: 0.408653
[1408]	valid_0's binary_logloss: 0.408645
[1409]	valid_0's binary_logloss: 0.408632
[1410]	valid_0's binary_logloss: 0.408621
[1411]	valid_0's binary_logloss: 0.408601
[1412]	valid_0's binary_logloss: 0.40859
[1413]	valid_0's binary_logloss: 0.408572
[1414]	valid_0's binary_logloss: 0.408559
[1415]	valid_0's binary_logloss: 0.408546
[1416]	valid_0's binary_logloss: 0.408533
[1417]	valid_0's binary_logloss: 0.408509
[1418]	valid_0's binary_logloss: 0.408505
[1419]	valid_0's binary_logloss: 0.40849
[1420]	valid_0's binary_logloss: 0.408484
[1421]	valid_0's binary_logloss: 0.408473
[1422]	valid_0's binary_logloss: 0.4

[1595]	valid_0's binary_logloss: 0.406818
[1596]	valid_0's binary_logloss: 0.406812
[1597]	valid_0's binary_logloss: 0.406792
[1598]	valid_0's binary_logloss: 0.406778
[1599]	valid_0's binary_logloss: 0.406772
[1600]	valid_0's binary_logloss: 0.406758
[1601]	valid_0's binary_logloss: 0.40675
[1602]	valid_0's binary_logloss: 0.406734
[1603]	valid_0's binary_logloss: 0.406727
[1604]	valid_0's binary_logloss: 0.406723
[1605]	valid_0's binary_logloss: 0.406722
[1606]	valid_0's binary_logloss: 0.406718
[1607]	valid_0's binary_logloss: 0.4067
[1608]	valid_0's binary_logloss: 0.40669
[1609]	valid_0's binary_logloss: 0.406678
[1610]	valid_0's binary_logloss: 0.406673
[1611]	valid_0's binary_logloss: 0.406662
[1612]	valid_0's binary_logloss: 0.406655
[1613]	valid_0's binary_logloss: 0.40665
[1614]	valid_0's binary_logloss: 0.406646
[1615]	valid_0's binary_logloss: 0.406641
[1616]	valid_0's binary_logloss: 0.406637
[1617]	valid_0's binary_logloss: 0.406625
[1618]	valid_0's binary_logloss: 0.4066

[1792]	valid_0's binary_logloss: 0.405173
[1793]	valid_0's binary_logloss: 0.405164
[1794]	valid_0's binary_logloss: 0.405161
[1795]	valid_0's binary_logloss: 0.405157
[1796]	valid_0's binary_logloss: 0.405155
[1797]	valid_0's binary_logloss: 0.405151
[1798]	valid_0's binary_logloss: 0.405139
[1799]	valid_0's binary_logloss: 0.405134
[1800]	valid_0's binary_logloss: 0.405128
[1801]	valid_0's binary_logloss: 0.405125
[1802]	valid_0's binary_logloss: 0.40512
[1803]	valid_0's binary_logloss: 0.405104
[1804]	valid_0's binary_logloss: 0.4051
[1805]	valid_0's binary_logloss: 0.405086
[1806]	valid_0's binary_logloss: 0.405082
[1807]	valid_0's binary_logloss: 0.405079
[1808]	valid_0's binary_logloss: 0.405074
[1809]	valid_0's binary_logloss: 0.405067
[1810]	valid_0's binary_logloss: 0.405063
[1811]	valid_0's binary_logloss: 0.405051
[1812]	valid_0's binary_logloss: 0.405048
[1813]	valid_0's binary_logloss: 0.40504
[1814]	valid_0's binary_logloss: 0.405036
[1815]	valid_0's binary_logloss: 0.405

[1989]	valid_0's binary_logloss: 0.403902
[1990]	valid_0's binary_logloss: 0.403892
[1991]	valid_0's binary_logloss: 0.40388
[1992]	valid_0's binary_logloss: 0.403877
[1993]	valid_0's binary_logloss: 0.403872
[1994]	valid_0's binary_logloss: 0.403863
[1995]	valid_0's binary_logloss: 0.403857
[1996]	valid_0's binary_logloss: 0.40385
[1997]	valid_0's binary_logloss: 0.403845
[1998]	valid_0's binary_logloss: 0.403841
[1999]	valid_0's binary_logloss: 0.40383
[2000]	valid_0's binary_logloss: 0.403822
[2001]	valid_0's binary_logloss: 0.40382
[2002]	valid_0's binary_logloss: 0.403813
[2003]	valid_0's binary_logloss: 0.403805
[2004]	valid_0's binary_logloss: 0.403801
[2005]	valid_0's binary_logloss: 0.403794
[2006]	valid_0's binary_logloss: 0.403792
[2007]	valid_0's binary_logloss: 0.403789
[2008]	valid_0's binary_logloss: 0.403788
[2009]	valid_0's binary_logloss: 0.403783
[2010]	valid_0's binary_logloss: 0.403777
[2011]	valid_0's binary_logloss: 0.403774
[2012]	valid_0's binary_logloss: 0.403

[2185]	valid_0's binary_logloss: 0.40283
[2186]	valid_0's binary_logloss: 0.402828
[2187]	valid_0's binary_logloss: 0.40281
[2188]	valid_0's binary_logloss: 0.402806
[2189]	valid_0's binary_logloss: 0.402793
[2190]	valid_0's binary_logloss: 0.402781
[2191]	valid_0's binary_logloss: 0.402779
[2192]	valid_0's binary_logloss: 0.402778
[2193]	valid_0's binary_logloss: 0.402778
[2194]	valid_0's binary_logloss: 0.402772
[2195]	valid_0's binary_logloss: 0.402767
[2196]	valid_0's binary_logloss: 0.402764
[2197]	valid_0's binary_logloss: 0.402762
[2198]	valid_0's binary_logloss: 0.402754
[2199]	valid_0's binary_logloss: 0.402741
[2200]	valid_0's binary_logloss: 0.402735
[2201]	valid_0's binary_logloss: 0.402733
[2202]	valid_0's binary_logloss: 0.402723
[2203]	valid_0's binary_logloss: 0.40272
[2204]	valid_0's binary_logloss: 0.402719
[2205]	valid_0's binary_logloss: 0.402719
[2206]	valid_0's binary_logloss: 0.402714
[2207]	valid_0's binary_logloss: 0.402702
[2208]	valid_0's binary_logloss: 0.40

[2381]	valid_0's binary_logloss: 0.401982
[2382]	valid_0's binary_logloss: 0.401979
[2383]	valid_0's binary_logloss: 0.40197
[2384]	valid_0's binary_logloss: 0.401963
[2385]	valid_0's binary_logloss: 0.401962
[2386]	valid_0's binary_logloss: 0.40196
[2387]	valid_0's binary_logloss: 0.401953
[2388]	valid_0's binary_logloss: 0.401952
[2389]	valid_0's binary_logloss: 0.401949
[2390]	valid_0's binary_logloss: 0.401947
[2391]	valid_0's binary_logloss: 0.401944
[2392]	valid_0's binary_logloss: 0.40194
[2393]	valid_0's binary_logloss: 0.401934
[2394]	valid_0's binary_logloss: 0.401929
[2395]	valid_0's binary_logloss: 0.401923
[2396]	valid_0's binary_logloss: 0.40192
[2397]	valid_0's binary_logloss: 0.401919
[2398]	valid_0's binary_logloss: 0.401917
[2399]	valid_0's binary_logloss: 0.401911
[2400]	valid_0's binary_logloss: 0.401907
[2401]	valid_0's binary_logloss: 0.401904
[2402]	valid_0's binary_logloss: 0.401903
[2403]	valid_0's binary_logloss: 0.401904
[2404]	valid_0's binary_logloss: 0.401

In [290]:
# Treinar 100% modelo da classe 3
m3 = lgb.train(params=hp3, num_boost_round=nbr, train_set=ds3_train, valid_sets=[ds3_valid], 
               verbose_eval=True, categorical_feature=feats_cat)
print('Modelo da classe 3 finalizado')

[1]	valid_0's binary_logloss: 0.0264888
Training until validation scores don't improve for 25 rounds
[2]	valid_0's binary_logloss: 0.0268237
[3]	valid_0's binary_logloss: 0.0273197
[4]	valid_0's binary_logloss: 0.0279168
[5]	valid_0's binary_logloss: 0.0285636
[6]	valid_0's binary_logloss: 0.029417
[7]	valid_0's binary_logloss: 0.0301252
[8]	valid_0's binary_logloss: 0.0307773
[9]	valid_0's binary_logloss: 0.0314689
[10]	valid_0's binary_logloss: 0.0322604
[11]	valid_0's binary_logloss: 0.032888
[12]	valid_0's binary_logloss: 0.0335434
[13]	valid_0's binary_logloss: 0.0341167
[14]	valid_0's binary_logloss: 0.0348912
[15]	valid_0's binary_logloss: 0.035461
[16]	valid_0's binary_logloss: 0.0360104
[17]	valid_0's binary_logloss: 0.036666
[18]	valid_0's binary_logloss: 0.0372043
[19]	valid_0's binary_logloss: 0.0377281
[20]	valid_0's binary_logloss: 0.0382782
[21]	valid_0's binary_logloss: 0.0387899
[22]	valid_0's binary_logloss: 0.0392837
[23]	valid_0's binary_logloss: 0.0397643
[24]	vali

In [291]:
# Treinar 100% modelo da classe 2
m2 = lgb.train(params=hp2, num_boost_round=nbr, train_set=ds2_train, valid_sets=[ds2_valid], 
               verbose_eval=True, categorical_feature=feats_cat)
print('Modelo da classe 2 finalizado')

[1]	valid_0's binary_logloss: 0.106742
Training until validation scores don't improve for 25 rounds
[2]	valid_0's binary_logloss: 0.102128
[3]	valid_0's binary_logloss: 0.0987775
[4]	valid_0's binary_logloss: 0.0961718
[5]	valid_0's binary_logloss: 0.0941027
[6]	valid_0's binary_logloss: 0.0925107
[7]	valid_0's binary_logloss: 0.0911198
[8]	valid_0's binary_logloss: 0.0899318
[9]	valid_0's binary_logloss: 0.0889502
[10]	valid_0's binary_logloss: 0.0881923
[11]	valid_0's binary_logloss: 0.087496
[12]	valid_0's binary_logloss: 0.08689
[13]	valid_0's binary_logloss: 0.0863897
[14]	valid_0's binary_logloss: 0.0860275
[15]	valid_0's binary_logloss: 0.0856501
[16]	valid_0's binary_logloss: 0.0853342
[17]	valid_0's binary_logloss: 0.0850899
[18]	valid_0's binary_logloss: 0.0848879
[19]	valid_0's binary_logloss: 0.0847479
[20]	valid_0's binary_logloss: 0.0846526
[21]	valid_0's binary_logloss: 0.0845822
[22]	valid_0's binary_logloss: 0.0845497
[23]	valid_0's binary_logloss: 0.0845351
[24]	valid

In [292]:
# Treinar 100% modelo da classe 1
m1 = lgb.train(params=hp1, num_boost_round=nbr, train_set=ds1_train, valid_sets=[ds1_valid], 
               verbose_eval=True, categorical_feature=feats_cat)
print('Modelo da classe 1 finalizado')

[1]	valid_0's binary_logloss: 0.0591568
Training until validation scores don't improve for 25 rounds
[2]	valid_0's binary_logloss: 0.0568595
[3]	valid_0's binary_logloss: 0.0555816
[4]	valid_0's binary_logloss: 0.0548287
[5]	valid_0's binary_logloss: 0.0543482
[6]	valid_0's binary_logloss: 0.0541915
[7]	valid_0's binary_logloss: 0.0540331
[8]	valid_0's binary_logloss: 0.0540578
[9]	valid_0's binary_logloss: 0.0541581
[10]	valid_0's binary_logloss: 0.0544385
[11]	valid_0's binary_logloss: 0.0546682
[12]	valid_0's binary_logloss: 0.0549474
[13]	valid_0's binary_logloss: 0.0552308
[14]	valid_0's binary_logloss: 0.0556381
[15]	valid_0's binary_logloss: 0.0559757
[16]	valid_0's binary_logloss: 0.0562794
[17]	valid_0's binary_logloss: 0.056782
[18]	valid_0's binary_logloss: 0.0571563
[19]	valid_0's binary_logloss: 0.057509
[20]	valid_0's binary_logloss: 0.0578782
[21]	valid_0's binary_logloss: 0.0582834
[22]	valid_0's binary_logloss: 0.0586759
[23]	valid_0's binary_logloss: 0.0590431
[24]	va

**Avaliação do F1-Score em conjunto de validação com thresholding móvel**

In [293]:
def calcular_f1_score(yt, yp):
    '''
    
    '''
    # Calcular F1-Score Macro para cada threshold
    vTo = list()
    vF1 = list()
    maxF1 = 0
    maxTo = 0
    
    # Loop em diversas possibilidades
    for threshold in np.linspace(0,1, 100):
        # Calcular
        yd = (yp >= threshold).astype(int)
        f1s = f1_score(yt, yd, average='macro')
        # Adicionar
        vTo.append(threshold)
        vF1.append(f1s)
        # Salvar valor maior caso possível
        if f1s > maxF1:
            maxF1 = f1s
            maxTo = threshold
            
    
    # Pegar a mediana para report
    mF1 = round(np.median(vF1), 3)
    # Report
    print(f'> F1 Score Macro mediano = {mF1}')
    
    # Retornar
    return maxF1, maxTo, mF1 

In [294]:
# Fazer previsões
y1_pred_valid = m1.predict(x_valid)
print(f' # Para classe 1, feito!')

 # Para classe 1, feito!


In [295]:
# Fazer previsões
y2_pred_valid = m2.predict(x_valid)
print(f' # Para classe 2, feito!')

 # Para classe 2, feito!


In [296]:
# Fazer previsões
y3_pred_valid = m3.predict(x_valid)
print(f' # Para classe 3, feito!')

 # Para classe 3, feito!


In [297]:
# Fazer previsões
y4_pred_valid = m4.predict(x_valid)
print(f' # Para classe 4, feito!')

 # Para classe 4, feito!


In [298]:
# Fazer previsões
y5_pred_valid = m5.predict(x_valid)
print(f' # Para classe 5, feito!')

 # Para classe 5, feito!


In [299]:
# Calcular F1 Score Macro para todos modelos
print(f' ############ Classe 1')
v1 = calcular_f1_score(y1_valid.values, y1_pred_valid)
print(f' ############ Classe 2')
v2 = calcular_f1_score(y2_valid.values, y2_pred_valid)
print(f' ############ Classe 3')
v3 = calcular_f1_score(y3_valid.values, y3_pred_valid)
print(f' ############ Classe 4')
v4 = calcular_f1_score(y4_valid.values, y4_pred_valid)
print(f' ############ Classe 5')
v5 = calcular_f1_score(y5_valid.values, y5_pred_valid)
print(f' ############ Média Geral')
mg = round(np.mean([v1[2], v2[2], v3[2], v4[2], v5[2]]), 5)
print(f'> F1 Score Macro = {mg}')

 ############ Classe 1
> F1 Score Macro mediano = 0.497
 ############ Classe 2
> F1 Score Macro mediano = 0.494
 ############ Classe 3
> F1 Score Macro mediano = 0.499
 ############ Classe 4
> F1 Score Macro mediano = 0.739
 ############ Classe 5
> F1 Score Macro mediano = 0.79
 ############ Média Geral
> F1 Score Macro = 0.6038


In [300]:
# Ver valores
for i, v in enumerate([v1, v2, v3, v4, v5]):
    print(f'> Classe {i+1} tem Máximo F1 Score de {v[0]} com threhsold de {v[1]}')

> Classe 1 tem Máximo F1 Score de 0.6588207906117378 com threhsold de 0.08080808080808081
> Classe 2 tem Máximo F1 Score de 0.7275155748521854 com threhsold de 0.21212121212121213
> Classe 3 tem Máximo F1 Score de 0.5614070056353214 com threhsold de 0.030303030303030304
> Classe 4 tem Máximo F1 Score de 0.7923044107993006 com threhsold de 0.5858585858585859
> Classe 5 tem Máximo F1 Score de 0.8296986843489476 com threhsold de 0.393939393939394


**Construção de Modelos Finais (treino e validação) após tentativas de otimização**

In [301]:
# Especificar hiper parâmetros finais
hp5 = {'objective':'binary',
       'is_unbalance': True,
       'boosting_type': 'gbdt',
       'metric': 'binary_logloss', 
       'eta': 0.01,
       'num_leaves': 224,
       'feature_fraction': 0.85,
#       'early_stopping_round': 25,
       "lambda_l1":8.526,
       "lambda_l2":0.011,
       'seed': 1206,
       'verbosity': -1,
       'num_threads':4}
hp4 = {'objective':'binary',
       'is_unbalance': True,
       'boosting_type': 'gbdt',
       'metric': 'binary_logloss', 
       'eta': 0.01,
       'num_leaves': 243,
       'feature_fraction': 0.85,
#       'early_stopping_round': 25,
       "lambda_l1":8.526,
       "lambda_l2":0.011,
       'seed': 1206, 
       'verbosity': -1,
       'num_threads':4}
hp3 = {'objective':'binary',
       'is_unbalance': True,
       'boosting_type': 'gbdt',
       'metric': 'binary_logloss', 
       'eta': 0.01,
       'num_leaves': 243,
       'feature_fraction': 0.85,
#       'early_stopping_round': 25,
       "lambda_l1":8.526,
       "lambda_l2":0.011,
       'seed': 1206, 
       'verbosity': -1,
       'num_threads':4}
hp2 = {'objective':'binary',
       'is_unbalance': True,
       'boosting_type': 'gbdt',
       'metric': 'binary_logloss', 
       'eta': 0.01,
       'num_leaves': 243,
       'feature_fraction': 0.85,
#       'early_stopping_round': 25,
       "lambda_l1":8.526,
       "lambda_l2":0.011,
       'seed': 1206, 
       'verbosity': -1,
       'num_threads':4}
hp1 = {'objective':'binary',
       'is_unbalance': True,
       'boosting_type': 'gbdt',
       'metric': 'binary_logloss', 
       'eta': 0.01,
       'num_leaves': 243,
       'feature_fraction': 0.85,
#       'early_stopping_round': 25,
       "lambda_l1":8.526,
       "lambda_l2":0.011,
       'seed': 1206, 
       'verbosity': -1,
       'num_threads':4}

In [302]:
# Definir número de rounds
nbr = 2500
print('Inicio de treinamento ...')
# Treinar 100% modelo da classe 5
m5f = lgb.train(params=hp5, num_boost_round=nbr, train_set=ds5_train_valid, verbose_eval=False, categorical_feature=feats_cat)
print('Modelo da classe 5 finalizado')
# Treinar 100% modelo da classe 4
m4f = lgb.train(params=hp4, num_boost_round=nbr, train_set=ds4_train_valid, verbose_eval=False, categorical_feature=feats_cat)
print('Modelo da classe 4 finalizado')
# Treinar 100% modelo da classe 3
m3f = lgb.train(params=hp3, num_boost_round=nbr, train_set=ds3_train_valid, verbose_eval=False, categorical_feature=feats_cat)
print('Modelo da classe 3 finalizado')
# Treinar 100% modelo da classe 2
m2f = lgb.train(params=hp2, num_boost_round=nbr, train_set=ds2_train_valid, verbose_eval=False, categorical_feature=feats_cat)
print('Modelo da classe 2 finalizado')
# Treinar 100% modelo da classe 1
m1f = lgb.train(params=hp1, num_boost_round=nbr, train_set=ds1_train_valid, verbose_eval=False, categorical_feature=feats_cat)
print('Modelo da classe 1 finalizado')

Inicio de treinamento ...
Modelo da classe 5 finalizado
Modelo da classe 4 finalizado
Modelo da classe 3 finalizado
Modelo da classe 2 finalizado
Modelo da classe 1 finalizado


# Previsão

**Probabilidade Máxima**

In [303]:
# Carregar modelos
#m1_fl = lgb.Booster(model_file='modelos/m1.txt')
#m2_fl = lgb.Booster(model_file='modelos/m2.txt')
#m3_fl = lgb.Booster(model_file='modelos/m3.txt')
#m4_fl = lgb.Booster(model_file='modelos/m4.txt')
#m5_fl = lgb.Booster(model_file='modelos/m5.txt')

In [304]:
# Fazer previsões
print('Inicio de previsao ...')
y1_pred = m1f.predict(x_test)
print('Previsão da classe 1 finalizado')
y2_pred = m2f.predict(x_test)
print('Previsão da classe 2 finalizado')
y3_pred = m3f.predict(x_test)
print('Previsão da classe 3 finalizado')
y4_pred = m4f.predict(x_test)
print('Previsão da classe 4 finalizado')
y5_pred = m5f.predict(x_test)
print('Previsão da classe 5 finalizado')

Inicio de previsao ...
Previsão da classe 1 finalizado
Previsão da classe 2 finalizado
Previsão da classe 3 finalizado
Previsão da classe 4 finalizado
Previsão da classe 5 finalizado


In [305]:
# Unificar
y_pred = list()

# Loop em cada previsão
for y1,y2,y3,y4,y5 in zip(y1_pred, y2_pred, y3_pred, y4_pred, y5_pred):
    # Listar sequencialmente as previsoes
    l = [y1,y2,y3,y4,y5]
    # Puxar maior previsão
    y_pred.append(l.index(max(l)) + 1)

In [306]:
# Salvar previsão final
df_sub_desafio['CLASSI_FIN'] = y_pred

# Salvar probabilidades
df_sub_desafio['PROBA_1'] = y1_pred
df_sub_desafio['PROBA_2'] = y2_pred
df_sub_desafio['PROBA_3'] = y3_pred
df_sub_desafio['PROBA_4'] = y4_pred
df_sub_desafio['PROBA_5'] = y5_pred

In [307]:
# Viz
df_sub_desafio.head()

Unnamed: 0,ID,CLASSI_FIN,PROBA_1,PROBA_2,PROBA_3,PROBA_4,PROBA_5
0,c42342a4-cfca-497f-8ee6-a6c76ed4bc67,4,0.003598,0.001004,0.000171,0.937172,0.092696
1,6c86b555-8bf2-4306-8f2e-db809a98e3c3,4,7e-06,0.00023,4e-06,0.978485,0.02122
2,0ac9582e-2683-474b-8316-5e9c0ea6edd5,4,0.000556,6.3e-05,0.001715,0.535169,0.454708
3,7bc6de6f-041c-4866-8778-7542c230a498,4,0.0001,9.8e-05,0.000692,0.925828,0.084859
4,8eefb2e4-626e-4f40-9b93-5146906fe07b,4,7.7e-05,4e-06,0.000172,0.640977,0.478363


**Camada Adicional: Melhor modelo(s)**

In [308]:
# Adicionar previsão do melhor modelo com threshold optimizado
df_sub_desafio_to = df_sub_desafio.copy()
df_sub_desafio_to['THRESH_OPT5'] = (y5_pred >= 0.393939393939394).astype(int)
df_sub_desafio_to['THRESH_OPT4'] = (y4_pred >= 0.5858585858585859).astype(int)

# Checar contagem de previsoes considerando TO
df_sub_desafio_to.loc[df_sub_desafio_to['THRESH_OPT5'] == 1, 'CLASSI_FIN'].value_counts()

5    536098
4     47083
1       880
3       478
2       304
Name: CLASSI_FIN, dtype: int64

In [309]:
# Aplicar TO
df_sub_desafio_to.loc[(df_sub_desafio_to['THRESH_OPT5'] == 1), 'CLASSI_FIN'] = 5

In [310]:
# Salvar
df_sub_desafio_to[['ID','CLASSI_FIN']].to_csv('ml-olympiad-ensure-healthy-lives/submissao_try_9_pm_mais_to5.csv', index=False)

In [311]:
# Salvar localmente
df_sub_desafio[['ID','CLASSI_FIN']].to_csv('ml-olympiad-ensure-healthy-lives/submissao_try_9_pm.csv', index=False)

In [312]:
# Salvar localmente
df_sub_desafio.to_csv('ml-olympiad-ensure-healthy-lives/completo_try_9_pm.csv', index=False)
df_sub_desafio_to.to_csv('ml-olympiad-ensure-healthy-lives/completo_try_9_pm_mais_to5.csv', index=False)

# Salvar modelos treinados

In [313]:
# Salvar modelos localmente
m1f.save_model('modelos/m1_v9.txt')
m2f.save_model('modelos/m2_v9.txt')
m3f.save_model('modelos/m3_v9.txt')
m4f.save_model('modelos/m4_v9.txt')
m5f.save_model('modelos/m5_v9.txt')

<lightgbm.basic.Booster at 0x20944124b20>

# Checar Feature Importance de cada modelo

In [314]:
# Criar
df_fi = pd.DataFrame()
df_fi['FEAT'] = x_train.columns
df_fi['FI_M1'] = (100*m1f.feature_importance() / m1f.feature_importance().sum()).round(2)
df_fi['FI_M2'] = (100*m2f.feature_importance() / m2f.feature_importance().sum()).round(2)
df_fi['FI_M3'] = (100*m3f.feature_importance() / m3f.feature_importance().sum()).round(2)
df_fi['FI_M4'] = (100*m4f.feature_importance() / m4f.feature_importance().sum()).round(2)
df_fi['FI_M5'] = (100*m5f.feature_importance() / m5f.feature_importance().sum()).round(2)
df_fi['FI_ORDER'] = df_fi[[c for c in df_fi.columns if 'FI_M' in c]].sum(axis=1)

In [316]:
# Salvar localmente
df_fi.to_csv('ml-olympiad-ensure-healthy-lives/feature_importance_models_9.csv', index=False)