# **Instalações e Bibliotecas**

In [None]:
%%capture

!pip install shap
!pip install optuna

In [None]:
import pandas as pd
import numpy as np

from sklearn.impute import KNNImputer

# **Funções**

O arquivo 'functions_pred_cruzada' contém as funções que serão utilizadas para selecionar, pré-processar e treinar e validar os modelos de machine learning

In [None]:
!gdown 1uVoArwRTJmvbyFLyFniwFvWGHMfiNIWO --quiet

from functions_pred_cruzada import *

# **Dados**

In [None]:
# Dados brutos com todos os tipos de câncer
!gdown 1P19f3kA5_s4sv5Pg6D6UKwwQvZvaAnQT --quiet

df_geral = pd.read_csv('pacigeral.csv')
print(df_geral.shape)
df_geral.head(3)

Columns (35,36,73,74,75,78,89,90) have mixed types. Specify dtype option on import or set low_memory=False.


(1134043, 100)


Unnamed: 0,ESCOLARI,IDADE,SEXO,UFNASC,UFRESID,IBGE,CIDADE,CATEATEND,DTCONSULT,CLINICA,...,REC03,REC04,IBGEATEN,CIDO,DSCCIDO,HABILIT,HABIT11,HABILIT1,HABILIT2,CIDADEH
0,2,100,2,SP,SP,3550308,SAO PAULO,9,2000-06-09,3,...,,,3550308,80703,CARCINOMA ESCAMOCELULAR SOE,7,CACON com Servio de Oncologia Pedi trica,3,2,SÆo Paulo
1,1,102,1,MG,MG,3133402,ITAPAGIPE,9,2002-04-29,24,...,,,3505500,80903,CARCINOMA BASOCELULAR SOE,7,CACON com Servio de Oncologia Pedi trica,3,2,Barretos
2,2,101,2,MG,SP,3543402,RIBEIRAO PRETO,9,2000-12-03,10,...,,,3543402,80703,CARCINOMA ESCAMOCELULAR SOE,7,CACON com Servio de Oncologia Pedi trica,3,2,RibeirÆo Preto


**Tipos com maior incidência**

In [None]:
df_geral.TOPOGRUP.value_counts(normalize=True).head(15)

C44    0.229273
C50    0.136342
C61    0.103496
C34    0.047272
C53    0.046387
C18    0.037844
C42    0.037599
C16    0.036821
C20    0.028988
C73    0.023747
C77    0.018756
C15    0.018305
C67    0.017882
C32    0.015979
C64    0.015855
Name: TOPOGRUP, dtype: float64

*   C44 - Pele 22,93%
*   C50 - Mama 13,63%
*   C61 - Próstata 10,35%
*   C34 - Pulmão 4,73%
*   C53 - Colo de Útero 4,64%
*   C18 - Cólon 3,78%
*   C42 - ?? 3,76%
*   C16 - Estômago 3,68%
*   C20 - Reto 2,90%
*   C73 - Tiróide 2,37%
*   C77 - ?? 1,88%
*   C15 - Esôfago 1,83%
*   C67 - Bexiga 1,79%
*   C32 - Laringe 1,60%
*   C64 - Rim 1,59%



## **Sistema Digestório**

* Boca (C00, C01, C02, C03, C04, C05, C06)

In [None]:
boca = ['C00', 'C01', 'C02', 'C03', 'C04', 'C05', 'C06']

df_boca = data_prep(df_geral, boca)
print(df_boca.shape)
df_boca.head(3)

(25413, 30)


Unnamed: 0,IDADE,SEXO,IBGE,CATEATEND,DIAGPREV,TOPO,TOPOGRUP,EC,TRATHOSP,NENHUM,...,DIAGTRAT,ANODIAG,DRS,RRAS,IBGEATEN,HABILIT2,ULTIDIAG,ESCOLARI_preench,IBGE_idem_IBGEATEN,presenca_rec
0,100,2,3550308,9,2,C031,C03,IVA,B,0,...,0,2000,1,6,3550308,2,171,2.0,1,0
29783,29,1,3507803,9,1,C001,C00,II,A,0,...,0,2005,13,13,3543402,2,482,3.0,0,0
29784,28,1,3520905,9,1,C001,C00,I,A,0,...,0,2004,9,10,3525300,2,5139,3.0,0,0


In [None]:
df_boca = get_labels(df_boca)
df_boca.shape

(25413, 32)

* Orofaringe (C10)

In [None]:
oro = ['C10']

df_oro = data_prep(df_geral, oro)
print(df_oro.shape)
df_oro.head(3)

(5121, 30)


Unnamed: 0,IDADE,SEXO,IBGE,CATEATEND,DIAGPREV,TOPO,TOPOGRUP,EC,TRATHOSP,NENHUM,...,DIAGTRAT,ANODIAG,DRS,RRAS,IBGEATEN,HABILIT2,ULTIDIAG,ESCOLARI_preench,IBGE_idem_IBGEATEN,presenca_rec
30156,27,1,3550308,2,1,C102,C10,IVB,F,0,...,2,2015,1,6,3550308,2,2423,3.0,1,0
30159,20,2,3526902,2,2,C103,C10,IVC,F,0,...,0,2014,10,14,3526902,1,731,2.0,1,0
30160,23,2,3503901,2,1,C108,C10,I,E,0,...,0,2013,1,2,3550308,1,2357,3.0,0,0


In [None]:
df_oro = get_labels(df_oro)
df_oro.shape

(5121, 32)

* Esôfago (C15)

In [None]:
eso = ['C15']

df_eso = data_prep(df_geral, eso)
print(df_eso.shape)
df_eso.head(3)

(14379, 30)


Unnamed: 0,IDADE,SEXO,IBGE,CATEATEND,DIAGPREV,TOPO,TOPOGRUP,EC,TRATHOSP,NENHUM,...,DIAGTRAT,ANODIAG,DRS,RRAS,IBGEATEN,HABILIT2,ULTIDIAG,ESCOLARI_preench,IBGE_idem_IBGEATEN,presenca_rec
30392,28,2,3549904,2,2,C150,C15,IV,C,0,...,2,2019,17,17,3549904,1,309,3.0,1,0
30393,27,2,3509601,2,1,C150,C15,IIIC,F,0,...,2,2018,7,16,3550308,2,212,1.0,0,0
30394,28,1,3550308,2,1,C152,C15,III,A,0,...,0,2004,1,6,3550308,2,585,3.0,1,0


In [None]:
df_eso = get_labels(df_eso)
df_eso.shape

(14379, 32)

* Estômago (C16)

In [None]:
esto = ['C16']

df_esto = data_prep(df_geral, esto)
print(df_esto.shape)
df_esto.head(3)

(29043, 30)


Unnamed: 0,IDADE,SEXO,IBGE,CATEATEND,DIAGPREV,TOPO,TOPOGRUP,EC,TRATHOSP,NENHUM,...,DIAGTRAT,ANODIAG,DRS,RRAS,IBGEATEN,HABILIT2,ULTIDIAG,ESCOLARI_preench,IBGE_idem_IBGEATEN,presenca_rec
30427,28,2,3518800,9,1,C160,C16,IV,C,0,...,0,2002,1,2,3550308,2,170,4.0,0,0
30428,29,2,3550308,9,1,C160,C16,IV,J,1,...,99,2003,1,6,3550308,2,54,3.0,1,0
30429,29,1,3557105,9,1,C160,C16,II,F,0,...,0,2006,15,12,3550308,2,260,5.0,0,1


In [None]:
df_esto = get_labels(df_esto)
df_esto.shape

(29043, 32)

* Intestino Delgado (C17)

In [None]:
delg = ['C17']

df_delg = data_prep(df_geral, delg)
print(df_delg.shape)
df_delg.head(3)

(1772, 30)


Unnamed: 0,IDADE,SEXO,IBGE,CATEATEND,DIAGPREV,TOPO,TOPOGRUP,EC,TRATHOSP,NENHUM,...,DIAGTRAT,ANODIAG,DRS,RRAS,IBGEATEN,HABILIT2,ULTIDIAG,ESCOLARI_preench,IBGE_idem_IBGEATEN,presenca_rec
30863,28,2,3550308,9,2,C170,C17,I,A,0,...,0,2008,1,6,3550308,2,1028,3.0,1,0
30865,22,1,3550308,2,1,C170,C17,IV,E,0,...,0,2005,1,6,3550308,2,109,1.0,1,0
30866,25,1,3504107,2,2,C170,C17,IV,J,1,...,99,2009,7,16,3550308,2,91,4.0,0,0


In [None]:
df_delg = get_labels(df_delg)
df_delg.shape

(1772, 32)

* Colorretal (C18, C19, C20)

In [None]:
colo = ['C18', 'C19', 'C20']

df_colo = data_prep(df_geral, colo)
print(df_colo.shape)
df_colo.head(3)

(60887, 30)


Unnamed: 0,IDADE,SEXO,IBGE,CATEATEND,DIAGPREV,TOPO,TOPOGRUP,EC,TRATHOSP,NENHUM,...,DIAGTRAT,ANODIAG,DRS,RRAS,IBGEATEN,HABILIT2,ULTIDIAG,ESCOLARI_preench,IBGE_idem_IBGEATEN,presenca_rec
30913,28,2,3548708,9,1,C180,C18,III,E,0,...,0,2003,1,1,3550308,1,415,3.0,0,0
30914,26,2,3550308,9,1,C180,C18,III,A,0,...,0,2001,1,6,3550308,2,2478,2.0,1,0
30916,22,2,3503208,9,2,C180,C18,IV,C,0,...,0,2001,3,13,3503208,1,255,4.0,1,0


In [None]:
df_colo = get_labels(df_colo)
df_colo.shape

(60887, 32)

* Ânus (C21)

In [None]:
anus = ['C21']

df_anus = data_prep(df_geral, anus)
print(df_anus.shape)
df_anus.head(3)

(2466, 30)


Unnamed: 0,IDADE,SEXO,IBGE,CATEATEND,DIAGPREV,TOPO,TOPOGRUP,EC,TRATHOSP,NENHUM,...,DIAGTRAT,ANODIAG,DRS,RRAS,IBGEATEN,HABILIT2,ULTIDIAG,ESCOLARI_preench,IBGE_idem_IBGEATEN,presenca_rec
31830,29,1,3518800,9,2,C211,C21,IV,G,0,...,2,2008,1,2,3550308,2,672,3.0,0,0
31832,23,1,3550308,9,1,C211,C21,IIIB,B,0,...,2,2005,1,6,3550308,2,1099,2.0,1,0
31834,28,1,3507753,9,2,C211,C21,II,B,0,...,1,2010,2,12,3505500,2,649,5.0,0,0


In [None]:
df_anus = get_labels(df_anus)
df_anus.shape

(2466, 32)

In [None]:
# import plotly.graph_objects as go

# grup_ec = np.sort(df_boca.ECGRUP.unique())

# fig = go.Figure(data=[
#     go.Bar(name='Boca', x=grup_ec, y=df_boca.ECGRUP.value_counts(normalize=True).sort_index()*100,
#            text=df_boca.ECGRUP.value_counts(normalize=True).sort_index()*100),
#     go.Bar(name='Orofaringe', x=grup_ec, y=df_oro.ECGRUP.value_counts(normalize=True).sort_index()*100,
#            text=df_oro.ECGRUP.value_counts(normalize=True).sort_index()*100),
#     go.Bar(name='Esôfago', x=grup_ec, y=df_eso.ECGRUP.value_counts(normalize=True).sort_index()*100,
#            text=df_eso.ECGRUP.value_counts(normalize=True).sort_index()*100),
#     go.Bar(name='Estômago', x=grup_ec, y=df_esto.ECGRUP.value_counts(normalize=True).sort_index()*100,
#            text=df_esto.ECGRUP.value_counts(normalize=True).sort_index()*100),
#     go.Bar(name='Intestino Delgado', x=grup_ec, y=df_delg.ECGRUP.value_counts(normalize=True).sort_index()*100,
#            text=df_delg.ECGRUP.value_counts(normalize=True).sort_index()*100),
#     go.Bar(name='Colorretal', x=grup_ec, y=df_colo.ECGRUP.value_counts(normalize=True).sort_index()*100,
#            text=df_colo.ECGRUP.value_counts(normalize=True).sort_index()*100),
#     go.Bar(name='Ânus', x=grup_ec, y=df_anus.ECGRUP.value_counts(normalize=True).sort_index()*100,
#            text=df_anus.ECGRUP.value_counts(normalize=True).sort_index()*100)
# ])
# # Change the bar mode
# fig.update_layout(barmode='group', yaxis_title='% Pacientes',
#                   xaxis_title='Estadiamento Clínico')
# fig.update_traces(texttemplate='%{text:.3s}', textposition='outside')
# fig.show()

In [None]:
df_boca.columns

Index(['IDADE', 'SEXO', 'IBGE', 'CATEATEND', 'DIAGPREV', 'TOPO', 'TOPOGRUP',
       'EC', 'TRATHOSP', 'NENHUM', 'CIRURGIA', 'RADIO', 'QUIMIO', 'HORMONIO',
       'TMO', 'IMUNO', 'OUTROS', 'CONSDIAG', 'TRATCONS', 'DIAGTRAT', 'ANODIAG',
       'DRS', 'RRAS', 'IBGEATEN', 'HABILIT2', 'ESCOLARI_preench',
       'IBGE_idem_IBGEATEN', 'presenca_rec', 'obito_geral', 'sobrevida_ano1',
       'sobrevida_ano3', 'sobrevida_ano5'],
      dtype='object')

In [None]:
dfs = [df_boca, df_oro, df_eso, df_esto, df_delg, df_colo, df_anus]
for df in dfs:
    print(df[~((df.obito_geral == 0) & (df.sobrevida_ano3 == 0))].sobrevida_ano3.value_counts(normalize=True).sort_index())

0    0.593974
1    0.406026
Name: sobrevida_ano3, dtype: float64
0    0.744509
1    0.255491
Name: sobrevida_ano3, dtype: float64
0    0.869225
1    0.130775
Name: sobrevida_ano3, dtype: float64
0    0.733289
1    0.266711
Name: sobrevida_ano3, dtype: float64
0    0.55527
1    0.44473
Name: sobrevida_ano3, dtype: float64
0    0.45493
1    0.54507
Name: sobrevida_ano3, dtype: float64
0    0.465253
1    0.534747
Name: sobrevida_ano3, dtype: float64


Salvando os dados no Drive

In [None]:
todos = ['C00', 'C01', 'C02', 'C03', 'C04', 'C05', 'C06', # Boca
         'C10',                                           # Orofaringe
         'C15',                                           # Esôfago
         'C16',                                           # Estômago
         'C17',                                           # Intestino Delgado
         'C18', 'C19', 'C20',                             # Colorretal
         'C21']                                           # Ânus

df_todos = data_prep(df_geral, todos)
df_todos = get_labels(df_todos)
df_todos.shape

(139081, 32)

In [None]:
# df_todos.to_csv('/content/drive/MyDrive/Trabalho/Cancer/Predição cruzada/Datasets/sist_digestorio.csv',
#                 encoding='utf-8', index=False)