# **Instalações e Bibliotecas**

In [1]:
%%capture

!pip install shap
!pip install optuna

In [None]:
import pandas as pd
import numpy as np

# **Funções**

O arquivo 'functions_pred_cruzada' contém as funções que serão utilizadas para selecionar, pré-processar e treinar e validar os modelos de machine learning

In [2]:
!gdown 1uVoArwRTJmvbyFLyFniwFvWGHMfiNIWO --quiet

from functions_pred_cruzada import *

# **Dados**

In [3]:
# Dados brutos com todos os tipos de câncer
!gdown 1P19f3kA5_s4sv5Pg6D6UKwwQvZvaAnQT --quiet

df_geral = pd.read_csv('pacigeral.csv')
print(df_geral.shape)
df_geral.head(3)

Columns (35,36,73,74,75,78,89,90) have mixed types. Specify dtype option on import or set low_memory=False.


(1134043, 100)


Unnamed: 0,ESCOLARI,IDADE,SEXO,UFNASC,UFRESID,IBGE,CIDADE,CATEATEND,DTCONSULT,CLINICA,...,REC03,REC04,IBGEATEN,CIDO,DSCCIDO,HABILIT,HABIT11,HABILIT1,HABILIT2,CIDADEH
0,2,100,2,SP,SP,3550308,SAO PAULO,9,2000-06-09,3,...,,,3550308,80703,CARCINOMA ESCAMOCELULAR SOE,7,CACON com Servio de Oncologia Pedi trica,3,2,SÆo Paulo
1,1,102,1,MG,MG,3133402,ITAPAGIPE,9,2002-04-29,24,...,,,3505500,80903,CARCINOMA BASOCELULAR SOE,7,CACON com Servio de Oncologia Pedi trica,3,2,Barretos
2,2,101,2,MG,SP,3543402,RIBEIRAO PRETO,9,2000-12-03,10,...,,,3543402,80703,CARCINOMA ESCAMOCELULAR SOE,7,CACON com Servio de Oncologia Pedi trica,3,2,RibeirÆo Preto


In [13]:
df_geral.ULTINFO.value_counts()

2    501971
3    331162
4    170886
1    130024
Name: ULTINFO, dtype: int64

In [14]:
df_geral['obito'] = [0 if x < 3 else 1 for x in df_geral.ULTINFO]
df_geral['obito'].value_counts()

0    631995
1    502048
Name: obito, dtype: int64

In [58]:
list_datas = ['DTDIAG', 'DTULTINFO']
df = df_geral.copy()

for col_data in list_datas:
    df[col_data] = pd.to_datetime(df[col_data])

df['ULTIDIAG'] = (df.DTULTINFO - df.DTDIAG).dt.days

df['sobrevida_ano1'] = 0
df['sobrevida_ano3'] = 0
df['sobrevida_ano5'] = 0

df.loc[df.ULTIDIAG > 365, 'sobrevida_ano1'] = 1
df.loc[df.ULTIDIAG > 3*365, 'sobrevida_ano3'] = 1
df.loc[df.ULTIDIAG > 5*365, 'sobrevida_ano5'] = 1

df.head(1)

Unnamed: 0,ESCOLARI,IDADE,SEXO,UFNASC,UFRESID,IBGE,CIDADE,CATEATEND,DTCONSULT,CLINICA,...,HABILIT,HABIT11,HABILIT1,HABILIT2,CIDADEH,obito,ULTIDIAG,sobrevida_ano1,sobrevida_ano3,sobrevida_ano5
0,2,100,2,SP,SP,3550308,SAO PAULO,9,2000-06-09,3,...,7,CACON com Servio de Oncologia Pedi trica,3,2,SÆo Paulo,1,171,0,0,0


**Tipos com maior incidência**

In [60]:
df.TOPOGRUP.value_counts(normalize=True).head(15)

C44    0.229273
C50    0.136342
C61    0.103496
C34    0.047272
C53    0.046387
C18    0.037844
C42    0.037599
C16    0.036821
C20    0.028988
C73    0.023747
C77    0.018756
C15    0.018305
C67    0.017882
C32    0.015979
C64    0.015855
Name: TOPOGRUP, dtype: float64

*   C44 - Pele 22,93%
*   C50 - Mama 13,63%
*   C61 - Próstata 10,35%
*   C34 - Pulmão 4,73%
*   C53 - Colo de Útero 4,64%
*   C18 - Cólon 3,78%
*   C42 - ?? 3,76%
*   C16 - Estômago 3,68%
*   C20 - Reto 2,90%
*   C73 - Tiróide 2,37%
*   C77 - ?? 1,88%
*   C15 - Esôfago 1,83%
*   C67 - Bexiga 1,79%
*   C32 - Laringe 1,60%
*   C64 - Rim 1,59%



In [65]:
topos = np.sort(df.TOPOGRUP.unique())
cols = ['TOPOGRUP', 'obito_0', 'obito_1']

for i, topo in enumerate(topos):
    aux = df[df.TOPOGRUP == topo]
    counts = aux.obito.value_counts(normalize=True).sort_index()
    if i == 0:
        topogrup = pd.DataFrame([[topo, counts[0], counts[1]]],
                                columns=cols)
    else:
        nova_linha = [{'TOPOGRUP': topo, 'obito_0': counts[0], 'obito_1': counts[1]}]
        novo_df = [pd.DataFrame([linha], columns=cols) for linha in nova_linha]
        topogrup = pd.concat([topogrup] + novo_df, ignore_index=True)

topogrup = topogrup.set_index('TOPOGRUP')

topogrup_ordenado = topogrup.sort_values(by='obito_1', ascending=False)
print(topogrup_ordenado.shape)
topogrup_ordenado.head(10)

(70, 2)


Unnamed: 0_level_0,obito_0,obito_1
TOPOGRUP,Unnamed: 1_level_1,Unnamed: 2_level_1
C80,0.113088,0.886912
C15,0.125199,0.874801
C23,0.137603,0.862397
C34,0.15757,0.84243
C13,0.167455,0.832545
C26,0.173502,0.826498
C25,0.187166,0.812834
C12,0.187172,0.812828
C14,0.211964,0.788036
C24,0.216409,0.783591


In [67]:
df_ano1 = df[~((df.obito == 0) & (df.sobrevida_ano1 == 0))].reset_index(drop=True)
topos = np.sort(df_ano1.TOPOGRUP.unique())
cols = ['TOPOGRUP', 'obito_0', 'obito_1']

for i, topo in enumerate(topos):
    aux = df_ano1[df_ano1.TOPOGRUP == topo]
    counts = aux.obito.value_counts(normalize=True).sort_index()
    if i == 0:
        topogrup_ano1 = pd.DataFrame([[topo, counts[0], counts[1]]],
                                     columns=cols)
    else:
        nova_linha = [{'TOPOGRUP': topo, 'obito_0': counts[0], 'obito_1': counts[1]}]
        novo_df = [pd.DataFrame([linha], columns=cols) for linha in nova_linha]
        topogrup_ano1 = pd.concat([topogrup_ano1] + novo_df, ignore_index=True)

topogrup_ano1 = topogrup_ano1.set_index('TOPOGRUP')

topogrup_ano1_ordenado = topogrup_ano1.sort_values(by='obito_1', ascending=False)
print(topogrup_ano1_ordenado.shape)
topogrup_ano1_ordenado.head(10)

(70, 2)


Unnamed: 0_level_0,obito_0,obito_1
TOPOGRUP,Unnamed: 1_level_1,Unnamed: 2_level_1
C80,0.08623,0.91377
C15,0.088628,0.911372
C23,0.095735,0.904265
C34,0.110356,0.889644
C25,0.119221,0.880779
C13,0.136718,0.863282
C26,0.138158,0.861842
C12,0.16332,0.83668
C24,0.163564,0.836436
C14,0.17663,0.82337


In [68]:
df_ano3 = df[~((df.obito == 0) & (df.sobrevida_ano3 == 0))].reset_index(drop=True)
topos = np.sort(df_ano3.TOPOGRUP.unique())
cols = ['TOPOGRUP', 'obito_0', 'obito_1']

for i, topo in enumerate(topos):
    aux = df_ano3[df_ano3.TOPOGRUP == topo]
    counts = aux.obito.value_counts(normalize=True).sort_index()
    if i == 0:
        topogrup_ano3 = pd.DataFrame([[topo, counts[0], counts[1]]],
                                     columns=cols)
    else:
        nova_linha = [{'TOPOGRUP': topo, 'obito_0': counts[0], 'obito_1': counts[1]}]
        novo_df = [pd.DataFrame([linha], columns=cols) for linha in nova_linha]
        topogrup_ano3 = pd.concat([topogrup_ano3] + novo_df, ignore_index=True)

topogrup_ano3 = topogrup_ano3.set_index('TOPOGRUP')

topogrup_ano3_ordenado = topogrup_ano3.sort_values(by='obito_1', ascending=False)
print(topogrup_ano3_ordenado.shape)
topogrup_ano3_ordenado.head(10)

(70, 2)


Unnamed: 0_level_0,obito_0,obito_1
TOPOGRUP,Unnamed: 1_level_1,Unnamed: 2_level_1
C15,0.057162,0.942838
C80,0.058157,0.941843
C23,0.062118,0.937882
C39,0.0625,0.9375
C34,0.062564,0.937436
C25,0.067341,0.932659
C26,0.074205,0.925795
C13,0.10228,0.89772
C24,0.10571,0.89429
C22,0.115598,0.884402


In [69]:
df_ano5 = df[~((df.obito == 0) & (df.sobrevida_ano5 == 0))].reset_index(drop=True)
topos = np.sort(df_ano5.TOPOGRUP.unique())
cols = ['TOPOGRUP', 'obito_0', 'obito_1']

for i, topo in enumerate(topos):
    aux = df_ano5[df_ano5.TOPOGRUP == topo]
    counts = aux.obito.value_counts(normalize=True).sort_index()
    if i == 0:
        topogrup_ano5 = pd.DataFrame([[topo, counts[0], counts[1]]],
                                     columns=cols)
    else:
        nova_linha = [{'TOPOGRUP': topo, 'obito_0': counts[0], 'obito_1': counts[1]}]
        novo_df = [pd.DataFrame([linha], columns=cols) for linha in nova_linha]
        topogrup_ano5 = pd.concat([topogrup_ano5] + novo_df, ignore_index=True)

topogrup_ano5 = topogrup_ano5.set_index('TOPOGRUP')

topogrup_ano5_ordenado = topogrup_ano5.sort_values(by='obito_1', ascending=False)
print(topogrup_ano5_ordenado.shape)
topogrup_ano5_ordenado.head(10)

(70, 2)


Unnamed: 0_level_0,obito_0,obito_1
TOPOGRUP,Unnamed: 1_level_1,Unnamed: 2_level_1
C15,0.036809,0.963191
C34,0.039373,0.960627
C25,0.040455,0.959545
C23,0.041827,0.958173
C80,0.042612,0.957388
C26,0.043796,0.956204
C39,0.0625,0.9375
C24,0.074299,0.925701
C13,0.077374,0.922626
C22,0.077724,0.922276


**Tipos com maior incidência por sexo**

In [71]:
df_masc = df[df.SEXO == 1]
df_fem = df[df.SEXO == 2]

print(df_masc.shape)
print(df_fem.shape)

(564295, 105)
(569748, 105)


**Masculino**

In [72]:
df_masc.TOPOGRUP.value_counts(normalize=True).head(10)

C44    0.242235
C61    0.207992
C34    0.057615
C16    0.047716
C42    0.040686
C18    0.037027
C20    0.032261
C15    0.030612
C32    0.028001
C67    0.026417
Name: TOPOGRUP, dtype: float64

*   C44 - Pele 24,22%
*   C61 - Próstata 20,80%
*   C34 - Pulmão 5,76%
*   C16 - Estômago 4,77%
*   C42 - ?? 4,07%
*   C18 - Cólon 3,70%
*   C20 - Reto 3,22%
*   C15 - Esôfago 3,06%
*   C32 - Laringe 2,80%
*   C67 - Bexiga 2,64%

In [93]:
topos = np.sort(df_masc.TOPOGRUP.unique())
cols = ['TOPOGRUP', 'obito_0', 'obito_1']

for i, topo in enumerate(topos):
    aux = df_masc[df_masc.TOPOGRUP == topo]
    counts = aux.obito.value_counts(normalize=True).sort_index()
    if i == 0:
        topogrup_masc = pd.DataFrame([[topo, counts[0], counts[1]]],
                                     columns=cols)
    else:
        nova_linha = [{'TOPOGRUP': topo, 'obito_0': counts[0], 'obito_1': counts[1]}]
        novo_df = [pd.DataFrame([linha], columns=cols) for linha in nova_linha]
        topogrup_masc = pd.concat([topogrup_masc] + novo_df, ignore_index=True)

topogrup_masc = topogrup_masc.set_index('TOPOGRUP')

topogrup_masc_ordenado = topogrup_masc.sort_values(by='obito_1', ascending=False)
print(topogrup_masc_ordenado.shape)
topogrup_masc_ordenado.head(10)

(62, 2)


Unnamed: 0_level_0,obito_0,obito_1
TOPOGRUP,Unnamed: 1_level_1,Unnamed: 2_level_1
C80,0.115782,0.884218
C15,0.117865,0.882135
C34,0.126907,0.873093
C13,0.158477,0.841523
C25,0.166553,0.833447
C23,0.175911,0.824089
C14,0.1776,0.8224
C12,0.181744,0.818256
C26,0.184713,0.815287
C01,0.205101,0.794899


In [74]:
df_masc_ano1 = df_masc[~((df_masc.obito == 0) & (df_masc.sobrevida_ano1 == 0))].reset_index(drop=True)
topos = np.sort(df_masc_ano1.TOPOGRUP.unique())
cols = ['TOPOGRUP', 'obito_0', 'obito_1']

for i, topo in enumerate(topos):
    aux = df_masc_ano1[df_masc_ano1.TOPOGRUP == topo]
    counts = aux.obito.value_counts(normalize=True).sort_index()
    if i == 0:
        topogrup_masc_ano1 = pd.DataFrame([[topo, counts[0], counts[1]]],
                                          columns=cols)
    else:
        nova_linha = [{'TOPOGRUP': topo, 'obito_0': counts[0], 'obito_1': counts[1]}]
        novo_df = [pd.DataFrame([linha], columns=cols) for linha in nova_linha]
        topogrup_masc_ano1 = pd.concat([topogrup_masc_ano1] + novo_df, ignore_index=True)

topogrup_masc_ano1 = topogrup_masc_ano1.set_index('TOPOGRUP')

topogrup_masc_ano1_ordenado = topogrup_masc_ano1.sort_values(by='obito_1', ascending=False)
print(topogrup_masc_ano1_ordenado.shape)
topogrup_masc_ano1_ordenado.head(10)

(62, 2)


Unnamed: 0_level_0,obito_0,obito_1
TOPOGRUP,Unnamed: 1_level_1,Unnamed: 2_level_1
C15,0.08155,0.91845
C34,0.085709,0.914291
C80,0.088047,0.911953
C25,0.099039,0.900961
C23,0.127517,0.872483
C26,0.129252,0.870748
C13,0.129606,0.870394
C14,0.140468,0.859532
C12,0.157829,0.842171
C01,0.169688,0.830312


In [75]:
df_masc_ano3 = df_masc[~((df_masc.obito == 0) & (df_masc.sobrevida_ano3 == 0))].reset_index(drop=True)
topos = np.sort(df_masc_ano3.TOPOGRUP.unique())
cols = ['TOPOGRUP', 'obito_0', 'obito_1']

for i, topo in enumerate(topos):
    aux = df_masc_ano3[df_masc_ano3.TOPOGRUP == topo]
    counts = aux.obito.value_counts(normalize=True).sort_index()
    if i == 0:
        topogrup_masc_ano3 = pd.DataFrame([[topo, counts[0], counts[1]]],
                                          columns=cols)
    else:
        nova_linha = [{'TOPOGRUP': topo, 'obito_0': counts[0], 'obito_1': counts[1]}]
        novo_df = [pd.DataFrame([linha], columns=cols) for linha in nova_linha]
        topogrup_masc_ano3 = pd.concat([topogrup_masc_ano3] + novo_df, ignore_index=True)

topogrup_masc_ano3 = topogrup_masc_ano3.set_index('TOPOGRUP')

topogrup_masc_ano3_ordenado = topogrup_masc_ano3.sort_values(by='obito_1', ascending=False)
print(topogrup_masc_ano3_ordenado.shape)
topogrup_masc_ano3_ordenado.head(10)

(62, 2)


Unnamed: 0_level_0,obito_0,obito_1
TOPOGRUP,Unnamed: 1_level_1,Unnamed: 2_level_1
C34,0.04681,0.95319
C25,0.050475,0.949525
C15,0.051006,0.948994
C80,0.058397,0.941603
C23,0.084507,0.915493
C39,0.090909,0.909091
C13,0.096604,0.903396
C26,0.098592,0.901408
C12,0.114574,0.885426
C14,0.115318,0.884682


In [76]:
df_masc_ano5 = df_masc[~((df_masc.obito == 0) & (df_masc.sobrevida_ano5 == 0))].reset_index(drop=True)
topos = np.sort(df_masc_ano5.TOPOGRUP.unique())
cols = ['TOPOGRUP', 'obito_0', 'obito_1']

for i, topo in enumerate(topos):
    aux = df_masc_ano5[df_masc_ano5.TOPOGRUP == topo]
    counts = aux.obito.value_counts(normalize=True).sort_index()
    if i == 0:
        topogrup_masc_ano5 = pd.DataFrame([[topo, counts[0], counts[1]]],
                                          columns=cols)
    else:
        nova_linha = [{'TOPOGRUP': topo, 'obito_0': counts[0], 'obito_1': counts[1]}]
        novo_df = [pd.DataFrame([linha], columns=cols) for linha in nova_linha]
        topogrup_masc_ano5 = pd.concat([topogrup_masc_ano5] + novo_df, ignore_index=True)

topogrup_masc_ano5 = topogrup_masc_ano5.set_index('TOPOGRUP')

topogrup_masc_ano5_ordenado = topogrup_masc_ano5.sort_values(by='obito_1', ascending=False)
print(topogrup_masc_ano5_ordenado.shape)
topogrup_masc_ano5_ordenado.head(10)

(62, 2)


Unnamed: 0_level_0,obito_0,obito_1
TOPOGRUP,Unnamed: 1_level_1,Unnamed: 2_level_1
C34,0.029339,0.970661
C25,0.029922,0.970078
C15,0.032815,0.967185
C80,0.042228,0.957772
C26,0.051852,0.948148
C23,0.052823,0.947177
C13,0.071816,0.928184
C22,0.078068,0.921932
C10,0.081883,0.918117
C12,0.083182,0.916818


**Feminino**

In [77]:
df_fem.TOPOGRUP.value_counts(normalize=True).head(10)

C50    0.269551
C44    0.216434
C53    0.092330
C18    0.038654
C73    0.037580
C34    0.037027
C42    0.034542
C54    0.029424
C16    0.026031
C20    0.025746
Name: TOPOGRUP, dtype: float64

*   C50 - Mama 26,95%
*   C44 - Pele 21,64%
*   C53 - Colo de Útero 9,23%
*   C18 - Cólon 3,86%
*   C73 - Tiróide 3,75%
*   C34 - Pulmão 3,70%
*   C42 - ?? 3,45%
*   C54 - Corpo do Útero 2,94%
*   C16 - Estômago 2,60%
*   C20 - Reto 2,57%

In [79]:
topos = np.sort(df_fem.TOPOGRUP.unique())
cols = ['TOPOGRUP', 'obito_0', 'obito_1']

for i, topo in enumerate(topos):
    aux = df_fem[df_fem.TOPOGRUP == topo]
    counts = aux.obito.value_counts(normalize=True).sort_index()
    if i == 0:
        topogrup_fem = pd.DataFrame([[topo, counts[0], counts[1]]],
                                    columns=cols)
    else:
        nova_linha = [{'TOPOGRUP': topo, 'obito_0': counts[0], 'obito_1': counts[1]}]
        novo_df = [pd.DataFrame([linha], columns=cols) for linha in nova_linha]
        topogrup_fem = pd.concat([topogrup_fem] + novo_df, ignore_index=True)

topogrup_fem = topogrup_fem.set_index('TOPOGRUP')

topogrup_fem_ordenado = topogrup_fem.sort_values(by='obito_1', ascending=False)
print(topogrup_fem_ordenado.shape)
topogrup_fem_ordenado.head(10)

(66, 2)


Unnamed: 0_level_0,obito_0,obito_1
TOPOGRUP,Unnamed: 1_level_1,Unnamed: 2_level_1
C80,0.109643,0.890357
C23,0.123214,0.876786
C15,0.161549,0.838451
C26,0.1625,0.8375
C24,0.204819,0.795181
C34,0.204826,0.795174
C25,0.208392,0.791608
C22,0.249012,0.750988
C13,0.253687,0.746313
C12,0.253731,0.746269


In [80]:
df_fem_ano1 = df_fem[~((df_fem.obito == 0) & (df_fem.sobrevida_ano1 == 0))].reset_index(drop=True)
topos = np.sort(df_fem_ano1.TOPOGRUP.unique())
cols = ['TOPOGRUP', 'obito_0', 'obito_1']

for i, topo in enumerate(topos):
    aux = df_fem_ano1[df_fem_ano1.TOPOGRUP == topo]
    counts = aux.obito.value_counts(normalize=True).sort_index()
    if i == 0:
        topogrup_fem_ano1 = pd.DataFrame([[topo, counts[0], counts[1]]],
                                         columns=cols)
    else:
        nova_linha = [{'TOPOGRUP': topo, 'obito_0': counts[0], 'obito_1': counts[1]}]
        novo_df = [pd.DataFrame([linha], columns=cols) for linha in nova_linha]
        topogrup_fem_ano1 = pd.concat([topogrup_fem_ano1] + novo_df, ignore_index=True)

topogrup_fem_ano1 = topogrup_fem_ano1.set_index('TOPOGRUP')

topogrup_fem_ano1_ordenado = topogrup_fem_ano1.sort_values(by='obito_1', ascending=False)
print(topogrup_fem_ano1_ordenado.shape)
topogrup_fem_ano1_ordenado.head(10)

(66, 2)


Unnamed: 0_level_0,obito_0,obito_1
TOPOGRUP,Unnamed: 1_level_1,Unnamed: 2_level_1
C80,0.083913,0.916087
C23,0.083955,0.916045
C15,0.123838,0.876162
C25,0.140104,0.859896
C26,0.146497,0.853503
C34,0.149168,0.850832
C24,0.154831,0.845169
C22,0.181848,0.818152
C13,0.206897,0.793103
C12,0.230769,0.769231


In [91]:
df_fem_ano3 = df_fem[~((df_fem.obito == 0) & (df_fem.sobrevida_ano3 == 0))].reset_index(drop=True)
topos = np.sort(df_fem_ano3.TOPOGRUP.unique())
cols = ['TOPOGRUP', 'obito_0', 'obito_1']

for i, topo in enumerate(topos):
    aux = df_fem_ano3[df_fem_ano3.TOPOGRUP == topo]
    counts = aux.obito.value_counts(normalize=True).sort_index()
    if i == 0:
        topogrup_fem_ano3 = pd.DataFrame([[topo, counts[0], counts[1]]],
                                         columns=cols)
    else:
        if counts[1] == 1:
            counts[0] = 0
        nova_linha = [{'TOPOGRUP': topo, 'obito_0': counts[0], 'obito_1': counts[1]}]
        novo_df = [pd.DataFrame([linha], columns=cols) for linha in nova_linha]
        topogrup_fem_ano3 = pd.concat([topogrup_fem_ano3] + novo_df, ignore_index=True)

topogrup_fem_ano3 = topogrup_fem_ano3.set_index('TOPOGRUP')

topogrup_fem_ano3_ordenado = topogrup_fem_ano3.sort_values(by='obito_1', ascending=False)
print(topogrup_fem_ano3_ordenado.shape)
topogrup_fem_ano3_ordenado.head(10)

(66, 2)


Unnamed: 0_level_0,obito_0,obito_1
TOPOGRUP,Unnamed: 1_level_1,Unnamed: 2_level_1
C39,0.0,1.0
C26,0.049645,0.950355
C23,0.05395,0.94605
C80,0.057851,0.942149
C25,0.084961,0.915039
C15,0.088015,0.911985
C34,0.088067,0.911933
C24,0.096453,0.903547
C22,0.115646,0.884354
C13,0.159468,0.840532


In [92]:
df_fem_ano5 = df_fem[~((df_fem.obito == 0) & (df_fem.sobrevida_ano5 == 0))].reset_index(drop=True)
topos = np.sort(df_fem_ano5.TOPOGRUP.unique())
cols = ['TOPOGRUP', 'obito_0', 'obito_1']

for i, topo in enumerate(topos):
    aux = df_fem_ano5[df_fem_ano5.TOPOGRUP == topo]
    counts = aux.obito.value_counts(normalize=True).sort_index()
    if i == 0:
        topogrup_fem_ano5 = pd.DataFrame([[topo, counts[0], counts[1]]],
                                          columns=cols)
    else:
        if counts[1] == 1:
            counts[0] = 0
        nova_linha = [{'TOPOGRUP': topo, 'obito_0': counts[0], 'obito_1': counts[1]}]
        novo_df = [pd.DataFrame([linha], columns=cols) for linha in nova_linha]
        topogrup_fem_ano5 = pd.concat([topogrup_fem_ano5] + novo_df, ignore_index=True)

topogrup_fem_ano5 = topogrup_fem_ano5.set_index('TOPOGRUP')

topogrup_fem_ano5_ordenado = topogrup_fem_ano5.sort_values(by='obito_1', ascending=False)
print(topogrup_fem_ano5_ordenado.shape)
topogrup_fem_ano5_ordenado.head(10)

(66, 2)


Unnamed: 0_level_0,obito_0,obito_1
TOPOGRUP,Unnamed: 1_level_1,Unnamed: 2_level_1
C39,0.0,1.0
C26,0.035971,0.964029
C23,0.037884,0.962116
C80,0.043099,0.956901
C25,0.051619,0.948381
C34,0.055887,0.944113
C15,0.057115,0.942885
C24,0.063226,0.936774
C22,0.076981,0.923019
C12,0.122807,0.877193
