## Pré-processamento de Dados
Fase de limpeza e tratamento de dados, para torna-los utilizáveis.

### Importação de bibliotecas

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

### Importação dos dados
Já que os conjuntos de dados (Datasets) contém bastante dados, vou usar apenas um deles para auxiliar no tratamento e na limpeza, e depois será replicados no outros conjuntos. Assim reduzindo o custo computacional. 

In [2]:
path = "Dados_SUS/CSV_FILE" #Caminho da pasta
# print(os.listdir(path)) #Comando para ver os arquivos na pasta

In [3]:
file = f"{path}/DOEXT22.csv"
df_geral = pd.read_csv(file,encoding='latin-1')
display(df_geral.head(10))
#print(df_geral.columns.to_list()) #Ver todas as colunas do dataset

  df_geral = pd.read_csv(file,encoding='latin-1')


Unnamed: 0,ORIGEM,TIPOBITO,DTOBITO,HORAOBITO,NATURAL,CODMUNNATU,DTNASC,IDADE,SEXO,RACACOR,...,FONTES,TPRESGINFO,TPNIVELINV,NUDIASINF,DTCADINF,MORTEPARTO,DTCONCASO,FONTESINF,ALTCAUSA,CONTADOR
0,1,2,30032022,1840.0,850.0,500345.0,3011967.0,455,1,1.0,...,,,,,,,,,,15
1,1,2,6032022,400.0,841.0,412760.0,28101994.0,427,1,1.0,...,,,,,,,,,,35
2,1,2,5052022,2233.0,833.0,330490.0,21082011.0,410,2,1.0,...,,,,,,,,,,100
3,1,2,10042022,1357.0,851.0,510792.0,21042019.0,402,2,1.0,...,,,,,,,,,,108
4,1,2,26052022,954.0,829.0,291950.0,13061927.0,494,2,1.0,...,,,,,,,,,,131
5,1,2,23012022,600.0,835.0,351880.0,23051991.0,430,1,4.0,...,,,,,,,,,,167
6,1,2,22012022,2102.0,828.0,280590.0,23121977.0,444,1,4.0,...,,,,,,,,,,170
7,1,2,23012022,430.0,828.0,280140.0,22062001.0,420,1,4.0,...,,,,,,,,,,171
8,1,2,23012022,2340.0,828.0,280067.0,26032003.0,418,1,4.0,...,,,,,,,,,,172
9,1,2,22052022,1710.0,835.0,353240.0,30102020.0,401,1,1.0,...,SXXXSX,,,,2062022.0,3.0,2062022.0,,2.0,177


### Tramento de dados

Para não ser custoso computacionalmente, a seleção visa a redução da quantidade de colunas.

In [4]:
colunas_selecionadas = ['DTNASC','DTOBITO','IDADE',
                        'SEXO','RACACOR','ESTCIV','ESC',
                        'ESC2010','OCUP','CIRCOBITO','CODMUNRES']

In [5]:
df_geral = pd.read_csv(file,encoding='latin-1',usecols=colunas_selecionadas) 
df_geral.head(10)

Unnamed: 0,DTOBITO,DTNASC,IDADE,SEXO,RACACOR,ESTCIV,ESC,ESC2010,OCUP,CODMUNRES,CIRCOBITO
0,30032022,3011967.0,455,1,1.0,2.0,3.0,1.0,715210.0,500370,1.0
1,6032022,28101994.0,427,1,1.0,1.0,3.0,1.0,782510.0,412760,1.0
2,5052022,21082011.0,410,2,1.0,1.0,2.0,1.0,999991.0,330490,1.0
3,10042022,21042019.0,402,2,1.0,,,,,510830,1.0
4,26052022,13061927.0,494,2,1.0,3.0,9.0,9.0,999993.0,291950,1.0
5,23012022,23051991.0,430,1,4.0,5.0,4.0,3.0,782305.0,280480,2.0
6,22012022,23121977.0,444,1,4.0,1.0,3.0,2.0,768110.0,280480,3.0
7,23012022,22062001.0,420,1,4.0,1.0,4.0,3.0,622020.0,280140,3.0
8,23012022,26032003.0,418,1,4.0,1.0,4.0,2.0,519110.0,280067,1.0
9,22052022,30102020.0,401,1,1.0,,,,,350710,3.0


Selecionando apenas casos de suicídios, sabendo que o codigo para os casos é 2. 

In [6]:
df_s = df_geral.loc[(df_geral['CIRCOBITO'])==2]
display(df_s.head())

Unnamed: 0,DTOBITO,DTNASC,IDADE,SEXO,RACACOR,ESTCIV,ESC,ESC2010,OCUP,CODMUNRES,CIRCOBITO
5,23012022,23051991.0,430,1,4.0,5.0,4.0,3.0,782305.0,280480,2.0
14,27012022,20081968.0,453,1,2.0,2.0,2.0,1.0,715210.0,280330,2.0
19,8022022,25041969.0,452,1,1.0,1.0,2.0,1.0,512105.0,430600,2.0
20,16022022,19031970.0,451,2,1.0,1.0,2.0,1.0,517310.0,430600,2.0
31,23042022,21031980.0,442,1,4.0,4.0,9.0,9.0,252105.0,351620,2.0


In [7]:
idade_indefinida = df_s.loc[(df_s['IDADE'])==999] #Pesquisa por idade indefinida
display(idade_indefinida)

Unnamed: 0,DTOBITO,DTNASC,IDADE,SEXO,RACACOR,ESTCIV,ESC,ESC2010,OCUP,CODMUNRES,CIRCOBITO
26175,14062022,,999,1,4.0,9.0,9.0,9.0,,420000,2.0
27393,7022022,,999,1,2.0,,,,,150380,2.0
35370,2012022,,999,1,2.0,,,,998999.0,410690,2.0
45064,25012022,,999,1,3.0,,9.0,,,431490,2.0
53425,10042022,,999,1,4.0,,,,,130260,2.0
61709,15042022,,999,1,1.0,,,,,420750,2.0
69290,7032022,,999,1,4.0,,,,,353870,2.0
70526,24032022,,999,1,,9.0,9.0,9.0,998999.0,353870,2.0
96067,1072022,,999,1,4.0,,,,262105.0,230440,2.0
104431,6122022,,999,1,2.0,9.0,9.0,9.0,,330000,2.0


In [8]:
df_s.drop(index=idade_indefinida.index,inplace=True) #Drop idade indefinida

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_s.drop(index=idade_indefinida.index,inplace=True) #Drop idade indefinida


In [9]:
linha_auxiliar = df_s.loc[5]

In [10]:
pd.options.mode.copy_on_write = True
def ajustar_idade(linha):
    if(linha['IDADE']) > 400:
        linha['IDADE'] = linha['IDADE'] - 400
    return linha

In [11]:
df_s = df_s.apply(lambda x:ajustar_idade(x),axis=1)

In [12]:
df_s[df_s['IDADE']==6]

Unnamed: 0,DTOBITO,DTNASC,IDADE,SEXO,RACACOR,ESTCIV,ESC,ESC2010,OCUP,CODMUNRES,CIRCOBITO
14141,26062022.0,6092015.0,6.0,1.0,4.0,1.0,,,999991.0,350320.0,2.0


In [13]:
df_s.drop(df_s[df_s['IDADE']==6].index,inplace=True)

In [14]:
df_s.drop(columns='DTNASC',inplace=True)
df_s.drop(columns='CIRCOBITO',inplace=True)

In [15]:
df_s['OCUP'] = df_s['OCUP'].fillna(998999)
df_s.fillna(0,inplace=True)

In [16]:
df_s = df_s[:].astype('int64')

In [17]:
# plt.hist(df_s['IDADE'],bins=89)
# print(np.histogram(df_s['IDADE'],bins=89))
# plt.show()

In [18]:
linha_auxiliar = df_s.loc[5]
print(linha_auxiliar)

DTOBITO      23012022
IDADE              30
SEXO                1
RACACOR             4
ESTCIV              5
ESC                 4
ESC2010             3
OCUP           782305
CODMUNRES      280480
Name: 5, dtype: int64


In [19]:
df_s['CODMUNRES'].value_counts()

330455    269
530010    247
310620    199
355030    159
230440    150
         ... 
290330      1
431280      1
330395      1
430410      1
130020      1
Name: CODMUNRES, Length: 3452, dtype: int64

In [20]:
df_municipios = pd.read_csv('Dados_SUS/CODMUN/RELATORIO_DTB_BRASIL_MUNICIPIO.csv',
                usecols=['UF','Código Município Completo','Nome_Município'],
                delimiter=',')
df_municipios.rename({'Código Município Completo':'CODMUNRES'},axis='columns',inplace=True)

In [21]:
linha_auxiliar_municipios = df_municipios.loc[0]

In [22]:
def remover_ultimo_digito(linha):
    linha_str = str(linha['CODMUNRES'])
    if len(linha_str) == 7:
        linha['CODMUNRES'] = int(linha_str[:6])
    return linha

In [23]:
df_municipios = df_municipios.apply(lambda x:remover_ultimo_digito(x),axis=1)

In [24]:
df_merged = pd.merge(df_s,df_municipios,how='left',on='CODMUNRES')

In [25]:
df_merged.dropna(inplace=True)

In [26]:
coluna_municipios = df_merged['Nome_Município'] 

In [27]:
df_merged.drop(columns='Nome_Município',inplace=True)

In [28]:
df_merged['UF'] = df_merged['UF'].astype('int64')

In [29]:
display(df_merged)
display(coluna_municipios)

Unnamed: 0,DTOBITO,IDADE,SEXO,RACACOR,ESTCIV,ESC,ESC2010,OCUP,CODMUNRES,UF
0,23012022,30,1,4,5,4,3,782305,280480,28
1,27012022,53,1,2,2,2,1,715210,280330,28
2,8022022,52,1,1,1,2,1,512105,430600,43
3,16022022,51,2,1,1,2,1,517310,430600,43
4,23042022,42,1,4,4,9,9,252105,351620,35
...,...,...,...,...,...,...,...,...,...,...
15651,6052022,29,1,1,4,4,2,715615,420240,42
15652,24042022,15,1,5,0,1,0,998999,130140,13
15653,1052022,18,2,5,0,1,0,998999,130140,13
15654,24122022,26,1,5,0,3,2,998999,130230,13


0        Nossa Senhora do Socorro
1                      Japaratuba
2                      Crissiumal
3                      Crissiumal
4                          Franca
                   ...           
15651                    Blumenau
15652                    Eirunepé
15653                    Eirunepé
15654                       Jutaí
15655            Atalaia do Norte
Name: Nome_Município, Length: 15647, dtype: object