## **IMPORTS**

In [30]:
import pandas as pd

In [31]:
brazil = pd.read_csv(
    filepath_or_buffer="../data/timeSeriesBrazil_raw.csv",
    dtype={
        "year": "int",
        "state": "string",
        "month": "string",
        "number": "float",
        "date": "string"
    },
    encoding="utf-8",
    sep=";",    
)
brazil.drop(columns=['Unnamed: 0'], inplace=True)

## **DATETIME TREATMENTS**

In [32]:
brazil['date'] = pd.to_datetime(arg=brazil['date'], format="mixed")
brazil['date'] = brazil['date'].dt.strftime('%m/%d/%Y')

## **DUPLICATED DATA**

In [33]:
brazil[brazil.duplicated(keep='first')]

Unnamed: 0,year,state,month,number,date
3579,2013,Acre,Janeiro,0.000,01/01/2013
3580,2014,Acre,Janeiro,0.000,01/01/2014
3583,2017,Acre,Janeiro,0.000,01/01/2017
3598,2012,Acre,Fevereiro,0.000,01/01/2012
3599,2013,Acre,Fevereiro,0.000,01/01/2013
...,...,...,...,...,...
9775,2013,Sergipe,Dezembro,9.000,01/01/2013
9776,2014,Sergipe,Dezembro,9.000,01/01/2014
9778,2016,Sergipe,Dezembro,36.000,01/01/2016
9814,2013,Tocantins,Fevereiro,37.000,01/01/2013


In [34]:
brazil = brazil.drop_duplicates(keep='last')

## **NAN VALUES**

In [35]:
pd.DataFrame(data={"Abs":brazil.isnull().sum(), "Porcent":brazil.isnull().sum()/brazil.shape[0]})

Unnamed: 0,Abs,Porcent
year,0,0.0
state,0,0.0
month,0,0.0
number,0,0.0
date,0,0.0


## **CATEGORICAL DATA**

In [36]:
#pd.get_dummies(brazil, columns=["state"])

## **STATES DATA**

In [37]:
brazil.state.replace("Rio", "Rio de Janeiro", inplace=True)
brazil.state.replace("Sao Paulo", "São Paulo", inplace=True)
brazil.state.replace("Maranhao", "Maranhão", inplace=True)
brazil.state.replace("Piau", "Piaui", inplace=True)
brazil.state.replace("Ceara", "Ceará", inplace=True)
brazil.state.replace("Paraiba", "Paraíba", inplace=True)
brazil.state.replace("Rondonia", "Rondônia", inplace=True)

In [38]:
brazil.state.value_counts().index.to_list()

['Rio de Janeiro',
 'Mato Grosso',
 'Paraíba',
 'Minas Gerais',
 'São Paulo',
 'Amazonas',
 'Bahia',
 'Ceará',
 'Piaui',
 'Goias',
 'Maranhão',
 'Pará',
 'Tocantins',
 'Rondônia',
 'Santa Catarina',
 'Espirito Santo',
 'Roraima',
 'Pernambuco',
 'Amapa',
 'Acre',
 'Alagoas',
 'Sergipe',
 'Distrito Federal',
 'Mato Grosso do Sul',
 'Paraná',
 'Rio Grande do Norte',
 'Rio Grande do Sul']

## **SAVE DATASET**

In [39]:
brazil.to_csv(path_or_buf="../data/timeSeriesBrazil_process.csv")