## Imports

In [21]:
import sys
from pathlib import Path
PROJECT_DIR = Path.cwd().parent
sys.path.append(str(PROJECT_DIR))

# basics
import pandas as pd
import geopandas as gpd

# utils
import os
from ppm.utils.readers import (
    reader,
    unzip_file
)

## Parameters

In [37]:
uf = 'PB'
path_root_data = os.path.join(
    '..', 'data', # data or oos
)
path_raw = os.path.join(
    path_root_data, '01_raw'
)
path_intermediate = os.path.join(
    path_root_data, '02_intermediate'
)
path_census_data = os.path.join(
    path_raw, f'{uf}_20171016'
)
path_census_data_csv = os.path.join(
    path_census_data,
    f'{uf}',
    f'Base informaçoes setores2010 universo {uf}',
    'CSV'
)

file_path_data_shp = os.path.join(
    path_raw, 
    f'{uf}_Setores_2021',
    f'{uf}_Setores_2021.shp'
)
file_path_shp_processed = os.path.join(
    path_intermediate, "data_shp_processed"
)
file_path_census_processed = os.path.join(
    path_intermediate, "data_census_processed.csv"
)

## Read data

In [23]:
# unzip
if not os.path.exists(path_census_data):
    unzip_file(
        path_census_data+'.zip',
        path_census_data
    )

file_path_census_data = {
    i.split(".")[0]: os.path.join(path_census_data_csv, i)for i in os.listdir(path_census_data_csv) 
}

In [24]:
data_census = {
    i: reader(url) for i, url in file_path_census_data.items()
}

In [25]:
data_shp = gpd.read_file(file_path_data_shp)
data_shp.columns = data_shp.columns.str.lower()

## Concatenate census data

In [26]:
for idx, (name, content) in enumerate(data_census.items()):
    if idx == 0:
        data_census_merged = content.copy()
        data_census_merged.columns = data_census_merged.columns.str.lower()
        data_census_merged.columns = ["cod_setor"]+[
            '{}_{}'.format(i, name) for i in data_census_merged.columns[1:]
        ]
    else:
        content.columns = content.columns.str.lower()
        content.columns = ["cod_setor"]+[
            '{}_{}'.format(i, name) for i in content.columns[1:]
        ]
        data_census_merged = data_census_merged.merge(
            content,
            on = ["cod_setor"],
            suffixes = (
                "_{}".format(name).lower(),
                "_{}".format(name).lower()
            )
        )
        data_census_merged.columns = data_census_merged.columns.str.lower()
data_census_merged.columns = data_census_merged.columns.str.lower()

In [27]:
data_census_merged.drop(
    data_census_merged.columns[data_census_merged.columns.str.contains('unnamed')],
    axis = 1, inplace = True
)
data_census_merged = data_census_merged.loc[:, ~data_census_merged.columns.duplicated()]
data_census_merged.rename(
        columns = {
            "cod_setor": "cd_setor"
        }, inplace = True
)

## Filter data

In [28]:
# filter_joao_pessoa
data_census_merged = data_census_merged[
    data_census_merged.cd_setor.astype(
        str
    ).str.contains(
        '2507507'
    )
].copy()

In [29]:
# filter_joao_pessoa
data_shp = data_shp[
    data_shp.cd_setor.astype(
        str
    ).str.contains(
        '2507507'
    )
].copy()
data_shp.rename(
    columns = {
        "cod_setor": "cd_setor"
    }, inplace = True
)

## Save data

In [38]:
data_census_merged.to_csv(
    file_path_census_processed,
    index = False
)

In [39]:
data_shp.to_file(
    file_path_shp_processed,
    driver = 'GPKG'
)