In [1]:
import pandas as pd
import numpy as np

In [2]:
if "snakemake" in locals():
    input_path = snakemake.input[0]
    output_path = snakemake.output[0]
else:
    input_path = "../../../../resources/brussels/census/OPENDATA_SECTOREN_2023.zip"
    output_path = "../../../../results/brussels/census/sectors.parquet"

In [3]:
# Load census data
df_census = pd.read_csv(input_path, sep = "|")

In [4]:
# Clean structure
df_census = df_census.rename(columns = {
    "CD_REFNIS": "municipality_id",
    "CD_SECTOR": "sector_id",
    "TOTAL": "weight",
})

df_codes = df_census[["municipality_id", "sector_id"]].copy()
df_census = df_census[["sector_id", "weight"]]

In [5]:
# Aggregate
df_census = df_census.groupby("sector_id")["weight"].sum().reset_index()

In [6]:
# Add municipality column
df_census = pd.merge(df_census, df_codes, on = "sector_id", how = "left")

In [7]:
# Filter ZZZZ out
df_census = df_census[~df_census["sector_id"].str.endswith("ZZZZ")]

In [8]:
# Integerize
df_census["sector_index"] = np.arange(len(df_census))
df_census["municipality_id"] = df_census["municipality_id"].astype(int)

In [9]:
# Output
df_census[[
    "municipality_id", "sector_id", "sector_index", "weight"
]].to_parquet(output_path)