In [None]:
import pandas as pd
import numpy as np
import plotly.express as px

In [None]:
if "snakemake" in locals():
    input_path = snakemake.input[0]
    output_path = snakemake.output[0]
else:
    input_path = "../../../../resources/brussels/census/TF_SOC_POP_STRUCT_2023.zip"
    output_path = "../../../../results/brussels/census/municipalities.parquet"

In [None]:
# Load census data
df_census = pd.read_csv(input_path, sep = "|", dtype = { "CD_REFNIS": "str" })

In [None]:
# Clean structure
df_census = df_census.rename(columns = {
    "CD_REFNIS": "municipality_id",
    "CD_SEX": "sex",
    "CD_AGE": "age",
    "MS_POPULATION": "weight",
})

df_census = df_census.groupby(["municipality_id", "sex", "age"])["weight"].sum().reset_index()

In [None]:
# Formatting to make it integer-based
df_census["sex"] = df_census["sex"].replace({ "M": 1, "F": 2 }).astype(int)
df_census["municipality_id"] = df_census["municipality_id"].astype(int)

In [None]:
# Plot age distribution
df_plot = df_census.groupby("age")["weight"].sum().reset_index()

px.line(
    df_plot, x = "age", y = "weight",
    title = "Population by age"
)

In [None]:
# Plot sex distribution
df_plot = df_census.groupby("sex")["weight"].sum().reset_index()
df_plot["sex"] = df_plot["sex"].replace({ 1: "Male", 2: "Female" })

px.bar(
    df_plot, x = "sex", y = "weight",
    title = "Population by sex"
)

In [None]:
# Establish age classes (same as airport data)
age_bounds = np.array([15, 25, 35, 45, 55, 65])

indices = np.digitize(df_census["age"], age_bounds)
df_census["age_class"] = np.array([0] + list(age_bounds))[indices]

In [None]:
# Plot age class
df_plot = df_census.groupby("age_class")["weight"].sum().reset_index()
df_plot["age_class"] = df_plot["age_class"].astype(str)

px.bar(
    df_plot, x = "age_class", y = "weight",
    title = "Population by age class"
)

In [None]:
# Output
df_census.to_parquet(output_path)