In [None]:
import pandas as pd
import numpy as np
import plotly.express as px

In [None]:
if "snakemake" in locals():
    input_path = snakemake.input[0]
    output_path = snakemake.output[0]
else:
    input_path = "../../../../../resources/madrid/census/68542.csv"
    output_path = "../../../../results/madrid/census/municipalities.parquet"

In [None]:
# Read input data
df_census = pd.read_csv(input_path, sep = "\t")

In [None]:
# Filter for 2023
df_census = df_census[df_census["Periodo"] == 2023].copy()

In [None]:
# Filter out aggregated rows
df_census = df_census[
    df_census["Sexo"].isin(["Mujeres", "Hombres"]) &
    df_census["Edad"].str.contains("año") &
    (df_census["Provincias"].str.len() > 0) &
    (df_census["Municipios"].str.len() > 0)
].copy()

In [None]:
# Format municipality identifier
df_census["municipality_id"] = df_census["Municipios"].apply(
    lambda x: x.split(" ")[0]
)

# Only keep of length five
df_census = df_census[df_census["municipality_id"].str.len() == 5].copy()
df_census["municipality_id"] = df_census["municipality_id"].astype(int)

In [None]:
# Format age
df_census["age"] = df_census["Edad"].apply(lambda x: int(x.split(" ")[0]))

# Format sex
df_census["sex"] = df_census["Sexo"].apply(lambda x: 1 if x == "Hombres" else 2)

# Rename weight
df_census["weight"] = df_census["Total"].str.replace(".", "").astype(float).fillna(0.0).astype(int)

In [None]:
# Filter for Madrid
df_census = df_census[
    df_census["municipality_id"].astype(str).str.startswith("28")
]

In [None]:
# Arrange columns
df_census = df_census[["municipality_id", "sex", "age", "weight"]].copy()

In [None]:
df_plot = df_census.groupby("sex")["weight"].sum().reset_index()
df_plot["sex"] = df_plot["sex"].replace({ 1: "M", 2: "F" })
px.bar(
    df_plot, x = "sex", y = "weight",
    title = "Population by sex"
)

In [None]:
df_plot = df_census.groupby("age")["weight"].sum().reset_index()
px.bar(
    df_plot, x = "age", y = "weight",
    title = "Population by age"
)

In [None]:
# Define age classes (in this case like Brussels)
age_bounds = np.array([15, 25, 35, 45, 55, 65])

indices = np.digitize(df_census["age"], age_bounds)
df_census["age_class"] = np.array([0] + list(age_bounds))[indices]

In [None]:
df_plot = df_census.groupby("age_class")["weight"].sum().reset_index()
df_plot["age_class"] = df_plot["age_class"].astype(str)

px.bar(
    df_plot, x = "age_class", y = "weight",
    title = "Population by age"
)

In [None]:
# Output
df_census.to_parquet(output_path)