In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import zipfile

In [None]:
if "snakemake" in locals():
    input_path = snakemake.input[0]
    output_path = snakemake.output[0]
    
else:
    input_path = "../../../../resources/paris/census/base-ic-evol-struct-pop-2019_csv.zip"
    output_path = "../../../../results/paris/census/iris.parquet"

In [None]:
# Load census data
with zipfile.ZipFile(input_path) as archive:
    with archive.open("base-ic-evol-struct-pop-2019.CSV") as f:
        df_census = pd.read_csv(f, sep = ";", dtype = {
            "IRIS": str, "COM": str
        })

In [None]:
# Filter for Île-de-France region
df_census = df_census[df_census["COM"].str[:2].isin([
    "75", "91", "92", "93", "77", "94", "95", "78"
])]

In [None]:
# Restructure data frame
df_census = df_census[[
    "COM", "IRIS",
    "P19_H0014", "P19_H1529", "P19_H3044", "P19_H4559", "P19_H6074", "P19_H75P",
    "P19_F0014", "P19_F1529", "P19_F3044", "P19_F4559", "P19_F6074", "P19_F75P", 
]].melt(["IRIS", "COM"], value_name = "weight")

In [None]:
# Formatting
df_census = df_census.rename(columns = {
    "COM": "municipality_id", "IRIS": "iris_id"
})

df_census["iris_id"] = df_census["iris_id"].astype(int)
df_census["municipality_id"] = df_census["municipality_id"].astype(int)

In [None]:
# Add sex attribute
df_census["sex"] = df_census["variable"].apply(
    lambda x: 1 if x.startswith("P19_H") else 2
)

# Add age lower attribute
df_census["age_class"] = df_census["variable"].apply(
    lambda x: int(x[5:7])
)

df_census = df_census.drop(columns = ["variable"])

In [None]:
df_plot = df_census.groupby("sex")["weight"].sum().reset_index()
df_plot["sex"] = df_plot["sex"].replace({ 1: "M", 2: "F" })
px.bar(
    df_plot, x = "sex", y = "weight",
    title = "Population by sex"
)

In [None]:
df_plot = df_census.groupby("age_class")["weight"].sum().reset_index()
df_plot["age_class"] = df_plot["age_class"].astype(str)
px.bar(
    df_plot, x = "age_class", y = "weight",
    title = "Population by age class"
)

In [None]:
# Output
df_census.to_parquet(output_path)