In [None]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import plotly.express as px

In [None]:
if "snakemake" in locals():
    input_seed_path = snakemake.input["seed"]
    output_table_path = snakemake.output[0]

    marginal_paths = []

    for key in snakemake.input.keys():
        if key.startswith("marginal"):
            marginal_paths.append(snakemake.input[key])

else:            
    input_seed_path = "../../../results/belgium/demand/initial_population.parquet"
    output_table_path = "../../../results/belgium/demand/weighted_population.parquet"

    marginal_paths = [
        "../../../results/belgium/marginals/municipalities.parquet",
        "../../../results/belgium/marginals/sectors.parquet",
        "../../../results/belgium/marginals/passengers.parquet",
        "../../../results/belgium/marginals/missing_locations.parquet"
    ]

In [None]:
# Load population sample
df_population = pd.read_parquet(input_seed_path)

In [None]:
# For performance, per-calculate the selectors and totals for the weighting process
constraints = []

for path in marginal_paths:
    df_marginal = pd.read_parquet(path)
    print(df_marginal)

    assert len(set(df_marginal.columns) - set(df_population.columns)) == 0
    attributes = list((set(df_marginal.columns) & set(df_population.columns)) - set(["weight"]))

    for values in tqdm(df_marginal[attributes + ["weight"]].itertuples(index = False), total = len(df_marginal)):
        population_selector = np.logical_and.reduce([np.ones((len(df_population),), dtype = bool)] + [
            df_population[attribute] == value for attribute, value in zip(attributes, values)
        ])

        assert np.count_nonzero(population_selector) > 0
        constraints.append((np.where(population_selector)[0], values[-1]))

In [None]:
weight_column = df_population.columns.get_loc("weight")

overall_factors = []

converged = False
for iteration in range(1000):
    print("Iteration", iteration + 1)
    iteration_factors = []

    for selector, target_value in tqdm(constraints):
        current_value = df_population.iloc[selector, weight_column].sum()

        if current_value > 0:
            factor = target_value / current_value

            df_population.iloc[selector, weight_column] *= factor
            iteration_factors.append(factor)

    print("Factors:", len(iteration_factors), 
        "mean:", np.mean(iteration_factors), 
        "min:", np.min(iteration_factors),
        "max:", np.max(iteration_factors))
    
    iteration_factors = np.array(iteration_factors)
    overall_factors.append(iteration_factors)

    if np.max(iteration_factors) - 1 < 1e-2:
        if np.min(iteration_factors) > 1 - 1e-2:
            converged = True
            break

assert converged

In [None]:
df_progress = pd.DataFrame({
    "iteration": np.arange(len(overall_factors)),
    "mean": [np.mean(v) for v in overall_factors],
    "min": [np.min(v) for v in overall_factors],
    "max": [np.max(v) for v in overall_factors]
})

figure = px.line(df_progress, x = "iteration", y = ["mean", "min", "max"])
figure.update_layout(yaxis = dict(range = [0.5, 1.5]))

In [None]:
# Output
df_population.to_parquet(output_table_path)
df_progress.to_parquet(output_table_path.replace(".parquet", ".progress.parquet"))