Import dependencies and files from directory

In [117]:
import osmnx as ox
import networkx as nx

import pandas as pd
import geopandas as gpd

import matplotlib.pyplot as plt
import numpy as np

from shapely.geometry import LineString
from shapely.geometry import Point
from shapely.geometry import Polygon, MultiPolygon

from tqdm import tqdm  # Import tqdm for progress bar
from tqdm.auto import tqdm
from collections import defaultdict

from sklearn.neighbors import KDTree

import threading
from queue import Queue

import json
import re

from concurrent.futures import ThreadPoolExecutor
import os


Import square data and residential units

In [118]:
#import CBS square statistics and set to epsg 4326
squares = gpd.read_file("./root/Inputs/cbs100x100.gpkg")
squares = squares.to_crs(epsg=4326)

#import residential units from the BAG and set to epsg 4326
residential_units = gpd.read_file("./root/Inputs/residential_units.gpkg")
residential_units = residential_units.to_crs(epsg=4326)

In [119]:
#import buildings from the BAG and set to epsg 28992
buildings = gpd.read_file("./root/Inputs/buildings.gpkg") 
buildings = buildings.to_crs(epsg=4326)

Select only residential units from year 2022 and earlier with function 'residential'

In [120]:
residential_units = residential_units[residential_units['gebruiksdoel'].str.contains('woonfunctie')]
residential_units = residential_units[residential_units['pandstatus'] == 'Pand in gebruik']
residential_units = residential_units[residential_units['bouwjaar'] < 2023]
residential_units['pop'] = 0

Preprocessing of squares for error handling

Calculate number of buildings per square based on an overlay with the residential units and delete squares without residential units

In [121]:
def count_residential_units_in_squares(residential_units, squares):

    # Create a copy of the squares GeoDataFrame
    result = squares.copy()
    
    # Initialize the count column
    result['residential_unit_count'] = 0
    
    # Spatial join between points and polygons
    joined = gpd.sjoin(residential_units, squares, how='left', predicate='within')
    
    # Count points per polygon
    if not joined.empty:
        point_counts = joined.groupby(joined.index_right).size()
        result.loc[point_counts.index, 'residential_unit_count'] = point_counts
    
    return result

#apply function
squares = count_residential_units_in_squares(residential_units, squares)

#drop squares without registered residential units in them
squares_with_residential_units = squares[(squares['residential_unit_count'] > 0)]

#report amount and percentage dropped
total_squares = len(squares)
remaining_squares = len(squares_with_residential_units)
dropped_squares = total_squares - remaining_squares

# Calculate and print the results
print(f"Dropped {dropped_squares} squares")
print(f"This is {(dropped_squares/total_squares*100):.2f}% of the total")

Dropped 350 squares
This is 3.55% of the total


Clean up data

In [122]:
#drop unnecessary columns
columns_to_keep = ['aantal_inwoners', 'aantal_inwoners_0_tot_15_jaar', 'aantal_inwoners_65_jaar_en_ouder', 'aantal_eenpersoonshuishoudens', 'residential_unit_count', 'geometry']  # Add any other columns you want to keep
squares_with_residential_units = squares_with_residential_units[columns_to_keep]

columns_to_keep = ['identificatie', 'pand_identificatie', 'pop', 'geometry']  # Add any other columns you want to keep
residential_units = residential_units[columns_to_keep]


Categorize squares into groups for processing

Group squares based on average household size

In [123]:
#set negative values to 0
squares_with_residential_units.loc[squares_with_residential_units['aantal_inwoners'] <= 0, 'aantal_inwoners'] = 0

#calculate average household size for every square
squares_with_residential_units['pop_building'] = squares_with_residential_units['aantal_inwoners']/squares_with_residential_units['residential_unit_count']

squares_no_population = squares_with_residential_units[squares_with_residential_units['aantal_inwoners'] == 0]
squares_low_average = squares_with_residential_units[
    (squares_with_residential_units['pop_building'] > 0) & 
    (squares_with_residential_units['pop_building'] < 1)
    ].copy()
squares_high_average = squares_with_residential_units[squares_with_residential_units['pop_building'] > 6]
squares_normal = squares_with_residential_units[
   (squares_with_residential_units['aantal_inwoners'] != 0) & 
   (squares_with_residential_units['pop_building'] >= 1) & 
   (squares_with_residential_units['pop_building'] <= 6)
].copy()

print(f"Number of squares with expected values: {len(squares_normal)}")
print(f"This is {len(squares_normal)/len(squares_with_residential_units)*100:.2f} % of the total")

print(f"Number of squares with values lower than expected: {len(squares_low_average)}")
print(f"This is {len(squares_low_average)/len(squares_with_residential_units)*100:.2f} % of the total")

print(f"Number of squares with values higher than expected: {len(squares_high_average)}")
print(f"This is {len(squares_high_average)/len(squares_with_residential_units)*100:.2f} % of the total")

#squares_no_pop = squares_with_residential_units[squares_with_residential_units['aantal_inwoners'] <= 0].shape[0]
print(f"Number of squares with no stored population: {len(squares_no_population)}")
print(f"This is {len(squares_no_population)/len(squares_with_residential_units)*100:.2f} % of the total")

Number of squares with expected values: 9254
This is 97.19 % of the total
Number of squares with values lower than expected: 90
This is 0.95 % of the total
Number of squares with values higher than expected: 113
This is 1.19 % of the total
Number of squares with no stored population: 65
This is 0.68 % of the total


Check within the squares with expected values whether it is possible to take single households into account

In [124]:
#set negative values to zero
squares_normal.loc[squares_normal['aantal_eenpersoonshuishoudens'] < 0, 'aantal_eenpersoonshuishoudens'] = 0

#calculate average household size after deducting single households
squares_normal['remaining_pop'] = squares_normal['aantal_inwoners']-squares_normal['aantal_eenpersoonshuishoudens']
squares_normal['remaining_building'] = squares_normal['residential_unit_count']-squares_normal['aantal_eenpersoonshuishoudens']
squares_normal['householdsize_remaining'] = squares_normal['remaining_pop']/squares_normal['remaining_building']

# Create bins and labels
bins = [0, 1, 2, 6, 10, float('inf')]
labels = ['0-1', '1-2', '2-6', '6-10', '10+']

# Get value counts with bins
counts = pd.cut(squares_normal['householdsize_remaining'], 
                bins=bins, 
                labels=labels, 
                include_lowest=True).value_counts()

# Calculate percentages
percentages = counts / len(squares_normal) * 100

# Combine counts and percentages in a DataFrame for nice display
summary = pd.DataFrame({
    'Count': counts,
    'Percentage': percentages.round(2)
}).sort_index()

print("Distribution of household sizes:")
print(summary)
print("\nTotal squares:", len(squares_normal))

#For values between 1 and 2 it is not possible to assign two persons to the remaining households, therefore 1 person will be assigned. After which the remaining population will be allocated randomly.
#For values below 1 and above 6 random distribution will be applied due to high chance of error

Distribution of household sizes:
                         Count  Percentage
householdsize_remaining                   
0-1                         48        0.52
1-2                        847        9.15
2-6                       7875       85.10
6-10                       205        2.22
10+                        134        1.45

Total squares: 9254


Group squares for processing (informed distribution, random distribution, static number)

In [125]:
squares_informed = squares_normal[
   (squares_normal['householdsize_remaining'] >= 1) & 
   (squares_normal['householdsize_remaining'] <= 6)
].copy()

squares_random = squares_normal[
   (squares_normal['householdsize_remaining'] < 1) & 
   (squares_normal['householdsize_remaining'] > 6)
].copy()

squares_random = pd.concat([squares_random, squares_low_average], ignore_index=True)
squares_random = pd.concat([squares_random, squares_high_average], ignore_index=True)

squares_static = squares_no_population

print(f"Number of squares with informed distribution (1-6 persons per building): {len(squares_normal):}")
print(f"Number of squares with randomly distributed population: {len(squares_random):,}")
print(f"Number of squares with static population (set number): {len(squares_static):,}")

total_population_informed = squares_informed['aantal_inwoners'].sum()
total_population_random = squares_random['aantal_inwoners'].sum()
total_population_static = (squares_static['residential_unit_count'].sum())*2

total_population = total_population_informed + total_population_random + total_population_static

print(f"Percentage of population in informed distribution: {(total_population_informed/total_population)*100:.2f}%")
print(f"Percentage of population in random distribution: {(total_population_random/total_population)*100:.2f}%")
print(f"Percentage of population in static distribution: {(total_population_static/total_population)*100:.2f}%")



Number of squares with informed distribution (1-6 persons per building): 9254
Number of squares with randomly distributed population: 203
Number of squares with static population (set number): 65
Percentage of population in informed distribution: 98.18%
Percentage of population in random distribution: 1.58%
Percentage of population in static distribution: 0.24%


Load buildings with paths

In [None]:
# load split files

import pandas as pd
import ast

file_name = "./root/Outputs/building_shortest_path_to_cluster.xlsx"
buildings_path = pd.read_excel(file_name)

# Convert 'shortest_path' column from string to list
buildings_path['shortest_path'] = buildings_path['shortest_path'].apply(ast.literal_eval)
# Keep only 'building_id' and 'shortest_path'
buildings_path = buildings_path[['building_id', 'shortest_path']]

print(buildings_path)

            building_id                                      shortest_path
0       393100000000637                                                 []
1       393100000000409                                                 []
2       394100000215377                                                 []
3       393100000003321                                                 []
4       363100012246859  [5412076069, 46520421, 46518243, 949635614, 46...
...                 ...                                                ...
161243  437100000002928  [10167299809, 46039725, 4734319723, 4734319746...
161244  437100000005955  [569107921, 2297541079, 569107854, 46002289, 4...
161245  437100000001919  [46038892, 2297541071, 46040188, 46041472, 460...
161246  437100000005999  [2297541071, 46040188, 46041472, 46042097, 149...
161247  437100000001306  [569107921, 2297541079, 569107854, 46002289, 4...

[161248 rows x 2 columns]


Join buildings to add the geometry

In [127]:
# Ensure both columns have the same type
buildings['identificatie'] = buildings['identificatie'].astype(str).str.lstrip('0')
buildings_path['building_id'] = buildings_path['building_id'].astype(str)

# Merge to join geometry with shortest_path
buildings_with_geometry = buildings.merge(
    buildings_path, 
    left_on='identificatie', 
    right_on='building_id'
)

# Keep only 'identificatie', 'shortest_path', and 'geometry'
buildings_with_geometry = buildings_with_geometry[['identificatie', 'shortest_path', 'geometry']]

display(buildings_path)

Unnamed: 0,building_id,shortest_path
0,393100000000637,[]
1,393100000000409,[]
2,394100000215377,[]
3,393100000003321,[]
4,363100012246859,"[5412076069, 46520421, 46518243, 949635614, 46..."
...,...,...
161243,437100000002928,"[10167299809, 46039725, 4734319723, 4734319746..."
161244,437100000005955,"[569107921, 2297541079, 569107854, 46002289, 4..."
161245,437100000001919,"[46038892, 2297541071, 46040188, 46041472, 460..."
161246,437100000005999,"[2297541071, 46040188, 46041472, 46042097, 149..."


Delete buildings where the shortest path is more than 2500 m (not within walking distance)

In [128]:
north, south, east, west = 52.43, 52.28, 5.10, 4.74

#fetch network
cf = """
     ["area"!~"yes"]
     ["highway"]
     ["highway"!~"motor|proposed|construction|abandoned|platform|raceway"]
     ["foot"!~"no"]
     ["service"!~"private"]
     ["access"!~"private"]
     """

#fetch network
G = ox.graph_from_bbox(north, south, east, west, custom_filter=cf, network_type='walk', simplify= False, truncate_by_edge=True) 

In [129]:
# Function to calculate path length
def calculate_path_length(G, path):
    length = 0
    for u, v in zip(path[:-1], path[1:]):  # Iterate through pairs of consecutive nodes
        try:
            length += G[u][v][0]['length']  # Sum the lengths of the edges
        except KeyError:
            print(f"Edge ({u}, {v}) not found in the graph. Skipping.")
    return length

# Calculate distances for each path
buildings_with_geometry['path_length_meters'] = buildings_with_geometry['shortest_path'].apply(lambda path: calculate_path_length(G, path))

# Display the resulting dataframe
display(buildings_with_geometry)


Unnamed: 0,identificatie,shortest_path,geometry,path_length_meters
0,394100000214674,[],"POLYGON ((4.74280 52.38161, 4.74280 52.38161, ...",0.000
1,394100000214675,[],"POLYGON ((4.74287 52.38167, 4.74285 52.38165, ...",0.000
2,394100000214676,[],"POLYGON ((4.74286 52.38179, 4.74281 52.38175, ...",0.000
3,394100000214677,[],"POLYGON ((4.74292 52.38183, 4.74286 52.38179, ...",0.000
4,394100000214682,[],"POLYGON ((4.74300 52.38262, 4.74293 52.38266, ...",0.000
...,...,...,...,...
161243,437100000005999,"[2297541071, 46040188, 46041472, 46042097, 149...","POLYGON ((4.91538 52.29042, 4.91548 52.29036, ...",1670.735
161244,437100000007763,"[2297541071, 46040188, 46041472, 46042097, 149...","POLYGON ((4.91563 52.29049, 4.91562 52.29049, ...",1670.735
161245,437100000002928,"[10167299809, 46039725, 4734319723, 4734319746...","POLYGON ((4.91961 52.29066, 4.91966 52.29060, ...",1754.740
161246,437100000001306,"[569107921, 2297541079, 569107854, 46002289, 4...","POLYGON ((4.92225 52.28032, 4.92226 52.28025, ...",3012.374


In [130]:
#filter out buildings without shortest path or a path length of more than 2500 meters
buildings_with_geometry = buildings_with_geometry[
    (buildings_with_geometry['path_length_meters'] < 2500) &
    (buildings_with_geometry['shortest_path'].str.len() > 0)
]

Clean data

In [131]:
columns_to_keep = ['aantal_inwoners', 'aantal_inwoners_0_tot_15_jaar', 'aantal_inwoners_65_jaar_en_ouder', 'aantal_eenpersoonshuishoudens', 'residential_unit_count', 'geometry']  # Add any other columns you want to keep
squares_informed = squares_informed[columns_to_keep]
squares_random = squares_random[columns_to_keep]
squares_static = squares_static[columns_to_keep]

Allocate population to buildings

Function for random distribution

In [132]:
def run_random_population_distribution(squares_random, residential_units):
    # Initialize an empty GeoDataFrame to store population results
    residential_unit_population = gpd.GeoDataFrame()

    # Process each square with progress bar
    for i, squares_select in tqdm(squares_random.iterrows(), 
                                        desc=f"Processing squares", 
                                        total=len(squares_random)):
        squares_select = gpd.GeoDataFrame([squares_select], geometry='geometry', crs=squares_random.crs)

        # Get residential units within the current square
        residential_unit_in_square = gpd.overlay(residential_units, squares_select, how='intersection')

        # Reset index for proper sampling
        residential_unit_in_square = residential_unit_in_square.reset_index(drop=True)

        # Get total residents and age-specific counts for this square
        total_residents = squares_select.iloc[0]['aantal_inwoners']
        young_residents = squares_select.iloc[0]['aantal_inwoners_0_tot_15_jaar']
        old_residents = squares_select.iloc[0]['aantal_inwoners_65_jaar_en_ouder']

        # Initialize population for all residential units
        residential_unit_in_square['pop'] = 0
        residential_unit_in_square['young_pop'] = 0
        residential_unit_in_square['old_pop'] = 0
        residential_unit_in_square['remaining_capacity'] = 0

        # Distribute total population randomly among residential units
        remaining_residents = total_residents
        if remaining_residents > 0:
            while remaining_residents > 0:
                random_unit = residential_unit_in_square.sample(n=1)
                random_index = random_unit.index[0]
                residential_unit_in_square.loc[random_index, 'pop'] += 1
                residential_unit_in_square.loc[random_index, 'remaining_capacity'] += 1
                remaining_residents -= 1

        # Distribute young residents randomly among existing population
        remaining_young = young_residents
        if remaining_young > 0:
            while remaining_young > 0:
                eligible_units = residential_unit_in_square[
                    (residential_unit_in_square['remaining_capacity'] > residential_unit_in_square['young_pop']) &
                    (residential_unit_in_square['pop'] - residential_unit_in_square['young_pop'] > 1)
                ]
                if eligible_units.empty:
                    break
                random_unit = eligible_units.sample(n=1)
                random_index = random_unit.index[0]
                residential_unit_in_square.loc[random_index, 'young_pop'] += 1
                residential_unit_in_square.loc[random_index, 'remaining_capacity'] -= 1
                remaining_young -= 1

        # Distribute old residents randomly among existing population
        remaining_old = old_residents
        if remaining_old > 0:
            while remaining_old > 0:
                eligible_units = residential_unit_in_square[
                    residential_unit_in_square['remaining_capacity'] > residential_unit_in_square['old_pop']
                ]
                if eligible_units.empty:
                    break
                random_unit = eligible_units.sample(n=1)
                random_index = random_unit.index[0]
                residential_unit_in_square.loc[random_index, 'old_pop'] += 1
                residential_unit_in_square.loc[random_index, 'remaining_capacity'] -= 1
                remaining_old -= 1

        # Append the results for this square to the main GeoDataFrame
        residential_unit_population = pd.concat([residential_unit_population, residential_unit_in_square], ignore_index=True)

    # Aggregate results
    residential_unit_population_random = residential_unit_population[
        ['identificatie', 'geometry', 'pop', 'young_pop', 'old_pop']
    ]

    return residential_unit_population_random


Function for static distribution

In [133]:
def run_static_population_distribution(squares_static, residential_units):
    # Overlay operation for all squares
    residential_unit_in_squares = gpd.overlay(residential_units, squares_static, how='intersection')
    
    # Assign static population of 2 to each residential unit
    residential_unit_in_squares['pop_static'] = 2
    
    # Return only the residential units with static population
    return residential_unit_in_squares[['identificatie', 'geometry', 'pop_static']]


Function for informed distribution

In [134]:
# Constants
EPSG_CODE = 4326  # Default coordinate reference system

def prepare_spatial_index(thread_args):
    """Pre-process spatial data for each thread to reduce GIL contention"""
    squares_batch, residential_units, thread_number = thread_args

    # Create spatial index for the batch area
    batch_bounds = gpd.GeoDataFrame([square for square in squares_batch], 
                                  geometry='geometry').unary_union.bounds

    # Filter verblijfsobject that intersects with batch bounds
    mask = residential_units.bounds.apply(lambda row: (
        row.minx <= batch_bounds[2] and 
        row.maxx >= batch_bounds[0] and 
        row.miny <= batch_bounds[3] and 
        row.maxy >= batch_bounds[1]
    ), axis=1)
    batch_residential_units = residential_units[mask].copy()

    return squares_batch, batch_residential_units, thread_number

def process_squares_batch(thread_args):
    """Process a batch of squares with pre-filtered spatial data"""
    squares_batch, batch_residential_units, thread_number = thread_args
    residential_unit_population = gpd.GeoDataFrame()

    for square in tqdm(squares_batch, 
                      desc=f'Thread {thread_number}', 
                      position=thread_number):
        squares_select = gpd.GeoDataFrame([square], geometry='geometry')

        if squares_select.crs is None:
            squares_select.set_crs(epsg=EPSG_CODE, inplace=True)
        elif squares_select.crs != f'EPSG:{EPSG_CODE}':
            squares_select = squares_select.to_crs(epsg=EPSG_CODE)

        if batch_residential_units.crs is None:
            batch_residential_units.set_crs(epsg=EPSG_CODE, inplace=True)
        elif batch_residential_units.crs != f'EPSG:{EPSG_CODE}':
            batch_residential_units = batch_residential_units.to_crs(epsg=EPSG_CODE)

        residential_unit_in_square = gpd.overlay(batch_residential_units, squares_select, 
                                              how='intersection')

        if len(residential_unit_in_square) == 0:
            continue

        residential_unit_in_square = residential_unit_in_square.reset_index(drop=True)

        # Read input data
        total_residents = squares_select.iloc[0]['aantal_inwoners']
        oneperson_units = squares_select.iloc[0]['aantal_eenpersoonshuishoudens']
        young_residents = squares_select.iloc[0]['aantal_inwoners_0_tot_15_jaar']
        old_residents = squares_select.iloc[0]['aantal_inwoners_65_jaar_en_ouder']

        # Initialize population
        residential_unit_in_square['pop'] = 0
        residential_unit_in_square['young_pop'] = 0
        residential_unit_in_square['old_pop'] = 0
        residential_unit_in_square['remaining_capacity'] = 0

        # Handle one-person households
        if oneperson_units < 0:
            oneperson_units = 0

        oneperson_units = min(oneperson_units, len(residential_unit_in_square))
        remaining_residents = total_residents

        single_households = np.random.choice(residential_unit_in_square.index, 
                                          size=oneperson_units, 
                                          replace=False)

        residential_unit_in_square.loc[single_households, 'pop'] = 1
        residential_unit_in_square.loc[single_households, 'remaining_capacity'] = 5  # Max capacity assumption
        remaining_residents -= len(single_households)

        remaining_units = np.setdiff1d(residential_unit_in_square.index, single_households)

        if len(remaining_units) > 0:
            if (len(remaining_units) * 2) < remaining_residents:
                residential_unit_in_square.loc[remaining_units, 'pop'] = 2
            elif len(remaining_units) < remaining_residents:
                residential_unit_in_square.loc[remaining_units, 'pop'] = 1
            elif len(remaining_units) > remaining_residents:
                residential_unit_in_square.loc[remaining_units, 'pop'] = 0

            remaining_residents = total_residents - residential_unit_in_square['pop'].sum()

            if remaining_residents > 0:
                while remaining_residents > 0 and len(remaining_units) > 0:
                    random_index = np.random.choice(remaining_units)
                    residential_unit_in_square.loc[random_index, 'pop'] += 1

                    if residential_unit_in_square.loc[random_index, 'pop'] >= 6:
                        remaining_units = remaining_units[remaining_units != random_index]

                    remaining_residents -= 1

        # Distribute young residents
        remaining_young = young_residents
        if remaining_young > 0:
            while remaining_young > 0:
                eligible_units = residential_unit_in_square[
                    residential_unit_in_square['pop'] > residential_unit_in_square['young_pop']
                ]
                if eligible_units.empty:
                    break
                random_unit = eligible_units.sample(n=1)
                random_index = random_unit.index[0]
                residential_unit_in_square.loc[random_index, 'young_pop'] += 1
                remaining_young -= 1

        # Distribute old residents
        remaining_old = old_residents
        if remaining_old > 0:
            while remaining_old > 0:
                eligible_units = residential_unit_in_square[
                    residential_unit_in_square['pop'] > residential_unit_in_square['old_pop']
                ]
                if eligible_units.empty:
                    break
                random_unit = eligible_units.sample(n=1)
                random_index = random_unit.index[0]
                residential_unit_in_square.loc[random_index, 'old_pop'] += 1
                remaining_old -= 1

        residential_unit_population = pd.concat([residential_unit_population, residential_unit_in_square], 
                                              ignore_index=True)

    return residential_unit_population

def run_informed_population_distribution(squares_informed, residential_units): 
    num_threads = os.cpu_count()
    print(f"Using {num_threads} threads for parallel processing")

    residential_unit_population_all_simulations = gpd.GeoDataFrame()


    squares_list = list(squares_informed.itertuples())
    batch_size = max(1, len(squares_list) // num_threads)

    squares_list.sort(key=lambda x: x.geometry.centroid.x)
    batches = [squares_list[i:i + batch_size] for i in range(0, len(squares_list), batch_size)]

    thread_args = [
        (batch, residential_units, i + 1) 
        for i, batch in enumerate(batches)
    ]

    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        thread_args = list(executor.map(prepare_spatial_index, thread_args))

    residential_unit_population = gpd.GeoDataFrame()
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures = [executor.submit(process_squares_batch, args) for args in thread_args]

        for future in futures:
            result = future.result()
            if not result.empty:
                residential_unit_population = pd.concat([residential_unit_population, result], 
                                                    ignore_index=True)

    for i in range(num_threads + 1):
        print('\033[F\033[K', end='')


    if residential_unit_population_all_simulations.empty:
        residential_unit_population_all_simulations = residential_unit_population[
            ['identificatie', 'geometry', 'pop', 'young_pop', 'old_pop']
        ]
    else:
        residential_unit_population_all_simulations = residential_unit_population_all_simulations.merge(
            residential_unit_population[['identificatie', 'pop', 'young_pop', 'old_pop']],
            on='identificatie',
            how='outer'
        )

    return residential_unit_population_all_simulations


Run simulations

In [135]:
#import buurten
buurten = gpd.read_file("./root/Inputs/neighbourhoods.gpkg", layer='buurten')
buurten = buurten.to_crs(buildings.crs)

In [136]:
#set number of simulations
#Please note that this is a time-intensive operation. For reference, one simulation round for the city of Amsterdam takes about 30 minutes

number_of_simulations = 1 #default = 1

This part calculates the pedestrian intensity per edge:
- residents are allocated to a residential unit
- residential units are spatially overlayed with buildings to retreive the shortest path
- the number of residents using a specific path is calculated
- results are stored per simulation

In [137]:
#run simulations
for sim in range(number_of_simulations):

    print(f"start simulation: {sim + 1}/{number_of_simulations}")
    #run static allocation
    building_population_static = run_static_population_distribution(squares_static, residential_units)
    #run random allocation
    building_population_random = run_random_population_distribution(squares_random, residential_units)
    #run informed random allocation
    building_population_informed = run_informed_population_distribution(squares_informed, residential_units)

    #fix columns of static population
    building_population_static['pop'] = building_population_static['pop_static']
    building_population_static = building_population_static.rename(columns={'pop_static': 'pop_unknown'})

    #combine into one dataframe
    combined_gdf = gpd.GeoDataFrame(pd.concat([building_population_informed, building_population_random, building_population_static], ignore_index=True))
    combined_gdf.fillna(0, inplace=True)

    # Convert numerical columns to integers
    numeric_columns = combined_gdf.select_dtypes(include=[np.number]).columns
    combined_gdf[numeric_columns] = combined_gdf[numeric_columns].astype(int)

    #join with geometry
    buildings_population_all_simulations = gpd.sjoin(
        combined_gdf, 
        buildings_with_geometry[['geometry', 'shortest_path']], 
        how='left', 
        predicate='intersects'
    )

    # Perform a spatial join to add 'buurtcode' to buildings_population_all_simulations

    #drop buildings with no path
    buildings_population_all_simulations = buildings_population_all_simulations[
        ~buildings_population_all_simulations['shortest_path'].isna()
    ]
    # Drop index_right
    buildings_population_all_simulations = buildings_population_all_simulations.drop(columns=['index_right'], errors='ignore')

    buildings_population_with_buurtcode = gpd.sjoin(
        buildings_population_all_simulations, 
        buurten[['geometry', 'buurtcode']], 
        how='left', 
        predicate='intersects'
    )

    # Drop unnecessary columns added by the spatial join
    buildings_population_with_buurtcode = buildings_population_with_buurtcode.drop(columns=['index_right'], errors='ignore')

    # Function to convert nodes into edges
    def nodes_to_edges(path):
        if isinstance(path, list) and len(path) > 1:
            return [(path[i], path[i + 1]) for i in range(len(path) - 1)]
        return []

    # Convert shortest_path into edges
    buildings_population_with_buurtcode['edges'] = buildings_population_with_buurtcode['shortest_path'].apply(nodes_to_edges)

    # Explode edges into individual rows
    exploded_edges = buildings_population_with_buurtcode.explode('edges', ignore_index=True)

    # Split edges into 'u' and 'v'
    exploded_edges[['u', 'v']] = pd.DataFrame(
        exploded_edges['edges'].tolist(), index=exploded_edges.index
    )

    # Precompute population sums per buurtcode, u, and v
    precomputed = exploded_edges.groupby(['u', 'v', 'buurtcode']).agg({
        'pop': 'sum',
        'young_pop': 'sum',
        'old_pop': 'sum',
        'pop_unknown': 'sum'
    }).reset_index()

    # Group by edges and aggregate using tqdm for progress
    edge_population_data = []
    grouped_edges = precomputed.groupby(['u', 'v'])

    for (u, v), group in tqdm(grouped_edges, desc="Processing edges", total=len(grouped_edges)):
        total_pop = group['pop'].sum()
        total_pop_young = group['young_pop'].sum()
        total_pop_old = group['old_pop'].sum()
        total_pop_unknown = group['pop_unknown'].sum()
        
        buurtcode_dict = dict(zip(group['buurtcode'], group['pop']))
        edge_population_data.append({
            'u': u,
            'v': v,
            'pop': total_pop,
            'young_pop': total_pop_young,
            'old_pop': total_pop_old,
            'pop_unknown': total_pop_unknown,        
            'buurtcode': buurtcode_dict
        })

    # Create a DataFrame from the aggregated data
    edge_population = pd.DataFrame(edge_population_data)

    edge_population.rename(columns={
        'pop_unknown': 'age-unknown_pop'
    }, inplace=True)

    # Ensure u and v are integers
    edge_population['u'] = edge_population['u'].astype('int64')
    edge_population['v'] = edge_population['v'].astype('int64')

    #combine edges in both direction into one
    edge_population['edge'] = edge_population.apply(lambda row: tuple(sorted([row['u'], row['v']])), axis=1)

    # Function to combine count of buurtcodes
    def combine_dicts(rows):
        combined_dict = {}
        for row in rows:
            if isinstance(row, dict):
                for key, value in row.items():
                    combined_dict[key] = combined_dict.get(key, 0) + value
        return combined_dict

    
    edge_population_combined = edge_population.groupby('edge').agg({
        'pop': 'sum',
        'buurtcode': lambda x: combine_dicts(x),  # Combine into a single dictionary
        'young_pop': 'sum',  
        'old_pop': 'sum',  
        'age-unknown_pop': 'sum'  
    }).reset_index()

    # Restore `u` and `v` columns for clarity
    edge_population_combined[['u', 'v']] = pd.DataFrame(edge_population_combined['edge'].tolist(), index=edge_population_combined.index)
    edge_population_combined.drop(columns=['edge'], inplace=True)

    #Export
    edge_population_combined.to_excel(f'./root/Outputs/sim_results/edge_population_simulation_{sim + 1}.xlsx', index=False)

print("\033[2J\033[H", end="")  # Clear terminal output
print(f"Simulation {sim + 1} done")


start simulation: 1/1


Processing squares:   0%|          | 0/203 [00:00<?, ?it/s]

Using 12 threads for parallel processing


Thread 1:   0%|          | 0/729 [00:00<?, ?it/s]

Thread 2:   0%|          | 0/729 [00:00<?, ?it/s]

Thread 5:   0%|          | 0/729 [00:00<?, ?it/s]

Thread 3:   0%|          | 0/729 [00:00<?, ?it/s]

Thread 4:   0%|          | 0/729 [00:00<?, ?it/s]

Thread 8:   0%|          | 0/729 [00:00<?, ?it/s]

Thread 10:   0%|          | 0/729 [00:00<?, ?it/s]

Thread 6:   0%|          | 0/729 [00:00<?, ?it/s]

Thread 7:   0%|          | 0/729 [00:00<?, ?it/s]

Thread 9:   0%|          | 0/729 [00:00<?, ?it/s]

Thread 11:   0%|          | 0/729 [00:00<?, ?it/s]

Thread 12:   0%|          | 0/729 [00:00<?, ?it/s]

Thread 13:   0%|          | 0/1 [00:00<?, ?it/s]

[F[K[F[K[F[K[F[K[F[K[F[K[F[K[F[K[F[K[F[K[F[K[F[K[F[K

Processing edges:   0%|          | 0/89963 [00:00<?, ?it/s]

[2J[HSimulation 1 done
