In [1]:
import geopandas as gpd
import pandas as pd
import classes.entropycalculator as ec
from spatialentropy import altieri_entropy, leibovici_entropy
from scipy.stats import entropy
import numpy as np
import gc

from IPython.display import clear_output

from tqdm import tqdm
tqdm.pandas()

# select the part to handle
part = 0

In [2]:
gemeenten = gpd.read_parquet(f"data/gemeenten_parts/gemeenten_{part}.parquet")
gemeenten.head(10)


Unnamed: 0,gemeentecode,gemeentenaam,H2O,OAD,STED,BEV_DICHTH,AANT_INW,AANT_MAN,AANT_VROUW,P_00_14_JR,...,L1_shannon_1,total_amenities_2,L0_shannon_2,L1_shannon_2,RE_L0_0,RE_L1_0,RE_L0_1,RE_L1_1,RE_L0_2,RE_L1_2
0,GM0014,Groningen,NEE,3427.0,1.0,1284.0,238147.0,118198.0,119949.0,12.0,...,4.488051,3919.0,2.923374,4.206563,0.145529,0.135552,0.083836,0.092403,0.081681,0.088819
1,GM0034,Almere,NEE,1629.0,2.0,1725.0,222825.0,110589.0,112236.0,18.0,...,4.107096,2388.0,2.896139,3.858373,0.18826,0.206235,0.227224,0.241199,0.215799,0.236978
2,GM0037,Stadskanaal,NEE,840.0,4.0,273.0,32135.0,15844.0,16291.0,14.0,...,3.382694,392.0,2.517498,3.117741,0.045353,0.057663,0.030424,0.053134,0.033571,0.052019
3,GM0047,Veendam,NEE,1013.0,3.0,364.0,27616.0,13863.0,13753.0,14.0,...,3.909604,414.0,2.716352,3.606224,0.036289,0.035318,0.062704,0.054879,0.058006,0.055661
4,GM0050,Zeewolde,NEE,842.0,4.0,96.0,23692.0,12003.0,11689.0,17.0,...,3.726865,377.0,2.615493,3.455533,0.0,0.0,0.0,0.0,0.0,0.0
5,GM0059,Achtkarspelen,NEE,437.0,5.0,275.0,28149.0,14352.0,13797.0,17.0,...,4.077286,436.0,2.864065,3.746635,0.106324,0.100335,0.039629,0.041303,0.030611,0.038185
6,GM0060,Ameland,NEE,266.0,5.0,68.0,3840.0,1938.0,1902.0,14.0,...,3.545886,311.0,2.457352,3.250715,0.0,0.0,0.0,0.0,0.0,0.0
7,GM0072,Harlingen,NEE,1092.0,3.0,649.0,16188.0,8115.0,8073.0,14.0,...,3.966635,213.0,2.857904,3.750821,0.148797,0.137499,0.038399,0.036313,0.034814,0.036799
8,GM0074,Heerenveen,NEE,1109.0,3.0,272.0,51637.0,25706.0,25931.0,15.0,...,4.248196,941.0,2.878862,3.914626,0.047937,0.050492,0.067014,0.062689,0.061904,0.062309
9,GM0080,Leeuwarden,NEE,2196.0,2.0,535.0,127073.0,63248.0,63825.0,15.0,...,4.333112,2322.0,2.925403,4.058613,0.10515,0.112482,0.108554,0.133673,0.10354,0.124867


In [3]:
def _get_shannon_entropy(labels, base=2):
    # get the total count of the labels
    total_count = len(labels)
    # get the unique labels and their counts
    _, label_counts = np.unique(labels, return_counts=True)

    probs = label_counts / total_count
    # get the entropy
    return entropy(probs, base=base)

def gm_total_amenities_entropy(gm_name, filter_i):
    L0_BLACKLIST, L1_BLACKLIST = ec.getfilter(filter_i)
    amenity_gdf = gpd.read_parquet(f"data/gm_amenities/amenities_{gm_name}.parquet")
    
    # apply filters
    amenity_gdf = amenity_gdf[~amenity_gdf.L0_category.isin(L0_BLACKLIST)]
    if L1_BLACKLIST:
        for key, value in L1_BLACKLIST.items():
            amenity_gdf = amenity_gdf[
                ~(
                    (amenity_gdf.L0_category == key)
                    & (amenity_gdf.L1_category.isin(value))
                )
            ]
    
    # total number of amenities
    total_amenities = len(amenity_gdf)
    
    points = [[point.x, point.y] for point in amenity_gdf.geometry]
    
    # calculate entropy
    L0 = amenity_gdf.loc[:, f"L0_category"].values
    L1 = amenity_gdf.loc[:, f"L1_category"].values
    L0_entropy_alt = altieri_entropy(points, L0, base=2).entropy
    L1_entropy_alt = altieri_entropy(points, L1, base=2).entropy
    
    del points
    gc.collect()
    
    return total_amenities, L0_entropy_alt, L1_entropy_alt

In [4]:
for filter in [0, 1, 2]:
    print(f"Filter {filter}")
    for i, gm in tqdm(gemeenten.iterrows(), total=len(gemeenten)):
        total_amenities, L0_entropy_alt, L1_entropy_alt = gm_total_amenities_entropy(gm["gemeentenaam"], filter)
        gemeenten.at[i, f"total_amenities_{filter}"] = total_amenities
        gemeenten.at[i, f"L0_altieri_{filter}"] = L0_entropy_alt
        gemeenten.at[i, f"L1_altieri_{filter}"] = L1_entropy_alt
        
        del total_amenities, L0_entropy_alt, L1_entropy_alt
        gc.collect()
    
gemeenten.to_parquet(f"data/gemeenten_parts/calculated/gemeenten_{part}.parquet")


Filter 0


100%|██████████| 57/57 [03:42<00:00,  3.90s/it]


Filter 1


100%|██████████| 57/57 [00:50<00:00,  1.13it/s]


Filter 2


100%|██████████| 57/57 [00:43<00:00,  1.32it/s]
