In [1]:
import geopandas as gpd
import pandas as pd
import classes.entropycalculator as ec
from scipy.stats import entropy
import numpy as np
import gc
import shapely
from tqdm import tqdm
import warnings

warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

tqdm.pandas()

part = 4

bu = gpd.read_parquet(f'data/buurten_parts/buurten_c_{part}.parquet')

categorisation = pd.read_excel('data/categorisation.xlsx')
L0_cats = categorisation['L0 category'].unique()
L1_cats = categorisation['L1 category'].unique()

In [2]:
def _get_shannon_entropy(labels, base=2):
    # get the total count of the labels
    total_count = len(labels)
    # get the unique labels and their counts
    _, label_counts = np.unique(labels, return_counts=True)

    probs = label_counts / total_count
    # get the entropy
    return entropy(probs, base=base)

In [3]:
bu.head(3)

Unnamed: 0,buurtcode,buurtnaam,wijkcode,gemeentecode,gemeentenaam,IND_WBI,H2O,POSTCODE,DEK_PERC,OAD,...,total_amenities_1,L0_leibovici_1,L1_leibovici_1,L0_altieri_1,L1_altieri_1,total_amenities_2,L0_leibovici_2,L1_leibovici_2,L0_altieri_2,L1_altieri_2
5579,BU04730205,Kostverloren e.o.,WK047302,GM0473,Zandvoort,1.0,NEE,2042,1.0,2215.0,...,17.0,2.829857,2.829857,3.606689,3.606689,16.0,2.404504,2.404504,3.183686,3.183686
5580,BU04730206,Centrum,WK047302,GM0473,Zandvoort,1.0,NEE,2042,1.0,2408.0,...,161.0,3.263137,6.909909,4.162867,8.043334,127.0,3.373928,6.531107,4.290373,7.650763
5581,BU04730301,Bentveld Noord,WK047303,GM0473,Zandvoort,1.0,NEE,2116,3.0,322.0,...,12.0,2.642147,2.642147,3.057764,3.057764,12.0,2.642147,2.642147,3.057764,3.057764


In [4]:
for idx, row in tqdm(bu.iterrows(), total=bu.shape[0]):
    amenities = gpd.read_parquet(f"data/gm_amenities/amenities_{row['gemeentenaam']}.parquet")
    amenities = amenities[amenities.geometry.within(row.geometry)]
    for filter_i in [0, 1, 2]:
        
        # get and apply filter
        L0_filter, L1_filter = ec.getfilter(filter_i)
        
        amenities_f = amenities[~amenities.L0_category.isin(L0_filter)]
        if L1_filter:
            for key, value in L1_filter.items():
                amenities_f = amenities_f[
                    ~( (amenities_f.L0_category == key) & (amenities_f.L1_category.isin(value)) )
                ]
        
        bu.at[idx, f'L0_shannon_{filter_i}'] = _get_shannon_entropy(amenities_f.L0_category.values)
        bu.at[idx, f'L1_shannon_{filter_i}'] = _get_shannon_entropy(amenities_f.L1_category.values)
        
        for cat in L0_cats:
            bu.loc[idx, f'L1_{filter_i}_count_{cat}'] = len(amenities_f[amenities_f.L0_category == cat])
        
        for cat in L1_cats:
            bu.loc[idx, f'L0_{filter_i}_count_{cat}'] = len(amenities_f[amenities_f.L1_category == cat])

    del amenities
    del amenities_f
    gc.collect()

bu.to_parquet(f'data/buurten_parts/calculated/buurten_c_{part}.parquet') 

100%|██████████| 1394/1394 [27:34<00:00,  1.19s/it] 
