In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import os

%matplotlib inline

In [None]:
parent_dir = os.path.split(os.getcwd())[0]

In [None]:
zones = gpd.read_file(parent_dir + '\\Data\\New\\lms_zone_du_new.shp') # LMS Zone data
pc4 = gpd.read_file(parent_dir + '\Data\PC4 2022\cbs_pc4_2019_vol.gpkg') # PC4 data

In [None]:
lms_pc4_match = pd.read_csv(parent_dir + '\\Data\\New\\lms_pc4_match_v2.csv') # df matching PC4 with LMS zones

In [None]:
pc4.loc[:, 'postcode4'] = pc4['postcode4'].astype(int)
pc4_numeric = pc4.select_dtypes(include=['int16', 'int32', 'int64', 'float16', 'float32', 'float64'])
pc4[pc4_numeric < 0] = np.nan

In [None]:
hist_data = zones[['ZONE_ID']]

## Historical development

This notebook will determine some data about the historical development of the city for each zone

In [None]:
pc4.iloc[:, np.r_[0, 19:28]]

In [None]:
w45min = np.zeros(1406)
w45_75 = np.zeros(1406)
w75_05 = np.zeros(1406)
w05plus = np.zeros(1406)

for z in range(1, 1407):
    postal_codes = lms_pc4_match[lms_pc4_match.LMS == z].PC4
    x = pc4[pc4.postcode4.isin(postal_codes)].iloc[:, np.r_[0, 19:28]]

    tot_won = x['aantal_woningen'].sum()

    if tot_won.sum() > 0:

        w45min[z - 1] = np.round(x.iloc[:, 2].sum() / tot_won * 100, 2)
        w45_75[z - 1] = np.round(x.iloc[:, 3:5].sum().sum() / tot_won * 100, 2)
        w75_05[z - 1] = np.round(x.iloc[:, 5:8].sum().sum() / tot_won * 100, 2)
        w05plus[z - 1] = np.round(x.iloc[:, 8:10].sum().sum() / tot_won * 100, 2)

In [None]:
w45min, w45_75, w75_05, w05plus

In [None]:
hist_data.loc[:, 'House_45_less'] = w45min
hist_data.loc[:, 'House_45_75'] = w45_75
hist_data.loc[:, 'House_75_05'] = w75_05
hist_data.loc[:, 'House_05_more'] = w05plus

In [None]:
hist_data

Because in many cases the data is censored, we need to check if the percentage add up to 100 (or at least get close to)

In [None]:
tot = w45min + w45_75 + w75_05 + w05plus
tot

In [None]:
plt.hist(tot, bins=50);

In [None]:
print('Percentages below 95%:\n', tot[tot < 95])
print('Percentages below 90%:\n', tot[tot < 90])
print('Percentages below 85%:\n', tot[tot < 85])


In [None]:
len(tot[tot < 95]) / len(tot) * 100

When removing all rows with values lower than 95%, only 2% of the data is removed and there are not really unreliable numbers left

In [None]:
hist_data.iloc[tot < 95, 1:] = np.nan

In [None]:
hist_data

## Job - workers ratio

Use working population (so people that do work or want to work) and divide by the number of jobs

In [None]:
hist_data.loc[:, 'Job-workers ratio'] = np.round((zones.BBV_MAN + zones.BBV_VROUW) / zones.BANENTOT, 3)

In [None]:
hist_data

## Save to csv

In [None]:
hist_data.to_csv(parent_dir + '\\Data\\New\\lms_zone_historical.csv')