In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import os

%matplotlib inline

In [None]:
parent_dir = os.path.split(os.getcwd())[0]

In [None]:
zones = gpd.read_file(parent_dir + '\\Data\\New\\lms_zone_du_new.shp') # LMS Zone data
pc4 = gpd.read_file(parent_dir + '\Data\PC4 2022\cbs_pc4_2019_vol.gpkg') # PC4 data
# buurt = gpd.read_file(parent_dir + '\\Data\\Wijk buurt\\WijkBuurtkaart_2017_v3\\buurt_2017_v3.shp') # Buurt data



lms_pc4_match = pd.read_csv(parent_dir + '\\Data\\New\\lms_pc4_match_v2.csv') # df matching PC4 with LMS zones
# lms_buurt_match = gpd.read_file(parent_dir + '\\Data\\New\\buurt_lms_match.shp') # df matching buurt with LMS zones

In [None]:
rijbewijs = pd.read_excel(parent_dir + '\\Data\\PC4 2022\\auto_atlas_pc4_levering3.xlsx', sheet_name=5, skiprows=8, skipfooter=2)

In [None]:
rijbewijs = rijbewijs[['Postcode 4 gebied', 'Unnamed: 2']]

In [None]:
pc4.loc[:, 'postcode4'] = pc4['postcode4'].astype(int)

Change all unknown values to NaN

In [None]:
pc4_numeric = pc4.select_dtypes(include=['int16', 'int32', 'int64', 'float16', 'float32', 'float64'])
pc4[pc4_numeric < 0] = np.nan

## Create dataframe for results

In [None]:
zone_data = zones[['ZONE_ID']]

## Leeftijd & geslacht

In [None]:
zone_data.loc[:, 'Tot_population'] = zones.iloc[:, 33]

In [None]:
ages = zones.iloc[:, 15:34]
ages

In [None]:
labels = ['Age_average', 'Age_0_11', 'Age_12_17', 'Age_18_34', 'Age_35_54', 'Age_55_74', 'Age_75+']
mean_age = [2.5, 8.5, 13, 16, 26, 44.5, 59.5, 69.5, 80] # Aanname, 75+ groep is gemiddeld 80

Average age

In [None]:
age_tot = 0

for i in range(len(mean_age)):
    
    age_tot += mean_age[i] * (ages.iloc[:, i] + ages.iloc[:, i + 9])

zone_data.loc[:, labels[0]] = np.round(age_tot / zones.iloc[:, 33], 2)

Age per category

In [None]:
index_list = [[0, 1], [2, 3], [4], [5], [6, 7], [8]] 

for i in range(len(index_list)):

    age_group = 0

    for j in range(len(index_list[i])):
        age_group += ages.iloc[:, index_list[i][j]]
        age_group += ages.iloc[:, index_list[i][j] + 9]

    zone_data.loc[:, labels[i + 1]] = np.round(age_group / zones.iloc[:, 33] * 100, 2)
    


Gender

In [None]:
m_tot = ages.iloc[:, 0:9].sum(axis=1)
v_tot = ages.iloc[:, 9:18].sum(axis=1)

zone_data.loc[:, 'Man'] = np.round(m_tot / zones.iloc[:, 33] * 100, 1)
zone_data.loc[:, 'Woman'] = np.round(v_tot / zones.iloc[:, 33] * 100, 1)


In [None]:
zone_data

## Income

In [None]:
zone_data.loc[:, 'Income_hh_average'] =  zones.INK_GEM

Ignore zones without the useful data

In [None]:
high_inc = np.zeros(1406)
low_inc = np.zeros(1406)

for z in range(1, 1407): # Loop over all LMS zones
# for z in [1]:
    postal_codes = lms_pc4_match[lms_pc4_match.LMS == z].PC4
    x = pc4[pc4.postcode4.isin(postal_codes)][['postcode4', 'aantal_part_huishoudens', 
                                               'percentage_laag_inkomen_huishouden',
                                               'percentage_hoog_inkomen_huishouden']]
    
    if x['aantal_part_huishoudens'].sum() > 0:

        high_inc[z - 1] = ((x['aantal_part_huishoudens'] * x['percentage_hoog_inkomen_huishouden']).sum() 
                            / x['aantal_part_huishoudens'].sum())
        low_inc[z - 1] = ((x['aantal_part_huishoudens'] * x['percentage_laag_inkomen_huishouden']).sum() 
                            / x['aantal_part_huishoudens'].sum())

    else:
        high_inc[z - 1] = np.nan
        low_inc[z - 1] = np.nan

In [None]:
zone_data.loc[:, 'High_income'] = np.round(high_inc, 2)
zone_data.loc[:, 'Low_income'] = np.round(low_inc, 2)

In [None]:
zone_data[(zone_data.High_income.isnull()) | (zone_data.Low_income.isnull())]

In [None]:
len(zone_data[(zone_data.High_income.isnull()) | (zone_data.Low_income.isnull())])

So 11 LMS zones have no income data. Most of those do have only a few inhabitants according to the LMS data, except for 3. Their corresponding Pc4 zones have presumably been assigned to nearby zones.

In [None]:
zone_data

## Household

In [None]:
hh_types = np.zeros((5, 1406))


for z in range(1, 1407):
    postal_codes = lms_pc4_match[lms_pc4_match.LMS == z].PC4
    x = pc4[pc4.postcode4.isin(postal_codes)][['postcode4', 'aantal_part_huishoudens', 
                                               'aantal_eenpersoonshuishoudens',
                                               'aantal_meerpersoonshuishoudens_zonder_kind',
                                               'aantal_eenouderhuishoudens',
                                               'aantal_tweeouderhuishoudens',
                                               'gemiddelde_huishoudensgrootte']]
    
    if x['aantal_part_huishoudens'].sum() > 0:
        
        for h in range(5):
            
            if h < 4:
                h_percent = x.iloc[:, 2 + h] / x.iloc[:, 1] * 100 # Calculate percentage for household type

            else:
                h_percent = x.iloc[:, 2 + h] # Average household size (h = 4) is absolute

            hh_types[h, z - 1] = (x.iloc[:, 1] * h_percent).sum() / x.iloc[:, 1].sum() # Calculate weighted average
    
    else:
        hh_types[:, z - 1] = np.array([np.nan] * 5)
        
hh_types = np.round(hh_types, 2)
        

In [None]:
labels =  ['1PersonHousehold', '2+PersonHousehold', '1ParentHousehold', '2ParentHousehold', 'Household_Size']

for i in range(5):
    zone_data.loc[:, labels[i]] = hh_types[i]

In [None]:
zone_data

## Working population

For the working population, there will be differentiated between man and woman. So first determine the total number of man and woman for each zone:

In [None]:
tot_man = zones.iloc[:, 15:24].sum(axis=1)
tot_vrouw = zones.iloc[:, 24:33].sum(axis=1)

Total Beroepsbevolking (working people, and people that want to work)

In [None]:
bbv_woman = np.round(zones.loc[:, 'BBV_VROUW'] / tot_vrouw * 100, 2)
bbv_man = np.round(zones.loc[:, 'BBV_MAN'] / tot_man * 100, 2)

Part time workers (12-30 h) and Fulltime workers (30+ h)

In [None]:
pt_woman = np.round(zones.loc[:, 'PT_VROUW'] / zones.loc[:, 'BBV_VROUW'] * 100, 2)
pt_man = np.round(zones.loc[:, 'PT_MAN'] / zones.loc[:, 'BBV_MAN'] * 100, 2)


In [None]:
ft_woman = np.round((zones.loc[:, 'WERKZ_V'] - zones.loc[:, 'PT_VROUW']) / zones.loc[:, 'BBV_VROUW'] * 100, 2)
ft_man = np.round((zones.loc[:, 'WERKZ_M'] - zones.loc[:, 'PT_MAN']) / zones.loc[:, 'BBV_MAN'] * 100, 2)


In [None]:
zone_data.loc[:, 'Work_Pop_M'] = bbv_man
zone_data.loc[:, 'Work_Pop_V'] = bbv_woman

zone_data.loc[:, 'PT_M'] = pt_man
zone_data.loc[:, 'PT_V'] = pt_woman

zone_data.loc[:, 'FT_M'] = ft_man
zone_data.loc[:, 'FT_V'] = ft_woman

In [None]:
zone_data

## Car ownership

In [None]:
zone_data.loc[:, 'Cars_HH'] = np.round(zones.ACT_WAG / zones.HUISH, 2)

In [None]:
zone_data

## Drivers Licence

First, correct number of licences, assuming that the 'niet in te delen' licences are evenly distributed over the other postal codes

In [None]:
p_increase_rb = rijbewijs['Unnamed: 2'].iloc[-1] / rijbewijs['Unnamed: 2'].sum()

In [None]:
rijbewijs.loc[:, 'Rijbewijs']  = rijbewijs['Unnamed: 2'] * (1 + p_increase_rb)
rijbewijs

In [None]:
rb = np.zeros(1406)

for z in range(1, 1407):
    postal_codes = lms_pc4_match[lms_pc4_match.LMS == z].PC4

    x = pc4[pc4.postcode4.isin(postal_codes)][['postcode4', 'aantal_inwoners']]
    x_r = rijbewijs[rijbewijs['Postcode 4 gebied'].isin(postal_codes)]

    x_r = x_r.merge(x, left_on='Postcode 4 gebied', right_on='postcode4')
    
    p_rb = x_r.Rijbewijs / x_r.aantal_inwoners

    if x_r.aantal_inwoners.sum() > 0:
        rb[z - 1] = np.round((p_rb * x_r.aantal_inwoners).sum() / x_r.aantal_inwoners.sum() * 100, 2)
    
    else:
        rb[z - 1] = np.nan

In [None]:
plt.hist(rb, bins=50)

Filter very high and very low values.

In [None]:
rb[rb > 100] = np.nan

In [None]:
rb[rb < 20] = np.nan

In [None]:
plt.hist(rb, bins=50)

In [None]:
zone_data.loc[:, 'Driver_licence'] = rb

In [None]:
zone_data

## Student OV

Do not make a distinction between weekend/week and MBO/HO

In [None]:
tot_ov = zones[['HOWEEK', 'MBOWEEK', 'HOWKND', 'MBOWKND']].sum(axis=1)
p_ov = np.round(tot_ov / zones.INWONERS * 100, 2)

In [None]:
zone_data.loc[:, 'Student_OV'] = p_ov

In [None]:
zone_data

## Save to csv

In [None]:
zone_data.to_csv(parent_dir + '\\Data\\New\\zone_demographics.csv')

In [None]:
pd.read_csv(parent_dir + '\\Data\\New\\zone_demographics.csv', index_col=0)