# Process census 2022 data (100 m x 100 m)

In [1]:
%load_ext autoreload
%autoreload 2
%cd D:\nine-euro-ticket-de

D:\nine-euro-ticket-de


In [40]:
# Load libs
import pandas as pd
import os
os.environ['USE_PYGEOS'] = '0'
import workers
import geopandas as gpd
import rasterio
import sqlalchemy
import numpy as np
from tqdm import tqdm

In [3]:
# Visualization packages
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import folium

In [4]:
# Data location
user = workers.keys_manager['database']['user']
password = workers.keys_manager['database']['password']
port = workers.keys_manager['database']['port']
db_name = workers.keys_manager['database']['name']
engine = sqlalchemy.create_engine(f'postgresql://{user}:{password}@localhost:{port}/{db_name}?gssencmode=disable')

## 1. Derived home
The included devices should have at least 30 records for being at home.

In [7]:
df_h = pd.merge(pd.read_sql("""SELECT device_aid, latitude, longitude
                               FROM home_r
                               WHERE count >= 30;""", con=engine),
                pd.read_sql("""SELECT * FROM data_desc.poi_visitation_indi;""", con=engine),
                on='device_aid', how='left')
gdf_h = workers.df2gdf_point(df=df_h, x_field='longitude', y_field='latitude', crs=4326, drop=True).to_crs(3035)
gdf_h.head()

Unnamed: 0,device_aid,num_unique_poi,num_visits,Water Sports,Tourist attractions,2019,2022,2023,geometry
0,000659ea-1522-4ea4-8c0e-e633a40bc0c7,,,,,,,,POINT (4149444.505 3037339.675)
1,00080961-f0ed-642e-aec2-e9903f704320,18.0,133.0,0.0,0.0,0.0,0.0,100.0,POINT (4242822.825 3228997.873)
2,0009cb64-31a3-4d22-90fb-cbe92b31405e,14.0,169.0,0.0,84.615385,0.0,0.0,100.0,POINT (4258271.494 3082049.121)
3,000bd331-5858-49b2-9014-1e2cfcec6d1c,9.0,13.0,0.0,0.0,100.0,0.0,0.0,POINT (4130599.405 3156443.712)
4,0015d168-d168-c17f-b82e-a4486067296f,5.0,10.0,0.0,0.0,0.0,100.0,0.0,POINT (4525692.206 3087123.152)


In [8]:
print(f"Number of devices: {len(gdf_h)}")

Number of devices: 4551183


## 2. Census data 2022
Data [location](https://www.zensus2022.de/DE/Ergebnisse-des-Zensus/_inhalt.html#toc-1).
### 2.1 Population count (Einwohner=Resident)

In [19]:
df_c = pd.read_csv('dbs/geo/census_2022/Zensus2022_Bevoelkerungszahl/Zensus2022_Bevoelkerungszahl_100m-Gitter.csv',
                   sep=';', encoding='latin-1')
df_c.columns = ['grid', 'x_mp_100m', 'y_mp_100m', 'pop']
# Apply function to create geometry column
df_c['geometry'] = df_c.apply(lambda row: workers.create_square(row['x_mp_100m'], row['y_mp_100m']), axis=1)

# Convert to GeoDataFrame
gdf_c = gpd.GeoDataFrame(df_c, geometry='geometry', crs="EPSG:3035")

In [20]:
home_100 = gdf_h.sjoin(gdf_c[['grid', 'pop', 'geometry']])
home_100 = home_100[['device_aid', 'grid', 'pop']].rename(columns={'grid': 'grid_100m', 'pop': 'pop_100m'})
home_100.head()

Unnamed: 0,device_aid,grid_100m,pop_100m
1,00080961-f0ed-642e-aec2-e9903f704320,CRS3035RES100mN3228900E4242800,57
2,0009cb64-31a3-4d22-90fb-cbe92b31405e,CRS3035RES100mN3082000E4258200,49
3,000bd331-5858-49b2-9014-1e2cfcec6d1c,CRS3035RES100mN3156400E4130500,62
4,0015d168-d168-c17f-b82e-a4486067296f,CRS3035RES100mN3087100E4525600,11
5,00185302-a463-4667-9fce-f8152a253959,CRS3035RES100mN3203500E4105100,41


#### Population count at 1 km grid for weighting

In [21]:
df_c = pd.read_csv('dbs/geo/census_2022/Zensus2022_Bevoelkerungszahl/Zensus2022_Bevoelkerungszahl_1km-Gitter.csv',
                   sep=';', encoding='latin-1')
df_c.head()

Unnamed: 0,GITTER_ID_1km,x_mp_1km,y_mp_1km,Einwohner
0,CRS3035RES1000mN2689000E4337000,4337500,2689500,4
1,CRS3035RES1000mN2689000E4341000,4341500,2689500,11
2,CRS3035RES1000mN2690000E4341000,4341500,2690500,4
3,CRS3035RES1000mN2691000E4340000,4340500,2691500,3
4,CRS3035RES1000mN2691000E4341000,4341500,2691500,22


In [22]:
df_c.columns = ['grid', 'x_mp_1km', 'y_mp_1km', 'pop']
# Apply function to create geometry column
df_c['geometry'] = df_c.apply(lambda row: workers.create_square(row['x_mp_1km'], row['y_mp_1km'], size=1000), axis=1)

# Convert to GeoDataFrame
gdf_c = gpd.GeoDataFrame(df_c, geometry='geometry', crs="EPSG:3035")

In [23]:
home_1k = gdf_h.sjoin(gdf_c[['grid', 'pop', 'geometry']])
home_1k = home_1k[['device_aid', 'grid', 'pop']].rename(columns={'grid': 'grid_1km', 'pop': 'pop_1km'})
home_1k.head()

Unnamed: 0,device_aid,grid_1km,pop_1km
0,000659ea-1522-4ea4-8c0e-e633a40bc0c7,CRS3035RES1000mN3037000E4149000,1276
1,00080961-f0ed-642e-aec2-e9903f704320,CRS3035RES1000mN3228000E4242000,2305
2,0009cb64-31a3-4d22-90fb-cbe92b31405e,CRS3035RES1000mN3082000E4258000,2441
3,000bd331-5858-49b2-9014-1e2cfcec6d1c,CRS3035RES1000mN3156000E4130000,2106
4,0015d168-d168-c17f-b82e-a4486067296f,CRS3035RES1000mN3087000E4525000,550


### 2.2 Average age

In [26]:
df_a = pd.read_csv('dbs/geo/census_2022/Durchschnittsalter_in_Gitterzellen/Zensus2022_Durchschnittsalter_100m-Gitter.csv',
                   sep=';', encoding='latin-1')
df_a = df_a.iloc[:, :4]
df_a.columns = ['grid', 'x_mp_100m', 'y_mp_100m', 'age']
df_a.loc[:, 'age'] = df_a.loc[:, 'age'].apply(lambda x: int(x.split(',')[0]))
# Apply function to create geometry column
df_a['geometry'] = df_a.apply(lambda row: workers.create_square(row['x_mp_100m'], row['y_mp_100m']), axis=1)

# Convert to GeoDataFrame
gdf_a = gpd.GeoDataFrame(df_a, geometry='geometry', crs="EPSG:3035")

  df_a = pd.read_csv('dbs/geo/census_2022/Durchschnittsalter_in_Gitterzellen/Zensus2022_Durchschnittsalter_100m-Gitter.csv',


In [29]:
age_100 = gdf_h.sjoin(gdf_a[['grid', 'age', 'geometry']])
age_100 = age_100[['device_aid', 'grid', 'age']].rename(columns={'grid': 'grid_100m', 'age': 'age_100m'})
age_100.head()

Unnamed: 0,device_aid,grid_100m,age_100m
1,00080961-f0ed-642e-aec2-e9903f704320,CRS3035RES100mN3228900E4242800,39
2,0009cb64-31a3-4d22-90fb-cbe92b31405e,CRS3035RES100mN3082000E4258200,35
3,000bd331-5858-49b2-9014-1e2cfcec6d1c,CRS3035RES100mN3156400E4130500,46
4,0015d168-d168-c17f-b82e-a4486067296f,CRS3035RES100mN3087100E4525600,43
5,00185302-a463-4667-9fce-f8152a253959,CRS3035RES100mN3203500E4105100,31


### 2.3 Living space per resident

In [30]:
df_s = pd.read_csv('dbs/geo/census_2022/Durchschnittliche_Wohnflaeche_je_Bewohner_in_Gitterzellen/Zensus2022_Durchschn_Flaeche_je_Bewohner_100m-Gitter.csv',
                   sep=';', encoding='latin-1')
df_s = df_s.iloc[:, :4]
df_s.columns = ['grid', 'x_mp_100m', 'y_mp_100m', 'space']
df_s.loc[:, 'space'] = df_s.loc[:, 'space'].apply(lambda x: int(x.split(',')[0]))
# Apply function to create geometry column
df_s['geometry'] = df_s.apply(lambda row: workers.create_square(row['x_mp_100m'], row['y_mp_100m']), axis=1)

# Convert to GeoDataFrame
gdf_s = gpd.GeoDataFrame(df_s, geometry='geometry', crs="EPSG:3035")

  df_s = pd.read_csv('dbs/geo/census_2022/Durchschnittliche_Wohnflaeche_je_Bewohner_in_Gitterzellen/Zensus2022_Durchschn_Flaeche_je_Bewohner_100m-Gitter.csv',


In [32]:
space_100 = gdf_h.sjoin(gdf_s[['grid', 'space', 'geometry']])
space_100 = space_100[['device_aid', 'grid', 'space']].rename(columns={'grid': 'grid_100m', 'space': 'space_100m'})
space_100.head()

Unnamed: 0,device_aid,grid_100m,space_100m
1,00080961-f0ed-642e-aec2-e9903f704320,CRS3035RES100mN3228900E4242800,65
2,0009cb64-31a3-4d22-90fb-cbe92b31405e,CRS3035RES100mN3082000E4258200,65
3,000bd331-5858-49b2-9014-1e2cfcec6d1c,CRS3035RES100mN3156400E4130500,46
4,0015d168-d168-c17f-b82e-a4486067296f,CRS3035RES100mN3087100E4525600,33
5,00185302-a463-4667-9fce-f8152a253959,CRS3035RES100mN3203500E4105100,41


### 2.4 Net rent

In [33]:
df_n = pd.read_csv('dbs/geo/census_2022/Zensus2022_Durchschn_Nettokaltmiete/Zensus2022_Durchschn_Nettokaltmiete_100m-Gitter.csv',
                   sep=';', encoding='latin-1')
df_n = df_n.iloc[:, :4]
df_n.columns = ['grid', 'x_mp_100m', 'y_mp_100m', 'net_rent']
df_n.loc[:, 'net_rent'] = df_n.loc[:, 'net_rent'].apply(lambda x: int(x.split(',')[0]))
# Apply function to create geometry column
df_n['geometry'] = df_n.apply(lambda row: workers.create_square(row['x_mp_100m'], row['y_mp_100m']), axis=1)

# Convert to GeoDataFrame
gdf_n = gpd.GeoDataFrame(df_n, geometry='geometry', crs="EPSG:3035")

In [34]:
rent_100 = gdf_h.sjoin(gdf_n[['grid', 'net_rent', 'geometry']])
rent_100 = rent_100[['device_aid', 'grid', 'net_rent']].rename(columns={'grid': 'grid_100m', 'net_rent': 'net_rent_100m'})
rent_100.head()

Unnamed: 0,device_aid,grid_100m,net_rent_100m
1,00080961-f0ed-642e-aec2-e9903f704320,CRS3035RES100mN3228900E4242800,5
2,0009cb64-31a3-4d22-90fb-cbe92b31405e,CRS3035RES100mN3082000E4258200,5
3,000bd331-5858-49b2-9014-1e2cfcec6d1c,CRS3035RES100mN3156400E4130500,5
6,001f24b1-26df-4949-a61c-abef1185ea8f,CRS3035RES100mN2908700E4145400,8
7,0028aea1-6ed5-406a-aa9f-2145ac49adee,CRS3035RES100mN3162700E4135900,5


## 3. Deprivation index

In [36]:
raster = rasterio.open('dbs/geo/povmap-grdi-v1-geotiff/povmap-grdi-v1.tif')

In [48]:
# Get the sampled values
tqdm.pandas()
gdf_h['grdi'] = gdf_h.to_crs(4326).geometry.progress_apply(lambda s: [x for x in raster.sample([(s.x, s.y)])][0])

100%|██████████| 4551183/4551183 [16:47<00:00, 4516.56it/s]


In [50]:
gdf_h['grdi'] = gdf_h['grdi'].apply(lambda x: x[0])

## 4. Combine attributes

In [54]:
dfs = [
       home_100[['device_aid', 'pop_100m']],
       home_1k[['device_aid', 'pop_1km', 'grid_1km']],
       age_100[['device_aid', 'age_100m']],
       space_100[['device_aid', 'space_100m']],
       rent_100[['device_aid', 'net_rent_100m']]
       ]
result = gdf_h[['device_aid', 'num_unique_poi', 'num_visits', 'Water Sports', 'Tourist attractions', 
              '2019', '2022', '2023', 'grdi']].copy()
for i in dfs:
    result = pd.merge(result, i, on='device_aid', how='left')
result.head()

Unnamed: 0,device_aid,num_unique_poi,num_visits,Water Sports,Tourist attractions,2019,2022,2023,grdi,pop_100m,pop_1km,grid_1km,age_100m,space_100m,net_rent_100m
0,000659ea-1522-4ea4-8c0e-e633a40bc0c7,,,,,,,,18.826588,,1276.0,CRS3035RES1000mN3037000E4149000,,,
1,00080961-f0ed-642e-aec2-e9903f704320,18.0,133.0,0.0,0.0,0.0,0.0,100.0,24.957132,57.0,2305.0,CRS3035RES1000mN3228000E4242000,39.0,65.0,5.0
2,0009cb64-31a3-4d22-90fb-cbe92b31405e,14.0,169.0,0.0,84.615385,0.0,0.0,100.0,21.832773,49.0,2441.0,CRS3035RES1000mN3082000E4258000,35.0,65.0,5.0
3,000bd331-5858-49b2-9014-1e2cfcec6d1c,9.0,13.0,0.0,0.0,100.0,0.0,0.0,2.794721,62.0,2106.0,CRS3035RES1000mN3156000E4130000,46.0,46.0,5.0
4,0015d168-d168-c17f-b82e-a4486067296f,5.0,10.0,0.0,0.0,0.0,100.0,0.0,4.414918,11.0,550.0,CRS3035RES1000mN3087000E4525000,43.0,33.0,


In [56]:
result.dropna(subset=['num_unique_poi'], inplace=True)
print(f"No. of the individual devices: {len(result)}")

No. of the individual devices: 3683502


In [None]:
result.to_sql('home_rgn', engine, schema='public', index=False, method='multi', if_exists='replace', chunksize=10000)