# Process census 2022 data

In [1]:
%load_ext autoreload
%autoreload 2
%cd D:\nine-euro-ticket-de

D:\nine-euro-ticket-de


In [2]:
# Load libs
import pandas as pd
import os
os.environ['USE_PYGEOS'] = '0'
import workers
import geopandas as gpd
import rasterio
import sqlalchemy
from tqdm import tqdm

In [3]:
# Data location
user = workers.keys_manager['database']['user']
password = workers.keys_manager['database']['password']
port = workers.keys_manager['database']['port']
db_name = workers.keys_manager['database']['name']
engine = sqlalchemy.create_engine(f'postgresql://{user}:{password}@localhost:{port}/{db_name}?gssencmode=disable')

## 1. Derived home
The included devices should have at least 30 records for being at home.

In [None]:
df_h = pd.merge(pd.read_sql("""SELECT device_aid, latitude, longitude FROM home;""", con=engine),
                pd.read_sql("""SELECT * FROM data_desc.poi_visitation_indi;""", con=engine),
                on='device_aid', how='left')
gdf_h = workers.df2gdf_point(df=df_h, x_field='longitude', y_field='latitude', crs=4326, drop=True).to_crs(3035)
gdf_h.head()

In [None]:
print(f"Number of devices: {len(gdf_h)}")

## 2. Census data 2022
Data [location](https://www.zensus2022.de/DE/Ergebnisse-des-Zensus/_inhalt.html#toc-1).
### 2.1 Population count (Einwohner=Resident)

In [None]:
df_c = pd.read_csv('dbs/geo/census_2022/Zensus2022_Bevoelkerungszahl/Zensus2022_Bevoelkerungszahl_100m-Gitter.csv',
                   sep=';', encoding='latin-1')
df_c.columns = ['grid', 'x_mp_100m', 'y_mp_100m', 'pop']
# Apply function to create geometry column
df_c['geometry'] = df_c.apply(lambda row: workers.create_square(row['x_mp_100m'], row['y_mp_100m']), axis=1)

# Convert to GeoDataFrame
gdf_c = gpd.GeoDataFrame(df_c, geometry='geometry', crs="EPSG:3035")

In [None]:
home_100 = gdf_h.sjoin(gdf_c[['grid', 'pop', 'geometry']])
home_100 = home_100[['device_aid', 'grid', 'pop']].rename(columns={'grid': 'grid_100m', 'pop': 'pop_100m'})
home_100.head()

#### Population count at 1 km grid for weighting

In [None]:
df_c = pd.read_csv('dbs/geo/census_2022/Zensus2022_Bevoelkerungszahl/Zensus2022_Bevoelkerungszahl_1km-Gitter.csv',
                   sep=';', encoding='latin-1')
df_c.columns = ['grid', 'x_mp_1km', 'y_mp_1km', 'pop']
# Apply function to create geometry column
df_c['geometry'] = df_c.apply(lambda row: workers.create_square(row['x_mp_1km'], row['y_mp_1km'], size=1000), axis=1)

# Convert to GeoDataFrame
gdf_c = gpd.GeoDataFrame(df_c, geometry='geometry', crs="EPSG:3035")

In [None]:
home_1k = gdf_h.sjoin(gdf_c[['grid', 'pop', 'geometry']])
home_1k = home_1k[['device_aid', 'grid', 'pop']].rename(columns={'grid': 'grid_1km', 'pop': 'pop_1km'})
home_1k.head()

### 2.2 Average age

In [None]:
df_a = pd.read_csv('dbs/geo/census_2022/Durchschnittsalter_in_Gitterzellen/Zensus2022_Durchschnittsalter_100m-Gitter.csv',
                   sep=';', encoding='latin-1')
df_a = df_a.iloc[:, :4]
df_a.columns = ['grid', 'x_mp_100m', 'y_mp_100m', 'age']
df_a.loc[:, 'age'] = df_a.loc[:, 'age'].apply(lambda x: int(x.split(',')[0]))
# Apply function to create geometry column
df_a['geometry'] = df_a.apply(lambda row: workers.create_square(row['x_mp_100m'], row['y_mp_100m']), axis=1)

# Convert to GeoDataFrame
gdf_a = gpd.GeoDataFrame(df_a, geometry='geometry', crs="EPSG:3035")

In [None]:
age_100 = gdf_h.sjoin(gdf_a[['grid', 'age', 'geometry']])
age_100 = age_100[['device_aid', 'grid', 'age']].rename(columns={'grid': 'grid_100m', 'age': 'age_100m'})
age_100.head()

### 2.3 Living space per resident

In [None]:
df_s = pd.read_csv('dbs/geo/census_2022/Durchschnittliche_Wohnflaeche_je_Bewohner_in_Gitterzellen/Zensus2022_Durchschn_Flaeche_je_Bewohner_100m-Gitter.csv',
                   sep=';', encoding='latin-1')
df_s = df_s.iloc[:, :4]
df_s.columns = ['grid', 'x_mp_100m', 'y_mp_100m', 'space']
df_s.loc[:, 'space'] = df_s.loc[:, 'space'].apply(lambda x: int(x.split(',')[0]))
# Apply function to create geometry column
df_s['geometry'] = df_s.apply(lambda row: workers.create_square(row['x_mp_100m'], row['y_mp_100m']), axis=1)

# Convert to GeoDataFrame
gdf_s = gpd.GeoDataFrame(df_s, geometry='geometry', crs="EPSG:3035")

In [None]:
space_100 = gdf_h.sjoin(gdf_s[['grid', 'space', 'geometry']])
space_100 = space_100[['device_aid', 'grid', 'space']].rename(columns={'grid': 'grid_100m', 'space': 'space_100m'})
space_100.head()

### 2.4 Net rent

In [None]:
df_n = pd.read_csv('dbs/geo/census_2022/Zensus2022_Durchschn_Nettokaltmiete/Zensus2022_Durchschn_Nettokaltmiete_100m-Gitter.csv',
                   sep=';', encoding='latin-1')
df_n = df_n.iloc[:, :4]
df_n.columns = ['grid', 'x_mp_100m', 'y_mp_100m', 'net_rent']
df_n.loc[:, 'net_rent'] = df_n.loc[:, 'net_rent'].apply(lambda x: int(x.split(',')[0]))
# Apply function to create geometry column
df_n['geometry'] = df_n.apply(lambda row: workers.create_square(row['x_mp_100m'], row['y_mp_100m']), axis=1)

# Convert to GeoDataFrame
gdf_n = gpd.GeoDataFrame(df_n, geometry='geometry', crs="EPSG:3035")

In [None]:
rent_100 = gdf_h.sjoin(gdf_n[['grid', 'net_rent', 'geometry']])
rent_100 = rent_100[['device_aid', 'grid', 'net_rent']].rename(columns={'grid': 'grid_100m', 'net_rent': 'net_rent_100m'})
rent_100.head()

## 3. Deprivation index

In [18]:
raster = rasterio.open('dbs/geo/povmap-grdi-v1-geotiff/povmap-grdi-v1.tif')

In [19]:
# Get the sampled values
tqdm.pandas()
gdf_h['grdi'] = gdf_h.to_crs(4326).geometry.progress_apply(lambda s: [x for x in raster.sample([(s.x, s.y)])][0])

100%|██████████| 23792532/23792532 [1:21:59<00:00, 4836.19it/s]


In [20]:
gdf_h['grdi'] = gdf_h['grdi'].apply(lambda x: x[0])

## 4. Combine attributes

In [None]:
dfs = [
       home_100[['device_aid', 'pop_100m']],
       home_1k[['device_aid', 'pop_1km', 'grid_1km']],
       age_100[['device_aid', 'age_100m']],
       space_100[['device_aid', 'space_100m']],
       rent_100[['device_aid', 'net_rent_100m']]
       ]
result = gdf_h[['device_aid', 'num_unique_poi', 'num_visits', 'Water Sports', 'Tourist attractions', 
              '2019', '2022', '2023', 'grdi']].copy()
for i in dfs:
    result = pd.merge(result, i, on='device_aid', how='left')
result.head()

In [22]:
result.dropna(subset=['num_unique_poi'], inplace=True)
print(f"No. of the individual devices: {len(result)}")

No. of the individual devices: 22738298


In [23]:
result.to_sql('home_g', engine, schema='public', index=False, method='multi', if_exists='replace', chunksize=10000)

22738298