# Time-shifted DiD place filtering (hexagons-h3-7)
Data: daily visitation statistics stored under `dbs/combined_visits_day_did_hex/` categorized by area.

In [1]:
%load_ext autoreload
%autoreload 2
%cd D:\nine-euro-ticket-de

D:\nine-euro-ticket-de


In [2]:
# Load libs
import pandas as pd
import numpy as np
import os
os.environ['USE_PYGEOS'] = '0'
import geopandas as gpd
from shapely.geometry import Point
import rasterio
from tqdm import tqdm
import h3
import workers
import tdid
import sqlalchemy
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [3]:
# Data location
user = workers.keys_manager['database']['user']
password = workers.keys_manager['database']['password']
port = workers.keys_manager['database']['port']
db_name = workers.keys_manager['database']['name']
engine = sqlalchemy.create_engine(f'postgresql://{user}:{password}@localhost:{port}/{db_name}?gssencmode=disable')

In [4]:
# Hexagon data
data_folder = 'dbs/combined_visits_day_did_hex/'
paths2hex = {x.split('.')[0]: os.path.join(data_folder, x)
             for x in list(os.walk(data_folder))[0][2]}
paths2hex_list = [v for k, v in paths2hex.items()]
print(paths2hex_list[0])

# Target folder to save the data for DiD modeling
target_folder = 'dbs/combined_did_data/'

dbs/combined_visits_day_did_hex/h_831e26fffffffff.parquet


In [50]:
grp, lv = 'age', 'q4'

## 1. Load hexagon visit patterns

In [51]:
cols = ['h3_id', 'date', 'year', 'month', 'weekday', 'precipitation',
       'pt_station_num', 'num_visits_wt', 'num_unique_device', 'd_ha_wt', 
        'group', 'level']
# Load hexagons
df_list = []
for lb in tqdm(paths2hex_list, desc='Load hexagons'):
    df = pd.read_parquet(lb, columns=cols)
    df = df.loc[(df.num_unique_device > 3) & (df.num_unique_device > 3) & (df['month'] != 9) &
                (df.group == grp) & (df.level == lv), :]
    df_list.append(df.drop(columns=['group', 'level']))
df = pd.concat(df_list)
del df_list

print('By group', grp, 'Level', lv)
# The 9ET
df1 = df.loc[(df['year'].isin([2019, 2022])) & (df['month'].isin([5, 6, 7, 8])), :].copy()
print(f"No. of unique hexagons included for analysis - 9ET: {df1['h3_id'].nunique()}")
df1['date'] = df1['date'].astype(str)

# The D-ticket
df2 = df.loc[(df['year'].isin([2022, 2023])) & (df['month'].isin([2, 3, 4, 5])), :].copy()
print(f"No. of unique hexagons included for analysis - DT: {df2['h3_id'].nunique()}")
df2['date'] = df2['date'].astype(str)

Load hexagons: 100%|██████████| 49/49 [02:11<00:00,  2.69s/it]


By group age Level q4
No. of unique hexagons included for analysis - 9ET: 30430
No. of unique hexagons included for analysis - DT: 34169


In [52]:
h3_id_list = list(set(list(df1['h3_id'].unique()) + list(df2['h3_id'].unique())))
print(len(h3_id_list))

37384


## 2. Connect hexagon (centroids) with state

In [53]:
df_h3 = pd.DataFrame(h3_id_list, columns=['h3_id'])
# Step 1: Convert each H3 index to its centroid coordinates (lat, lon)
tqdm.pandas()
df_h3['centroid'] = df_h3['h3_id'].progress_apply(lambda x: h3.h3_to_geo(x))

# Step 2: Split the centroid coordinates into separate latitude and longitude columns
df_h3[['lat', 'lon']] = pd.DataFrame(df_h3['centroid'].tolist(), index=df_h3.index)

# Step 3: Create a GeoDataFrame using these coordinates as Point geometries
geometry = [Point(xy) for xy in zip(df_h3['lon'], df_h3['lat'])]
gdf = gpd.GeoDataFrame(df_h3, geometry=geometry)

# Optional: Set the CRS to WGS84 (EPSG:4326)
gdf = gdf.set_crs(epsg=4326)

100%|██████████| 37384/37384 [00:00<00:00, 341807.00it/s]


In [54]:
# Find h3_id: state
gdf_state = gpd.read_file("dbs/geo/vg2500_12-31.utm32s.shape/vg2500/vg2500_LAN.shp").to_crs(4326)
gdf_state = gdf_state.loc[gdf_state['GF'] == 9, :].rename(columns={'GEN': 'state'})
states = gdf.sjoin(gdf_state[['state', 'geometry']])
states.dropna(inplace=True)
states = states[['h3_id', 'state']]

In [55]:
gdf = pd.merge(gdf, states, on='h3_id', how='left')
print(f'No. of locations: {gdf.h3_id.nunique()}')
gdf.head()

No. of locations: 37384


Unnamed: 0,h3_id,centroid,lat,lon,geometry,state
0,881fab8117fffff,"(49.58520779939761, 10.619123420680415)",49.585208,10.619123,POINT (10.61912 49.58521),Bayern
1,881f1386adfffff,"(52.164242614196034, 9.784649037487563)",52.164243,9.784649,POINT (9.78465 52.16424),Niedersachsen
2,881fa5add1fffff,"(51.57032761291947, 6.962343593247691)",51.570328,6.962344,POINT (6.96234 51.57033),Nordrhein-Westfalen
3,881fac3b37fffff,"(50.74194802044497, 9.259039158849113)",50.741948,9.259039,POINT (9.25904 50.74195),Hessen
4,881faa70e9fffff,"(48.863326460510926, 9.258221332697397)",48.863326,9.258221,POINT (9.25822 48.86333),Baden-Württemberg


In [56]:
# Add state
df1 = pd.merge(df1, gdf[['h3_id', 'state']], on='h3_id', how='left')
df1.dropna(inplace=True)
df2 = pd.merge(df2, gdf[['h3_id', 'state']], on='h3_id', how='left')
df2.dropna(inplace=True)
print(df1.h3_id.nunique(), df2.h3_id.nunique())

30343 34056


## 3. Add fuel price (time-based)

In [57]:
df_f = pd.read_sql("""SELECT * FROM fuel_price;""", con=engine)

In [58]:
df1 = pd.merge(df1, df_f[['date', 'gasoline']], on='date', how='left')
df2 = pd.merge(df2, df_f[['date', 'gasoline']], on='date', how='left')
print(df1.h3_id.nunique(), df2.h3_id.nunique())

30343 34056


## 4. Select complete data and save

In [59]:
def h3_stats_ym(data):
    # comp = 2 means being complete
    comp_y = data['year'].nunique()
    return pd.Series(dict(comp_y=comp_y))

tqdm.pandas()
df1_r= df1.groupby('h3_id').progress_apply(h3_stats_ym).reset_index()
print("No. of h3 grids complete for the 9ET", len(df1_r.loc[df1_r.comp_y==2, :]))

tqdm.pandas()
df2_r = df2.groupby('h3_id').progress_apply(h3_stats_ym).reset_index()
print("No. of h3 grids complete for the D-Ticket", len(df2_r.loc[df2_r.comp_y==2, :]))

100%|██████████| 30343/30343 [00:10<00:00, 2945.20it/s]


No. of h3 grids complete for the 9ET 10392


100%|██████████| 34056/34056 [00:10<00:00, 3200.25it/s]

No. of h3 grids complete for the D-Ticket 22742





In [60]:
df1_rh = df1.loc[df1.h3_id.isin(df1_r.loc[df1_r.comp_y==2, 'h3_id'].values), :]
df2_rh = df2.loc[df2.h3_id.isin(df2_r.loc[df2_r.comp_y==2, 'h3_id'].values), :]
print(f"No. of h3 grids included for analysis - 9ET: {df1_rh['h3_id'].nunique()}")
print(f"No. of h3 grids included for analysis - DT: {df2_rh['h3_id'].nunique()}")

No. of h3 grids included for analysis - 9ET: 10392
No. of h3 grids included for analysis - DT: 22742


In [61]:
df1_rh.to_parquet(target_folder + f'h3_grids_9et_{grp}_{lv}.parquet', index=False)
df2_rh.to_parquet(target_folder + f'h3_grids_dt_{grp}_{lv}.parquet', index=False)

## 6. Time series

In [62]:
df1_rh = pd.read_parquet(target_folder + f'h3_grids_9et_{grp}_{lv}.parquet')
df2_rh = pd.read_parquet(target_folder + f'h3_grids_dt_{grp}_{lv}.parquet')

In [63]:
def visit_patterns_hex_date(data):
    data.loc[:, 'date'] = data.loc[:, 'date'].astype(str)
    metrics_dict = dict()
    # osm_id info
    for var in ('date', 'year', 'month', 'weekday', 'pt_station_num'):
        metrics_dict[var] = data[var].values[0]

    # Visits
    metrics_dict['visit_50'] = 10 ** (np.log10(data['num_visits_wt']).median())
    metrics_dict['visit_25'] = 10 ** (np.nanquantile(np.log10(data['num_visits_wt']), 0.25))
    metrics_dict['visit_75'] = 10 ** (np.nanquantile(np.log10(data['num_visits_wt']), 0.75))

    # Distance
    metrics_dict['d_50'] = 10 ** (np.log10(data['d_ha_wt']).median())
    metrics_dict['d_25'] = 10 ** (np.nanquantile(np.log10(data['d_ha_wt']), 0.25))
    metrics_dict['d_75'] = 10 ** (np.nanquantile(np.log10(data['d_ha_wt']), 0.75))
    return pd.Series(metrics_dict)

In [64]:
tqdm.pandas()
df_v = pd.concat([df1_rh.groupby('date').progress_apply(visit_patterns_hex_date).reset_index(drop=True).assign(policy='9et'),
                  df2_rh.groupby('date').progress_apply(visit_patterns_hex_date).reset_index(drop=True).assign(policy='dt')])
df_v.to_parquet(os.path.join(f"results/hex_time_series/{grp}_{lv}.parquet"), index=False)

100%|██████████| 240/240 [00:00<00:00, 322.44it/s]
100%|██████████| 231/231 [00:01<00:00, 162.70it/s]


## 6. Entropy balancing
### 6.1 The 9ET data

In [28]:
var = 'num_visits_wt'
df1_rh[f'ln_{var}'] = np.log(df1_rh[var])
df1_w = tdid.data_filtering_and_weighting(data=df1_rh, control_y=2019, treatment_y=2022, covar='pt_station_num',
                                          control_m=[5,], treatment_m=[6, 7, 8], var=f'ln_{var}', unit='h3')
df1_w.to_parquet(target_folder + f'h3_grids_9et_{grp}_{lv}_wt_v.parquet', index=False)

                                     CVXPY                                     
                                     v1.5.3                                    
(CVXPY) Nov 13 09:32:53 AM: Your problem has 719182 variables, 719185 constraints, and 0 parameters.
(CVXPY) Nov 13 09:32:53 AM: It is compliant with the following grammars: DCP, DQCP
(CVXPY) Nov 13 09:32:53 AM: (If you need to solve this problem multiple times, but with different data, consider using parameters.)
(CVXPY) Nov 13 09:32:53 AM: CVXPY will first compile your problem; then, it will invoke a numerical solver to obtain a solution.
(CVXPY) Nov 13 09:32:53 AM: Your problem is compiled with the CPP canonicalization backend.
-------------------------------------------------------------------------------
                                  Compilation                                  
-------------------------------------------------------------------------------
(CVXPY) Nov 13 09:32:53 AM: Compiling problem (target solver=SC

In [29]:
var = 'd_ha_wt'
df1_rh[f'ln_{var}'] = np.log(df1_rh[var])
df1_w = tdid.data_filtering_and_weighting(data=df1_rh, control_y=2019, treatment_y=2022, covar='pt_station_num', 
                                          control_m=[5,], treatment_m=[6, 7, 8], var=f'ln_{var}', unit='h3')
df1_w.to_parquet(target_folder + f'h3_grids_9et_{grp}_{lv}_wt_d.parquet', index=False)

                                     CVXPY                                     
                                     v1.5.3                                    
(CVXPY) Nov 13 09:34:01 AM: Your problem has 719182 variables, 719185 constraints, and 0 parameters.
(CVXPY) Nov 13 09:34:01 AM: It is compliant with the following grammars: DCP, DQCP
(CVXPY) Nov 13 09:34:01 AM: (If you need to solve this problem multiple times, but with different data, consider using parameters.)
(CVXPY) Nov 13 09:34:01 AM: CVXPY will first compile your problem; then, it will invoke a numerical solver to obtain a solution.
(CVXPY) Nov 13 09:34:01 AM: Your problem is compiled with the CPP canonicalization backend.
-------------------------------------------------------------------------------
                                  Compilation                                  
-------------------------------------------------------------------------------
(CVXPY) Nov 13 09:34:01 AM: Compiling problem (target solver=SC

### 6.2 The DT

In [None]:
var = 'num_visits_wt'
df2_rh[f'ln_{var}'] = np.log(df2_rh[var])
df2_w = tdid.data_filtering_and_weighting(data=df2_rh, control_y=2022, treatment_y=2023, covar='pt_station_num',
                                          control_m=[2, 3, 4], treatment_m=[5,], var=f'ln_{var}', unit='h3')
df2_w.to_parquet(target_folder + f'h3_grids_dt_{grp}_{lv}_wt_v.parquet', index=False)

                                     CVXPY                                     
                                     v1.5.3                                    
(CVXPY) Nov 13 09:37:00 AM: Your problem has 2126060 variables, 2126063 constraints, and 0 parameters.
(CVXPY) Nov 13 09:37:00 AM: It is compliant with the following grammars: DCP, DQCP
(CVXPY) Nov 13 09:37:00 AM: (If you need to solve this problem multiple times, but with different data, consider using parameters.)
(CVXPY) Nov 13 09:37:00 AM: CVXPY will first compile your problem; then, it will invoke a numerical solver to obtain a solution.
(CVXPY) Nov 13 09:37:00 AM: Your problem is compiled with the CPP canonicalization backend.
-------------------------------------------------------------------------------
                                  Compilation                                  
-------------------------------------------------------------------------------
(CVXPY) Nov 13 09:37:00 AM: Compiling problem (target solver=

In [None]:
var = 'd_ha_wt'
df2_rh[f'ln_{var}'] = np.log(df2_rh[var])
df2_w = tdid.data_filtering_and_weighting(data=df2_rh, control_y=2022, treatment_y=2023, covar='pt_station_num',
                                          control_m=[2, 3, 4], treatment_m=[5,], var=f'ln_{var}', unit='h3')
df2_w.to_parquet(target_folder + f'h3_grids_dt_{grp}_{lv}_wt_d.parquet', index=False)