# Time-shifted DiD modeling - hexagons
Data: daily visitation statistics stored under `dbs/combined_did_data/h3_grids_9et_X_X.parquet`.

In [1]:
%load_ext autoreload
%autoreload 2
%cd D:\nine-euro-ticket-de

D:\nine-euro-ticket-de


In [2]:
# Load libs
import pandas as pd
import numpy as np
import os
os.environ['USE_PYGEOS'] = '0'
from tqdm import tqdm
import workers
import tdid
import random
import sqlalchemy
from linearmodels.panel import PanelOLS
from statsmodels.stats.weightstats import DescrStatsW
import warnings
warnings.filterwarnings("ignore")

In [3]:
# Data location
user = workers.keys_manager['database']['user']
password = workers.keys_manager['database']['password']
port = workers.keys_manager['database']['port']
db_name = workers.keys_manager['database']['name']
engine = sqlalchemy.create_engine(f'postgresql://{user}:{password}@localhost:{port}/{db_name}?gssencmode=disable')

## 1. Load data

In [158]:
# Load hexagons
data_folder = 'dbs/combined_did_data/'
grp, lv = 'all', 'all'
file1 = data_folder + f'h3_grids_9et_{grp}_{lv}.parquet'
file2 = data_folder + f'h3_grids_dt_{grp}_{lv}.parquet'

In [162]:
df1 = pd.read_parquet(file1)
df2 = pd.read_parquet(file2)
# Deal weigh negative weights
if 'weight' in df1.columns:
    df1 = df1[df1['weight'] > 0]
if 'weight' in df2.columns:
    df2 = df2[df2['weight'] > 0]
print(f"Number of unique hexagons for the 9ET: {df1['h3_id'].nunique()}")
print(f"Number of unique hexagons for the DT: {df2['h3_id'].nunique()}")

Number of unique hexagons for the 9ET: 38985
Number of unique hexagons for the DT: 57824


### 1.1 Optional random sampling 50% hexagons

In [163]:
hex_ids_1 = list(df1.h3_id.unique())
hex_ids_2 = list(df2.h3_id.unique())
random.seed(42)
share = 0.2
# Calculate 20% of the list length
sample_size1 = int(len(hex_ids_1) * share)
sample_size2 = int(len(hex_ids_2) * share)

# Randomly sample 20% of items
sampled_hex_ids1 = random.sample(hex_ids_1, sample_size1)
sampled_hex_ids2 = random.sample(hex_ids_2, sample_size2)

df1 = df1.loc[df1.h3_id.isin(sampled_hex_ids1), :]
df2 = df2.loc[df2.h3_id.isin(sampled_hex_ids2), :]
print(f"Number of unique hexagons for the 9ET: {df1['h3_id'].nunique()}")
print(f"Number of unique hexagons for the DT: {df2['h3_id'].nunique()}")

Number of unique hexagons for the 9ET: 7797
Number of unique hexagons for the DT: 11564


## 2. Main model - Time-shifted DiD
$$y_{i,d}=\delta \cdot \text{9ET}_d \cdot \text{post}_d +\beta_1 \cdot \text{9ET}_d + \beta_2 \cdot \text{precipitation}_d + \beta_3 \cdot \text{precipitation}_d \cdot \text{post}_d + \beta_4 \cdot f_d + \beta_5 \cdot f_d \cdot \text{post}_d + \zeta_{s,y(d),m(d)} + \epsilon_{i, d}$$

- $y_{i,d}$ is the dependent variable for place $i$ at daily basis. 
- $\text{9ET}_d$ is a dummy variable indicating whether it is during June, July, or Aug.
- $\text{post}_d$ is a dummy variable indicating the treatment is on.
- $\delta$ is the coefficient of the effect of the 9ET.
- $\beta_1$ is the coefficient of the effect of being in June, July, or Aug.
- $f_d$ is the average gasoline price of the day $d$.
- $\zeta_{s,ym}$ is the state-year-month fixed effect.
- $\epsilon_{i, d}$ is the error term, clustered at the state level.

### 2.1 9ET

In [164]:
tvar = 'num_visits_wt'  # num_visits_wt, d_ha_wt
print(tvar, grp, lv, 'Jun-Aug vs. May')
df = tdid.data_preparation(data=df1, year_list=[2019, 2022], treatment_yr=2022, 
                           treatment_months = [6,7,8], control_months=[5,], unit='h3', unit_time='time')
df[f"ln_{tvar}"] = np.log(df[tvar])
_, res = tdid.time_shifted_did(df=df, target_var=f"ln_{tvar}", weight=False, time_effect='jue')
res.loc[:, 'tc_id'] = 'Jun-Aug vs. May'
res.loc[:, 'policy'] = '9et'
print(res)

print('The 9ET placebo...')
df = tdid.data_prep_placebo(data=df1, treatment_month=5, policy_t='20220516', treatment_yr=2022, 
                            p_9et=True, unit='h3', unit_time='time')
df[f"ln_{tvar}"] = np.log(df[tvar])
_, res = tdid.time_shifted_did(df=df, target_var=f"ln_{tvar}", weight=False, time_effect='jue')
res.loc[:, 'tc_id'] = 'placebo'
res.loc[:, 'policy'] = '9et'
print(res)

num_visits_wt all all Jun-Aug vs. May
   variable  coefficient    pvalue  std_error            tc_id policy
0       P_m     0.029958  0.064874   0.016228  Jun-Aug vs. May    9et
1    rain_m     0.014827  0.103113   0.009097  Jun-Aug vs. May    9et
2      rain    -0.006742  0.374955   0.007598  Jun-Aug vs. May    9et
3       9et     0.318310  0.000000   0.013342  Jun-Aug vs. May    9et
4  gasoline     0.869448  0.000000   0.035575  Jun-Aug vs. May    9et
The 9ET placebo...
   variable  coefficient    pvalue  std_error    tc_id policy
0       P_m    -0.130836  0.000000   0.009377  placebo    9et
1    rain_m     0.052005  0.000101   0.013377  placebo    9et
2      rain    -0.040801  0.000585   0.011866  placebo    9et
3  gasoline     3.266377  0.000000   0.120580  placebo    9et


In [165]:
tvar = 'd_ha_wt'  # num_visits_wt, d_ha_wt
print(tvar, 'Jun vs. May')
df = tdid.data_preparation(data=df1, year_list=[2019, 2022], treatment_yr=2022, 
                           treatment_months = [6,7,8], control_months=[5,], unit='h3', unit_time='time')
df[f"ln_{tvar}"] = np.log(df[tvar])
_, res = tdid.time_shifted_did(df=df, target_var=f"ln_{tvar}", weight=False, time_effect='jue')
res.loc[:, 'tc_id'] = 'Jun vs. May'
res.loc[:, 'policy'] = '9et'
print(res)

print('The 9ET placebo...')
df = tdid.data_prep_placebo(data=df1, treatment_month=5, policy_t='20220516', treatment_yr=2022, 
                            p_9et=True, unit='h3', unit_time='time')
df[f"ln_{tvar}"] = np.log(df[tvar])
_, res = tdid.time_shifted_did(df=df, target_var=f"ln_{tvar}", weight=False, time_effect='jue')
res.loc[:, 'tc_id'] = 'placebo'
res.loc[:, 'policy'] = '9et'
print(res)

d_ha_wt Jun vs. May
   variable  coefficient    pvalue  std_error        tc_id policy
0       P_m    -0.283467  0.000000   0.010627  Jun vs. May    9et
1    rain_m     0.001009  0.856257   0.005573  Jun vs. May    9et
2      rain    -0.004047  0.314860   0.004027  Jun vs. May    9et
3       9et     0.113722  0.000000   0.013716  Jun vs. May    9et
4  gasoline    -0.417129  0.000000   0.047378  Jun vs. May    9et
The 9ET placebo...
   variable  coefficient    pvalue  std_error    tc_id policy
0       P_m     0.033372  0.000435   0.009486  placebo    9et
1    rain_m    -0.003274  0.857175   0.018190  placebo    9et
2      rain     0.003405  0.796293   0.013191  placebo    9et
3  gasoline    -1.954800  0.000000   0.107567  placebo    9et


### 2.2 DT

In [166]:
tvar = 'num_visits_wt'  # num_visits_wt, d_ha_wt
print(tvar, grp, lv, 'Feb-Apr vs. May')
df = tdid.data_preparation(data=df2, year_list=[2022, 2023], treatment_yr=2023, 
                           treatment_months = [5,], control_months=[3,], unit='h3', unit_time='time')
df[f"ln_{tvar}"] = np.log(df[tvar])
_, res = tdid.time_shifted_did(df=df, target_var=f"ln_{tvar}", weight=False, time_effect='jue')
res.loc[:, 'tc_id'] = 'Feb-Apr vs. May'
res.loc[:, 'policy'] = 'dt'
print(res)

num_visits_wt all all Feb-Apr vs. May
   variable  coefficient        pvalue  std_error            tc_id policy
0       P_m     0.208498  0.000000e+00   0.013603  Feb-Apr vs. May     dt
1    rain_m    -0.058744  0.000000e+00   0.006373  Feb-Apr vs. May     dt
2      rain     0.020887  3.202710e-03   0.007086  Feb-Apr vs. May     dt
3       9et     0.054160  3.502976e-12   0.007786  Feb-Apr vs. May     dt
4  gasoline     0.503341  0.000000e+00   0.013814  Feb-Apr vs. May     dt


In [167]:
print('The DT placebo...')
df = tdid.data_prep_placebo(data=df2, treatment_month=3, policy_t='20230313', treatment_yr=2023, 
                            p_9et=True, unit='h3', unit_time='time', control_months=[3,])
df[f"ln_{tvar}"] = np.log(df[tvar])
_, res = tdid.time_shifted_did(df=df, target_var=f"ln_{tvar}", weight=False, time_effect='jue')
res.loc[:, 'tc_id'] = 'placebo'
res.loc[:, 'policy'] = 'dt'
print(res)

The DT placebo...
   variable  coefficient        pvalue  std_error    tc_id policy
0       P_m    -0.031274  8.437695e-15   0.004030  placebo     dt
1    rain_m    -0.036550  5.075935e-09   0.006253  placebo     dt
2      rain     0.014729  2.160602e-02   0.006412  placebo     dt
3  gasoline     0.244224  0.000000e+00   0.015502  placebo     dt


In [168]:
tvar = 'd_ha_wt'  # num_visits_wt, d_ha_wt
print(tvar, grp, lv, 'Apr vs. May')
df = tdid.data_preparation(data=df2, year_list=[2022, 2023], treatment_yr=2023, 
                           treatment_months = [5,], control_months=[3,], unit='h3', unit_time='time')
df[f"ln_{tvar}"] = np.log(df[tvar])
summary, res = tdid.time_shifted_did(df=df, target_var=f"ln_{tvar}", weight=False, time_effect='jue')
res.loc[:, 'tc_id'] = 'Feb-Apr vs. May'
res.loc[:, 'policy'] = 'dt'
print(res)

d_ha_wt all all Apr vs. May
   variable  coefficient    pvalue  std_error            tc_id policy
0       P_m     0.119331  0.000017   0.027729  Feb-Apr vs. May     dt
1    rain_m     0.034275  0.006505   0.012595  Feb-Apr vs. May     dt
2      rain    -0.016261  0.035350   0.007728  Feb-Apr vs. May     dt
3       9et    -0.229681  0.000000   0.013683  Feb-Apr vs. May     dt
4  gasoline    -0.056382  0.303336   0.054776  Feb-Apr vs. May     dt


In [169]:
print('The DT placebo...')
df = tdid.data_prep_placebo(data=df2, treatment_month=3, policy_t='20230313', treatment_yr=2023, 
                            p_9et=True, unit='h3', unit_time='time')
df[f"ln_{tvar}"] = np.log(df[tvar])
_, res = tdid.time_shifted_did(df=df, target_var=f"ln_{tvar}", weight=False, time_effect='jue')
res.loc[:, 'tc_id'] = 'placebo'
res.loc[:, 'policy'] = 'dt'
print(res)

The DT placebo...
   variable  coefficient    pvalue  std_error    tc_id policy
0       P_m    -0.001067  0.878121   0.006955  placebo     dt
1    rain_m    -0.011661  0.193532   0.008968  placebo     dt
2      rain     0.021153  0.005975   0.007694  placebo     dt
3  gasoline     0.210006  0.000323   0.058400  placebo     dt


In [87]:
with open("dbs/panel_ols_summary.txt", "w") as f:
    f.write(summary.as_csv())

In [53]:
# Calculate the percentage change
coef = 0.09
percentage_change = (np.exp(coef) - 1) * 100
print(f"The treatment led to an estimated {percentage_change:.2f}% change in travel distance.")

The treatment led to an estimated 9.42% change in travel distance.


## 3. By public transit access

In [None]:
df1_pt = df1.groupby('h3_id')['pt_station_num'].mean().reset_index()
print(df1_pt['pt_station_num'].quantile([0.25, 0.75]))
df1_pt.loc[:, 'pt_access'] = pd.cut(df1_pt['pt_station_num'], [0, 10, 27, 10000], labels=['L', 'M', 'H'])
df1 = pd.merge(df1, df1_pt[['h3_id', 'pt_access']], on='h3_id', how='left')

In [173]:
df2_pt = df2.groupby('h3_id')['pt_station_num'].mean().reset_index()
print(df2_pt['pt_station_num'].quantile([0.25, 0.75]))
df2_pt.loc[:, 'pt_access'] = pd.cut(df2_pt['pt_station_num'], [0, 8, 23, 10000], labels=['L', 'M', 'H'])
df2 = pd.merge(df2, df2_pt[['h3_id', 'pt_access']], on='h3_id', how='left')

0.25     7.565891
0.75    22.955424
Name: pt_station_num, dtype: float64


In [176]:
tvar = 'd_ha_wt'  # num_visits_wt, d_ha_wt
print(tvar, 'Jun vs. May')
df = tdid.data_preparation(data=df1, year_list=[2019, 2022], treatment_yr=2022, break_pt=True,
                           treatment_months = [6,7,8], control_months=[5,], unit='h3', unit_time='time')
df[f"ln_{tvar}"] = np.log(df[tvar])
_, res = tdid.time_shifted_did(df=df, target_var=f"ln_{tvar}", weight=False, time_effect='jue', break_pt=True)
res.loc[:, 'tc_id'] = 'Jun vs. May'
res.loc[:, 'policy'] = '9et'
print(res)

d_ha_wt Jun vs. May
   variable  coefficient        pvalue  std_error        tc_id policy
0      P_m1    -0.306993  0.000000e+00   0.023177  Jun vs. May    9et
1      P_m2    -0.284498  0.000000e+00   0.012497  Jun vs. May    9et
2      P_m3    -0.248947  0.000000e+00   0.019891  Jun vs. May    9et
3    rain_m     0.001241  8.204615e-01   0.005469  Jun vs. May    9et
4      rain    -0.004469  2.625138e-01   0.003988  Jun vs. May    9et
5      9et1     0.293038  4.056227e-08   0.053392  Jun vs. May    9et
6      9et2     0.271377  1.392048e-05   0.062455  Jun vs. May    9et
7      9et3     0.270487  4.793775e-06   0.059140  Jun vs. May    9et
8  gasoline    -0.392584  2.220446e-16   0.048006  Jun vs. May    9et


In [177]:
tvar = 'd_ha_wt'  # num_visits_wt, d_ha_wt
print(tvar, grp, lv, 'Apr vs. May')
df = tdid.data_preparation(data=df2, year_list=[2022, 2023], treatment_yr=2023, break_pt=True,
                           treatment_months = [5,], control_months=[2, 3, 4], unit='h3', unit_time='time')
df[f"ln_{tvar}"] = np.log(df[tvar])
summary, res = tdid.time_shifted_did(df=df, target_var=f"ln_{tvar}", weight=False, time_effect='jue', break_pt=True)
res.loc[:, 'tc_id'] = 'Feb-Apr vs. May'
res.loc[:, 'policy'] = 'dt'
print(res)

d_ha_wt all all Apr vs. May
   variable  coefficient        pvalue  std_error            tc_id policy
0      P_m1     0.002355  9.106969e-01   0.020994  Feb-Apr vs. May     dt
1      P_m2     0.078544  3.446879e-03   0.026854  Feb-Apr vs. May     dt
2      P_m3     0.139443  8.427058e-09   0.024210  Feb-Apr vs. May     dt
3    rain_m     0.034358  6.259708e-04   0.010046  Feb-Apr vs. May     dt
4      rain    -0.026868  1.679481e-05   0.006243  Feb-Apr vs. May     dt
5      9et1    -0.075080  1.572864e-01   0.053088  Feb-Apr vs. May     dt
6      9et2    -0.144178  7.849697e-03   0.054233  Feb-Apr vs. May     dt
7      9et3    -0.168780  1.696206e-03   0.053771  Feb-Apr vs. May     dt
8  gasoline     0.380657  0.000000e+00   0.037122  Feb-Apr vs. May     dt
