# Time-shifted DiD modeling - hexagons
Data: daily visitation statistics stored under `dbs/combined_did_data/h3_grids_9et_X_X.parquet`.

In [1]:
%load_ext autoreload
%autoreload 2
%cd D:\nine-euro-ticket-de

D:\nine-euro-ticket-de


In [2]:
# Load libs
import pandas as pd
import numpy as np
import os
os.environ['USE_PYGEOS'] = '0'
from tqdm import tqdm
import workers
import tdid
import random
import sqlalchemy
from linearmodels.panel import PanelOLS
import statsmodels.formula.api as smf
from statsmodels.stats.weightstats import DescrStatsW
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [3]:
# Data location
user = workers.keys_manager['database']['user']
password = workers.keys_manager['database']['password']
port = workers.keys_manager['database']['port']
db_name = workers.keys_manager['database']['name']
engine = sqlalchemy.create_engine(f'postgresql://{user}:{password}@localhost:{port}/{db_name}?gssencmode=disable')

In [4]:
def percent_convert(x):
    pc = (np.exp(x) - 1) * 100
    print(pc)

## 1. Load data

In [5]:
# Load hexagons
data_folder = 'dbs/combined_did_data/'
grp, lv = 'all', 'all'
file1 = data_folder + f'h3_grids_9et_{grp}_{lv}_c.parquet'
file2 = data_folder + f'h3_grids_dt_{grp}_{lv}_c.parquet'

In [6]:
df1 = pd.read_parquet(file1)
df2 = pd.read_parquet(file2)
# Deal weigh negative weights
if 'weight' in df1.columns:
    df1 = df1[df1['weight'] > 0]
if 'weight' in df2.columns:
    df2 = df2[df2['weight'] > 0]

In [7]:
df1 = df1.loc[df1['month'].isin([6, 7, 8, 9]), :]
df2 = df2.loc[df2['month'].isin([3, 4, 5]), :]
print(f"Number of unique hexagons for the 9ET: {df1['h3_id'].nunique()}")
print(f"Number of unique hexagons for the DT: {df2['h3_id'].nunique()}")

Number of unique hexagons for the 9ET: 19753
Number of unique hexagons for the DT: 35682


### 1.1 Optional complete sampling

In [9]:
df1 = tdid.place_filter_complete(data=df1, control_y=2019, treatment_y=2022, unit='h3')
df2 = tdid.place_filter_complete(data=df2, control_y=2022, treatment_y=2023, unit='h3')

100%|██████████| 38981/38981 [00:48<00:00, 800.16it/s]
The 9ET searching: 100%|██████████| 38981/38981 [00:28<00:00, 1370.27it/s]
100%|██████████| 57797/57797 [01:16<00:00, 753.00it/s]
The 9ET searching: 100%|██████████| 57797/57797 [00:43<00:00, 1327.11it/s]


In [10]:
print(f"Number of unique hexagons for the 9ET: {df1['h3_id'].nunique()}")
print(f"Number of unique hexagons for the DT: {df2['h3_id'].nunique()}")

Number of unique hexagons for the 9ET: 22299
Number of unique hexagons for the DT: 35740


### 1.2 Optional random sampling a proportion of hexagons

In [6]:
hex_ids_1 = list(df1.h3_id.unique())
hex_ids_2 = list(df2.h3_id.unique())
random.seed(42)
share = 0.1
# Calculate 20% of the list length
sample_size1 = int(len(hex_ids_1) * share)
sample_size2 = int(len(hex_ids_2) * share)

# Randomly sample 20% of items
sampled_hex_ids1 = random.sample(hex_ids_1, sample_size1)
sampled_hex_ids2 = random.sample(hex_ids_2, sample_size2)

df1 = df1.loc[df1.h3_id.isin(sampled_hex_ids1), :]
df2 = df2.loc[df2.h3_id.isin(sampled_hex_ids2), :]
print(f"Number of unique hexagons for the 9ET: {df1['h3_id'].nunique()}")
print(f"Number of unique hexagons for the DT: {df2['h3_id'].nunique()}")

Number of unique hexagons for the 9ET: 3898
Number of unique hexagons for the DT: 5782


## 2. Main model - Time-shifted DiD
$$y_{i,d}=\delta \cdot \text{9ET}_d \cdot \text{post}_d +\beta_1 \cdot \text{9ET}_d + \beta_2 \cdot \text{precipitation}_d + \beta_3 \cdot \text{precipitation}_d \cdot \text{post}_d + \beta_4 \cdot f_d + \beta_5 \cdot f_d \cdot \text{post}_d + \zeta_{s,y(d),m(d)} + \epsilon_{i, d}$$

- $y_{i,d}$ is the dependent variable for place $i$ at daily basis. 
- $\text{9ET}_d$ is a dummy variable indicating whether it is during June, July, or Aug.
- $\text{post}_d$ is a dummy variable indicating the treatment is on.
- $\delta$ is the coefficient of the effect of the 9ET.
- $\beta_1$ is the coefficient of the effect of being in June, July, or Aug.
- $f_d$ is the average gasoline price of the day $d$.
- $\zeta_{s,ym}$ is the state-year-month fixed effect.
- $\epsilon_{i, d}$ is the error term, clustered at the state level.

### 2.1 9ET

In [9]:
print(len(df1)), print(len(df2))

3421661
4100652


(None, None)

In [10]:
tvar = 'num_visits_wt'  # num_visits_wt, d_ha_wt
print(tvar, grp, lv, 'Jun-Aug vs. May')
df = tdid.data_preparation(data=df1, year_list=[2019, 2022], treatment_yr=2022, 
                           treatment_months = [6,7,8], control_months=[9,], unit='h3', unit_time='time')
df[f"ln_{tvar}"] = np.log(df[tvar])
sum_et_v, res = tdid.time_shifted_did_absorbing(df=df, target_var=f"ln_{tvar}", weight=False, time_effect='jue')
res.loc[:, 'tc_id'] = 'Jun-Aug vs. May'
res.loc[:, 'policy'] = '9et'
print(res)

num_visits_wt all all Jun-Aug vs. May
     variable  coefficient    pvalue  std_error            tc_id policy
0         P_m     0.186527  0.000000   0.005967  Jun-Aug vs. May    9et
1      rain_m     0.018864  0.000607   0.005502  Jun-Aug vs. May    9et
2        rain    -0.013396  0.035145   0.006359  Jun-Aug vs. May    9et
3  fuel_price     1.411198  0.000000   0.034620  Jun-Aug vs. May    9et


In [12]:
print(sum_et_v.summary)

                         Absorbing LS Estimation Summary                          
Dep. Variable:       ln_num_visits_wt   R-squared:                          0.1699
Estimator:               Absorbing LS   Adj. R-squared:                     0.1699
No. Observations:             3421661   F-statistic:                        4083.9
Date:                Thu, Dec 05 2024   P-value (F-stat):                   0.0000
Time:                        13:17:11   Distribution:                      chi2(4)
Cov. Estimator:             clustered   R-squared (No Effects):             0.1605
                                        Variables Absorbed:                 71.000
                             Parameter Estimates                              
            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
P_m            0.1865     0.0060     31.261     0.0000      0.1748      0.1982
rain_m         0.018

In [13]:
percent_convert(0.186527)
percent_convert(0.005967)

20.505715818972803
0.5984838006645843


In [None]:
print('The 9ET placebo...')
df = tdid.data_prep_placebo(data=df1, treatment_month=5, policy_t='20220516', treatment_yr=2022, 
                            p_9et=True, unit='h3', unit_time='time')
df[f"ln_{tvar}"] = np.log(df[tvar])
_, res = tdid.time_shifted_did_absorbing(df=df, target_var=f"ln_{tvar}", weight=False, time_effect='jue', drop_month=False)
res.loc[:, 'tc_id'] = 'placebo'
res.loc[:, 'policy'] = '9et'
print(res)

In [16]:
tvar = 'd_ha_wt'  # num_visits_wt, d_ha_wt
print(tvar, 'Jun vs. May')
df = tdid.data_preparation(data=df1, year_list=[2019, 2022], treatment_yr=2022, 
                           treatment_months = [7,8], control_months=[9,], unit='h3', unit_time='time')
df[f"ln_{tvar}"] = np.log(df[tvar])
sum_et_v, res = tdid.time_shifted_did_absorbing(df=df, target_var=f"ln_{tvar}", weight=False, time_effect='jue')
res.loc[:, 'tc_id'] = 'Jun vs. May'
res.loc[:, 'policy'] = '9et'
print(res)

d_ha_wt Jun vs. May
     variable  coefficient    pvalue  std_error        tc_id policy
0         P_m     0.037257  0.336092   0.038732  Jun vs. May    9et
1      rain_m    -0.018428  0.455521   0.024695  Jun vs. May    9et
2        rain     0.024715  0.104486   0.015223  Jun vs. May    9et
3  fuel_price    -0.178805  0.179420   0.133184  Jun vs. May    9et


In [17]:
print(sum_et_v.summary)

                         Absorbing LS Estimation Summary                          
Dep. Variable:             ln_d_ha_wt   R-squared:                          0.0240
Estimator:               Absorbing LS   Adj. R-squared:                     0.0240
No. Observations:             2573641   F-statistic:                        28.376
Date:                Thu, Dec 05 2024   P-value (F-stat):                   0.0000
Time:                        13:23:17   Distribution:                      chi2(4)
Cov. Estimator:             clustered   R-squared (No Effects):             0.0006
                                        Variables Absorbed:                 55.000
                             Parameter Estimates                              
            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
P_m            0.0373     0.0387     0.9619     0.3361     -0.0387      0.1132
rain_m        -0.018

In [13]:
percent_convert(-0.057084)
percent_convert(0.0207)

-5.823546641575128
2.09157309724346


In [None]:
print('The 9ET placebo...')
df = tdid.data_prep_placebo(data=df1, treatment_month=5, policy_t='20220516', treatment_yr=2022, 
                            p_9et=True, unit='h3', unit_time='time')
df[f"ln_{tvar}"] = np.log(df[tvar])
_, res = tdid.time_shifted_did_absorbing(df=df, target_var=f"ln_{tvar}", weight=False, time_effect='jue', drop_month=False)
res.loc[:, 'tc_id'] = 'placebo'
res.loc[:, 'policy'] = '9et'
print(res)

### Parallel trend - 9et

In [9]:
tvar = 'd_ha_wt'
df1_c = df1.copy()
df1_c[f"ln_{tvar}"] = np.log(df1_c[tvar])
df1_c['time'] = pd.to_datetime(df1_c['date'])
df1_c['Time'] = df1_c['time'].dt.dayofyear
# Calculate daily averages for each group
daily_avg = df1_c.groupby(['Time', 'year'])[f"ln_{tvar}"].mean().reset_index()
daily_avg[tvar] = np.exp(daily_avg[f"ln_{tvar}"])
# Pivot the data for plotting
daily_pivot = daily_avg.pivot(index='Time', columns='year', values=tvar)

# Plot the trends
plt.figure(figsize=(12, 6))
plt.plot(daily_pivot.index, daily_pivot[2019.0], label='Control Group')
plt.plot(daily_pivot.index, daily_pivot[2022.0], label='Treatment Group')
plt.axvline(122+31, color='red', linestyle='--', label='Treatment Start')
plt.title('Daily Trends of Target Variable by Group')
plt.xlabel('Date')
plt.ylabel('Average Target Variable')
plt.legend()
plt.show()

In [8]:
treatment_start_date = 122 + 31
# Calculate days relative to treatment start
df1_c['days_since_treatment'] = df1_c['Time'] - treatment_start_date

# Create dummies for relative days
df1_c['relative_day'] = df1_c['days_since_treatment']
df1_c['relative_day'] = df1_c['relative_day'].astype(int)
# Define a function to create valid column names
def rename_day(day):
    if day < 0:
        return f'day_m{abs(day)}'  # 'm' stands for minus
    elif day == 0:
        return 'day_0'
    else:
        return f'day_p{day}'  # 'p' stands for plus

# Apply the function to rename days
df1_c['relative_day_label'] = df1_c['relative_day'].apply(rename_day)
df1_c = df1_c.loc[df1_c['relative_day_label']!='day_0']
df1_c.head()

Unnamed: 0,h3_id,date,year,month,weekday,precipitation,pt_station_num,num_visits_wt,num_unique_device,d_ha_wt,state,gasoline,ln_d_ha_wt,time,Time,days_since_treatment,relative_day,relative_day_label
3206,881e3202abfffff,2019-05-08,2019.0,5.0,2.0,5.2,32.64467,18.211092,4.0,14.203828,Bayern,1.505617,2.653512,2019-05-08,128,-25,-25,day_m25
3207,881e3202abfffff,2019-05-09,2019.0,5.0,3.0,1.5,25.92758,34.587244,6.0,7.331139,Bayern,1.505105,1.992131,2019-05-09,129,-24,-24,day_m24
3208,881e3202abfffff,2019-05-23,2019.0,5.0,3.0,0.0,23.347887,21.045556,4.0,2.54476,Bayern,1.542476,0.934036,2019-05-23,143,-10,-10,day_m10
3209,881e3202abfffff,2019-06-18,2019.0,6.0,1.0,0.0,36.051379,31.263494,4.0,5.836806,Bayern,1.479828,1.764184,2019-06-18,169,16,16,day_p16
3210,881e3202abfffff,2019-08-13,2019.0,8.0,1.0,6.1,32.505753,28.158443,4.0,2.386877,Bayern,1.422899,0.869986,2019-08-13,225,72,72,day_p72


In [None]:
# Regression
formula = "ln_d_ha_wt ~ year + relative_day_label + C(h3_id)"

# Fit the model
model = smf.ols(formula, data=df1_c).fit(    
    cov_type='cluster',
    cov_kwds={'groups': df1_c['state']})

# Display the results
print(model.summary())

### 2.2 DT

In [14]:
tvar = 'num_visits_wt'  # num_visits_wt, d_ha_wt
print(tvar, grp, lv, 'Feb-Apr vs. May')
df = tdid.data_preparation(data=df2, year_list=[2022, 2023], treatment_yr=2023, 
                           treatment_months = [5,], control_months=[3,4], unit='h3', unit_time='time')
df[f"ln_{tvar}"] = np.log(df[tvar])
sum_et_v, res = tdid.time_shifted_did_absorbing(df=df, target_var=f"ln_{tvar}", weight=False, time_effect='jue')
res.loc[:, 'tc_id'] = 'Feb-Apr vs. May'
res.loc[:, 'policy'] = 'dt'
print(res)

num_visits_wt all all Feb-Apr vs. May
     variable  coefficient        pvalue  std_error            tc_id policy
0         P_m     0.169532  0.000000e+00   0.009520  Feb-Apr vs. May     dt
1      rain_m    -0.035151  2.140101e-07   0.006777  Feb-Apr vs. May     dt
2        rain     0.017313  1.793798e-02   0.007314  Feb-Apr vs. May     dt
3  fuel_price     0.110948  4.160655e-08   0.020232  Feb-Apr vs. May     dt


In [15]:
print(sum_et_v)

                         Absorbing LS Estimation Summary                          
Dep. Variable:       ln_num_visits_wt   R-squared:                          0.0245
Estimator:               Absorbing LS   Adj. R-squared:                     0.0245
No. Observations:             4100652   F-statistic:                        519.59
Date:                Mon, Dec 02 2024   P-value (F-stat):                   0.0000
Time:                        12:37:22   Distribution:                      chi2(4)
Cov. Estimator:             clustered   R-squared (No Effects):             0.0024
                                        Variables Absorbed:                 55.000
                             Parameter Estimates                              
            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
P_m            0.1695     0.0095     17.807     0.0000      0.1509      0.1882
rain_m        -0.035

In [16]:
percent_convert(0.1695)
percent_convert(0.0095)

18.47123470331209
0.9545268235856774


In [None]:
print('The DT placebo...')
df = tdid.data_prep_placebo(data=df2, treatment_month=4, policy_t='20230401', treatment_yr=2023, 
                            p_9et=False, unit='h3', unit_time='time', control_months=[2,3,])
df[f"ln_{tvar}"] = np.log(df[tvar])
_, res = tdid.time_shifted_did_absorbing(df=df, target_var=f"ln_{tvar}", weight=False, time_effect='jue')
res.loc[:, 'tc_id'] = 'placebo'
res.loc[:, 'policy'] = 'dt'
print(res)

In [17]:
tvar = 'd_ha_wt'  # num_visits_wt, d_ha_wt
print(tvar, grp, lv, 'Apr vs. May')
df = tdid.data_preparation(data=df2, year_list=[2022, 2023], treatment_yr=2023, 
                           treatment_months = [5,], control_months=[3,4], unit='h3', unit_time='time')
df[f"ln_{tvar}"] = np.log(df[tvar])
sum_et_v, res = tdid.time_shifted_did_absorbing(df=df, target_var=f"ln_{tvar}", weight=False, time_effect='jue')
res.loc[:, 'tc_id'] = 'Feb-Apr vs. May'
res.loc[:, 'policy'] = 'dt'
print(res)

d_ha_wt all all Apr vs. May
     variable  coefficient    pvalue  std_error            tc_id policy
0         P_m     0.107262  0.008300   0.040636  Feb-Apr vs. May     dt
1      rain_m     0.057261  0.024458   0.025451  Feb-Apr vs. May     dt
2        rain    -0.027108  0.080473   0.015509  Feb-Apr vs. May     dt
3  fuel_price    -0.216803  0.000344   0.060563  Feb-Apr vs. May     dt


In [18]:
print(sum_et_v)

                         Absorbing LS Estimation Summary                          
Dep. Variable:             ln_d_ha_wt   R-squared:                          0.0228
Estimator:               Absorbing LS   Adj. R-squared:                     0.0228
No. Observations:             4100652   F-statistic:                        138.39
Date:                Mon, Dec 02 2024   P-value (F-stat):                   0.0000
Time:                        12:39:35   Distribution:                      chi2(4)
Cov. Estimator:             clustered   R-squared (No Effects):             0.0029
                                        Variables Absorbed:                 55.000
                             Parameter Estimates                              
            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
P_m            0.1073     0.0406     2.6396     0.0083      0.0276      0.1869
rain_m         0.057

In [20]:
percent_convert(0.1073)
percent_convert(0.0406)

11.32681848427195
4.143544804031785


In [None]:
print('The DT placebo...')
df = tdid.data_prep_placebo(data=df2, treatment_month=4, policy_t='20230401', treatment_yr=2023, 
                            p_9et=False, unit='h3', unit_time='time', control_months=[2,3,])
df[f"ln_{tvar}"] = np.log(df[tvar])
_, res = tdid.time_shifted_did_absorbing(df=df, target_var=f"ln_{tvar}", weight=False, time_effect='jue')
res.loc[:, 'tc_id'] = 'placebo'
res.loc[:, 'policy'] = 'dt'
print(res)

In [87]:
with open("dbs/panel_ols_summary.txt", "w") as f:
    f.write(summary.as_csv())

In [15]:
# Calculate the percentage change
coef = 0.19
percentage_change = (np.exp(coef) - 1) * 100
print(f"The treatment led to an estimated {percentage_change:.2f}% change in travel distance.")

The treatment led to an estimated 20.92% change in travel distance.


## 3. By hexagon attributes: public transit access and visitors' attributes
### 3.1 Public transit access
#### 3.1.1 The 9ET

In [21]:
tvar, grp = 'num_visits_wt', 'pt_grp'
df = tdid.data_preparation(data=df1, year_list=[2019, 2022], treatment_yr=2022, grp=grp,
                           treatment_months = [6,7,8], control_months=[5,9], unit='h3', unit_time='time')
df[f"ln_{tvar}"] = np.log(df[tvar])
_, res = tdid.time_shifted_did_absorbing(df=df, target_var=f"ln_{tvar}", weight=False, time_effect='jue', grp=grp)
res.loc[:, 'tc_id'] = 'Jun vs. May'
res.loc[:, 'policy'] = '9et'
print(res)

     variable  coefficient        pvalue  std_error        tc_id policy
0        P_m1    -0.112697  2.052374e-09   0.018803  Jun vs. May    9et
1        P_m2     0.041404  4.231278e-02   0.020392  Jun vs. May    9et
2        P_m3     0.193485  2.153389e-12   0.027546  Jun vs. May    9et
3        P_m4     0.558433  0.000000e+00   0.030399  Jun vs. May    9et
4      rain_m     0.020015  3.799444e-04   0.005632  Jun vs. May    9et
5        rain    -0.016440  1.747446e-02   0.006918  Jun vs. May    9et
6  fuel_price     1.419073  0.000000e+00   0.032553  Jun vs. May    9et


In [22]:
tvar, grp = 'd_ha_wt', 'pt_grp'
df = tdid.data_preparation(data=df1, year_list=[2019, 2022], treatment_yr=2022, grp=grp,
                           treatment_months = [6,7,8], control_months=[5,9], unit='h3', unit_time='time')
df[f"ln_{tvar}"] = np.log(df[tvar])
_, res = tdid.time_shifted_did_absorbing(df=df, target_var=f"ln_{tvar}", weight=False, time_effect='jue', grp=grp)
res.loc[:, 'tc_id'] = 'Jun vs. May'
res.loc[:, 'policy'] = '9et'
print(res)

     variable  coefficient        pvalue  std_error        tc_id policy
0        P_m1     0.322618  2.794096e-08   0.058089  Jun vs. May    9et
1        P_m2    -0.196959  1.509748e-11   0.029192  Jun vs. May    9et
2        P_m3    -0.279026  1.265654e-14   0.036189  Jun vs. May    9et
3        P_m4    -0.119061  2.080868e-03   0.038676  Jun vs. May    9et
4      rain_m    -0.008910  6.269274e-01   0.018332  Jun vs. May    9et
5        rain    -0.001926  8.648467e-01   0.011317  Jun vs. May    9et
6  fuel_price     0.159962  1.401074e-02   0.065105  Jun vs. May    9et


#### 3.1.2 The DT

In [23]:
tvar, grp = 'num_visits_wt', 'pt_grp'
df = tdid.data_preparation(data=df2, year_list=[2022, 2023], treatment_yr=2023, grp=grp,
                           treatment_months = [5,], control_months=[3,4], unit='h3', unit_time='time')
df[f"ln_{tvar}"] = np.log(df[tvar])
_, res = tdid.time_shifted_did_absorbing(df=df, target_var=f"ln_{tvar}", weight=False, time_effect='jue', grp=grp)
res.loc[:, 'tc_id'] = 'Mar-Apr vs. May'
res.loc[:, 'policy'] = 'dt'
print(res)

     variable  coefficient        pvalue  std_error            tc_id policy
0        P_m1    -0.232745  0.000000e+00   0.022297  Mar-Apr vs. May     dt
1        P_m2    -0.061755  1.086725e-02   0.024247  Mar-Apr vs. May     dt
2        P_m3     0.166430  0.000000e+00   0.018554  Mar-Apr vs. May     dt
3        P_m4     0.634824  0.000000e+00   0.027448  Mar-Apr vs. May     dt
4      rain_m    -0.034309  2.441721e-05   0.008130  Mar-Apr vs. May     dt
5        rain     0.016632  2.714718e-02   0.007528  Mar-Apr vs. May     dt
6  fuel_price     0.124078  4.803872e-07   0.024648  Mar-Apr vs. May     dt


In [24]:
tvar, grp = 'd_ha_wt', 'pt_grp'  # num_visits_wt, d_ha_wt
df = tdid.data_preparation(data=df2, year_list=[2022, 2023], treatment_yr=2023, grp=grp,
                           treatment_months = [5,], control_months=[3,4], unit='h3', unit_time='time')
df[f"ln_{tvar}"] = np.log(df[tvar])
summary, res = tdid.time_shifted_did_absorbing(df=df, target_var=f"ln_{tvar}", weight=False, time_effect='jue', grp=grp)
res.loc[:, 'tc_id'] = 'Mar-Apr vs. May'
res.loc[:, 'policy'] = 'dt'
print(res)

     variable  coefficient        pvalue  std_error            tc_id policy
0        P_m1     0.271368  4.707881e-08   0.049683  Mar-Apr vs. May     dt
1        P_m2     0.063992  5.898318e-02   0.033889  Mar-Apr vs. May     dt
2        P_m3    -0.015434  8.039977e-01   0.062189  Mar-Apr vs. May     dt
3        P_m4     0.108969  7.748949e-02   0.061724  Mar-Apr vs. May     dt
4      rain_m     0.056518  2.577810e-02   0.025350  Mar-Apr vs. May     dt
5        rain    -0.027812  7.068023e-02   0.015387  Mar-Apr vs. May     dt
6  fuel_price    -0.236294  1.163031e-04   0.061315  Mar-Apr vs. May     dt


### 3.2 Foreign-share groups
#### 3.2.1 The 9ET

In [25]:
tvar, grp = 'num_visits_wt', 'f_grp'
df = tdid.data_preparation(data=df1, year_list=[2019, 2022], treatment_yr=2022, grp=grp,
                           treatment_months = [6,7,8], control_months=[5,9], unit='h3', unit_time='time')
df[f"ln_{tvar}"] = np.log(df[tvar])
_, res = tdid.time_shifted_did_absorbing(df=df, target_var=f"ln_{tvar}", weight=False, time_effect='jue', grp=grp)
res.loc[:, 'tc_id'] = 'Jun vs. May'
res.loc[:, 'policy'] = '9et'
print(res)

     variable  coefficient    pvalue  std_error        tc_id policy
0        P_m1     0.019011  0.577892   0.034164  Jun vs. May    9et
1        P_m2     0.152993  0.000027   0.036481  Jun vs. May    9et
2        P_m3     0.260253  0.000000   0.027315  Jun vs. May    9et
3        P_m4     0.276609  0.000000   0.021927  Jun vs. May    9et
4      rain_m     0.019800  0.003708   0.006823  Jun vs. May    9et
5        rain    -0.015562  0.024625   0.006925  Jun vs. May    9et
6  fuel_price     1.423361  0.000000   0.033062  Jun vs. May    9et


In [26]:
tvar, grp = 'd_ha_wt', 'f_grp'
df = tdid.data_preparation(data=df1, year_list=[2019, 2022], treatment_yr=2022, grp=grp,
                           treatment_months = [6,7,8], control_months=[5,9], unit='h3', unit_time='time')
df[f"ln_{tvar}"] = np.log(df[tvar])
_, res = tdid.time_shifted_did_absorbing(df=df, target_var=f"ln_{tvar}", weight=False, time_effect='jue', grp=grp)
res.loc[:, 'tc_id'] = 'Jun vs. May'
res.loc[:, 'policy'] = '9et'
print(res)

     variable  coefficient        pvalue  std_error        tc_id policy
0        P_m1    -0.272828  3.702265e-09   0.046266  Jun vs. May    9et
1        P_m2     0.097858  1.286000e-01   0.064395  Jun vs. May    9et
2        P_m3     0.129589  7.280044e-06   0.028892  Jun vs. May    9et
3        P_m4    -0.206023  2.716494e-12   0.029467  Jun vs. May    9et
4      rain_m    -0.011610  5.078011e-01   0.017530  Jun vs. May    9et
5        rain     0.000892  9.341800e-01   0.010803  Jun vs. May    9et
6  fuel_price     0.137313  4.104823e-02   0.067210  Jun vs. May    9et


#### 3.2.2 The DT

In [27]:
tvar, grp = 'num_visits_wt', 'f_grp'  # num_visits_wt, d_ha_wt
df = tdid.data_preparation(data=df2, year_list=[2022, 2023], treatment_yr=2023, grp=grp,
                           treatment_months = [5,], control_months=[3, 4], unit='h3', unit_time='time')
df[f"ln_{tvar}"] = np.log(df[tvar])
summary, res = tdid.time_shifted_did_absorbing(df=df, target_var=f"ln_{tvar}", weight=False, time_effect='jue', grp=grp)
res.loc[:, 'tc_id'] = 'Feb-Apr vs. May'
res.loc[:, 'policy'] = 'dt'
print(res)

     variable  coefficient        pvalue  std_error            tc_id policy
0        P_m1    -0.069961  2.398788e-02   0.030993  Feb-Apr vs. May     dt
1        P_m2     0.095805  1.862801e-02   0.040718  Feb-Apr vs. May     dt
2        P_m3     0.243927  1.639533e-11   0.036218  Feb-Apr vs. May     dt
3        P_m4     0.352148  0.000000e+00   0.024070  Feb-Apr vs. May     dt
4      rain_m    -0.040573  1.546097e-12   0.005739  Feb-Apr vs. May     dt
5        rain     0.017903  9.134238e-03   0.006867  Feb-Apr vs. May     dt
6  fuel_price     0.094169  1.362210e-09   0.015540  Feb-Apr vs. May     dt


In [28]:
tvar, grp = 'd_ha_wt', 'f_grp'  # num_visits_wt, d_ha_wt
df = tdid.data_preparation(data=df2, year_list=[2022, 2023], treatment_yr=2023, grp=grp,
                           treatment_months = [5,], control_months=[3, 4], unit='h3', unit_time='time')
df[f"ln_{tvar}"] = np.log(df[tvar])
summary, res = tdid.time_shifted_did_absorbing(df=df, target_var=f"ln_{tvar}", weight=False, time_effect='jue', grp=grp)
res.loc[:, 'tc_id'] = 'Feb-Apr vs. May'
res.loc[:, 'policy'] = 'dt'
print(res)

     variable  coefficient        pvalue  std_error            tc_id policy
0        P_m1    -0.177844  9.016385e-08   0.033270  Feb-Apr vs. May     dt
1        P_m2     0.145903  1.172825e-05   0.033292  Feb-Apr vs. May     dt
2        P_m3     0.281516  7.570899e-09   0.048724  Feb-Apr vs. May     dt
3        P_m4     0.129024  4.045280e-02   0.062966  Feb-Apr vs. May     dt
4      rain_m     0.052752  4.318394e-02   0.026090  Feb-Apr vs. May     dt
5        rain    -0.026757  9.651217e-02   0.016099  Feb-Apr vs. May     dt
6  fuel_price    -0.228372  2.583848e-04   0.062503  Feb-Apr vs. May     dt


### 3.3 Deprivation level
#### 3.3.1 The 9ET

In [29]:
tvar, grp = 'num_visits_wt', 'g_grp'
df = tdid.data_preparation(data=df1, year_list=[2019, 2022], treatment_yr=2022, grp=grp,
                           treatment_months = [6,7,8], control_months=[5,9], unit='h3', unit_time='time')
df[f"ln_{tvar}"] = np.log(df[tvar])
_, res = tdid.time_shifted_did_absorbing(df=df, target_var=f"ln_{tvar}", weight=False, time_effect='jue', grp=grp)
res.loc[:, 'tc_id'] = 'Jun vs. May'
res.loc[:, 'policy'] = '9et'
print(res)

     variable  coefficient    pvalue  std_error        tc_id policy
0        P_m1     0.182617  0.000000   0.015340  Jun vs. May    9et
1        P_m2     0.291006  0.000000   0.010954  Jun vs. May    9et
2        P_m3     0.183197  0.000000   0.018886  Jun vs. May    9et
3        P_m4     0.054559  0.000055   0.013523  Jun vs. May    9et
4      rain_m     0.020635  0.000822   0.006168  Jun vs. May    9et
5        rain    -0.015774  0.024470   0.007012  Jun vs. May    9et
6  fuel_price     1.425231  0.000000   0.032317  Jun vs. May    9et


In [30]:
tvar, grp = 'd_ha_wt', 'g_grp'
df = tdid.data_preparation(data=df1, year_list=[2019, 2022], treatment_yr=2022, grp=grp,
                           treatment_months = [6,7,8], control_months=[5,9], unit='h3', unit_time='time')
df[f"ln_{tvar}"] = np.log(df[tvar])
_, res = tdid.time_shifted_did_absorbing(df=df, target_var=f"ln_{tvar}", weight=False, time_effect='jue', grp=grp)
res.loc[:, 'tc_id'] = 'Jun vs. May'
res.loc[:, 'policy'] = '9et'
print(res)

     variable  coefficient        pvalue  std_error        tc_id policy
0        P_m1    -0.434539  4.007593e-08   0.079144  Jun vs. May    9et
1        P_m2     0.003794  9.442140e-01   0.054221  Jun vs. May    9et
2        P_m3     0.178260  3.719701e-06   0.038531  Jun vs. May    9et
3        P_m4     0.058906  2.478430e-01   0.050974  Jun vs. May    9et
4      rain_m    -0.011154  5.460755e-01   0.018477  Jun vs. May    9et
5        rain    -0.000454  9.690220e-01   0.011688  Jun vs. May    9et
6  fuel_price     0.128615  5.855365e-02   0.067995  Jun vs. May    9et


#### 3.3.2 The DT

In [31]:
tvar, grp = 'num_visits_wt', 'g_grp'  # num_visits_wt, d_ha_wt
df = tdid.data_preparation(data=df2, year_list=[2022, 2023], treatment_yr=2023, grp=grp,
                           treatment_months = [5,], control_months=[3, 4], unit='h3', unit_time='time')
df[f"ln_{tvar}"] = np.log(df[tvar])
summary, res = tdid.time_shifted_did_absorbing(df=df, target_var=f"ln_{tvar}", weight=False, time_effect='jue', grp=grp)
res.loc[:, 'tc_id'] = 'Feb-Apr vs. May'
res.loc[:, 'policy'] = 'dt'
print(res)

     variable  coefficient        pvalue  std_error            tc_id policy
0        P_m1     0.314488  0.000000e+00   0.022389  Feb-Apr vs. May     dt
1        P_m2     0.300022  0.000000e+00   0.016630  Feb-Apr vs. May     dt
2        P_m3     0.081183  1.154802e-05   0.018510  Feb-Apr vs. May     dt
3        P_m4    -0.082256  8.704149e-11   0.012678  Feb-Apr vs. May     dt
4      rain_m    -0.031674  3.261904e-05   0.007624  Feb-Apr vs. May     dt
5        rain     0.017177  1.809073e-02   0.007267  Feb-Apr vs. May     dt
6  fuel_price     0.120295  5.319556e-08   0.022112  Feb-Apr vs. May     dt


In [32]:
tvar, grp = 'd_ha_wt', 'g_grp'  # num_visits_wt, d_ha_wt
df = tdid.data_preparation(data=df2, year_list=[2022, 2023], treatment_yr=2023, grp=grp,
                           treatment_months = [5,], control_months=[3, 4], unit='h3', unit_time='time')
df[f"ln_{tvar}"] = np.log(df[tvar])
summary, res = tdid.time_shifted_did_absorbing(df=df, target_var=f"ln_{tvar}", weight=False, time_effect='jue', grp=grp)
res.loc[:, 'tc_id'] = 'Feb-Apr vs. May'
res.loc[:, 'policy'] = 'dt'
print(res)

     variable  coefficient    pvalue  std_error            tc_id policy
0        P_m1    -0.001990  0.982874   0.092718  Feb-Apr vs. May     dt
1        P_m2     0.161196  0.007243   0.060025  Feb-Apr vs. May     dt
2        P_m3     0.242344  0.000000   0.025860  Feb-Apr vs. May     dt
3        P_m4     0.021940  0.359166   0.023927  Feb-Apr vs. May     dt
4      rain_m     0.055887  0.032974   0.026208  Feb-Apr vs. May     dt
5        rain    -0.026764  0.084487   0.015513  Feb-Apr vs. May     dt
6  fuel_price    -0.222296  0.000325   0.061849  Feb-Apr vs. May     dt


### 3.4 Net rent
#### 3.4.1 The 9ET

In [33]:
tvar, grp = 'num_visits_wt', 'r_grp'
df = tdid.data_preparation(data=df1, year_list=[2019, 2022], treatment_yr=2022, grp=grp,
                           treatment_months = [6,7,8], control_months=[5,9], unit='h3', unit_time='time')
df[f"ln_{tvar}"] = np.log(df[tvar])
_, res = tdid.time_shifted_did_absorbing(df=df, target_var=f"ln_{tvar}", weight=False, time_effect='jue', grp=grp)
res.loc[:, 'tc_id'] = 'Jun vs. May'
res.loc[:, 'policy'] = '9et'
print(res)

     variable  coefficient    pvalue  std_error        tc_id policy
0        P_m1     0.055881  0.104210   0.034393  Jun vs. May    9et
1        P_m2     0.199860  0.000000   0.023895  Jun vs. May    9et
2        P_m3     0.218207  0.000000   0.021774  Jun vs. May    9et
3        P_m4     0.244421  0.000000   0.013696  Jun vs. May    9et
4      rain_m     0.019537  0.002093   0.006350  Jun vs. May    9et
5        rain    -0.014921  0.021257   0.006478  Jun vs. May    9et
6  fuel_price     1.423897  0.000000   0.033105  Jun vs. May    9et


In [34]:
tvar, grp = 'd_ha_wt', 'r_grp'
df = tdid.data_preparation(data=df1, year_list=[2019, 2022], treatment_yr=2022, grp=grp,
                           treatment_months = [6,7,8], control_months=[5,9], unit='h3', unit_time='time')
df[f"ln_{tvar}"] = np.log(df[tvar])
_, res = tdid.time_shifted_did_absorbing(df=df, target_var=f"ln_{tvar}", weight=False, time_effect='jue', grp=grp)
res.loc[:, 'tc_id'] = 'Jun vs. May'
res.loc[:, 'policy'] = '9et'
print(res)

     variable  coefficient        pvalue  std_error        tc_id policy
0        P_m1    -0.448318  0.000000e+00   0.044046  Jun vs. May    9et
1        P_m2     0.152259  5.581063e-04   0.044118  Jun vs. May    9et
2        P_m3     0.166020  3.074126e-07   0.032433  Jun vs. May    9et
3        P_m4    -0.122354  5.164701e-05   0.030225  Jun vs. May    9et
4      rain_m    -0.012491  4.843803e-01   0.017862  Jun vs. May    9et
5        rain     0.002369  8.339111e-01   0.011298  Jun vs. May    9et
6  fuel_price     0.138894  3.975512e-02   0.067546  Jun vs. May    9et


#### 3.4.2 The DT

In [35]:
tvar, grp = 'num_visits_wt', 'r_grp'  # num_visits_wt, d_ha_wt
df = tdid.data_preparation(data=df2, year_list=[2022, 2023], treatment_yr=2023, grp=grp,
                           treatment_months = [5,], control_months=[3, 4], unit='h3', unit_time='time')
df[f"ln_{tvar}"] = np.log(df[tvar])
summary, res = tdid.time_shifted_did_absorbing(df=df, target_var=f"ln_{tvar}", weight=False, time_effect='jue', grp=grp)
res.loc[:, 'tc_id'] = 'Feb-Apr vs. May'
res.loc[:, 'policy'] = 'dt'
print(res)

     variable  coefficient        pvalue  std_error            tc_id policy
0        P_m1     0.032611  1.113296e-01   0.020481  Feb-Apr vs. May     dt
1        P_m2     0.151225  1.156016e-05   0.034481  Feb-Apr vs. May     dt
2        P_m3     0.174040  3.166280e-08   0.031461  Feb-Apr vs. May     dt
3        P_m4     0.295982  0.000000e+00   0.021307  Feb-Apr vs. May     dt
4      rain_m    -0.038353  1.006972e-12   0.005379  Feb-Apr vs. May     dt
5        rain     0.019319  7.130130e-03   0.007180  Feb-Apr vs. May     dt
6  fuel_price     0.099482  7.100391e-09   0.017186  Feb-Apr vs. May     dt


In [36]:
tvar, grp = 'd_ha_wt', 'r_grp'  # num_visits_wt, d_ha_wt
df = tdid.data_preparation(data=df2, year_list=[2022, 2023], treatment_yr=2023, grp=grp,
                           treatment_months = [5,], control_months=[3, 4], unit='h3', unit_time='time')
df[f"ln_{tvar}"] = np.log(df[tvar])
summary, res = tdid.time_shifted_did_absorbing(df=df, target_var=f"ln_{tvar}", weight=False, time_effect='jue', grp=grp)
res.loc[:, 'tc_id'] = 'Feb-Apr vs. May'
res.loc[:, 'policy'] = 'dt'
print(res)

     variable  coefficient        pvalue  std_error            tc_id policy
0        P_m1    -0.215669  3.526707e-04   0.060358  Feb-Apr vs. May     dt
1        P_m2     0.231639  1.816536e-04   0.061882  Feb-Apr vs. May     dt
2        P_m3     0.260458  7.191636e-10   0.042270  Feb-Apr vs. May     dt
3        P_m4     0.131928  1.080749e-02   0.051759  Feb-Apr vs. May     dt
4      rain_m     0.054767  4.432575e-02   0.027234  Feb-Apr vs. May     dt
5        rain    -0.024705  1.173999e-01   0.015778  Feb-Apr vs. May     dt
6  fuel_price    -0.224914  3.843335e-04   0.063345  Feb-Apr vs. May     dt


### 3.5 POI clusters

In [8]:
df_poi = pd.read_sql("""SELECT * FROM h3_poi_cluster_grp;""", con=engine)
df_poi.head()

Unnamed: 0,h3_id,cluster,cluster_name
0,881e265325fffff,0,Tourism-Life cluster
1,881e26532dfffff,4,Tourism-focused sparse cluster
2,881e265367fffff,1,Sparse activity cluster
3,881e26ca39fffff,1,Sparse activity cluster
4,881e26cb65fffff,1,Sparse activity cluster


In [9]:
df1 = pd.merge(df1, df_poi[['h3_id', 'cluster_name']], on='h3_id', how='left')
df2 = pd.merge(df2, df_poi[['h3_id', 'cluster_name']], on='h3_id', how='left')

In [12]:
cluster_name = {'Tourism-Life cluster': 'q3', 'Sparse activity cluster': 'q1', 
                'Residential and dining cluster': 'q4', 'High-activity hub': 'q5', 
                'Tourism-focused sparse cluster': 'q2'}

In [13]:
df1.loc[:, 'cluster'] = df1.loc[:, 'cluster_name'].map(cluster_name)
df2.loc[:, 'cluster'] = df2.loc[:, 'cluster_name'].map(cluster_name)

#### 3.5.1 The 9ET

In [17]:
tvar, grp = 'num_visits_wt', 'cluster'
df = tdid.data_preparation(data=df1, year_list=[2019, 2022], treatment_yr=2022, grp=grp,
                           treatment_months = [6,7,8], control_months=[5,9], unit='h3', unit_time='time')
df[f"ln_{tvar}"] = np.log(df[tvar])
_, res = tdid.time_shifted_did_absorbing(df=df, target_var=f"ln_{tvar}", weight=False, time_effect='jue', grp=grp)
res.loc[:, 'tc_id'] = 'Jun vs. May'
res.loc[:, 'policy'] = '9et'
print(res)

     variable  coefficient    pvalue  std_error        tc_id policy
0        P_m1     0.043499  0.001099   0.013327  Jun vs. May    9et
1        P_m2     0.337890  0.000000   0.016914  Jun vs. May    9et
2        P_m3     0.644922  0.000000   0.020402  Jun vs. May    9et
3        P_m4     1.199549  0.000000   0.024787  Jun vs. May    9et
4        P_m5     1.943307  0.000000   0.072094  Jun vs. May    9et
5      rain_m     0.016534  0.002022   0.005356  Jun vs. May    9et
6        rain    -0.012657  0.052405   0.006525  Jun vs. May    9et
7  fuel_price     1.375518  0.000000   0.036313  Jun vs. May    9et


In [18]:
tvar, grp = 'd_ha_wt', 'cluster'
df = tdid.data_preparation(data=df1, year_list=[2019, 2022], treatment_yr=2022, grp=grp,
                           treatment_months = [6,7,8], control_months=[5,9], unit='h3', unit_time='time')
df[f"ln_{tvar}"] = np.log(df[tvar])
_, res = tdid.time_shifted_did_absorbing(df=df, target_var=f"ln_{tvar}", weight=False, time_effect='jue', grp=grp)
res.loc[:, 'tc_id'] = 'Jun vs. May'
res.loc[:, 'policy'] = '9et'
print(res)

     variable  coefficient        pvalue  std_error        tc_id policy
0        P_m1    -0.033307  4.811614e-01   0.047282  Jun vs. May    9et
1        P_m2    -0.154584  2.036488e-05   0.036280  Jun vs. May    9et
2        P_m3     0.142741  3.879042e-06   0.030911  Jun vs. May    9et
3        P_m4     0.345630  1.320617e-10   0.053796  Jun vs. May    9et
4        P_m5     0.736895  0.000000e+00   0.079036  Jun vs. May    9et
5      rain_m    -0.002483  8.745386e-01   0.015724  Jun vs. May    9et
6        rain     0.005733  5.386855e-01   0.009325  Jun vs. May    9et
7  fuel_price    -0.037813  7.141731e-01   0.103242  Jun vs. May    9et


#### 3.5.2 The DT

In [19]:
tvar, grp = 'num_visits_wt', 'cluster'  # num_visits_wt, d_ha_wt
df = tdid.data_preparation(data=df2, year_list=[2022, 2023], treatment_yr=2023, grp=grp,
                           treatment_months = [5,], control_months=[3, 4], unit='h3', unit_time='time')
df[f"ln_{tvar}"] = np.log(df[tvar])
summary, res = tdid.time_shifted_did_absorbing(df=df, target_var=f"ln_{tvar}", weight=False, time_effect='jue', grp=grp)
res.loc[:, 'tc_id'] = 'Feb-Apr vs. May'
res.loc[:, 'policy'] = 'dt'
print(res)

     variable  coefficient        pvalue  std_error            tc_id policy
0        P_m1     0.030516  5.447034e-02   0.015868  Feb-Apr vs. May     dt
1        P_m2     0.450616  0.000000e+00   0.013181  Feb-Apr vs. May     dt
2        P_m3     0.904131  0.000000e+00   0.027906  Feb-Apr vs. May     dt
3        P_m4     1.536740  0.000000e+00   0.029480  Feb-Apr vs. May     dt
4        P_m5     2.275999  0.000000e+00   0.078089  Feb-Apr vs. May     dt
5      rain_m    -0.036111  2.126344e-11   0.005392  Feb-Apr vs. May     dt
6        rain     0.020790  3.834041e-03   0.007190  Feb-Apr vs. May     dt
7  fuel_price     0.145490  0.000000e+00   0.017002  Feb-Apr vs. May     dt


In [20]:
tvar, grp = 'd_ha_wt', 'cluster'  # num_visits_wt, d_ha_wt
df = tdid.data_preparation(data=df2, year_list=[2022, 2023], treatment_yr=2023, grp=grp,
                           treatment_months = [5,], control_months=[3, 4], unit='h3', unit_time='time')
df[f"ln_{tvar}"] = np.log(df[tvar])
summary, res = tdid.time_shifted_did_absorbing(df=df, target_var=f"ln_{tvar}", weight=False, time_effect='jue', grp=grp)
res.loc[:, 'tc_id'] = 'Feb-Apr vs. May'
res.loc[:, 'policy'] = 'dt'
print(res)

     variable  coefficient        pvalue  std_error            tc_id policy
0        P_m1     0.044556  2.143972e-01   0.035887  Feb-Apr vs. May     dt
1        P_m2     0.043181  4.440170e-01   0.056414  Feb-Apr vs. May     dt
2        P_m3     0.352843  7.302122e-08   0.065541  Feb-Apr vs. May     dt
3        P_m4     0.514750  3.023435e-10   0.081738  Feb-Apr vs. May     dt
4        P_m5     0.886328  9.570122e-14   0.119024  Feb-Apr vs. May     dt
5      rain_m     0.056389  2.139463e-02   0.024507  Feb-Apr vs. May     dt
6        rain    -0.029860  4.897713e-02   0.015167  Feb-Apr vs. May     dt
7  fuel_price    -0.266291  4.326444e-06   0.057952  Feb-Apr vs. May     dt


In [25]:
df1.groupby(['cluster_name', 'pt_grp'])['h3_id'].nunique()

cluster_name                    pt_grp
High-activity hub               q1           0
                                q2           0
                                q3           3
                                q4          89
Residential and dining cluster  q1           6
                                q2          10
                                q3          39
                                q4         375
Sparse activity cluster         q1        3793
                                q2        3459
                                q3        3002
                                q4        1975
Tourism-Life cluster            q1         119
                                q2         202
                                q3         353
                                q4         956
Tourism-focused sparse cluster  q1         714
                                q2        1166
                                q3        1474
                                q4        1527
Name: h3_id, dtype: i

In [22]:
df1.columns

Index(['h3_id', 'date', 'year', 'month', 'weekday', 'precipitation',
       'fuel_price', 'pt_station_num', 'num_visits_wt', 'num_unique_device',
       'd_ha_wt', 'f_share', 'grdi', 'net_rent_100m', 'state', 'state_holiday',
       'pt_grp', 'f_grp', 'g_grp', 'r_grp', 'cluster_name', 'cluster'],
      dtype='object')