# Difference-in-Difference prototypes


In [1]:
%load_ext autoreload
%autoreload 2
%cd D:\nine-euro-ticket-de

D:\nine-euro-ticket-de


In [2]:
# Load libs
import pandas as pd
import os
os.environ['USE_PYGEOS'] = '0'
import geopandas as gpd
from tqdm import tqdm
import linearmodels as lm
import workers
import sqlalchemy
from linearmodels.panel import PanelOLS
import warnings
warnings.filterwarnings("ignore")

In [3]:
# Data location
user = workers.keys_manager['database']['user']
password = workers.keys_manager['database']['password']
port = workers.keys_manager['database']['port']
db_name = workers.keys_manager['database']['name']
engine = sqlalchemy.create_engine(f'postgresql://{user}:{password}@localhost:{port}/{db_name}?gssencmode=disable')

## 1. Load data and preparation
Supermarkets, retail, and restaurant

In [158]:
df_v = pd.read_parquet('results/poi_cases/supermarket.parquet')
df_v = df_v.loc[(df_v.name == 'Lidl') & (df_v.month.isin([5, 6])) &\
                (df_v.weekday != 6) & (df_v.year.isin([2019, 2022])), 
                ['osm_id', 'year', 'month', 'weekday', 'num_visits', 'num_visits_wt', 'd_ha', 'd_ha_wt']]
df_v.head()

Unnamed: 0,osm_id,year,month,weekday,num_visits,num_visits_wt,d_ha,d_ha_wt
0,4989605,2019,6,2,10,35.519424,80.065626,11.584209
5,25029454,2019,6,2,7,154.952918,2.668695,2.909428
6,25029454,2019,6,3,8,156.767312,3.539516,3.796602
24,26606798,2019,5,0,3,51.819585,6.239097,1.969708
25,26606798,2019,6,3,13,150.501035,4.240029,2.56545


### 1.1 Prepare data

In [159]:
df_v['variant_places'] = df_v['year'] == 2022
df_v['after'] = df_v['month'] == 6
df_v['treated'] = 1*(df_v['variant_places'] & df_v['after'])
df_v['year_place'] = df_v.apply(lambda row: f"{row['year']}_{row['osm_id']}", axis=1)

In [160]:
# Set city and week as (index) for our data
ols_df = df_v.set_index(['year_place', 'weekday'])
ols_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,osm_id,year,month,num_visits,num_visits_wt,d_ha,d_ha_wt,variant_places,after,treated
year_place,weekday,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2019_4989605,2,4989605,2019,6,10,35.519424,80.065626,11.584209,False,True,0
2019_25029454,2,25029454,2019,6,7,154.952918,2.668695,2.909428,False,True,0
2019_25029454,3,25029454,2019,6,8,156.767312,3.539516,3.796602,False,True,0
2019_26606798,0,26606798,2019,5,3,51.819585,6.239097,1.969708,False,False,0
2019_26606798,3,26606798,2019,6,13,150.501035,4.240029,2.56545,False,True,0


In [161]:
# Set formula for OLS regression
mod = lm.PanelOLS.from_formula('''d_ha_wt ~ treated + EntityEffects + TimeEffects''', ols_df)

# Specify clustering when we fit the model
clfe = mod.fit(cov_type = 'clustered', cluster_entity = True)
print(clfe)

                          PanelOLS Estimation Summary                           
Dep. Variable:                d_ha_wt   R-squared:                        0.0075
Estimator:                   PanelOLS   R-squared (Between):             -0.0528
No. Observations:                1440   R-squared (Within):               0.0074
Date:                Tue, Jun 04 2024   R-squared (Overall):             -0.0296
Time:                        21:11:32   Log-likelihood                   -6522.3
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      9.9045
Entities:                         120   P-value                           0.0017
Avg Obs:                       12.000   Distribution:                  F(1,1314)
Min Obs:                       12.000                                           
Max Obs:                       12.000   F-statistic (robust):             6.3684
                            

### 1.2 Parallel trend test (Placebo test)

In [162]:
placebo_df = df_v.loc[df_v.month == 5].copy()
placebo_df['variant_places'] = placebo_df['year'] == 2022
placebo_df['fake_after1'] = placebo_df['weekday'] > 2
placebo_df['fake_after2'] = placebo_df['weekday'] > 3
placebo_df['fake_treated1'] = 1*(placebo_df['variant_places'] & placebo_df['fake_after1'])
placebo_df['fake_treated2'] = 1*(placebo_df['variant_places'] & placebo_df['fake_after2'])
placebo_ols = placebo_df.set_index(['year_place', 'weekday'])
# Run the same model as before
# but with our fake treatment variables
mod1 = lm.PanelOLS.from_formula('''d_ha_wt ~ fake_treated1 + EntityEffects + TimeEffects''', placebo_ols)
mod2 = lm.PanelOLS.from_formula('''d_ha_wt ~ fake_treated2 + EntityEffects + TimeEffects''', placebo_ols)

clfe1 = mod1.fit(cov_type = 'clustered', cluster_entity = True) 
clfe2 = mod2.fit(cov_type = 'clustered', cluster_entity = True)

print(clfe1)
print(clfe2)

                          PanelOLS Estimation Summary                           
Dep. Variable:                d_ha_wt   R-squared:                     1.537e-06
Estimator:                   PanelOLS   R-squared (Between):              0.0010
No. Observations:                 720   R-squared (Within):            -6.25e-05
Date:                Tue, Jun 04 2024   R-squared (Overall):              0.0006
Time:                        21:11:32   Log-likelihood                   -3356.3
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      0.0009
Entities:                         120   P-value                           0.9759
Avg Obs:                       6.0000   Distribution:                   F(1,594)
Min Obs:                       6.0000                                           
Max Obs:                       6.0000   F-statistic (robust):             0.0009
                            

## 2. Different POI types

In [163]:
def did_test(fname=None, place=None, name_filter=None, compare_year=2019,
             treatment_month=6, remove_sun=False, target_var='num_visits_wt'):
    df_v = pd.read_parquet(fname)
    cols = ['osm_id', 'year', 'month', 'weekday', 'num_visits', 'num_visits_wt', 'd_ha', 'd_ha_wt']
    df_v = df_v.loc[(df_v.month.isin([5, treatment_month])) & (df_v.year.isin([compare_year, 2022])), cols]
    if remove_sun:
        df_v = df_v.loc[df_v.weekday != 6]
    if name_filter is not None:
        df_v = df_v.loc[df_v.name == name_filter]
    df_v['variant_places'] = df_v['year'] == 2022
    df_v['after'] = df_v['month'] == treatment_month
    df_v['treated'] = 1*(df_v['variant_places'] & df_v['after'])
    df_v['year_place'] = df_v.apply(lambda row: f"{row['year']}_{row['osm_id']}", axis=1)
    ols_df = df_v.set_index(['year_place', 'weekday'])
    mod = lm.PanelOLS.from_formula(f'''{target_var} ~ treated + EntityEffects + TimeEffects''', ols_df)
    clfe = mod.fit(cov_type = 'clustered', cluster_entity = True)
    # Summary
    df = pd.concat([clfe.params, clfe.std_errors, clfe.pvalues], axis = 1)
    # Scale standard error to CI
    df['ci'] = df['std_error']*1.96
    df.reset_index(drop=True, inplace=True)
    df['target_var'] = target_var
    df['place_type'] = place
    df['treatment_month'] = treatment_month
    df['compare_year'] = compare_year
    return df

In [164]:
lb = 'Supermarket'
res = did_test(fname=f"dbs/visits_day_sg/{lb}.parquet", place=lb, 
               name_filter=None, treatment_month=8, 
               remove_sun=True, target_var='d_ha_wt',
               compare_year=2019)

KeyboardInterrupt: 

In [None]:
lbs = ['Restaurant', 'Supermarket', 'Recreation & Sports Centres', 'Retail stores']
para_set = [(lb, tr_m, c_yr, t_var) for lb in lbs for tr_m in (6, 7, 8) for c_yr in (2019, 2023) for t_var in ('d_ha_wt', 'num_visits_wt')]
res_df_list = []
for paras in tqdm(para_set, desc='Basic DiD test'):
    remove_sun = False
    if paras[0] == 'Supermarket':
        remove_sun = True 
    res = did_test(fname=f"dbs/visits_day_sg/{paras[0]}.parquet", 
                   place=paras[0], 
                   name_filter=None, 
                   compare_year=paras[2],
                   treatment_month=paras[1], 
                   remove_sun=remove_sun, 
                   target_var=paras[3])
    res_df_list.append(res)
    
df_res = pd.concat(res_df_list)
df_res.head()

In [None]:
df_res.to_parquet('results/did/basic_did.parquet', index=False)

## 3. Additional controlled variables
$$y_{i,ymd}=\alpha_i + \gamma_{yf} + \eta_{mf} +\zeta_{d} +\beta P_m + \epsilon_{i, ymd}$$

- $y_{i,ymd}$ is the dependent variable for place $i$ at year $y$, month $m$, and day of the week $d$. 
- $\alpha_i$ is the place-specific fixed effect.
- $\gamma_{yf}$ is the state-specific fixed effect depending on the year $y$.
- $\eta_{mf}$ is the state-specific fixed effect depending on the month $m$.
- $\zeta_{d}$ is the day-of-the-week fixed effect depending on the day of the week $d$.
- $\beta$ is the coefficient of the effect of the 9ET.
- $P_m$ is a dummy variable indicating the treatment.
- $\epsilon_{i, ymd}$ is the error term.

In [207]:
df_v = pd.read_parquet('dbs/visits_day_sg/Recreation & Sports Centres.parquet')
df_v = df_v.loc[(df_v.month.isin([5, 6, 7, 8])) & (df_v.year.isin([2019, 2022])), 
                ['osm_id', 'year', 'month', 'weekday', 'num_visits', 'num_visits_wt', 'd_ha', 'd_ha_wt']]
df_v.head()

Unnamed: 0,osm_id,year,month,weekday,num_visits,num_visits_wt,d_ha,d_ha_wt
0,4407207,2022,8,0,4,10.909091,2.805708,2.805708
3,4701854,2019,8,3,8,99.777778,9.648704,9.661539
4,4701854,2022,7,6,8,65.277778,14.252445,8.492695
6,4797423,2019,7,3,1,2.205128,19.461274,19.461274
7,4797423,2022,7,1,2,20.0,3.608743,3.608743


### 3.1 Add state information

In [183]:
osms = df_v['osm_id'].unique()
osms_sql = ','.join(["'" + str(x) + "'" for x in osms])
osms_sql = "(" + osms_sql + ")"
gdf_poi_c = gpd.read_postgis(f"""SELECT osm_id, geom FROM poi 
                                WHERE osm_id IN {osms_sql};""", con=engine)

In [184]:
gdf_state = gpd.read_file("dbs/geo/vg2500_12-31.utm32s.shape/vg2500/vg2500_LAN.shp").to_crs(4326)
gdf_state = gdf_state.loc[gdf_state['GF'] == 9, :]

In [185]:
gdf_poi_c = gdf_poi_c.sjoin(gdf_state[['GEN', 'geometry']])
df_v = pd.merge(df_v, gdf_poi_c[['osm_id', 'GEN']], on='osm_id', how='left').rename(columns={'GEN': 'state'})

### 3.2 Data preparation

In [196]:
time_seq_list = [f'{y}-{m}-{d}' for y in (2019, 2022, 2023) for m in range(5, 10) for d in range(0, 7)]
time_seq_dict = {x:i for i, x in zip(range(0, len(time_seq_list)), time_seq_list)}

In [210]:
#df = df_v.loc[df_v.state=='Berlin', :].copy()
df = df_v.copy()
treatment_month = [6, 7, 8]
# Categorization
df['time'] = df['year'].astype(str) + '-' + df['month'].astype(str)+ '-' + df['weekday'].astype(str)
df['time'] = df['time'].map(time_seq_dict)
df['osm_id'] = df['osm_id'].astype('category')
# df['state'] = df['state'].astype('category')
df['year'] = df['year'].astype('category')
df['month'] = df['month'].astype('category')
df['weekday'] = df['weekday'].astype('category')

# Treatment
df['variant_places'] = df['year'] == 2022
df['after'] = (df['month'] == treatment_month[0]) | (df['month'] == treatment_month[1]) | (df['month'] == treatment_month[2])
# Add the dummy variable for treatment (P_m)
df['P_m'] = df['variant_places'] & df['after'] # 1*(df['variant_places'] & df['after']), 

# Create a state-year and state-month fixed effect
#df['state_year'] = df['state'].astype(str) + '_' + df['year'].astype(str)
#df['state_month'] = df['state'].astype(str) + '_' + df['month'].astype(str)

# Set the multiindex
df = df.set_index(['osm_id', 'time'])
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,year,month,weekday,num_visits,num_visits_wt,d_ha,d_ha_wt,variant_places,after,P_m
osm_id,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
4407207,56,2022,8,0,4,10.909091,2.805708,2.805708,True,True,True
4701854,24,2019,8,3,8,99.777778,9.648704,9.661539,False,True,False
4701854,55,2022,7,6,8,65.277778,14.252445,8.492695,True,True,True
4797423,17,2019,7,3,1,2.205128,19.461274,19.461274,False,True,False
4797423,50,2022,7,1,2,20.0,3.608743,3.608743,True,True,True


### 3.3 Modeling

In [211]:
# Define the dependent variable and the independent variables
target_var = 'd_ha_wt'
dependent = df[target_var]
exog = df[['P_m']]

# Add fixed effects dummies
df = pd.get_dummies(df, columns=['year', 'month', 'weekday'], drop_first=True)  #'state_year', 'state_month', 'weekday'

# Collect all exogenous variables including the fixed effects
exog = pd.concat([exog, df.filter(like='year_'), df.filter(like='month_'), df.filter(like='weekday_')], axis=1)

# Model specification
model = PanelOLS(dependent, exog, entity_effects=True, check_rank=False, drop_absorbed=True)    # check_rank ? drop_absorbed ?

# Fit the model
results = model.fit()

# Print the summary of the model
print(results.summary)

                          PanelOLS Estimation Summary                           
Dep. Variable:                d_ha_wt   R-squared:                        0.0030
Estimator:                   PanelOLS   R-squared (Between):              0.0058
No. Observations:              516751   R-squared (Within):               0.0030
Date:                Wed, Jun 05 2024   R-squared (Overall):              0.0047
Time:                        10:05:13   Log-likelihood                  -2.8e+06
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      133.83
Entities:                       25038   P-value                           0.0000
Avg Obs:                       20.639   Distribution:               F(11,491702)
Min Obs:                       1.0000                                           
Max Obs:                       56.000   F-statistic (robust):             133.83
                            

## 4. Time-shifted DiD - multiple POI types

In [8]:
gdf_state = gpd.read_file("dbs/geo/vg2500_12-31.utm32s.shape/vg2500/vg2500_LAN.shp").to_crs(4326)
gdf_state = gdf_state.loc[gdf_state['GF'] == 9, :]

time_seq_list = [f'{y}-{m}-{d}' for y in (2019, 2022, 2023) for m in range(5, 10) for d in range(0, 7)]
time_seq_dict = {x:i for i, x in zip(range(0, len(time_seq_list)), time_seq_list)}

In [11]:
def load_data(fname = None, gdf_state=None, threshold_v=25):
    df_v = pd.read_parquet(fname)
    cols = ['osm_id', 'year', 'month', 'weekday', 'num_visits', 'num_visits_wt', 'd_ha', 'd_ha_wt']
    df_v = df_v.loc[df_v.num_visits >= threshold_v, cols]
    # Add state
    osms = df_v['osm_id'].unique()
    osms_sql = ','.join(["'" + str(x) + "'" for x in osms])
    osms_sql = "(" + osms_sql + ")"
    gdf_poi_c = gpd.read_postgis(f"""SELECT osm_id, geom FROM poi 
                                    WHERE osm_id IN {osms_sql};""", con=engine)
    gdf_poi_c = gdf_poi_c.sjoin(gdf_state[['GEN', 'geometry']])
    df_v = pd.merge(df_v, gdf_poi_c[['osm_id', 'GEN']], on='osm_id', how='left').rename(columns={'GEN': 'state'})
    return df_v

In [None]:
def data_prep(df=None, treatment_month=6, time_seq_dict=None):
    # Categorization
    df['time'] = df['year'].astype(str) + '-' + df['month'].astype(str)+ '-' + df['weekday'].astype(str)
    df['time'] = df['time'].map(time_seq_dict)
    df['osm_id'] = df['osm_id'].astype('category')
    df['state'] = df['state'].astype('category')
    df['year'] = df['year'].astype('category')
    df['month'] = df['month'].astype('category')
    df['weekday'] = df['weekday'].astype('category')
    
    # Treatment
    df['variant_places'] = df['year'] == 2022
    df['after'] = df['month'] == treatment_month
    # Add the dummy variable for treatment (P_m)
    df['P_m'] = df['variant_places'] & df['after'] # 1*(df['variant_places'] & df['after']), 
    
    # Create a state-year and state-month fixed effect
    df['state_year'] = df['state'].astype(str) + '_' + df['year'].astype(str)
    df['state_month'] = df['state'].astype(str) + '_' + df['month'].astype(str)
    
    # Set the multiindex
    df = df.set_index(['osm_id', 'time'])
    return df

def did_model(df=None, target_var=None):
    # Define the dependent variable and the independent variables
    dependent = df[target_var]
    exog = df[['P_m']]
    
    # Add fixed effects dummies
    df = pd.get_dummies(df, columns=['state_year', 'state_month', 'weekday'], drop_first=True)
    
    # Collect all exogenous variables including the fixed effects
    exog = pd.concat([exog, df.filter(like='state_year_'), df.filter(like='state_month_'), df.filter(like='weekday_')], axis=1)
    
    # Model specification
    model = PanelOLS(dependent, exog, entity_effects=True, check_rank=False, drop_absorbed=True)    # check_rank ? drop_absorbed ?
    
    # Fit the model
    results = model.fit()
    
    return results

def did_model_execution(df=None, place=None, name_filter=None, compare_year=2019,
                        treatment_month=6, remove_sun=False, target_var='num_visits_wt', time_seq_dict=None):
    df_v = df.copy()
    df_v = df_v.loc[(df_v.month.isin([5, treatment_month])) & (df_v.year.isin([compare_year, 2022]))]
    if remove_sun:
        df_v = df_v.loc[df_v.weekday != 6]
    if name_filter is not None:
        df_v = df_v.loc[df_v.name == name_filter]

    res = did_model(df=data_prep(df=df_v, treatment_month=treatment_month, time_seq_dict=time_seq_dict), target_var=target_var)
    # Summary
    df_r = pd.concat([res.params, res.std_errors, res.pvalues], axis = 1).\
             loc['P_m', :].to_frame().transpose().reset_index(drop=True)
    # Scale standard error to CI
    df_r['ci'] = df_r['std_error']*1.96
    df_r.reset_index(drop=True, inplace=True)
    df_r['target_var'] = target_var
    df_r['place_type'] = place
    df_r['treatment_month'] = treatment_month
    df_r['compare_year'] = compare_year
    return df_r

In [180]:
lbs = ['Restaurant', 'Supermarket', 'Recreation & Sports Centres', 'Retail stores']
para_set = [(tr_m, c_yr, t_var) for tr_m in (6, 7, 8) for c_yr in (2019, 2023) for t_var in ('d_ha_wt', 'num_visits_wt')]
res_df_list = []
for lb in lbs:
    print(f'Prepare data for {lb}.')
    df_visits = load_data(fname = f"dbs/visits_day_sg/{lb}.parquet", gdf_state=gdf_state, threshold_v=25)
    for paras in tqdm(para_set, desc=lb):
        remove_sun = False
        if paras[0] == 'Supermarket':
            remove_sun = True 
        res = did_model_execution(df=df_visits, 
                                  place=lb, 
                                  name_filter=None, 
                                  compare_year=paras[1],
                                  treatment_month=paras[0], 
                                  remove_sun=remove_sun, 
                                  target_var=paras[2], 
                                  time_seq_dict=time_seq_dict)
        res_df_list.append(res)
df_res = pd.concat(res_df_list)
df_res.head()

Prepare data for Restaurant.


Restaurant: 100%|██████████| 12/12 [00:02<00:00,  4.98it/s]


Prepare data for Supermarket.


Supermarket: 100%|██████████| 12/12 [00:02<00:00,  5.84it/s]


Prepare data for Recreation & Sports Centres.


Recreation & Sports Centres: 100%|██████████| 12/12 [00:06<00:00,  1.76it/s]


Prepare data for Retail stores.


Retail stores: 100%|██████████| 12/12 [00:01<00:00,  7.37it/s]


Unnamed: 0,parameter,std_error,pvalue,ci,target_var,place_type,treatment_month,compare_year
0,-3.473497,2.395873,0.1471711,4.69591,d_ha_wt,Restaurant,6,2019
0,-145.35585,14.653308,0.0,28.720484,num_visits_wt,Restaurant,6,2019
0,0.259231,0.877719,0.7677359,1.720328,d_ha_wt,Restaurant,6,2023
0,-48.156818,6.630386,4.08118e-13,12.995556,num_visits_wt,Restaurant,6,2023
0,0.189118,2.48309,0.9392927,4.866857,d_ha_wt,Restaurant,7,2019


In [181]:
df_res.to_parquet('results/did/did_models_30.parquet', index=False)

## 5. Model 2
$$y_{i,d}=\beta_0 \cdot \text{9ET}_d \cdot \text{post}_d + \beta_1 \cdot \text{9ET}_d + $\zeta_{s,ymd}$ + \epsilon_{i, d}$$

- $y_{i,d}$ is the dependent variable for place $i$ at the day of the week $d$. 
- $\text{9ET}_d$ is a dummy variable indicating whether it is during June, July, or Aug.
- $\text{post}_d$ is a dummy variable indicating the treatment is on.
- $\beta_0$ is the coefficient of the effect of the 9ET.
- $\beta_1$ is the coefficient of the effect of being in June, July, or Aug.
- $\zeta_{s,ymd}$ is the state-year-month-day-of-the-week fixed effect.
- $\epsilon_{i, d}$ is the error term.

**To be added: precipitation, holiday.**

In [10]:
lb = "Recreation & Sports Centres"
df_visits = load_data(fname = f"dbs/visits_day_sg/{lb}.parquet", gdf_state=gdf_state, threshold_v=5)
df_visits = df_visits.dropna()

In [7]:
def data_preparation(data=None, year_list=[2019, 2022], remove_sep=True):
    # df = data.loc[data.state=='Berlin', :].copy()
    df = data.copy()
    df = df.loc[df.year.isin(year_list), :].drop_duplicates(subset=['osm_id', 'year', 'month', 'weekday'])
    if remove_sep:
        df = df.loc[df.month != 9, :]
    treatment_month = [6, 7, 8]
    # Categorization
    df['time_fe'] = df['state'].astype(str) + '-' + df['year'].astype(str) + '-' + df['month'].astype(str)+ '-' + df['weekday'].astype(str)
    df['time_fe'] = df['time_fe'].astype('category')
    df['time'] = df['weekday']
    df['osm_id'] = df['osm_id'].astype('category')
    # df['state'] = df['state'].astype('category')
    df['year'] = df['year'].astype('category')
    df['month'] = df['month'].astype('category')
    df['weekday'] = df['weekday'].astype('category')
    
    # Treatment
    df['variant_places'] = df['year'] == 2022
    # df['after'] = df['month'] == treatment_month
    df['after'] = (df['month'] == treatment_month[0]) | (df['month'] == treatment_month[1]) | (df['month'] == treatment_month[2])
    # Add the dummy variable for treatment (P_m)
    df['P_m'] = df['variant_places'] & df['after'] # 1*(df['variant_places'] & df['after']), 
    
    # Create a state-year and state-month fixed effect
    # df['state_month'] = df['state'].astype(str) + '_' + df['month'].astype(str)
    
    # Set the multiindex
    df = df.set_index(['osm_id', 'time'])
    return df

def model_results(res=None, placebo=False, treatment_month='all', target_var=None):
    # Summary
    if placebo:
        df_r = pd.concat([res.params, res.std_errors, res.pvalues], axis = 1).\
             loc[['after'], :].reset_index().rename(columns={'index': 'y'})
    else:
        df_r = pd.concat([res.params, res.std_errors, res.pvalues], axis = 1).\
                 loc[['P_m', 'after'], :].reset_index().rename(columns={'index': 'y'})
    # Scale standard error to CI
    df_r['ci'] = df_r['std_error']*1.96
    df_r.reset_index(drop=True, inplace=True)
    df_r['target_var'] = target_var
    df_r['treatment_month'] = treatment_month
    return df_r

In [8]:
def did_model2(data=None, target_var=None):
    df = data.copy()
    dependent = df[target_var]
    exog = df[['P_m', 'after']]
    
    # Add fixed effects dummies
    df = pd.get_dummies(df, columns=['time_fe'], drop_first=True) 
    
    # Collect all exogenous variables including the fixed effects
    exog = pd.concat([exog, df.filter(like='time_fe_')], axis=1)
    
    # Model specification
    model = PanelOLS(dependent, exog, entity_effects=True, check_rank=False, drop_absorbed=True)    # check_rank ? drop_absorbed ?
    
    # Fit the model
    results = model.fit()
    return model_results(res=results, placebo=False, treatment_month='all', target_var=target_var)

### 5.1 Placebo test - pre-treatment period

In [9]:
# Define the dependent variable and the independent variables
def placebo_test_pre(df_visits=None, target_var=None, remove_sep=False):
    data = data_preparation(data=df_visits, year_list=[2019, ], remove_sep=remove_sep)
    dependent = data[target_var]
    exog = data[['after']]
    
    # Add fixed effects dummies
    data = pd.get_dummies(data, columns=['time_fe'], drop_first=True) 
    
    # Collect all exogenous variables including the fixed effects
    exog = pd.concat([exog, data.filter(like='time_fe_')], axis=1)
    
    # Model specification
    model = PanelOLS(dependent, exog, entity_effects=True, check_rank=False, drop_absorbed=True)    # check_rank ? drop_absorbed ?
    
    # Fit the model
    results = model.fit()
    
    return model_results(res=results, placebo=True, treatment_month='all', target_var=target_var)

### 5.2 Multiple POI types

In [4]:
gdf_state = gpd.read_file("dbs/geo/vg2500_12-31.utm32s.shape/vg2500/vg2500_LAN.shp").to_crs(4326)
gdf_state = gdf_state.loc[gdf_state['GF'] == 9, :]

In [5]:
df_cat = pd.read_excel('dbs/poi/categories.xlsx').rename(columns={'category': 'theme', 'subcategory': 'label'})
label_list = df_cat['label'].unique()

In [None]:
res_df_list = []
res_p_df_list = []

In [18]:
# lbs = ['Restaurant', 'Supermarket', 'Recreation & Sports Centres', 'Retail stores']
para_set = [(c_yr, t_var) for c_yr in (2019, 2023) for t_var in ('d_ha_wt', 'num_visits_wt')]

for lb in label_list[31:]:
    print(f'Prepare data for {lb}.')
    df_visits = load_data(fname = f"dbs/visits_day_sg/{lb}.parquet", gdf_state=gdf_state, threshold_v=5)
    if len(df_visits) > 0:
        df_visits = df_visits.dropna()
        for paras in tqdm(para_set, desc=lb):
            remove_sun = False
            if lb == 'Supermarket':
                remove_sun = True 
            rs = True
            df = data_preparation(data=df_visits, year_list=[paras[0], 2022], remove_sep=rs)
            try:
                res = did_model2(data=df, target_var=paras[1])
                res.loc[:, 'place'] = lb
                res.loc[:, 'compare_year'] = paras[0]
                res_df_list.append(res)
                # Pre-treatment placebo test
                res_p = placebo_test_pre(df_visits=df_visits, target_var=paras[1], remove_sep=rs)
                res_p.loc[:, 'place'] = lb
                res_p_df_list.append(res_p)
            except:
                continue

Prepare data for Extreme & Adventure Sports.


Extreme & Adventure Sports: 100%|██████████| 4/4 [00:06<00:00,  1.60s/it]


Prepare data for Fashion and clothing.


Fashion and clothing: 100%|██████████| 4/4 [00:50<00:00, 12.56s/it]


Prepare data for Financial services.


Financial services: 100%|██████████| 4/4 [01:31<00:00, 22.82s/it]


Prepare data for Food shop.


Food shop: 100%|██████████| 4/4 [03:08<00:00, 47.06s/it]


Prepare data for Kindergarten and childcare.


Kindergarten and childcare: 100%|██████████| 4/4 [03:55<00:00, 58.77s/it]


Prepare data for Library.


Library: 100%|██████████| 4/4 [00:23<00:00,  5.89s/it]


Prepare data for Medical supplies.


Medical supplies: 100%|██████████| 4/4 [00:18<00:00,  4.72s/it]


Prepare data for Other facilities.


Other facilities: 100%|██████████| 4/4 [00:31<00:00,  7.84s/it]


Prepare data for Place of worship.


Place of worship: 100%|██████████| 4/4 [07:23<00:00, 110.78s/it]


Prepare data for Pub.


Pub: 100%|██████████| 4/4 [01:52<00:00, 28.13s/it]


Prepare data for Public services.


Public services: 100%|██████████| 4/4 [03:51<00:00, 57.80s/it]


Prepare data for Racket & Martial Arts.


Racket & Martial Arts: 100%|██████████| 4/4 [00:07<00:00,  1.83s/it]


Prepare data for Rentals and sharing.


Rentals and sharing: 100%|██████████| 4/4 [00:43<00:00, 10.98s/it]


Prepare data for Restaurant.


Restaurant: 100%|██████████| 4/4 [08:37<00:00, 129.31s/it]


Prepare data for School.


School: 100%|██████████| 4/4 [02:53<00:00, 43.26s/it]


Prepare data for Services & Repairs.


Services & Repairs: 100%|██████████| 4/4 [00:45<00:00, 11.28s/it]


Prepare data for Social facilities.


Social facilities: 100%|██████████| 4/4 [03:02<00:00, 45.54s/it]


Prepare data for Technology & Hobbies.


Technology & Hobbies: 100%|██████████| 4/4 [00:49<00:00, 12.36s/it]


Prepare data for Training center.


Training center: 100%|██████████| 4/4 [00:15<00:00,  4.00s/it]


Prepare data for Wellness and fitness.


Wellness and fitness: 100%|██████████| 4/4 [00:06<00:00,  1.61s/it]


Prepare data for Winter Sports.


Winter Sports: 100%|██████████| 4/4 [00:00<00:00, 11.82it/s]


In [19]:
df_res = pd.concat(res_df_list)
df_res_p = pd.concat(res_p_df_list)
df_res_p = df_res_p.loc[df_res_p.pvalue >= 0.05, :]
df_res_p.loc[:, 'placebo'] = 1
df_res_p = df_res_p[['target_var', 'place', 'placebo']]
df_res = pd.merge(df_res, df_res_p, on=['target_var', 'place'], how='left')
df_res.fillna(0, inplace=True)
df_res.to_parquet('results/did/did_model_2.parquet', index=False)

In [22]:
df_res = df_res.drop_duplicates(subset=['y', 'target_var', 'place', 'compare_year'])
df_res.to_parquet('results/did/did_model_2.parquet', index=False)

In [25]:
df_res.loc[(df_res.placebo == 1) & \
           (df_res.pvalue < 0.05) & \
           (df_res.y == 'P_m'), 'place'].unique()

array(['Home & Lifestyle', 'Office', 'Accomodations', 'Fast food',
       'Health care and services', 'Recreation & Sports Centres',
       'Tourist attractions', 'Viewing and observation',
       'Adventure & Wildlife', 'Animal', 'Beverages', 'College',
       'Community center', 'Cosmetics and beauty', 'Equestrian & Riding',
       'Extreme & Adventure Sports', 'Fashion and clothing',
       'Kindergarten and childcare', 'Medical supplies',
       'Other facilities', 'Public services', 'Rentals and sharing',
       'Restaurant'], dtype=object)

In [29]:
df_res.loc[(df_res.placebo == 1) & \
           (df_res.pvalue < 0.05) & \
           (df_res.y == 'P_m') & \
           (df_res.compare_year == 2019), :]

Unnamed: 0,y,parameter,std_error,pvalue,ci,target_var,treatment_month,place,compare_year,placebo
16,P_m,-79.446468,34.995558,0.02320014,68.591294,d_ha_wt,all,Home & Lifestyle,2019,1.0
32,P_m,-31.700095,11.872057,0.007583154,23.269233,d_ha_wt,all,Office,2019,1.0
60,P_m,-44.185395,21.929247,0.04391738,42.981325,d_ha_wt,all,Accomodations,2019,1.0
120,P_m,28.507421,12.62793,0.0239805,24.750742,d_ha_wt,all,Fast food,2019,1.0
156,P_m,60.85698,25.47417,0.01689757,49.929373,num_visits_wt,all,Health care and services,2019,1.0
232,P_m,-18.146552,7.165229,0.01132345,14.043848,d_ha_wt,all,Recreation & Sports Centres,2019,1.0
236,P_m,84.097883,39.161278,0.03175698,76.756106,num_visits_wt,all,Recreation & Sports Centres,2019,1.0
276,P_m,-122.126741,51.575257,0.01791179,101.087503,d_ha_wt,all,Tourist attractions,2019,1.0
292,P_m,-132.622558,61.117462,0.03004783,119.790226,d_ha_wt,all,Viewing and observation,2019,1.0
340,P_m,-224.106258,37.917096,0.0002261311,74.317508,d_ha_wt,all,Adventure & Wildlife,2019,1.0
