# Difference-in-Difference prototypes


In [1]:
%load_ext autoreload
%autoreload 2
%cd D:\nine-euro-ticket-de

D:\nine-euro-ticket-de


In [2]:
# Load libs
import pandas as pd
import os
os.environ['USE_PYGEOS'] = '0'
import geopandas as gpd
from tqdm import tqdm
import linearmodels as lm
import workers
import sqlalchemy
from linearmodels.panel import PanelOLS
import warnings
warnings.filterwarnings("ignore")

In [3]:
# Data location
user = workers.keys_manager['database']['user']
password = workers.keys_manager['database']['password']
port = workers.keys_manager['database']['port']
db_name = workers.keys_manager['database']['name']
engine = sqlalchemy.create_engine(f'postgresql://{user}:{password}@localhost:{port}/{db_name}?gssencmode=disable')

## 1. Load data and preparation
Supermarkets, retail, and restaurant

In [158]:
df_v = pd.read_parquet('results/poi_cases/supermarket.parquet')
df_v = df_v.loc[(df_v.name == 'Lidl') & (df_v.month.isin([5, 6])) &\
                (df_v.weekday != 6) & (df_v.year.isin([2019, 2022])), 
                ['osm_id', 'year', 'month', 'weekday', 'num_visits', 'num_visits_wt', 'd_ha', 'd_ha_wt']]
df_v.head()

Unnamed: 0,osm_id,year,month,weekday,num_visits,num_visits_wt,d_ha,d_ha_wt
0,4989605,2019,6,2,10,35.519424,80.065626,11.584209
5,25029454,2019,6,2,7,154.952918,2.668695,2.909428
6,25029454,2019,6,3,8,156.767312,3.539516,3.796602
24,26606798,2019,5,0,3,51.819585,6.239097,1.969708
25,26606798,2019,6,3,13,150.501035,4.240029,2.56545


### 1.1 Prepare data

In [159]:
df_v['variant_places'] = df_v['year'] == 2022
df_v['after'] = df_v['month'] == 6
df_v['treated'] = 1*(df_v['variant_places'] & df_v['after'])
df_v['year_place'] = df_v.apply(lambda row: f"{row['year']}_{row['osm_id']}", axis=1)

In [160]:
# Set city and week as (index) for our data
ols_df = df_v.set_index(['year_place', 'weekday'])
ols_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,osm_id,year,month,num_visits,num_visits_wt,d_ha,d_ha_wt,variant_places,after,treated
year_place,weekday,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2019_4989605,2,4989605,2019,6,10,35.519424,80.065626,11.584209,False,True,0
2019_25029454,2,25029454,2019,6,7,154.952918,2.668695,2.909428,False,True,0
2019_25029454,3,25029454,2019,6,8,156.767312,3.539516,3.796602,False,True,0
2019_26606798,0,26606798,2019,5,3,51.819585,6.239097,1.969708,False,False,0
2019_26606798,3,26606798,2019,6,13,150.501035,4.240029,2.56545,False,True,0


In [161]:
# Set formula for OLS regression
mod = lm.PanelOLS.from_formula('''d_ha_wt ~ treated + EntityEffects + TimeEffects''', ols_df)

# Specify clustering when we fit the model
clfe = mod.fit(cov_type = 'clustered', cluster_entity = True)
print(clfe)

                          PanelOLS Estimation Summary                           
Dep. Variable:                d_ha_wt   R-squared:                        0.0075
Estimator:                   PanelOLS   R-squared (Between):             -0.0528
No. Observations:                1440   R-squared (Within):               0.0074
Date:                Tue, Jun 04 2024   R-squared (Overall):             -0.0296
Time:                        21:11:32   Log-likelihood                   -6522.3
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      9.9045
Entities:                         120   P-value                           0.0017
Avg Obs:                       12.000   Distribution:                  F(1,1314)
Min Obs:                       12.000                                           
Max Obs:                       12.000   F-statistic (robust):             6.3684
                            

### 1.2 Parallel trend test (Placebo test)

In [162]:
placebo_df = df_v.loc[df_v.month == 5].copy()
placebo_df['variant_places'] = placebo_df['year'] == 2022
placebo_df['fake_after1'] = placebo_df['weekday'] > 2
placebo_df['fake_after2'] = placebo_df['weekday'] > 3
placebo_df['fake_treated1'] = 1*(placebo_df['variant_places'] & placebo_df['fake_after1'])
placebo_df['fake_treated2'] = 1*(placebo_df['variant_places'] & placebo_df['fake_after2'])
placebo_ols = placebo_df.set_index(['year_place', 'weekday'])
# Run the same model as before
# but with our fake treatment variables
mod1 = lm.PanelOLS.from_formula('''d_ha_wt ~ fake_treated1 + EntityEffects + TimeEffects''', placebo_ols)
mod2 = lm.PanelOLS.from_formula('''d_ha_wt ~ fake_treated2 + EntityEffects + TimeEffects''', placebo_ols)

clfe1 = mod1.fit(cov_type = 'clustered', cluster_entity = True) 
clfe2 = mod2.fit(cov_type = 'clustered', cluster_entity = True)

print(clfe1)
print(clfe2)

                          PanelOLS Estimation Summary                           
Dep. Variable:                d_ha_wt   R-squared:                     1.537e-06
Estimator:                   PanelOLS   R-squared (Between):              0.0010
No. Observations:                 720   R-squared (Within):            -6.25e-05
Date:                Tue, Jun 04 2024   R-squared (Overall):              0.0006
Time:                        21:11:32   Log-likelihood                   -3356.3
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      0.0009
Entities:                         120   P-value                           0.9759
Avg Obs:                       6.0000   Distribution:                   F(1,594)
Min Obs:                       6.0000                                           
Max Obs:                       6.0000   F-statistic (robust):             0.0009
                            

## 2. Different POI types

In [163]:
def did_test(fname=None, place=None, name_filter=None, compare_year=2019,
             treatment_month=6, remove_sun=False, target_var='num_visits_wt'):
    df_v = pd.read_parquet(fname)
    cols = ['osm_id', 'year', 'month', 'weekday', 'num_visits', 'num_visits_wt', 'd_ha', 'd_ha_wt']
    df_v = df_v.loc[(df_v.month.isin([5, treatment_month])) & (df_v.year.isin([compare_year, 2022])), cols]
    if remove_sun:
        df_v = df_v.loc[df_v.weekday != 6]
    if name_filter is not None:
        df_v = df_v.loc[df_v.name == name_filter]
    df_v['variant_places'] = df_v['year'] == 2022
    df_v['after'] = df_v['month'] == treatment_month
    df_v['treated'] = 1*(df_v['variant_places'] & df_v['after'])
    df_v['year_place'] = df_v.apply(lambda row: f"{row['year']}_{row['osm_id']}", axis=1)
    ols_df = df_v.set_index(['year_place', 'weekday'])
    mod = lm.PanelOLS.from_formula(f'''{target_var} ~ treated + EntityEffects + TimeEffects''', ols_df)
    clfe = mod.fit(cov_type = 'clustered', cluster_entity = True)
    # Summary
    df = pd.concat([clfe.params, clfe.std_errors, clfe.pvalues], axis = 1)
    # Scale standard error to CI
    df['ci'] = df['std_error']*1.96
    df.reset_index(drop=True, inplace=True)
    df['target_var'] = target_var
    df['place_type'] = place
    df['treatment_month'] = treatment_month
    df['compare_year'] = compare_year
    return df

In [164]:
lb = 'Supermarket'
res = did_test(fname=f"dbs/visits_day_sg/{lb}.parquet", place=lb, 
               name_filter=None, treatment_month=8, 
               remove_sun=True, target_var='d_ha_wt',
               compare_year=2019)

KeyboardInterrupt: 

In [None]:
lbs = ['Restaurant', 'Supermarket', 'Recreation & Sports Centres', 'Retail stores']
para_set = [(lb, tr_m, c_yr, t_var) for lb in lbs for tr_m in (6, 7, 8) for c_yr in (2019, 2023) for t_var in ('d_ha_wt', 'num_visits_wt')]
res_df_list = []
for paras in tqdm(para_set, desc='Basic DiD test'):
    remove_sun = False
    if paras[0] == 'Supermarket':
        remove_sun = True 
    res = did_test(fname=f"dbs/visits_day_sg/{paras[0]}.parquet", 
                   place=paras[0], 
                   name_filter=None, 
                   compare_year=paras[2],
                   treatment_month=paras[1], 
                   remove_sun=remove_sun, 
                   target_var=paras[3])
    res_df_list.append(res)
    
df_res = pd.concat(res_df_list)
df_res.head()

In [None]:
df_res.to_parquet('results/did/basic_did.parquet', index=False)

## 3. Additional controlled variables
$$y_{i,ymd}=\alpha_i + \gamma_{yf} + \eta_{mf} +\zeta_{d} +\beta P_m + \epsilon_{i, ymd}$$

- $y_{i,ymd}$ is the dependent variable for place $i$ at year $y$, month $m$, and day of the week $d$. 
- $\alpha_i$ is the place-specific fixed effect.
- $\gamma_{yf}$ is the state-specific fixed effect depending on the year $y$.
- $\eta_{mf}$ is the state-specific fixed effect depending on the month $m$.
- $\zeta_{d}$ is the day-of-the-week fixed effect depending on the day of the week $d$.
- $\beta$ is the coefficient of the effect of the 9ET.
- $P_m$ is a dummy variable indicating the treatment.
- $\epsilon_{i, ymd}$ is the error term.

In [207]:
df_v = pd.read_parquet('dbs/visits_day_sg/Recreation & Sports Centres.parquet')
df_v = df_v.loc[(df_v.month.isin([5, 6, 7, 8])) & (df_v.year.isin([2019, 2022])), 
                ['osm_id', 'year', 'month', 'weekday', 'num_visits', 'num_visits_wt', 'd_ha', 'd_ha_wt']]
df_v.head()

Unnamed: 0,osm_id,year,month,weekday,num_visits,num_visits_wt,d_ha,d_ha_wt
0,4407207,2022,8,0,4,10.909091,2.805708,2.805708
3,4701854,2019,8,3,8,99.777778,9.648704,9.661539
4,4701854,2022,7,6,8,65.277778,14.252445,8.492695
6,4797423,2019,7,3,1,2.205128,19.461274,19.461274
7,4797423,2022,7,1,2,20.0,3.608743,3.608743


### 3.1 Add state information

In [183]:
osms = df_v['osm_id'].unique()
osms_sql = ','.join(["'" + str(x) + "'" for x in osms])
osms_sql = "(" + osms_sql + ")"
gdf_poi_c = gpd.read_postgis(f"""SELECT osm_id, geom FROM poi 
                                WHERE osm_id IN {osms_sql};""", con=engine)

In [184]:
gdf_state = gpd.read_file("dbs/geo/vg2500_12-31.utm32s.shape/vg2500/vg2500_LAN.shp").to_crs(4326)
gdf_state = gdf_state.loc[gdf_state['GF'] == 9, :]

In [185]:
gdf_poi_c = gdf_poi_c.sjoin(gdf_state[['GEN', 'geometry']])
df_v = pd.merge(df_v, gdf_poi_c[['osm_id', 'GEN']], on='osm_id', how='left').rename(columns={'GEN': 'state'})

### 3.2 Data preparation

In [196]:
time_seq_list = [f'{y}-{m}-{d}' for y in (2019, 2022, 2023) for m in range(5, 10) for d in range(0, 7)]
time_seq_dict = {x:i for i, x in zip(range(0, len(time_seq_list)), time_seq_list)}

In [210]:
#df = df_v.loc[df_v.state=='Berlin', :].copy()
df = df_v.copy()
treatment_month = [6, 7, 8]
# Categorization
df['time'] = df['year'].astype(str) + '-' + df['month'].astype(str)+ '-' + df['weekday'].astype(str)
df['time'] = df['time'].map(time_seq_dict)
df['osm_id'] = df['osm_id'].astype('category')
# df['state'] = df['state'].astype('category')
df['year'] = df['year'].astype('category')
df['month'] = df['month'].astype('category')
df['weekday'] = df['weekday'].astype('category')

# Treatment
df['variant_places'] = df['year'] == 2022
df['after'] = (df['month'] == treatment_month[0]) | (df['month'] == treatment_month[1]) | (df['month'] == treatment_month[2])
# Add the dummy variable for treatment (P_m)
df['P_m'] = df['variant_places'] & df['after'] # 1*(df['variant_places'] & df['after']), 

# Create a state-year and state-month fixed effect
#df['state_year'] = df['state'].astype(str) + '_' + df['year'].astype(str)
#df['state_month'] = df['state'].astype(str) + '_' + df['month'].astype(str)

# Set the multiindex
df = df.set_index(['osm_id', 'time'])
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,year,month,weekday,num_visits,num_visits_wt,d_ha,d_ha_wt,variant_places,after,P_m
osm_id,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
4407207,56,2022,8,0,4,10.909091,2.805708,2.805708,True,True,True
4701854,24,2019,8,3,8,99.777778,9.648704,9.661539,False,True,False
4701854,55,2022,7,6,8,65.277778,14.252445,8.492695,True,True,True
4797423,17,2019,7,3,1,2.205128,19.461274,19.461274,False,True,False
4797423,50,2022,7,1,2,20.0,3.608743,3.608743,True,True,True


### 3.3 Modeling

In [211]:
# Define the dependent variable and the independent variables
target_var = 'd_ha_wt'
dependent = df[target_var]
exog = df[['P_m']]

# Add fixed effects dummies
df = pd.get_dummies(df, columns=['year', 'month', 'weekday'], drop_first=True)  #'state_year', 'state_month', 'weekday'

# Collect all exogenous variables including the fixed effects
exog = pd.concat([exog, df.filter(like='year_'), df.filter(like='month_'), df.filter(like='weekday_')], axis=1)

# Model specification
model = PanelOLS(dependent, exog, entity_effects=True, check_rank=False, drop_absorbed=True)    # check_rank ? drop_absorbed ?

# Fit the model
results = model.fit()

# Print the summary of the model
print(results.summary)

                          PanelOLS Estimation Summary                           
Dep. Variable:                d_ha_wt   R-squared:                        0.0030
Estimator:                   PanelOLS   R-squared (Between):              0.0058
No. Observations:              516751   R-squared (Within):               0.0030
Date:                Wed, Jun 05 2024   R-squared (Overall):              0.0047
Time:                        10:05:13   Log-likelihood                  -2.8e+06
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      133.83
Entities:                       25038   P-value                           0.0000
Avg Obs:                       20.639   Distribution:               F(11,491702)
Min Obs:                       1.0000                                           
Max Obs:                       56.000   F-statistic (robust):             133.83
                            

## 4. Time-shifted DiD - multiple POI types

In [7]:
gdf_state = gpd.read_file("dbs/geo/vg2500_12-31.utm32s.shape/vg2500/vg2500_LAN.shp").to_crs(4326)
gdf_state = gdf_state.loc[gdf_state['GF'] == 9, :]

time_seq_list = [f'{y}-{m}-{d}' for y in (2019, 2022, 2023) for m in range(5, 10) for d in range(0, 7)]
time_seq_dict = {x:i for i, x in zip(range(0, len(time_seq_list)), time_seq_list)}

In [13]:
def load_data(fname = None, gdf_state=None, threshold_v=25):
    df_v = pd.read_parquet(fname)
    cols = ['osm_id', 'year', 'month', 'weekday', 'num_visits', 'num_visits_wt', 'd_ha', 'd_ha_wt', 'precipitation', 'pt_station_num']
    df_v = df_v.loc[df_v.num_visits >= threshold_v, cols]
    # Add state
    osms = df_v['osm_id'].unique()
    osms_sql = ','.join(["'" + str(x) + "'" for x in osms])
    osms_sql = "(" + osms_sql + ")"
    gdf_poi_c = gpd.read_postgis(f"""SELECT osm_id, geom FROM poi 
                                    WHERE osm_id IN {osms_sql};""", con=engine)
    gdf_poi_c = gdf_poi_c.sjoin(gdf_state[['GEN', 'geometry']])
    df_v = pd.merge(df_v, gdf_poi_c[['osm_id', 'GEN']], on='osm_id', how='left').rename(columns={'GEN': 'state'})
    return df_v

In [9]:
def data_prep(df=None, treatment_month=6, time_seq_dict=None):
    # Categorization
    df['time'] = df['year'].astype(str) + '-' + df['month'].astype(str)+ '-' + df['weekday'].astype(str)
    df['time'] = df['time'].map(time_seq_dict)
    df['osm_id'] = df['osm_id'].astype('category')
    df['state'] = df['state'].astype('category')
    df['year'] = df['year'].astype('category')
    df['month'] = df['month'].astype('category')
    df['weekday'] = df['weekday'].astype('category')
    
    # Treatment
    df['variant_places'] = df['year'] == 2022
    df['after'] = df['month'] == treatment_month
    # Add the dummy variable for treatment (P_m)
    df['P_m'] = df['variant_places'] & df['after'] # 1*(df['variant_places'] & df['after']), 
    
    # Create a state-year and state-month fixed effect
    df['state_year'] = df['state'].astype(str) + '_' + df['year'].astype(str)
    df['state_month'] = df['state'].astype(str) + '_' + df['month'].astype(str)
    
    # Set the multiindex
    df = df.set_index(['osm_id', 'time'])
    return df

def did_model(df=None, target_var=None):
    # Define the dependent variable and the independent variables
    dependent = df[target_var]
    exog = df[['P_m']]
    
    # Add fixed effects dummies
    df = pd.get_dummies(df, columns=['state_year', 'state_month', 'weekday'], drop_first=True)
    
    # Collect all exogenous variables including the fixed effects
    exog = pd.concat([exog, df.filter(like='state_year_'), df.filter(like='state_month_'), df.filter(like='weekday_')], axis=1)
    
    # Model specification
    model = PanelOLS(dependent, exog, entity_effects=True, check_rank=False, drop_absorbed=True)    # check_rank ? drop_absorbed ?
    
    # Fit the model
    results = model.fit()
    
    return results

def did_model_execution(df=None, place=None, name_filter=None, compare_year=2019,
                        treatment_month=6, remove_sun=False, target_var='num_visits_wt', time_seq_dict=None):
    df_v = df.copy()
    df_v = df_v.loc[(df_v.month.isin([5, treatment_month])) & (df_v.year.isin([compare_year, 2022]))]
    if remove_sun:
        df_v = df_v.loc[df_v.weekday != 6]
    if name_filter is not None:
        df_v = df_v.loc[df_v.name == name_filter]

    res = did_model(df=data_prep(df=df_v, treatment_month=treatment_month, time_seq_dict=time_seq_dict), target_var=target_var)
    # Summary
    df_r = pd.concat([res.params, res.std_errors, res.pvalues], axis = 1).\
             loc['P_m', :].to_frame().transpose().reset_index(drop=True)
    # Scale standard error to CI
    df_r['ci'] = df_r['std_error']*1.96
    df_r.reset_index(drop=True, inplace=True)
    df_r['target_var'] = target_var
    df_r['place_type'] = place
    df_r['treatment_month'] = treatment_month
    df_r['compare_year'] = compare_year
    return df_r

In [10]:
lbs = ['Restaurant', 'Supermarket', 'Recreation & Sports Centres', 'Retail stores']
para_set = [(tr_m, c_yr, t_var) for tr_m in (6, 7, 8) for c_yr in (2019, 2023) for t_var in ('d_ha_wt', 'num_visits_wt')]
res_df_list = []
for lb in lbs:
    print(f'Prepare data for {lb}.')
    df_visits = load_data(fname = f"dbs/visits_day_sg/{lb}.parquet", gdf_state=gdf_state, threshold_v=25)
    for paras in tqdm(para_set, desc=lb):
        remove_sun = False
        if paras[0] == 'Supermarket':
            remove_sun = True 
        res = did_model_execution(df=df_visits, 
                                  place=lb, 
                                  name_filter=None, 
                                  compare_year=paras[1],
                                  treatment_month=paras[0], 
                                  remove_sun=remove_sun, 
                                  target_var=paras[2], 
                                  time_seq_dict=time_seq_dict)
        res_df_list.append(res)
df_res = pd.concat(res_df_list)
df_res.head()

Prepare data for Restaurant.


KeyboardInterrupt: 

In [181]:
df_res.to_parquet('results/did/did_models_30.parquet', index=False)

## 5. Model 2
$$y_{i,d}=\delta \cdot \text{9ET}_d \cdot \text{post}_d +\beta_1 \cdot \text{9ET}_d + \beta_2 \cdot \text{precipitation}_d + \beta_3 \cdot \text{precipitation}_d \cdot \text{post}_d + \zeta_{s,y(d),m(d),\text{dow}(d)} + \epsilon_{i, d}$$

- $y_{i,d}$ is the dependent variable for place $i$ at the day of the week $d$. 
- $\text{9ET}_d$ is a dummy variable indicating whether it is during June, July, or Aug.
- $\text{post}_d$ is a dummy variable indicating the treatment is on.
- $\beta_0$ is the coefficient of the effect of the 9ET.
- $\beta_1$ is the coefficient of the effect of being in June, July, or Aug.
- $\zeta_{s,ymd}$ is the state-year-month-day-of-the-week fixed effect.
- $\epsilon_{i, d}$ is the error term.


In [14]:
lb = "Recreation & Sports Centres"
df_visits = load_data(fname = f"dbs/visits_day_did/{lb}.parquet", gdf_state=gdf_state, threshold_v=5)
df_visits = df_visits.dropna()
len(df_visits)

346306

In [15]:
df_visits.head()

Unnamed: 0,osm_id,year,month,weekday,num_visits,num_visits_wt,d_ha,d_ha_wt,precipitation,pt_station_num,state
0,5938322,2022,7,1,6,26.759259,3.700771,5.520459,0.0,25,Hessen
1,8767587,2022,5,0,6,26.833333,0.137972,0.137573,0.0,12,Hessen
2,8767587,2022,5,4,6,75.547619,1.739159,2.046274,0.0,12,Hessen
3,8767587,2022,6,6,5,48.071429,24.913963,98.605921,6.6,12,Hessen
4,8767587,2023,9,0,5,18.094118,0.202001,0.127251,0.0,12,Hessen


In [170]:
def data_preparation(data=None, year_list=[2019, 2022], treatment_yr=2022, 
                     treatment_month=6, remove_sep=True, treat_all=False):
    # df = data.loc[data.state=='Berlin', :].copy()
    df = data.copy()
    df = df.loc[df.year.isin(year_list), :] # .drop_duplicates(subset=['osm_id', 'year', 'month', 'weekday'])
    if treat_all:
        df = df.loc[df.month.isin([5, 6, 7, 8])]
    else:
        df = df.loc[df.month.isin([5, treatment_month])]
    if remove_sep:
        df = df.loc[df.month != 9, :]
    # Categorization
    df['time_fe'] = df['state'].astype(str) + '-' + df['year'].astype(str) + '-' + df['month'].astype(str)+ '-' + df['weekday'].astype(str)
    df['time_fe'] = df['time_fe'].astype('category')
    df['time'] = df['weekday']
    df['osm_id'] = df['osm_id'].astype('category')
    # df['state'] = df['state'].astype('category')
    df['year'] = df['year'].astype('category')
    df['month'] = df['month'].astype('category')
    df['weekday'] = df['weekday'].astype('category')
    
    # Treatment
    df['variant_places'] = df['year'] == treatment_yr
    df['rain'] = df['precipitation'] > 0
    df['after'] = df['month'] == treatment_month
    if treat_all:
        df['after'] = (df['month'] == 6) |\
                      (df['month'] == 7) |\
                      (df['month'] == 8)
    else:
        df['after'] = df['month'] == treatment_month
    # Add the dummy variable for treatment (P_m)
    df['P_m'] = df['variant_places'] & df['after'] # 1*(df['variant_places'] & df['after']), 
    df['rain_m'] = df['rain'] & df['after']
    
    # Create a state-year and state-month fixed effect
    # df['state_month'] = df['state'].astype(str) + '_' + df['month'].astype(str)
    
    # Set the multiindex
    df = df.set_index(['osm_id', 'time'])
    return df

def model_results(res=None, placebo=False, treatment_month='all', target_var=None):
    # Summary
    if placebo:
        vars = ['P_m', 'rain_m', 'after', 'rain']
    else:
        vars = ['P_m', 'rain_m', 'after', 'rain']
    df_r = pd.concat([res.params, res.std_errors, res.pvalues], axis = 1).\
               loc[vars, :].reset_index().rename(columns={'index': 'y'})
    # Scale standard error to CI
    df_r['ci'] = df_r['std_error']*1.96
    df_r.reset_index(drop=True, inplace=True)
    df_r['target_var'] = target_var
    df_r['treatment_month'] = treatment_month
    return df_r

In [171]:
def did_model2(data=None, target_var=None, treatment_month=None):
    df = data.copy()
    dependent = df[target_var]
    exog = df[['P_m', 'rain_m', 'rain', 'after']]
    
    # Add fixed effects dummies
    df = pd.get_dummies(df, columns=['time_fe'], drop_first=True) 
    
    # Collect all exogenous variables including the fixed effects
    exog = pd.concat([exog, df.filter(like='time_fe_')], axis=1)
    
    # Model specification
    model = PanelOLS(dependent, exog, entity_effects=True, check_rank=False, drop_absorbed=True)    # check_rank ? drop_absorbed ?
    
    # Fit the model
    results = model.fit()
    return model_results(res=results, treatment_month=treatment_month, target_var=target_var)

def placebo2(data=None, target_var=None, treatment_month=None):
    df = data.copy()
    dependent = df[target_var]
    exog = df[['P_m', 'rain_m', 'rain', 'after']]
    
    # Add fixed effects dummies
    df = pd.get_dummies(df, columns=['time_fe'], drop_first=True) 
    
    # Collect all exogenous variables including the fixed effects
    exog = pd.concat([exog, df.filter(like='time_fe_')], axis=1)
    
    # Model specification
    model = PanelOLS(dependent, exog, entity_effects=True, check_rank=False, drop_absorbed=True)    # check_rank ? drop_absorbed ?
    
    # Fit the model
    results = model.fit()
    return model_results(res=results, placebo=True, treatment_month=treatment_month, target_var=target_var)

### 5.1 Multiple POI types

In [18]:
gdf_state = gpd.read_file("dbs/geo/vg2500_12-31.utm32s.shape/vg2500/vg2500_LAN.shp").to_crs(4326)
gdf_state = gdf_state.loc[gdf_state['GF'] == 9, :]

In [19]:
df_cat = pd.read_excel('dbs/poi/categories.xlsx').rename(columns={'category': 'theme', 'subcategory': 'label'})
label_list = df_cat['label'].unique()

In [20]:
res_df_list = []
res_p_df_list = []
labels4test = ['Restaurant', 'Supermarket', 'Recreation & Sports Centres', 'Retail stores']

In [21]:
# lbs = ['Restaurant', 'Supermarket', 'Recreation & Sports Centres', 'Retail stores']
para_set = [(c_yr, t_var) for c_yr in (2019, 2023) for t_var in ('d_ha_wt', 'num_visits_wt')]

for lb in labels4test:
    print(f'Prepare data for {lb}.')
    df_visits = load_data(fname = f"dbs/visits_day_did/{lb}.parquet", gdf_state=gdf_state, threshold_v=5)
    if len(df_visits) > 0:
        df_visits = df_visits.dropna()
        for paras in tqdm(para_set, desc=lb):
            remove_sun = False
            if lb == 'Supermarket':
                remove_sun = True 
            rs = True
            df = data_preparation(data=df_visits, year_list=[paras[0], 2022], 
                                  treatment_yr=2022, remove_sep=rs)
            res = did_model2(data=df, target_var=paras[1])
            res.loc[:, 'compare_year'] = paras[0]
            res_df_list.append(res)
            # Placebo test
            df = data_preparation(data=df_visits, year_list=[paras[0], 2022], 
                                  treatment_yr=2023, remove_sep=rs)
            res_p = did_model2(data=df, target_var=paras[1])
            res_p_df_list.append(res_p)

Prepare data for Restaurant.


Restaurant: 100%|██████████| 4/4 [04:59<00:00, 74.79s/it]


Prepare data for Supermarket.


Supermarket: 100%|██████████| 4/4 [03:43<00:00, 55.85s/it]


Prepare data for Recreation & Sports Centres.


Recreation & Sports Centres: 100%|██████████| 4/4 [06:20<00:00, 95.09s/it] 


Prepare data for Retail stores.


Retail stores: 100%|██████████| 4/4 [02:16<00:00, 34.17s/it]


In [22]:
df_res = pd.concat(res_df_list)
df_res_p = pd.concat(res_p_df_list)
df_res_p = df_res_p.loc[df_res_p.pvalue >= 0.05, :]
df_res_p.loc[:, 'placebo'] = 1
df_res_p = df_res_p[['target_var', 'place', 'placebo']]
df_res = pd.merge(df_res, df_res_p, on=['target_var', 'place'], how='left')
df_res.fillna(0, inplace=True)

ValueError: No objects to concatenate

In [22]:
df_res = df_res.drop_duplicates(subset=['y', 'target_var', 'place', 'compare_year'])
df_res.to_parquet('results/did/did_model_2.parquet', index=False)

### 5.2 Overall effect

In [48]:
# Overall model
df_visits_total = []
for lb in labels4test:
    print(f'Prepare data for {lb}.')
    df_visits = load_data(fname = f"dbs/visits_day_did/{lb}.parquet", gdf_state=gdf_state, threshold_v=5)
    if lb == 'Supermarket':
        df_visits = df_visits.loc[df_visits.weekday != 6, :]
    df_visits_total.append(df_visits)
df_visits_total = pd.concat(df_visits_total)
df_visits_total = df_visits_total.dropna()

Prepare data for Restaurant.
Prepare data for Supermarket.
Prepare data for Recreation & Sports Centres.
Prepare data for Retail stores.


In [111]:
print(f'{len(df_visits_total)} visits on {df_visits_total.osm_id.nunique()} locations')

964753 visits on 43465 locations


In [108]:
df_vs = df_visits_total.loc[(df_visits_total.pt_station_num > 5) &\
                            (df_visits_total.state=='Berlin'), :]
print(f'{len(df_vs)} visits on {df_vs.osm_id.nunique()} locations')

62425 visits on 2209 locations


In [81]:
lb = 'Recreation & Sports Centres'
df_vs = load_data(fname = f"dbs/visits_day_did/{lb}.parquet", gdf_state=gdf_state, threshold_v=5)
df_vs = df_vs.loc[df_vs.pt_station_num > 0, :].dropna()
print(f'{len(df_vs)} visits on {df_vs.osm_id.nunique()} locations')

345463 visits on 9985 locations


#### 5.2.1 2019 May-June/July/Aug vs. 2022 May-June/July/Aug

In [120]:
res_df_list = []
for paras in tqdm(para_set):
    remove_sun = False
    rs = True
    df = data_preparation(data=df_vs, year_list=[paras[0], 2022], 
                          treatment_yr=2022, treatment_month=8, remove_sep=rs)
    res = did_model2(data=df, target_var=paras[1])
    res.loc[:, 'compare_year'] = paras[0]
    res_df_list.append(res)

100%|██████████| 4/4 [00:00<00:00,  4.10it/s]


In [121]:
df_res = pd.concat(res_df_list)
df_res

Unnamed: 0,y,parameter,std_error,pvalue,ci,target_var,treatment_month,compare_year
0,P_m,3.22829,3.730834,0.3868892,7.312435,d_ha_wt,all,2019
1,rain_m,1.491259,1.326417,0.2609154,2.599778,d_ha_wt,all,2019
2,after,8.30502,3.310527,0.01212992,6.488632,d_ha_wt,all,2019
3,rain,-0.673986,0.861886,0.4342336,1.689296,d_ha_wt,all,2019
0,P_m,-3.417934,6.630533,0.606223,12.995844,num_visits_wt,all,2019
1,rain_m,-3.737912,2.357343,0.1128422,4.620392,num_visits_wt,all,2019
2,after,82.001372,5.883552,0.0,11.531762,num_visits_wt,all,2019
3,rain,2.920509,1.531765,0.05658917,3.00226,num_visits_wt,all,2019
0,P_m,-1.072696,1.952335,0.5827076,3.826578,d_ha_wt,all,2023
1,rain_m,-2.346368,0.909488,0.009890048,1.782596,d_ha_wt,all,2023


In [122]:
# Placebo test
res_p_df_list = []
for tv in ['d_ha_wt', 'num_visits_wt']:
    df = data_preparation(data=df_vs, year_list=[2019, 2023], 
                          treatment_yr=2023, treatment_month=7, remove_sep=True)
    res_p = placebo2(data=df, target_var=tv)
    res_p_df_list.append(res_p)

In [123]:
df_res_p = pd.concat(res_p_df_list)
df_res_p

Unnamed: 0,y,parameter,std_error,pvalue,ci,target_var,treatment_month
0,P_m,-24.176802,4.118004,4.470645e-09,8.071288,d_ha_wt,all
1,rain_m,-1.151845,1.852716,0.5341494,3.631323,d_ha_wt,all
2,after,37.984668,3.726291,0.0,7.30353,d_ha_wt,all
3,rain,0.247501,1.58366,0.8758122,3.103974,d_ha_wt,all
0,P_m,-6.419209,6.381605,0.3144918,12.507946,num_visits_wt,all
1,rain_m,5.963998,2.871124,0.03780556,5.627403,num_visits_wt,all
2,after,75.856947,5.774573,0.0,11.318162,num_visits_wt,all
3,rain,4.573449,2.454173,0.06241582,4.810179,num_visits_wt,all


### 5.3 Model by state

In [4]:
gdf_state = gpd.read_file("dbs/geo/vg2500_12-31.utm32s.shape/vg2500/vg2500_LAN.shp").to_crs(4326)
gdf_state = gdf_state.loc[gdf_state['GF'] == 9, :].rename(columns={'GEN': 'state'})
state_list = list(gdf_state.state.unique())

In [5]:
para_set = [(c_yr, t_var) for c_yr in (2019, 2023) for t_var in ('d_ha_wt', 'num_visits_wt')]

In [6]:
lbs2include = ['Home & Lifestyle', 'Accomodations', 'Art & Culture', 'Café', 'Entertainment venues', 
               'Fast food', 'Games and activities', 'Historic', 'Information and services', 'Nightclub', 'Parks and gardens',
               'Recreation & Sports Centres', 'Recreational facilities', 'Retail stores', 'Tourist attractions', 
               'Viewing and observation', 'Water Sports', 'Wellness & Relaxation', 'Adventure & Wildlife', 
               'Ball Sports', 'Beverages', 'Cosmetics and beauty', 'Equestrian & Riding', 'Events and fairs', 
               'Extreme & Adventure Sports', 'Fashion and clothing', 'Library', 'Place of worship', 'Pub', 
               'Racket & Martial Arts', 'Rentals and sharing', 'Restaurant', 'Technology & Hobbies', 'Training center',
               'Wellness and fitness']
# print(', '.join(["'" + x + "'" for x in label_list]))

In [161]:
# By state
df_state_list =[]
for st in tqdm(state_list, desc='Modeling by state'):
    df_visits = pd.read_parquet(f'dbs/visits_day_did_states/{st}.parquet')
    # df_visits = df_visits.loc[df_visits.pt_station_num >= 3, :]
    df_visits = df_visits.loc[(df_visits.pt_station_num >= 3) & \
                              (df_visits.label.isin(lbs2include)) & \
                              (df_visits.num_visits >= 5), :]
    # DiD modeling
    res_df_list = []
    for paras in para_set:
        for tm in [6, 7, 8]:
            remove_sun = False
            rs = True
            df = data_preparation(data=df_visits, year_list=[paras[0], 2022], 
                                  treatment_yr=2022, treatment_month=tm, remove_sep=rs)
            res = did_model2(data=df, target_var=paras[1], treatment_month=tm)
            res.loc[:, 'compare_year'] = paras[0]
            res_df_list.append(res)
    
    # Placebo test
    res_p_df_list = []
    for tv in ['d_ha_wt', 'num_visits_wt']:
        for tm in [6, 7, 8]:
            df = data_preparation(data=df_visits, year_list=[2019, 2023], 
                                  treatment_yr=2023, treatment_month=tm, remove_sep=True)
            res_p = placebo2(data=df, target_var=tv, treatment_month=tm)
            res_p.loc[:, 'compare_year'] = 999  # placebo
            res_p_df_list.append(res_p)
    
    # Put results together
    df_state = pd.concat(res_df_list + res_p_df_list)
    df_state.loc[:, 'state'] = st
    df_state_list.append(df_state)
df_state = pd.concat(df_state_list)

Modeling by state: 100%|██████████| 16/16 [02:12<00:00,  8.26s/it]


In [162]:
fs = df_state.loc[(df_state.y == 'P_m') & \
             (df_state.compare_year == 999) & \
             (df_state.pvalue > 0.05), ['target_var', 'treatment_month', 'state']].copy()
fs.loc[:, 'placebo'] = 1
df_r = pd.merge(df_state.loc[(df_state.y == 'P_m') & \
             (df_state.pvalue < 0.05)], fs,
on=['target_var', 'treatment_month', 'state'], how='left')
df_r = df_r.loc[df_r.placebo == 1]

In [163]:
df_r

Unnamed: 0,y,parameter,std_error,pvalue,ci,target_var,treatment_month,compare_year,state,placebo
1,P_m,45.651257,4.336432,0.0,8.499407,num_visits_wt,7,2019,Schleswig-Holstein,1.0
2,P_m,49.085267,4.146169,0.0,8.126491,num_visits_wt,8,2019,Schleswig-Holstein,1.0
3,P_m,-8.493638,3.394709,0.01235521,6.65363,d_ha_wt,6,2023,Schleswig-Holstein,1.0
7,P_m,20.623969,2.242023,0.0,4.394366,num_visits_wt,7,2023,Schleswig-Holstein,1.0
8,P_m,19.144207,2.46965,9.325873e-15,4.840513,num_visits_wt,8,2023,Schleswig-Holstein,1.0
40,P_m,84.578005,11.794105,8.570922e-13,23.116446,num_visits_wt,7,2019,Bremen,1.0
41,P_m,42.141283,10.185009,3.5632e-05,19.962617,num_visits_wt,8,2019,Bremen,1.0
45,P_m,21.046498,4.995903,2.549276e-05,9.79197,num_visits_wt,7,2023,Bremen,1.0
127,P_m,82.753936,8.227006,0.0,16.124932,num_visits_wt,7,2019,Saarland,1.0
128,P_m,77.456291,9.644346,1.110223e-15,18.902917,num_visits_wt,8,2019,Saarland,1.0


In [164]:
df_r.to_parquet('results/did/state_model_v5_pt3_pois.parquet', index=False)

### 5.4 Model selective location types (all states)

In [7]:
df_visits_list = []
for st in tqdm(state_list, desc='Loading by state'):
    df_visits = pd.read_parquet(f'dbs/visits_day_did_states/{st}.parquet')
    # df_visits = df_visits.loc[df_visits.pt_station_num >= 3, :]
    df_visits = df_visits.loc[(df_visits.pt_station_num >= 3) & \
                              (df_visits.label.isin(lbs2include)) & \
                              (df_visits.num_visits >= 5), :]
    df_visits_list.append(df_visits)
df_visits = pd.concat(df_visits_list)
print(f'{df_visits.num_visits.sum()} visits on {df_visits.osm_id.nunique()} locations')

Loading by state: 100%|██████████| 16/16 [00:23<00:00,  1.47s/it]

22089013 visits on 112926 locations





In [167]:
# DiD modeling
res_df_list = []
for paras in para_set:
    for tm in [6, 7, 8]:
        remove_sun = False
        rs = True
        df = data_preparation(data=df_visits, year_list=[paras[0], 2022], 
                              treatment_yr=2022, treatment_month=tm, remove_sep=rs)
        res = did_model2(data=df, target_var=paras[1], treatment_month=tm)
        res.loc[:, 'compare_year'] = paras[0]
        res_df_list.append(res)

# Placebo test
res_p_df_list = []
for tv in ['d_ha_wt', 'num_visits_wt']:
    for tm in [6, 7, 8]:
        df = data_preparation(data=df_visits, year_list=[2019, 2023], 
                              treatment_yr=2023, treatment_month=tm, remove_sep=True)
        res_p = placebo2(data=df, target_var=tv, treatment_month=tm)
        res_p.loc[:, 'compare_year'] = 999  # placebo
        res_p_df_list.append(res_p)

# Put results together
df_state = pd.concat(res_df_list + res_p_df_list)
df_state.loc[:, 'state'] = 'All'

In [168]:
fs = df_state.loc[(df_state.y == 'P_m') & \
             (df_state.compare_year == 999) & \
             (df_state.pvalue > 0.05), ['target_var', 'treatment_month', 'state']].copy()
fs.loc[:, 'placebo'] = 1
df_r = pd.merge(df_state.loc[(df_state.y == 'P_m') & \
             (df_state.pvalue < 0.05)], fs,
on=['target_var', 'treatment_month', 'state'], how='left')
df_r = df_r.loc[df_r.placebo == 1]

In [172]:
df_r.to_parquet('results/did/all_states_model_v5_pt3_pois.parquet', index=False)
df_r

Unnamed: 0,y,parameter,std_error,pvalue,ci,target_var,treatment_month,compare_year,state,placebo
0,P_m,23.288303,7.448435,0.0017685,14.598932,d_ha_wt,7,2019,All,1
1,P_m,-8.042942,3.074992,0.008907339,6.026985,d_ha_wt,6,2023,All,1
2,P_m,11.921968,3.039952,8.791215e-05,5.958306,d_ha_wt,7,2023,All,1
3,P_m,16.725999,3.377362,7.332558e-07,6.619629,num_visits_wt,6,2023,All,1
4,P_m,15.93941,3.601677,9.621047e-06,7.059288,num_visits_wt,7,2023,All,1
5,P_m,15.039892,4.757776,0.001571714,9.325241,num_visits_wt,8,2023,All,1


In [173]:
# DiD modeling - merge 6-8
res_df_list = []
for paras in para_set:
    remove_sun = False
    rs = True
    df = data_preparation(data=df_visits, year_list=[paras[0], 2022], 
                          treatment_yr=2022, treatment_month=6, remove_sep=rs, treat_all=True)
    res = did_model2(data=df, target_var=paras[1], treatment_month='all')
    res.loc[:, 'compare_year'] = paras[0]
    res_df_list.append(res)

# Placebo test
res_p_df_list = []
for tv in ['d_ha_wt', 'num_visits_wt']:
    df = data_preparation(data=df_visits, year_list=[2019, 2023], 
                          treatment_yr=2023, treatment_month=6, remove_sep=True, treat_all=True)
    res_p = placebo2(data=df, target_var=tv, treatment_month='all')
    res_p.loc[:, 'compare_year'] = 999  # placebo
    res_p_df_list.append(res_p)

# Put results together
df_state = pd.concat(res_df_list + res_p_df_list)
df_state.loc[:, 'state'] = 'All'

In [175]:
fs = df_state.loc[(df_state.y == 'P_m') & \
             (df_state.compare_year == 999) & \
             (df_state.pvalue > 0.05), ['target_var', 'treatment_month', 'state']].copy()
fs.loc[:, 'placebo'] = 1
df_r = pd.merge(df_state.loc[(df_state.y == 'P_m') & \
             (df_state.pvalue < 0.05)], fs,
on=['target_var', 'treatment_month', 'state'], how='left')
df_r = df_r.loc[df_r.placebo == 1]

In [178]:
df_r.to_parquet('results/did/all_states_model_v5_pt3_pois_6_8.parquet', index=False)
df_r

Unnamed: 0,y,parameter,std_error,pvalue,ci,target_var,treatment_month,compare_year,state,placebo
0,P_m,14.610567,4.171503,0.000461,8.176146,num_visits_wt,all,2023,All,1.0


## 6. Model 2 - by public transit access

In [8]:
def data_preparation_by_pt(data=None, year_list=[2019, 2022], treatment_yr=2022, 
                     treatment_month=6, remove_sep=True, treat_all=False):
    # df = data.loc[data.state=='Berlin', :].copy()
    df = data.copy()
    df = df.loc[df.year.isin(year_list), :] # .drop_duplicates(subset=['osm_id', 'year', 'month', 'weekday'])
    if treat_all:
        df = df.loc[df.month.isin([5, 6, 7, 8])]
    else:
        df = df.loc[df.month.isin([5, treatment_month])]
    if remove_sep:
        df = df.loc[df.month != 9, :]
    # Categorization
    # PT
    def cate_pt(y):
        if y <= 14:
            return 1
        elif y <= 24:
            return 2
        elif y <= 37:
            return 3
        else:
            return 4
    df.loc[:, 'pt_cat'] = df['pt_station_num'].apply(lambda x: cate_pt(x))
    df['time_fe'] = df['state'].astype(str) + '-' + df['year'].astype(str) + '-' + df['month'].astype(str)+ '-' + df['weekday'].astype(str)
    df['time_fe'] = df['time_fe'].astype('category')
    df['time'] = df['weekday']
    df['osm_id'] = df['osm_id'].astype('category')
    # df['state'] = df['state'].astype('category')
    df['year'] = df['year'].astype('category')
    df['month'] = df['month'].astype('category')
    df['weekday'] = df['weekday'].astype('category')
    
    # Treatment
    df['variant_places'] = df['year'] == treatment_yr
    df['rain'] = df['precipitation'] > 0
    df['after'] = df['month'] == treatment_month
    if treat_all:
        df['after'] = (df['month'] == 6) |\
                      (df['month'] == 7) |\
                      (df['month'] == 8)
    else:
        df['after'] = df['month'] == treatment_month
    # Add the dummy variable for treatment (P_m)
    df['P_m1'] = df['variant_places'] & df['after'] & (df['pt_cat'] == 1)  
    df['P_m2'] = df['variant_places'] & df['after'] & (df['pt_cat'] == 2) 
    df['P_m3'] = df['variant_places'] & df['after'] & (df['pt_cat'] == 3) 
    df['P_m4'] = df['variant_places'] & df['after'] & (df['pt_cat'] == 4) 
    df['after1'] = df['after'] & (df['pt_cat'] == 1)
    df['after2'] = df['after'] & (df['pt_cat'] == 2) 
    df['after3'] = df['after'] & (df['pt_cat'] == 3) 
    df['after4'] = df['after'] & (df['pt_cat'] == 4)
    
    df['rain_m'] = df['rain'] & df['after']
    
    # Create a state-year and state-month fixed effect
    # df['state_month'] = df['state'].astype(str) + '_' + df['month'].astype(str)
    
    # Set the multiindex
    df = df.set_index(['osm_id', 'time'])
    return df

def did_model2_by_pt(data=None, target_var=None, treatment_month=None):
    df = data.copy()
    dependent = df[target_var]
    exog = df[['P_m1', 'P_m2', 'P_m3', 'P_m4',
               'rain_m', 'rain', 
               'after1', 'after2', 'after3', 'after4']]
    
    # Add fixed effects dummies
    df = pd.get_dummies(df, columns=['time_fe'], drop_first=True) 
    
    # Collect all exogenous variables including the fixed effects
    exog = pd.concat([exog, df.filter(like='time_fe_')], axis=1)
    
    # Model specification
    model = PanelOLS(dependent, exog, entity_effects=True, check_rank=False, drop_absorbed=True)    # check_rank ? drop_absorbed ?
    
    # Fit the model
    results = model.fit()
    
    # Summary
    vars = ['P_m1', 'P_m2', 'P_m3', 'P_m4',
               'rain_m', 'rain', 
               'after1', 'after2', 'after3', 'after4']
    df_r = pd.concat([results.params, results.std_errors, results.pvalues], axis = 1).\
               loc[vars, :].reset_index().rename(columns={'index': 'y'})
    # Scale standard error to CI
    df_r['ci'] = df_r['std_error']*1.96
    df_r.reset_index(drop=True, inplace=True)
    df_r['target_var'] = target_var
    df_r['treatment_month'] = treatment_month
    df_r.loc[:, 'compare_year'] = 2023
    return df_r

In [182]:
remove_sun = False
rs = True
df = data_preparation_by_pt(data=df_visits, year_list=[2023, 2022], 
                            treatment_yr=2022, treatment_month=6, remove_sep=rs, treat_all=True)
df_res = did_model2_by_pt(data=df, target_var='num_visits_wt', treatment_month='all')
df_res

Unnamed: 0,y,parameter,std_error,pvalue,ci,target_var,treatment_month
0,P_m1,12.202579,4.171138,0.00343931,8.175431,num_visits_wt,all
1,P_m2,16.083214,4.175425,0.0001172219,8.183834,num_visits_wt,all
2,P_m3,19.575597,4.176007,2.76404e-06,8.184974,num_visits_wt,all
3,P_m4,21.917239,4.177231,1.547558e-07,8.187374,num_visits_wt,all
4,rain_m,1.806508,0.236785,2.353673e-14,0.464099,num_visits_wt,all
5,rain,-0.462806,0.207274,0.025561,0.406258,num_visits_wt,all
6,after1,8.252647,4.426508,0.06227,8.675956,num_visits_wt,all
7,after2,6.63035,4.431723,0.1346249,8.686177,num_visits_wt,all
8,after3,4.837564,4.43249,0.2751026,8.687681,num_visits_wt,all
9,after4,2.729948,4.434187,0.5381196,8.691007,num_visits_wt,all


In [183]:
df_res.to_parquet('results/did/all_states_model_v5_pt3_pois_6_8_by_pt.parquet', index=False)

## 7. Model 2- by POI label

In [9]:
print(','.join(lbs2include))

Home & Lifestyle,Accomodations,Art & Culture,Café,Entertainment venues,Fast food,Games and activities,Historic,Information and services,Nightclub,Parks and gardens,Recreation & Sports Centres,Recreational facilities,Retail stores,Tourist attractions,Viewing and observation,Water Sports,Wellness & Relaxation,Adventure & Wildlife,Ball Sports,Beverages,Cosmetics and beauty,Equestrian & Riding,Events and fairs,Extreme & Adventure Sports,Fashion and clothing,Library,Place of worship,Pub,Racket & Martial Arts,Rentals and sharing,Restaurant,Technology & Hobbies,Training center,Wellness and fitness


In [10]:
def data_preparation_by_poi(data=None, year_list=[2019, 2022], treatment_yr=2022, 
                     treatment_month=6, remove_sep=True, treat_all=False):
    # df = data.loc[data.state=='Berlin', :].copy()
    df = data.copy()
    df = df.loc[df.year.isin(year_list), :] # .drop_duplicates(subset=['osm_id', 'year', 'month', 'weekday'])
    if treat_all:
        df = df.loc[df.month.isin([5, 6, 7, 8])]
    else:
        df = df.loc[df.month.isin([5, treatment_month])]
    if remove_sep:
        df = df.loc[df.month != 9, :]
    # Categorization
    # POI
    df['time_fe'] = df['state'].astype(str) + '-' + df['year'].astype(str) + '-' + df['month'].astype(str)+ '-' + df['weekday'].astype(str)
    df['time_fe'] = df['time_fe'].astype('category')
    df['time'] = df['weekday']
    df['osm_id'] = df['osm_id'].astype('category')
    # df['state'] = df['state'].astype('category')
    df['year'] = df['year'].astype('category')
    df['month'] = df['month'].astype('category')
    df['weekday'] = df['weekday'].astype('category')
    
    # Treatment
    df['variant_places'] = df['year'] == treatment_yr
    df['rain'] = df['precipitation'] > 0
    df['after'] = df['month'] == treatment_month
    if treat_all:
        df['after'] = (df['month'] == 6) |\
                      (df['month'] == 7) |\
                      (df['month'] == 8)
    else:
        df['after'] = df['month'] == treatment_month
    # Add the dummy variable for treatment (P_m)
    for lb in lbs2include:
        df[f'P_m_{lb}'] = df['variant_places'] & df['after'] & (df['label'] == lb)
        df[f'after_{lb}'] = df['after'] & (df['label'] == lb)
    df['rain_m'] = df['rain'] & df['after']
    
    # Create a state-year and state-month fixed effect
    # df['state_month'] = df['state'].astype(str) + '_' + df['month'].astype(str)
    
    # Set the multiindex
    df = df.set_index(['osm_id', 'time'])
    return df

def did_model2_by_poi(data=None, target_var=None, treatment_month=None):
    df = data.copy()
    dependent = df[target_var]
    exog_vars = [f'P_m_{lb}' for lb in lbs2include] + ['rain_m', 'rain'] +\
        [f'after_{lb}' for lb in lbs2include]
    exog = df[exog_vars]
    
    # Add fixed effects dummies
    df = pd.get_dummies(df, columns=['time_fe'], drop_first=True) 
    
    # Collect all exogenous variables including the fixed effects
    exog = pd.concat([exog, df.filter(like='time_fe_')], axis=1)
    
    # Model specification
    model = PanelOLS(dependent, exog, entity_effects=True, check_rank=False, drop_absorbed=True)    # check_rank ? drop_absorbed ?
    
    # Fit the model
    results = model.fit()
    
    # Summary
    df_r = pd.concat([results.params, results.std_errors, results.pvalues], axis = 1).\
               loc[exog_vars, :].reset_index().rename(columns={'index': 'y'})
    # Scale standard error to CI
    df_r['ci'] = df_r['std_error']*1.96
    df_r.reset_index(drop=True, inplace=True)
    df_r['target_var'] = target_var
    df_r['treatment_month'] = treatment_month
    df_r.loc[:, 'compare_year'] = 2023
    return df_r

In [11]:
remove_sun = False
rs = True
df = data_preparation_by_poi(data=df_visits, year_list=[2023, 2022], 
                            treatment_yr=2022, treatment_month=6, remove_sep=rs, treat_all=True)
df_res = did_model2_by_poi(data=df, target_var='num_visits_wt', treatment_month='all')
df_res

Unnamed: 0,y,parameter,std_error,pvalue,ci,target_var,treatment_month,compare_year
0,P_m_Home & Lifestyle,10.751136,4.198036,1.043748e-02,8.228151,num_visits_wt,all,2023
1,P_m_Accomodations,11.234301,4.191939,7.362793e-03,8.216200,num_visits_wt,all,2023
2,P_m_Art & Culture,21.665732,4.236736,3.158135e-07,8.304002,num_visits_wt,all,2023
3,P_m_Café,14.467513,4.226746,6.197013e-04,8.284422,num_visits_wt,all,2023
4,P_m_Entertainment venues,24.385057,4.541961,7.925649e-08,8.902243,num_visits_wt,all,2023
...,...,...,...,...,...,...,...,...
67,after_Rentals and sharing,4.999051,4.546064,2.714876e-01,8.910285,num_visits_wt,all,2023
68,after_Restaurant,8.016910,4.437868,7.084394e-02,8.698221,num_visits_wt,all,2023
69,after_Technology & Hobbies,4.973976,4.550722,2.743905e-01,8.919416,num_visits_wt,all,2023
70,after_Training center,9.748890,4.829429,4.352458e-02,9.465681,num_visits_wt,all,2023


In [12]:
df_res.to_parquet('results/did/all_states_model_v5_pt3_pois_6_8_by_poi.parquet', index=False)