# Analysis on cleaned dataset from Data.ipynb

In [None]:
#only perform once in the beginning
#%pip install linearmodels

In [13]:
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None  # default='warn'


# Fixed effects regression:
from linearmodels import PanelOLS
import statsmodels.api as sm


In [222]:
# import the csv data from Data.ipynb
data = pd.read_csv('/Users/maxweber/Desktop/DataMasterThesis/data_clean.csv')
houshold = pd.read_csv('/Users/maxweber/Desktop/DataMasterThesis/houshold_clean.csv', index_col=['SSUID', 'month_total'])

In [10]:
def get_grouping(df_column, group_dict, reb_dummies_df, naming_interaction = 'j'):
    # get flag indicating the group with the dictionary-mapping:
    new_col = df_column.apply(lambda x: next((k for k, v in group_dict.items() if x in v), 0))
    new_col = new_col.rename(new_col.name + '_flag')
    new_cols = pd.get_dummies(new_col) # get dummies from group flag


   # create interaction dummies (group-dummy * rebatelag_dummy) iterating over each group_id and concatenating in the end:
    df_group_interactions = pd.DataFrame(df_column) # for initialization with correct index
    group_lst = group_dict.keys()
    for group_id in group_lst:
        dummies_group = reb_dummies_df.multiply(new_cols[group_id], axis = 'index')
        
        new_dummy_names = []
        for name in dummies_group.columns:
            new_dummy_names.append(name + '-' + naming_interaction + str(group_id))
        dummies_group.columns = new_dummy_names
        df_group_interactions = df_group_interactions.merge(dummies_group,left_index= True, right_index=True)
        df_group_interactions.replace(np.NaN, 0, inplace = True)
    return df_group_interactions

# 2SLS approach by hand

In [56]:
# this is the dataframe grouped by HH
houshold = pd.read_csv('/Users/maxweber/Desktop/DataMasterThesis/houshold_clean.csv', index_col=['SSUID', 'month_total'])

houshold.reset_index(drop=False, inplace=True)
houshold.fillna(0,inplace=True)
houshold.columns

Index(['SSUID', 'month_total', 'tfearn', 'erbamth', 'TFTOTINC', 'ERBATAMT',
       'SREFMON', 'ems', 'EHHNUMPP', 'erebate-6', 'erebate-5', 'erebate-4',
       'erebate-3', 'erebate-2', 'erebate-1', 'erebate0', 'erebate1',
       'erebate2', 'erebate3', 'erebate4', 'erebate5', 'erebate6', 'erebate7',
       'erebate8', 'erebate9', 'erebate10', 'erebate11'],
      dtype='object')

In [82]:
houshold.index

RangeIndex(start=0, stop=196864, step=1)

In [85]:
# analysis on subset with time-interactions terms:
hh_new = houshold[['ERBATAMT', 'tfearn', 'month_total', 'SSUID','ems', 'EHHNUMPP', 'SREFMON']]
hh_new['reb'] = houshold['erebate0'] + houshold['erebate1']
hh_new['reb_lag'] = houshold['erebate2'] + houshold['erebate3']
hh_new['erbatamt'] = hh_new['ERBATAMT'].multiply(hh_new['reb'], axis = 'index')  # so that it is 0 in the months without rebate payout
hh_new['erbatamt_lag'] = hh_new['ERBATAMT'].multiply(hh_new['reb_lag'], axis = 'index')

hh_new.set_index(['SSUID', 'month_total'], drop=False, inplace=True)

#interaction terms based on  marital status, household size, month (and month relative to interview month)
hh_new['time_interact'] = 'hh-'+ hh_new['month_total'].astype(int).astype(str) + '-' +  hh_new['EHHNUMPP'].astype(str) + '-'  + hh_new['ems'].astype(str) 
print('# unique: ', len(hh_new['time_interact'].unique()))

# define the time-interaction terms as own dataframe
time_interact = pd.DataFrame(hh_new['time_interact'], index = hh_new.index)
time_interact.head()

# unique:  121


Unnamed: 0_level_0,Unnamed: 1_level_0,time_interact
SSUID,month_total,Unnamed: 2_level_1
19128000276,6.0,hh-6-3-1
19128000276,7.0,hh-7-3-1
19128000276,8.0,hh-8-3-1
19128000276,9.0,hh-9-3-1
19128000276,10.0,hh-10-3-1


In [88]:
# first stage to estimate fitted rebate with interactions with time effect
exog_vars = hh_new[['reb', 'reb_lag']]
#time_interact = hh_new['time_interact'].to_frame()
exog = sm.add_constant(exog_vars)

model_S1_erbatamt = PanelOLS(hh_new['erbatamt'], exog, entity_effects = True, time_effects = False, other_effects = time_interact, check_rank = True, drop_absorbed=True)
params_S1_erbatamt = model_S1_erbatamt.fit() #cov_type='clustered', cluster_entity = True
params_S1_erbatamt

  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,erbatamt,R-squared:,0.7296
Estimator:,PanelOLS,R-squared (Between):,0.6356
No. Observations:,196864,R-squared (Within):,0.7677
Date:,"Sun, Jun 12 2022",R-squared (Overall):,0.7467
Time:,14:56:42,Log-likelihood,-1.266e+06
Cov. Estimator:,Unadjusted,,
,,F-statistic:,2.322e+05
Entities:,24608,P-value,0.0000
Avg Obs:,8.0000,Distribution:,"F(2,172134)"
Min Obs:,8.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
const,1.6894,0.4960,3.4061,0.0007,0.7173,2.6616
reb,964.38,1.5694,614.51,0.0000,961.31,967.46
reb_lag,-3.4918,1.2897,-2.7074,0.0068,-6.0196,-0.9640


In [89]:
# first stage to estimate fitted rebate-lag with interactions with time effect
exog_vars = hh_new[['reb', 'reb_lag']]
#time_interact = hh_new['time_interact'].to_frame()
exog = sm.add_constant(exog_vars)

model_S1_erbatamt = PanelOLS(hh_new['erbatamt_lag'], exog, entity_effects = True, time_effects = False, other_effects = time_interact, check_rank = True, drop_absorbed=True)
params_S1_erbatamt_lag = model_S1_erbatamt.fit() #cov_type='clustered', cluster_entity = True
params_S1_erbatamt_lag

0,1,2,3
Dep. Variable:,erbatamt_lag,R-squared:,0.7203
Estimator:,PanelOLS,R-squared (Between):,0.5425
No. Observations:,196864,R-squared (Within):,0.7571
Date:,"Sun, Jun 12 2022",R-squared (Overall):,0.7246
Time:,14:57:17,Log-likelihood,-1.311e+06
Cov. Estimator:,Unadjusted,,
,,F-statistic:,2.216e+05
Entities:,24608,P-value,0.0000
Avg Obs:,8.0000,Distribution:,"F(2,172134)"
Min Obs:,8.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
const,2.4789,0.6227,3.9809,0.0001,1.2584,3.6994
reb,-7.5448,1.9702,-3.8294,0.0001,-11.406,-3.6832
reb_lag,971.15,1.6192,599.79,0.0000,967.97,974.32


In [90]:
# adding fitted values to dataframe in preparation of 2nd stage regression:
print(params_S1_erbatamt.params.loc[['reb', 'reb_lag']])
print(params_S1_erbatamt_lag.params.loc[['reb', 'reb_lag']])


erbatamt_fitted_base_reb = params_S1_erbatamt.fitted_values
erbatamt_fitted_base_reb.columns = ['erbatamt_fitted_base_reb']

erbatamt_fitted_base_reb_lag = params_S1_erbatamt_lag.fitted_values
erbatamt_fitted_base_reb_lag.columns = ['erbatamt_fitted_base_reb_lag']

hh_new = pd.concat([hh_new,erbatamt_fitted_base_reb, erbatamt_fitted_base_reb_lag], axis = 1)

reb        964.382407
reb_lag     -3.491786
Name: parameter, dtype: float64
reb         -7.544792
reb_lag    971.146627
Name: parameter, dtype: float64


In [91]:
# 2nd stage FE regression:

exog_vars = hh_new[['erbatamt_fitted_base_reb', 'erbatamt_fitted_base_reb_lag']]
exog = sm.add_constant(exog_vars)
exog
model_second_stage = PanelOLS(hh_new['tfearn'], exog, entity_effects = True, time_effects = True, check_rank = True, drop_absorbed=True)
params_second_stage = model_second_stage.fit()
params_second_stage

0,1,2,3
Dep. Variable:,tfearn,R-squared:,6.194e-06
Estimator:,PanelOLS,R-squared (Between):,4.711e-05
No. Observations:,196864,R-squared (Within):,-3.745e-05
Date:,"Sun, Jun 12 2022",R-squared (Overall):,3.702e-05
Time:,14:57:19,Log-likelihood,-1.776e+06
Cov. Estimator:,Unadjusted,,
,,F-statistic:,0.5335
Entities:,24608,P-value,0.5866
Avg Obs:,8.0000,Distribution:,"F(2,172244)"
Min Obs:,8.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
const,5320.1,6.6580,799.05,0.0000,5307.0,5333.1
erbatamt_fitted_base_reb,-0.0006,0.0217,-0.0284,0.9774,-0.0432,0.0420
erbatamt_fitted_base_reb_lag,-0.0167,0.0177,-0.9414,0.3465,-0.0514,0.0180
