# ECON 570: Final Project

Masashi Yoshioka (3200-3439-52), Jincen Jiang (XXXX-XXXX-XX), Shuxian Mao (XXXX-XXXX-XX)

In [1]:
# Suppress warnings
# import warnings
# warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
import pyreadr
import random
import statsmodels.formula.api as smf
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.neighbors import NearestNeighbors
from utils.functions import fn_generate_data, fn_generate_variables, fn_generate_df_prop, fn_generate_df_matched, fn_IPTW

np.random.seed(570)
random.seed(570)

  from pandas import Int64Index as NumericIndex


## 1. Introduction

Draft by Shuxian:

a. What question are you answering with your project?  
b. Why should we care about this question?  
c. Have there been previous papers in the economics literature addressing this question? Summarize previous research briefly.

## 2. Data

a. Describe the source(s) of the data  
b. Discuss whether you are removing any outliers from your data, or doing any other sampling restrictions  
c. Discuss whether you are transforming the data and/or merging/appending multiple datasets together  
d. Show comprehensive summary statistics.  
Remember: summary statistics are a very important part of all applied work!

In [3]:
# LaLonde's original dataset
df_nsw = pyreadr.read_r('./data/nsw.rda')['nsw']
df_nsw.head()

Unnamed: 0,data_id,treat,age,education,black,hispanic,married,nodegree,re75,re78
0,Lalonde Sample,1.0,37.0,11.0,1.0,0.0,1.0,1.0,0.0,9930.045898
1,Lalonde Sample,1.0,22.0,9.0,0.0,1.0,0.0,1.0,0.0,3595.894043
2,Lalonde Sample,1.0,30.0,12.0,1.0,0.0,0.0,0.0,0.0,24909.449219
3,Lalonde Sample,1.0,27.0,11.0,1.0,0.0,0.0,1.0,0.0,7506.145996
4,Lalonde Sample,1.0,33.0,8.0,1.0,0.0,0.0,1.0,0.0,289.789886


In [4]:
df_nsw.describe()

Unnamed: 0,treat,age,education,black,hispanic,married,nodegree,re75,re78
count,722.0,722.0,722.0,722.0,722.0,722.0,722.0,722.0,722.0
mean,0.411357,24.520776,10.267313,0.800554,0.105263,0.16205,0.779778,3042.896585,5454.635844
std,0.492421,6.625947,1.704774,0.399861,0.307105,0.368752,0.414683,5066.143382,6252.943413
min,0.0,17.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,19.0,9.0,1.0,0.0,0.0,1.0,0.0,0.0
50%,0.0,23.0,10.0,1.0,0.0,0.0,1.0,936.307953,3951.889038
75%,1.0,27.0,11.0,1.0,0.0,0.0,1.0,3993.20697,8772.004395
max,1.0,55.0,16.0,1.0,1.0,1.0,1.0,37431.660156,60307.929688


In [5]:
# Dehejia and Wahba subsamples
df_nsw_dw = pyreadr.read_r('./data/nsw_dw.rda')['nsw_dw']
df_nsw_dw.head()

Unnamed: 0,data_id,treat,age,education,black,hispanic,married,nodegree,re74,re75,re78
0,Dehejia-Wahba Sample,1.0,37.0,11.0,1.0,0.0,1.0,1.0,0.0,0.0,9930.045898
1,Dehejia-Wahba Sample,1.0,22.0,9.0,0.0,1.0,0.0,1.0,0.0,0.0,3595.894043
2,Dehejia-Wahba Sample,1.0,30.0,12.0,1.0,0.0,0.0,0.0,0.0,0.0,24909.449219
3,Dehejia-Wahba Sample,1.0,27.0,11.0,1.0,0.0,0.0,1.0,0.0,0.0,7506.145996
4,Dehejia-Wahba Sample,1.0,33.0,8.0,1.0,0.0,0.0,1.0,0.0,0.0,289.789886


In [6]:
df_nsw_dw.describe()

Unnamed: 0,treat,age,education,black,hispanic,married,nodegree,re74,re75,re78
count,445.0,445.0,445.0,445.0,445.0,445.0,445.0,445.0,445.0,445.0
mean,0.41573,25.370787,10.195506,0.833708,0.08764,0.168539,0.782022,2102.265309,1377.138375,5300.763683
std,0.493402,7.100282,1.792119,0.372762,0.28309,0.374766,0.413337,5363.582366,3150.9608,6631.491681
min,0.0,17.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,20.0,9.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
50%,0.0,24.0,10.0,1.0,0.0,0.0,1.0,0.0,0.0,3701.812012
75%,1.0,28.0,11.0,1.0,0.0,0.0,1.0,824.388916,1220.83606,8124.714844
max,1.0,55.0,16.0,1.0,1.0,1.0,1.0,39570.679688,25142.240234,60307.929688


In [7]:
# Combine all of the datasets
df_nsw.loc[df_nsw['treat'] == 1, 'data_id'] = 'LT'
df_nsw.loc[df_nsw['treat'] == 0, 'data_id'] = 'LC'

df_nsw_dw.loc[df_nsw_dw['treat'] == 1, 'data_id'] = 'DWT'
df_nsw_dw.loc[df_nsw_dw['treat'] == 0, 'data_id'] = 'DWC'

df_psid1 = pyreadr.read_r('./data/psid_controls.rda')['psid_controls']
df_psid2 = pyreadr.read_r('./data/psid_controls2.rda')['psid_controls2']
df_psid3 = pyreadr.read_r('./data/psid_controls3.rda')['psid_controls3']
df_cps1 = pyreadr.read_r('./data/cps_controls.rda')['cps_controls']
df_cps2 = pyreadr.read_r('./data/cps_controls2.rda')['cps_controls2']
df_cps3 = pyreadr.read_r('./data/cps_controls3.rda')['cps_controls3']

df = pd.concat([df_nsw, df_nsw_dw, df_psid1, df_psid2, df_psid3, df_cps1, df_cps2, df_cps3]).reset_index(drop = True)
df

Unnamed: 0,data_id,treat,age,education,black,hispanic,married,nodegree,re75,re78,re74
0,LT,1.0,37.0,11.0,1.0,0.0,1.0,1.0,0.0,9930.045898,
1,LT,1.0,22.0,9.0,0.0,1.0,0.0,1.0,0.0,3595.894043,
2,LT,1.0,30.0,12.0,1.0,0.0,0.0,0.0,0.0,24909.449219,
3,LT,1.0,27.0,11.0,1.0,0.0,0.0,1.0,0.0,7506.145996,
4,LT,1.0,33.0,8.0,1.0,0.0,0.0,1.0,0.0,289.789886,
...,...,...,...,...,...,...,...,...,...,...,...
22823,CPS3,0.0,18.0,11.0,0.0,0.0,0.0,1.0,0.0,10150.500000,0.0
22824,CPS3,0.0,24.0,1.0,0.0,1.0,1.0,1.0,0.0,19464.609375,0.0
22825,CPS3,0.0,21.0,18.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
22826,CPS3,0.0,32.0,5.0,1.0,0.0,1.0,1.0,0.0,187.671295,0.0


In [8]:
# Add a difference in revenues
df['dif'] = df['re78'] - df['re75']
df

Unnamed: 0,data_id,treat,age,education,black,hispanic,married,nodegree,re75,re78,re74,dif
0,LT,1.0,37.0,11.0,1.0,0.0,1.0,1.0,0.0,9930.045898,,9930.045898
1,LT,1.0,22.0,9.0,0.0,1.0,0.0,1.0,0.0,3595.894043,,3595.894043
2,LT,1.0,30.0,12.0,1.0,0.0,0.0,0.0,0.0,24909.449219,,24909.449219
3,LT,1.0,27.0,11.0,1.0,0.0,0.0,1.0,0.0,7506.145996,,7506.145996
4,LT,1.0,33.0,8.0,1.0,0.0,0.0,1.0,0.0,289.789886,,289.789886
...,...,...,...,...,...,...,...,...,...,...,...,...
22823,CPS3,0.0,18.0,11.0,0.0,0.0,0.0,1.0,0.0,10150.500000,0.0,10150.500000
22824,CPS3,0.0,24.0,1.0,0.0,1.0,1.0,1.0,0.0,19464.609375,0.0,19464.609375
22825,CPS3,0.0,21.0,18.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000
22826,CPS3,0.0,32.0,5.0,1.0,0.0,1.0,1.0,0.0,187.671295,0.0,187.671295


In [9]:
# Check what is in data_id
df.data_id.unique()

array(['LT', 'LC', 'DWT', 'DWC', 'PSID', 'PSID2', 'PSID3', 'CPS1', 'CPS2',
       'CPS3'], dtype=object)

## 3. Modeling

a. Analyze the data using 2-3 different model specifications (i.e. with/without covariates, using different definitions of the treatment, outcome, or predictor variables, etc.)  
Remember: complicated models are not necessarily better than simple models. There is nothing wrong with OLS regression if that’s the best model.

### 3.1 Regression

Estimate $E[Y | D = 1, X]$ and $E[Y | D = 0, X]$.

#### 3.1.1 Linear Regression

Assume $E[Y|D, X] = \alpha + \tau D + X \beta + u$.

In [10]:
# Obtain a 'true' average treatment effects
df0 = fn_generate_data(treat_id = 'LT', control_id = 'LC', df = df)
results = smf.ols(formula = 'dif ~ treat', data = df0).fit()
results.summary()

0,1,2,3
Dep. Variable:,dif,R-squared:,0.003
Model:,OLS,Adj. R-squared:,0.002
Method:,Least Squares,F-statistic:,2.286
Date:,"Thu, 31 Mar 2022",Prob (F-statistic):,0.131
Time:,00:03:09,Log-Likelihood:,-7456.6
No. Observations:,722,AIC:,14920.0
Df Residuals:,720,BIC:,14930.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2063.3655,359.259,5.743,0.000,1358.045,2768.686
treat,846.8883,560.142,1.512,0.131,-252.818,1946.594

0,1,2,3
Omnibus:,161.292,Durbin-Watson:,1.743
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1522.87
Skew:,0.709,Prob(JB):,0.0
Kurtosis:,9.972,Cond. No.,2.46


In [11]:
df0['age2'] = df0['age'] ** 2

results = smf.ols(formula = 'dif ~ treat + age + age2 + education + black + hispanic + nodegree', data = df0).fit()
results.summary()

0,1,2,3
Dep. Variable:,dif,R-squared:,0.018
Model:,OLS,Adj. R-squared:,0.008
Method:,Least Squares,F-statistic:,1.823
Date:,"Thu, 31 Mar 2022",Prob (F-statistic):,0.0799
Time:,00:03:09,Log-Likelihood:,-7451.3
No. Observations:,722,AIC:,14920.0
Df Residuals:,714,BIC:,14960.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.173e+04,4344.694,2.700,0.007,3202.218,2.03e+04
treat,819.2771,561.491,1.459,0.145,-283.094,1921.648
age,-746.4430,251.135,-2.972,0.003,-1239.495,-253.391
age2,12.3287,4.229,2.915,0.004,4.025,20.632
education,87.1087,217.554,0.400,0.689,-340.013,514.230
black,77.0276,956.036,0.081,0.936,-1799.950,1954.005
hispanic,977.7160,1253.769,0.780,0.436,-1483.799,3439.231
nodegree,-469.8266,891.073,-0.527,0.598,-2219.263,1279.610

0,1,2,3
Omnibus:,183.214,Durbin-Watson:,1.732
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1758.888
Skew:,0.846,Prob(JB):,0.0
Kurtosis:,10.457,Cond. No.,12100.0


In [12]:
results = smf.ols(formula = 'dif ~ treat + age + age2', data = df0).fit()
results.summary()

0,1,2,3
Dep. Variable:,dif,R-squared:,0.015
Model:,OLS,Adj. R-squared:,0.011
Method:,Least Squares,F-statistic:,3.606
Date:,"Thu, 31 Mar 2022",Prob (F-statistic):,0.0132
Time:,00:03:09,Log-Likelihood:,-7452.3
No. Observations:,722,AIC:,14910.0
Df Residuals:,718,BIC:,14930.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.184e+04,3380.152,3.503,0.000,5205.814,1.85e+04
treat,856.9110,557.678,1.537,0.125,-237.964,1951.786
age,-703.6823,241.631,-2.912,0.004,-1178.070,-229.294
age2,11.5827,4.079,2.839,0.005,3.574,19.591

0,1,2,3
Omnibus:,183.011,Durbin-Watson:,1.749
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1741.797
Skew:,0.847,Prob(JB):,0.0
Kurtosis:,10.418,Cond. No.,9330.0


Average treatment effect on `dif` is about 820-860 using LaLonde's original dataset.

In [13]:
# Obtain a 'true' average treatment effects over the DW subset
df0 = fn_generate_data(treat_id = 'DWT', control_id = 'DWC', df = df)
results = smf.ols(formula = 'dif ~ treat', data = df0).fit()
results.summary()

0,1,2,3
Dep. Variable:,dif,R-squared:,0.011
Model:,OLS,Adj. R-squared:,0.009
Method:,Least Squares,F-statistic:,5.064
Date:,"Thu, 31 Mar 2022",Prob (F-statistic):,0.0249
Time:,00:03:09,Log-Likelihood:,-4574.4
No. Observations:,445,AIC:,9153.0
Df Residuals:,443,BIC:,9161.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,3287.8921,438.147,7.504,0.000,2426.788,4148.997
treat,1529.1961,679.538,2.250,0.025,193.677,2864.715

0,1,2,3
Omnibus:,202.731,Durbin-Watson:,1.818
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1934.02
Skew:,1.709,Prob(JB):,0.0
Kurtosis:,12.624,Cond. No.,2.47


In [14]:
df0['age2'] = df0['age'] ** 2

results = smf.ols(formula = 'dif ~ treat + age + age2 + education + black + hispanic + nodegree', data = df0).fit()
results.summary()

0,1,2,3
Dep. Variable:,dif,R-squared:,0.031
Model:,OLS,Adj. R-squared:,0.015
Method:,Least Squares,F-statistic:,1.97
Date:,"Thu, 31 Mar 2022",Prob (F-statistic):,0.0577
Time:,00:03:09,Log-Likelihood:,-4570.0
No. Observations:,445,AIC:,9156.0
Df Residuals:,437,BIC:,9189.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,4745.6828,5052.075,0.939,0.348,-5183.702,1.47e+04
treat,1375.1516,688.395,1.998,0.046,22.176,2728.127
age,-182.7710,288.188,-0.634,0.526,-749.178,383.636
age2,3.4888,4.752,0.734,0.463,-5.851,12.829
education,282.9176,247.451,1.143,0.254,-203.424,769.259
black,-1688.9039,1260.792,-1.340,0.181,-4166.874,789.066
hispanic,-71.1607,1672.512,-0.043,0.966,-3358.328,3216.007
nodegree,-828.7128,1074.532,-0.771,0.441,-2940.606,1283.180

0,1,2,3
Omnibus:,209.866,Durbin-Watson:,1.792
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2186.026
Skew:,1.755,Prob(JB):,0.0
Kurtosis:,13.275,Cond. No.,12500.0


In [15]:
results = smf.ols(formula = 'dif ~ treat + age + age2', data = df0).fit()
results.summary()

0,1,2,3
Dep. Variable:,dif,R-squared:,0.012
Model:,OLS,Adj. R-squared:,0.005
Method:,Least Squares,F-statistic:,1.793
Date:,"Thu, 31 Mar 2022",Prob (F-statistic):,0.148
Time:,00:03:09,Log-Likelihood:,-4574.2
No. Observations:,445,AIC:,9156.0
Df Residuals:,441,BIC:,9173.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,3650.0468,3987.120,0.915,0.360,-4186.070,1.15e+04
treat,1516.9969,682.237,2.224,0.027,176.157,2857.837
age,-46.6970,279.933,-0.167,0.868,-596.866,503.472
age2,1.1926,4.623,0.258,0.797,-7.893,10.279

0,1,2,3
Omnibus:,203.247,Durbin-Watson:,1.813
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1952.469
Skew:,1.712,Prob(JB):,0.0
Kurtosis:,12.674,Cond. No.,9720.0


The ATE is much larger than LaLonde's original data! It ranges from 1,380 to 1,530.

#### 3.1.2 Random Forest Regression

Estimate $\mu_d (X) \equiv E[Y|D = d, X]$ by random forest.

### 3.2 Propensity Score Matching

In [16]:
# First, we will use Lalonde as a treatment and CPS1 as a control
df1 = fn_generate_data(treat_id = 'LT', control_id = 'CPS1', df = df)
df1

Unnamed: 0,data_id,treat,age,education,black,hispanic,married,nodegree,re75,re78,dif
0,LT,1.0,37.0,11.0,1.0,0.0,1.0,1.0,0.000000,9930.045898,9930.045898
1,LT,1.0,22.0,9.0,0.0,1.0,0.0,1.0,0.000000,3595.894043,3595.894043
2,LT,1.0,30.0,12.0,1.0,0.0,0.0,0.0,0.000000,24909.449219,24909.449219
3,LT,1.0,27.0,11.0,1.0,0.0,0.0,1.0,0.000000,7506.145996,7506.145996
4,LT,1.0,33.0,8.0,1.0,0.0,0.0,1.0,0.000000,289.789886,289.789886
...,...,...,...,...,...,...,...,...,...,...,...
16284,CPS1,0.0,22.0,12.0,1.0,0.0,0.0,0.0,6801.435059,2757.437988,-4043.997070
16285,CPS1,0.0,20.0,12.0,1.0,0.0,1.0,0.0,11832.240234,6895.071777,-4937.168457
16286,CPS1,0.0,37.0,12.0,0.0,0.0,0.0,0.0,1559.370972,4221.865234,2662.494263
16287,CPS1,0.0,47.0,9.0,0.0,0.0,1.0,1.0,11384.660156,13671.929688,2287.269531


In [17]:
# Generate variables
X, T, Y = fn_generate_variables(data = df1, outcome = 'dif')

# Estimate propensity score by Random Forest
param_grid_p = {'n_estimators': [50, 100, 500, 1000], 'max_features': [2, 3, 4, 5]}

rfc = GridSearchCV(RandomForestClassifier(), param_grid = param_grid_p, cv = 5,
                   scoring = 'neg_mean_squared_error', return_train_score = False, verbose = 1,
                   error_score = 'raise')
rfc.fit(X, T.ravel())

phat = np.array(rfc.predict_proba(X)[:, 1], ndmin = 2).T

Fitting 5 folds for each of 16 candidates, totalling 80 fits


In [18]:
# Generate a data frame with propensity score
# The data with extremely high or low pronepsity scores are removed
df_prop = fn_generate_df_prop(df = df1, prop = phat, truncate_level = 0.01)
df_prop

Unnamed: 0,data_id,treat,age,education,black,hispanic,married,nodegree,re75,re78,dif,propensity_score,propensity_score_logit
0,LT,1.0,37.0,11.0,1.0,0.0,1.0,1.0,0.000000,9930.045898,9930.045898,0.626667,0.517943
1,LT,1.0,22.0,9.0,0.0,1.0,0.0,1.0,0.000000,3595.894043,3595.894043,0.714200,0.915871
2,LT,1.0,30.0,12.0,1.0,0.0,0.0,0.0,0.000000,24909.449219,24909.449219,0.313471,-0.783940
3,LT,1.0,27.0,11.0,1.0,0.0,0.0,1.0,0.000000,7506.145996,7506.145996,0.990000,4.595120
4,LT,1.0,33.0,8.0,1.0,0.0,0.0,1.0,0.000000,289.789886,289.789886,0.802000,1.398842
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1152,CPS1,0.0,17.0,9.0,1.0,0.0,0.0,1.0,0.000000,422.629791,422.629791,0.497494,-0.010024
1153,CPS1,0.0,42.0,12.0,1.0,0.0,1.0,0.0,0.000000,0.000000,0.000000,0.016267,-4.102237
1154,CPS1,0.0,20.0,12.0,1.0,0.0,0.0,0.0,9737.565430,3771.157959,-5966.407471,0.075000,-2.512306
1155,CPS1,0.0,17.0,9.0,1.0,0.0,0.0,1.0,2112.581055,3039.684082,927.103027,0.141000,-1.807009


In [19]:
# Propensity score matching
df_matched = fn_generate_df_matched(df = df_prop, outcome = 'dif', n_neighbors = 1)
df_matched

Unnamed: 0,data_id,treat,age,education,black,hispanic,married,nodegree,re75,re78,dif,propensity_score,propensity_score_logit,matched_index_1,matched_outcome_1
0,LT,1.0,37.0,11.0,1.0,0.0,1.0,1.0,0.000000,9930.045898,9930.045898,0.626667,0.517943,827.0,1053.619019
1,LT,1.0,22.0,9.0,0.0,1.0,0.0,1.0,0.000000,3595.894043,3595.894043,0.714200,0.915871,827.0,1053.619019
2,LT,1.0,30.0,12.0,1.0,0.0,0.0,0.0,0.000000,24909.449219,24909.449219,0.313471,-0.783940,1108.0,1161.493042
3,LT,1.0,27.0,11.0,1.0,0.0,0.0,1.0,0.000000,7506.145996,7506.145996,0.990000,4.595120,827.0,1053.619019
4,LT,1.0,33.0,8.0,1.0,0.0,0.0,1.0,0.000000,289.789886,289.789886,0.802000,1.398842,827.0,1053.619019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1152,CPS1,0.0,17.0,9.0,1.0,0.0,0.0,1.0,0.000000,422.629791,422.629791,0.497494,-0.010024,128.0,1953.267944
1153,CPS1,0.0,42.0,12.0,1.0,0.0,1.0,0.0,0.000000,0.000000,0.000000,0.016267,-4.102237,9.0,12418.070312
1154,CPS1,0.0,20.0,12.0,1.0,0.0,0.0,0.0,9737.565430,3771.157959,-5966.407471,0.075000,-2.512306,9.0,12418.070312
1155,CPS1,0.0,17.0,9.0,1.0,0.0,0.0,1.0,2112.581055,3039.684082,927.103027,0.141000,-1.807009,96.0,8048.603027


In [20]:
# ATET based on propensity score matching
df_matched[df_matched.treat == 1]['dif'].mean() - df_matched[df_matched.treat == 1]['matched_outcome_1'].mean()

1650.1522635640324

Much larger than the experimental data.

In [21]:
# Propensity score matching using 10 nearest neighbors
df_matched = fn_generate_df_matched(df = df_prop, outcome = 'dif', n_neighbors = 10)
df_matched

Unnamed: 0,data_id,treat,age,education,black,hispanic,married,nodegree,re75,re78,...,matched_outcome_1,matched_outcome_2,matched_outcome_3,matched_outcome_4,matched_outcome_5,matched_outcome_6,matched_outcome_7,matched_outcome_8,matched_outcome_9,matched_outcome_10
0,LT,1.0,37.0,11.0,1.0,0.0,1.0,1.0,0.000000,9930.045898,...,0.000000,1053.619019,0.000000,0.000000,0.000000,2713.105957,0.000000,0.000000,1956.510010,4520.366211
1,LT,1.0,22.0,9.0,0.0,1.0,0.0,1.0,0.000000,3595.894043,...,0.000000,1053.619019,0.000000,0.000000,0.000000,2713.105957,0.000000,0.000000,1956.510010,4520.366211
2,LT,1.0,30.0,12.0,1.0,0.0,0.0,0.0,0.000000,24909.449219,...,1898.879028,1161.493042,-1448.370972,13883.240234,-205.887100,1642.463867,2796.312744,0.000000,-94.887100,0.000000
3,LT,1.0,27.0,11.0,1.0,0.0,0.0,1.0,0.000000,7506.145996,...,0.000000,1053.619019,0.000000,0.000000,0.000000,2713.105957,0.000000,0.000000,1956.510010,4520.366211
4,LT,1.0,33.0,8.0,1.0,0.0,0.0,1.0,0.000000,289.789886,...,0.000000,1053.619019,0.000000,0.000000,0.000000,2713.105957,0.000000,0.000000,1956.510010,4520.366211
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1152,CPS1,0.0,17.0,9.0,1.0,0.0,0.0,1.0,0.000000,422.629791,...,1953.267944,0.000000,1991.400024,995.700195,8061.484863,0.000000,0.000000,0.000000,4814.626953,8087.486816
1153,CPS1,0.0,42.0,12.0,1.0,0.0,1.0,0.0,0.000000,0.000000,...,12418.070312,8048.603027,5587.502930,1048.432007,11163.169922,0.000000,24909.449219,4843.175781,0.000000,7458.104980
1154,CPS1,0.0,20.0,12.0,1.0,0.0,0.0,0.0,9737.565430,3771.157959,...,12418.070312,8048.603027,5587.502930,1048.432007,11163.169922,0.000000,24909.449219,4843.175781,0.000000,7458.104980
1155,CPS1,0.0,17.0,9.0,1.0,0.0,0.0,1.0,2112.581055,3039.684082,...,8048.603027,5587.502930,1048.432007,11163.169922,0.000000,12418.070312,24909.449219,4843.175781,0.000000,7458.104980


In [22]:
# ATET based on propensity score matching

colnames = []

for i in range(1, 11):
    colnames += ['matched_outcome_' + str(i)]

tauhats = df_matched[df_matched.treat == 1]['dif'] - df_matched[df_matched.treat == 1][colnames].mean(axis = 1)
np.mean(tauhats)

1531.4350943487973

Still much larger than the experimental data.

### 3.3 Inverse Probability of Treatment Weighted (IPTW) Estimator

In [23]:
# IPTW estimator
fn_IPTW(df = df_prop, outcome = 'dif')

-892.1006998330197

It's negative!

### 3.4 Doubly Robust Estimator

To be added.

## 4. Findings

a. What are your main findings?  
b. Are your findings robust to different model specifications?

## 5. Conclusion

a. Discuss what we’ve learned from your analysis.  
b. Discuss any questions that could be answered in the future.