In [39]:
import pandas as pd
EXTERNAL_DATA_PATH = "external_data"
PREPARED_DATA_PATH = "."

## Lalonde

Building on Dehejia & Wahba (2002) and Smith & Todd (2005), we use a selected subset of the Lalonde experimental data (with real income in 1974 available) and the PSID observational controls.

In the experimental data, there are 297 treated units and 425 control units. The PSID data has a further 2490 control units. In these datasets Real Income in 1978 is the measured outcome. All other variables are measured pre-treatment and serve as covariates.

Given that we will simulate treatment and outcomes, we can utilize both the treated and control group from the experimental data as well as the observational controls in one large data set.

Data source: https://users.nber.org/~rdehejia/data/.nswdata2.html

Dehejia, R. H., & Wahba, S. (2002). Propensity score-matching methods for nonexperimental causal studies. Review of Economics and statistics, 84(1), 151-161.

Smith, J. A., & Todd, P. E. (2005). Does matching overcome LaLonde's critique of nonexperimental estimators?. Journal of econometrics, 125(1-2), 305-353.




In [40]:
dw_experimental_data = pd.read_csv(EXTERNAL_DATA_PATH+"/lalonde/lalonde_dw.csv")
dw_experimental_data.head()

Unnamed: 0,data_id,treat,age,education,black,hispanic,married,nodegree,re74,re75,re78
0,Dehejia-Wahba Sample,1,37,11,1,0,1,1,0.0,0.0,9930.045898
1,Dehejia-Wahba Sample,1,22,9,0,1,0,1,0.0,0.0,3595.894043
2,Dehejia-Wahba Sample,1,30,12,1,0,0,0,0.0,0.0,24909.449219
3,Dehejia-Wahba Sample,1,27,11,1,0,0,1,0.0,0.0,7506.145996
4,Dehejia-Wahba Sample,1,33,8,1,0,0,1,0.0,0.0,289.789886


In [41]:
psid_data = pd.read_csv(EXTERNAL_DATA_PATH+"/lalonde/lalonde_psid_controls.csv")
psid_data.head()

Unnamed: 0,data_id,treat,age,education,black,hispanic,married,nodegree,re74,re75,re78
0,PSID,0,47,12,0,0,0,0,0.0,0.0,0.0
1,PSID,0,50,12,1,0,1,0,0.0,0.0,0.0
2,PSID,0,44,12,0,0,0,0,0.0,0.0,0.0
3,PSID,0,28,12,1,0,1,0,0.0,0.0,0.0
4,PSID,0,54,12,0,0,1,0,0.0,0.0,0.0


In [42]:
combined_lalonde = dw_experimental_data.append(psid_data)
combined_lalonde.shape

(2935, 11)

In [43]:
lalonde_covars = combined_lalonde.drop(["treat", "re78", "data_id"], axis=1)
lalonde_covars.head()

Unnamed: 0,age,education,black,hispanic,married,nodegree,re74,re75
0,37,11,1,0,1,1,0.0,0.0
1,22,9,0,1,0,1,0.0,0.0
2,30,12,1,0,0,0,0.0,0.0
3,27,11,1,0,0,1,0.0,0.0
4,33,8,1,0,0,1,0.0,0.0


In [44]:
lalonde_covars.to_csv(PREPARED_DATA_PATH+"/lalonde.csv", index=False)

## Collaborative Perinatal Project

The CPP data below is extracted frpm the package provided by Doie et al (2019). The treatment variable and outcome variables have been removed.

In [47]:
pd.read_csv?

In [92]:
cpp_data = pd.read_csv(EXTERNAL_DATA_PATH+"/collaborative_perinatal_project.csv")
cpp_data = cpp_data.drop("Unnamed: 0", axis=1)
cpp_data.head()

Unnamed: 0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,x_10,...,x_49,x_50,x_51,x_52,x_53,x_54,x_55,x_56,x_57,x_58
0,29,C,1.0,7.0,60,85,0,0,1,0,...,0,0,0,0,0,0,0,0,45,39
1,27,C,0.0,0.0,64,178,0,0,0,0,...,0,0,0,0,0,0,0,0,46,42
2,27,C,0.0,0.0,60,102,0,0,0,0,...,1,0,0,0,0,0,0,0,45,40
3,37,C,0.0,0.0,65,174,0,0,0,0,...,0,0,0,0,0,0,0,0,47,40
4,24,C,20.0,14.0,63,129,0,0,0,0,...,2,0,0,0,0,0,0,0,47,43


In [93]:
# All string columns
cpp_categorical = cpp_data.select_dtypes(['O']).columns

# Assign categorical columns numeric codes
for cat_var in cpp_categorical:
    cpp_data[cat_var] = cpp_data[cat_var].astype('category')#.cat.codes
    
cpp_data.head()

Unnamed: 0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,x_10,...,x_49,x_50,x_51,x_52,x_53,x_54,x_55,x_56,x_57,x_58
0,29,C,1.0,7.0,60,85,0,0,1,0,...,0,0,0,0,0,0,0,0,45,39
1,27,C,0.0,0.0,64,178,0,0,0,0,...,0,0,0,0,0,0,0,0,46,42
2,27,C,0.0,0.0,60,102,0,0,0,0,...,1,0,0,0,0,0,0,0,45,40
3,37,C,0.0,0.0,65,174,0,0,0,0,...,0,0,0,0,0,0,0,0,47,40
4,24,C,20.0,14.0,63,129,0,0,0,0,...,2,0,0,0,0,0,0,0,47,43


In [97]:
cpp_data = pd.get_dummies(cpp_data, columns=cpp_categorical)
cpp_data.head()

Unnamed: 0,x_1,x_3,x_4,x_5,x_6,x_7,x_8,x_9,x_10,x_11,...,x_21_L,x_21_M,x_21_N,x_21_O,x_21_P,x_24_A,x_24_B,x_24_C,x_24_D,x_24_E
0,29,1.0,7.0,60,85,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
1,27,0.0,0.0,64,178,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,27,0.0,0.0,60,102,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,37,0.0,0.0,65,174,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,24,20.0,14.0,63,129,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [105]:
# Collect categorical and discrete
discrete_columns = []
for col in cpp_data.columns:
    if col in cpp_categorical:
        discrete_columns.append(col)
    if len(cpp_data[col].unique()) == 2:
        discrete_columns.append(col)
discrete_columns

['x_17',
 'x_22',
 'x_38',
 'x_51',
 'x_54',
 'x_2_A',
 'x_2_B',
 'x_2_C',
 'x_2_D',
 'x_2_E',
 'x_2_F',
 'x_21_A',
 'x_21_B',
 'x_21_C',
 'x_21_D',
 'x_21_E',
 'x_21_F',
 'x_21_G',
 'x_21_H',
 'x_21_I',
 'x_21_J',
 'x_21_K',
 'x_21_L',
 'x_21_M',
 'x_21_N',
 'x_21_O',
 'x_21_P',
 'x_24_A',
 'x_24_B',
 'x_24_C',
 'x_24_D',
 'x_24_E']

In [104]:
cpp_data.to_csv(PREPARED_DATA_PATH+"/cpp.csv", index=False)