In [109]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from statsmodels.sandbox.regression.gmm import IV2SLS

In [39]:
adult_df = pd.read_csv('RDD_data/adult21.csv')
child_df = pd.read_csv('RDD_data/child21.csv')

adult_income_df = pd.read_csv('RDD_data/adultinc21.csv')
child_income_df = pd.read_csv('RDD_data/childinc21.csv')



In [None]:
#RATCAT_A 

#Merge income with Adult and Child dataframes based on 'HHX' 

adult_merged = pd.merge(adult_df, adult_income_df, on = 'HHX')
child_merged = pd.merge(child_df, child_income_df, on = 'HHX')

In [51]:
child_df

Unnamed: 0,URBRRL,RATCAT_C,IMPINCFLG_C,SHTCVD19AV_C,SHTCVD19NM_C,PPSU,PSTRAT,HISPALLP_C,RACEALLP_C,SCHDYMSSTC_C,...,PHSTAT_C,HHSTAT_C,INTV_MON,RECTYPE,IMPNUM_C,RELCHPARENTP1_C,RELCHPARENTP2_C,WTFA_C,HHX,POVRATTC_C
0,1,14,0,,,3,115,3,2,4.0,...,1,1,1,20,1,1,4,6128.031,H025636,5.13
1,1,7,0,,,3,115,3,2,0.0,...,2,1,1,20,1,1,4,4529.750,H018455,1.96
2,1,14,0,,,1,115,2,1,0.0,...,1,1,1,20,1,4,4,5790.022,H047222,7.20
3,1,13,0,,,1,115,2,1,10.0,...,1,1,1,20,1,1,1,16068.950,H020084,4.85
4,3,4,0,,,1,103,3,2,,...,3,1,1,20,1,3,4,4668.747,H046058,1.10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8256,4,9,0,,,27,146,2,1,,...,1,1,12,20,1,1,1,6346.783,H007394,2.59
8257,4,5,0,,,29,146,2,1,3.0,...,1,1,12,20,1,1,4,20683.243,H038392,1.44
8258,3,12,0,,,114,108,2,1,0.0,...,2,1,12,20,1,1,1,2153.966,H012473,4.19
8259,3,8,0,,,114,108,2,1,,...,1,1,12,20,1,1,1,3875.047,H002032,2.26


In [44]:
adult_income_df

Unnamed: 0,RATCAT_A,IMPNUM_A,IMPINCFLG_A,RECTYPE,POVRATTC_A,HHX
0,7,1,0,30,1.93,H056808
1,7,2,0,30,1.93,H056808
2,7,3,0,30,1.93,H056808
3,7,4,0,30,1.93,H056808
4,7,5,0,30,1.93,H056808
...,...,...,...,...,...,...
294815,13,6,0,30,4.89,H052354
294816,13,7,0,30,4.89,H052354
294817,13,8,0,30,4.89,H052354
294818,13,9,0,30,4.89,H052354


In [80]:
#Get rows we want


#HISTOP_COST_A is Cost Increase

#HISTOPJOB_A is Number of months without coverage

#PAYBLL12M_A is problems paying medical bills, past 12 months

#MEDDL12M_A is Delayed medical care due to cost, past 12m

#MEDNG12M_A is Needed medical care but did not get it due to cost, past 12m

adult_data = adult_df[['HHX', 'AGEP_A', 'SEX_A', 'HOSPONGT_A', 'RATCAT_A',
                 'CHIP_A','PAYBLL12M_A',
                  'MEDDL12M_A','MEDNG12M_A', ]]
child_data = child_df[['HHX', 'AGEP_C', 'SEX_C', 'HOSPONGT_C', 'RATCAT_C',
                 'CHIP_C', 'PAYBLL12M_C',
                  'MEDDL12M_C','MEDNG12M_C', ]]


child_data = child_data.loc[(child_data['AGEP_C'] >= 14)]
#df['new_column'] = df['a'].apply(lambda x: 1 if x == 0 or x == 1 else 0)
child_data['Treatment'] = child_df['CHIP_C'].apply(lambda x: 1 if x == 1 or x == 2 else 0)

adult_data = adult_data.loc[adult_data['AGEP_A'] <= 24] #Experimental group should not be much higher than 23. 



#Rename Columns
adult_data.rename(columns = {'AGEP_A':'Age', 'SEX_A':'Sex', 'HOSPONGT_A': 'Hospitalized', 
                         'RATCAT_A': 'Income_Ratio', 'CHIP_A': 'CHIP',
                          'PAYBLL12M_A': 'PayBill', 'MEDDL12M_A': 'DelayedCareCost',
                             'MEDNG12M_A':'NoCareCost'},inplace=True)

child_data.rename(columns = {'AGEP_C':'Age', 'SEX_C':'Sex', 'HOSPONGT_C': 'Hospitalized', 
                         'RATCAT_C': 'Income_Ratio', 'CHIP_C': 'CHIP',
                          'PAYBLL12M_C': 'PayBill', 'MEDDL12M_C': 'DelayedCareCost',
                          'MEDNG12M_C':'NoCareCost'}, inplace=True)

adult_data['Treatment'] = 0



In [133]:
#RUN RDD ON above target variable to be 'PAYBLL12M_C'
rdd_data = pd.concat([child_data, adult_data])

# Create the running variable for the first stage: whether the age is over the threshold (18)
rdd_data['Age_over_18'] = np.where(rdd_data['Age'] > 18, 1, 0)

# Add an interaction term between the running variable and the cutoff indicator
rdd_data['Age_over_18_times_Age'] = rdd_data['Age_over_18'] * rdd_data['Age']

#Delayed Care due to Cost
rdd_data['DelayCareCost'] = rdd_data['DelayedCareCost'].apply(lambda x: 1 if x == 1 else 0)

#Inability to Pay hospital Bill
rdd_data['PayBill'] = rdd_data['PayBill'].apply(lambda x: 1 if x == 1 else 0)

#Didn't get care due to Cost
rdd_data['NoCareCost'] = rdd_data['NoCareCost'].apply(lambda x: 1 if x == 1 else 0)



# First stage regression: Treatment on Age_over_18, Age, Age_over_18_times_Age, Income_Ratio
X_first = sm.add_constant(rdd_data[['Age_over_18', 'Age', 'Age_over_18_times_Age', 'Income_Ratio']])
Y_first = rdd_data['Treatment']
first_stage = sm.OLS(Y_first, X_first).fit()
rdd_data['predicted_treatment'] = first_stage.predict(X_first)

In [116]:
Y_first

0        1
5        1
7        0
11       0
12       0
        ..
29330    0
29350    0
29376    0
29452    0
29453    0
Name: Treatment, Length: 4052, dtype: int64

In [114]:
print(rdd_data['predicted_treatment'])

0        0.010328
5        0.049244
7        0.043571
11       0.019607
12       0.027595
           ...   
29330    0.001009
29350   -0.017021
29376   -0.000753
29452   -0.017284
29453   -0.017547
Name: predicted_treatment, Length: 4052, dtype: float64


In [134]:
response_variables = ['NoCareCost', 'DelayedCareCost', 'PayBill']

for response in response_variables:
    # Second stage regression using 2SLS: response on predicted treatment, Age, Age_over_18_times_Age, Income_Ratio
    Y_second = rdd_data[response]
    X_second = sm.add_constant(rdd_data[['Treatment', 'Age', 'Age_over_18_times_Age', 'Income_Ratio']])
    model = sm.OLS(Y_second, X_second, instrument = X_first)
    second_stage = model.fit()
    print(second_stage.summary())


rdd_x = rdd_data[['Treatment', 'Age', 'Income_Ratio']]
rdd_y = rdd_data['NoCareCost']

response_variables = ['NoCareCost', 'DelayedCareCost', 'Hospitalized', 'PayBill']

for response in response_variables:
    rdd_y = rdd_data[response]
    regression = sm.OLS(rdd_y, rdd_x)
    fit = regression.fit()
    print(fit.pvalues)
    print("Effect of response" + str(response) + " is " + str(fit.params['Treatment']) 
          + " with p_value " + str(fit.pvalues['Treatment']))

#For Hospitalized, we get a causal effect of 0.13 

                            OLS Regression Results                            
Dep. Variable:             NoCareCost   R-squared:                       0.020
Model:                            OLS   Adj. R-squared:                  0.019
Method:                 Least Squares   F-statistic:                     20.73
Date:                Thu, 25 May 2023   Prob (F-statistic):           6.26e-17
Time:                        21:10:25   Log-Likelihood:                 1325.9
No. Observations:                4052   AIC:                            -2642.
Df Residuals:                    4047   BIC:                            -2610.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                    -0.00

In [137]:
#Now, we want to run our Fuzzy Difference in Discontinuity Estimator.
rdd_data

Unnamed: 0,HHX,Age,Sex,Hospitalized,Income_Ratio,CHIP,PayBill,DelayedCareCost,NoCareCost,Treatment,Age_over_18,Age_over_18_times_Age,DelayCareCost,predicted_treatment,ACA_Treatment
0,H025636,17,1,2.0,14,1,0,2,0,1,0,0,0,0.010328,0
5,H019593,16,1,2.0,5,1,1,2,0,1,0,0,0,0.049244,0
7,H050208,14,1,2.0,14,3,0,2,0,0,0,0,0,0.043571,0
11,H066576,17,1,2.0,11,3,0,2,0,0,0,0,0,0.019607,0
12,H019151,16,2,2.0,12,3,0,2,0,0,0,0,0,0.027595,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29330,H049980,20,1,2.0,8,3,0,2,0,0,1,20,0,0.001009,0
29350,H043126,24,2,2.0,14,3,0,2,0,0,1,24,0,-0.017021,0
29376,H054556,18,1,2.0,14,3,0,2,0,0,0,0,0,-0.000753,0
29452,H027545,22,2,2.0,14,3,0,2,0,0,1,22,0,-0.017284,0


In [None]:
#Mic is Treatment Column

#Oic is 