In [1]:
import numpy as np
import pandas as pd
import csv

import statsmodels.api as sm
from statsmodels.sandbox.regression.gmm import IV2SLS

In [2]:
def clean2012(filename):
    """
    'PDMED12M', 'PHOSPYR2', 'HISTOP5','HISTOP6'
    """
    
    with open(filename, 'r') as file:
        lines = file.readlines()
    
    age_p = []
    household = []
    fam = []
    indiv = []
    schip = []
    pdmed = []
    med_cancel = []
    in_hos = []
    cost_too_high = []
    insurance_coverage = []

    for line in lines:
        age = int(line[65:67])

        if age >= 15 and age <= 24:
            hhx = int(line[6:12])
            family = int(line[15:17])
            individual = int(line[17:19])
            chip = int(line[674])
            med_delay = int(line[526])
            m_cancel = int(line[527])
            hos = int(line[528])
            
            if line[698] == ' ':
                cost = 0
            else:
                cost = int(line[698])
                
            if line[699] == ' ':
                insurance = 0
            else:
                insurance = int(line[699])

            age_p.append(age)
            household.append(hhx)
            fam.append(family)
            indiv.append(individual)
            schip.append(chip)
            pdmed.append(med_delay)
            med_cancel.append(m_cancel)
            in_hos.append(hos)
            cost_too_high.append(cost)
            insurance_coverage.append(insurance)

    d = {'Age': age_p, 'Household': household, 'Family' : fam, 'Individual Within Family' : indiv, 
         'CHIP': schip, 'MedDelay': pdmed, 'MedCancel':m_cancel,
         'BeenInHospital': in_hos, 'CostTooHigh': cost_too_high,
         'InsuranceRefused': insurance_coverage}
    
    df = pd.DataFrame.from_dict(d)
    df.fillna(0, inplace=True)
    return df

In [3]:
def clean2014(filename):
    """
    'PDMED12M', 'PHOSPYR2', 'HISTOP5','HISTOP6'
    """
    
    with open(filename, 'r') as file:
        lines = file.readlines()
    
    age_p = []
    household = []
    fam = []
    indiv = []
    schip = []
    pdmed = []
    med_cancel = []
    in_hos = []
    cost_too_high = []
    insurance_coverage = []

    for line in lines:
        age = int(line[65:67])

        if age >= 15 and age <= 24:
            hhx = int(line[6:12])
            family = int(line[15:17])
            individual = int(line[17:19])
            chip = int(line[691])
            m_cancel = int(line[525])
            med_delay = int(line[524])
            hos = int(line[526])
            
            if line[729] == ' ':
                cost = 0
            else:
                cost = int(line[729])
            
            if line[730] == ' ':
                insurance = 0
            else:
                insurance = int(line[730])

            age_p.append(age)
            household.append(hhx)
            fam.append(family)
            indiv.append(individual)
            schip.append(chip)
            pdmed.append(med_delay)
            med_cancel.append(m_cancel)
            in_hos.append(hos)
            cost_too_high.append(cost)
            insurance_coverage.append(insurance)

    d = {'Age': age_p, 'Household': household, 'Family': family, 'Individual Within Family' : indiv,
         'CHIP': schip, 'MedDelay': pdmed, 'MedCancel':med_cancel,
         'BeenInHospital': in_hos, 'CostTooHigh': cost_too_high, 
         'InsuranceRefused': insurance_coverage}
    
    df = pd.DataFrame.from_dict(d)
    df.fillna(0, inplace=True)
    return df

In [15]:
df_2012 = clean2012("personsx2012.dat") # cleaned data for 2012
df_2014 = clean2014("personsx2014.dat")

df_2014['MedDelay'] = df_2014['MedDelay'].apply(lambda x: 1 if x == 1 else 0)

df_2012['MedDelay'] = df_2012['MedDelay'].apply(lambda x: 1 if x == 1 else 0)

df_2014['MedCancel'] = df_2014['MedCancel'].apply(lambda x: 1 if x == 1 else 0)

df_2012['MedCancel'] = df_2012['MedCancel'].apply(lambda x: 1 if x == 1 else 0)

df_2014['CostTooHigh'] = df_2014['CostTooHigh'].apply(lambda x: 1 if x == 1 else 0)

df_2012['CostTooHigh'] = df_2012['CostTooHigh'].apply(lambda x: 1 if x == 1 else 0)

df_2014['InsuranceRefused'] = df_2014['InsuranceRefused'].apply(lambda x: 1 if x == 1 else 0)

df_2012['InsuranceRefused'] = df_2012['InsuranceRefused'].apply(lambda x: 1 if x == 1 else 0)


In [16]:
Y_estimators = ['MedDelay', 'MedCancel', 'CostTooHigh', 'InsuranceRefused']

    
    
Pre_experiment = df_2012.loc[(df_2012['CHIP'] == 1) | (df_2012['CHIP'] == 2)]
Pre_control = df_2012.loc[(df_2012['CHIP'] == 3) & (df_2012['Age'] < 24) & (df_2012['Age'] > 20)]
Post_experiment = df_2014.loc[(df_2014['CHIP'] == 1) | (df_2014['CHIP'] == 2)]
Post_control = df_2014.loc[(df_2014['CHIP'] == 3) & (df_2014['Age'] < 24) & (df_2012['Age'] > 20)]


for Y in Y_estimators:
    Pre_control_mean = Pre_control[Y].mean()
    Pre_experiment_mean = Pre_experiment[Y].mean()
    Post_control_mean = Post_control[Y].mean()
    Post_experiment_mean = Post_experiment[Y].mean()
    
    chip_did = (Post_experiment_mean - Pre_experiment_mean) - (Post_control_mean - Pre_control_mean)
    
    
    print(chip_did)
    

0.015291962217043348
-0.039071876762495183
0.04922641373197948
0.0008526746430807293


In [17]:
#
df_2012['ACA'] = 0
df_2014['ACA'] = np.where((df_2014['Age'] <= 18) & ((df_2014['CHIP'] == 1) | (df_2014['CHIP'] == 2)), 1, 0)

df_2012['Year_indicator'] = 0
df_2014['Year_indicator'] = 1

fuzzy_diff_df = pd.concat([df_2012, df_2014])
fuzzy_diff_df['Age_indicator'] = np.where(fuzzy_diff_df['Age']<= 18, 1, 0)

fuzzy_diff_df['CHIP_indicator'] = np.where(fuzzy_diff_df['CHIP'] == 3, 0, 1)#CHIP value of 1,2 means you are in experimental group

fuzzy_diff_df['Age*Year'] = fuzzy_diff_df['Year_indicator'] * fuzzy_diff_df['Age_indicator']



In [18]:
#Now, we want this to work for Fuzzy Difference in Discontinuity 

chip_x =sm.add_constant(fuzzy_diff_df[['Age_indicator', 'Year_indicator', 'Age*Year']])

fuzzy_diff_df['Predicted_CHIP'] = sm.OLS(fuzzy_diff_df['CHIP_indicator'], chip_x).fit().predict(chip_x)

aca_x = sm.add_constant(fuzzy_diff_df[['Age_indicator', 'Year_indicator', 'Age*Year']])

fuzzy_diff_df['Predicted_ACA'] = sm.OLS(fuzzy_diff_df['ACA'], aca_x).fit().predict(aca_x)

fuzzy_diff_df


Unnamed: 0,Age,Household,Family,Individual Within Family,CHIP,MedDelay,MedCancel,BeenInHospital,CostTooHigh,InsuranceRefused,ACA,Year_indicator,Age_indicator,CHIP_indicator,Age*Year,Predicted_CHIP,Predicted_ACA
0,22,6,1,1,3,1,0,2,0,0,0,0,0,0,0,0.019321,6.981839e-17
1,15,9,1,3,3,0,0,2,0,0,0,0,1,0,0,0.067262,-1.665731e-16
2,20,22,1,1,3,0,0,2,0,0,0,0,0,0,0,0.019321,6.981839e-17
3,20,22,2,1,3,0,0,2,0,0,0,0,0,0,0,0.019321,6.981839e-17
4,22,36,1,1,3,0,0,2,0,0,0,0,0,0,0,0.019321,6.981839e-17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14562,23,86622,1,2,3,0,0,2,1,0,0,1,0,0,0,0.022907,-6.650395e-17
14563,23,86639,1,2,3,0,0,2,0,0,0,1,0,0,0,0.022907,-6.650395e-17
14564,21,86639,1,3,3,0,0,2,0,0,0,1,0,0,0,0.022907,-6.650395e-17
14565,23,86667,1,3,3,0,0,2,0,0,0,1,0,0,0,0.022907,-6.650395e-17


In [24]:
#Now, we run the final regression 

Y_estimators = ['MedDelay', 'MedCancel', 'CostTooHigh', 'InsuranceRefused']

fuzzy_diff_df['Pred_CHIP * Pred_ACA'] = fuzzy_diff_df['Predicted_CHIP'] * fuzzy_diff_df['Predicted_ACA']

fuzzy_diff_df['Year * Pred_CHIP * Pred_ACA'] = fuzzy_diff_df['Predicted_CHIP'] * fuzzy_diff_df['Predicted_ACA'] * fuzzy_diff_df['Year_indicator']

fuzzy_diff_x = sm.add_constant(fuzzy_diff_df[['Predicted_CHIP', 'Predicted_ACA', 'Year_indicator', 'Pred_CHIP * Pred_ACA', 'Year * Pred_CHIP * Pred_ACA']])

for response in Y_estimators:
    fuzzy_diff_y = fuzzy_diff_df[response]
    regression = sm.OLS(fuzzy_diff_y, fuzzy_diff_x)
    fit = regression.fit()
    print("Effect of response" + str(response) + " is " + str(fit.params['Year * Pred_CHIP * Pred_ACA']) 
          + " with p_value " + str(fit.pvalues['Year * Pred_CHIP * Pred_ACA']))
    

Effect of responseMedDelay is -0.011141244867012796 with p_value 0.08794167452250086
Effect of responseMedCancel is -0.04564796957751825 with p_value 2.7233980489023682e-33
Effect of responseCostTooHigh is 0.0011376198399542514 with p_value 0.8608399056291371
Effect of responseInsuranceRefused is -3.210029572647366e-05 with p_value 0.9698204799509995
