In [1]:
import numpy as np
import pandas as pd
import csv

import statsmodels.api as sm
from statsmodels.sandbox.regression.gmm import IV2SLS

In [2]:
def clean2012(filename):
    """
    'PDMED12M', 'PHOSPYR2', 'HISTOP5','HISTOP6'
    """
    
    with open(filename, 'r') as file:
        lines = file.readlines()
    
    age_p = []
    household = []
    fam = []
    indiv = []
    schip = []
    pdmed = []
    med_cancel = []
    in_hos = []
    cost_too_high = []
    insurance_coverage = []

    for line in lines:
        age = int(line[65:67])

        if age >= 15 and age <= 24:
            hhx = int(line[6:12])
            family = int(line[15:17])
            individual = int(line[17:19])
            chip = int(line[674])
            med_delay = int(line[526])
            m_cancel = int(line[527])
            hos = int(line[528])
            
            if line[698] == ' ':
                cost = 0
            else:
                cost = int(line[698])
                
            if line[699] == ' ':
                insurance = 0
            else:
                insurance = int(line[699])

            age_p.append(age)
            household.append(hhx)
            fam.append(family)
            indiv.append(individual)
            schip.append(chip)
            pdmed.append(med_delay)
            med_cancel.append(m_cancel)
            in_hos.append(hos)
            cost_too_high.append(cost)
            insurance_coverage.append(insurance)

    d = {'Age': age_p, 'Household': household, 'Family' : fam, 'Individual Within Family' : indiv, 
         'CHIP': schip, 'MedDelay': pdmed, 'MedCancel':m_cancel,
         'BeenInHospital': in_hos, 'CostTooHigh': cost_too_high,
         'InsuranceRefused': insurance_coverage}
    
    df = pd.DataFrame.from_dict(d)
    df.fillna(0, inplace=True)
    return df

In [3]:
def clean2014(filename):
    """
    'PDMED12M', 'PHOSPYR2', 'HISTOP5','HISTOP6'
    """
    
    with open(filename, 'r') as file:
        lines = file.readlines()
    
    age_p = []
    household = []
    fam = []
    indiv = []
    schip = []
    pdmed = []
    med_cancel = []
    in_hos = []
    cost_too_high = []
    insurance_coverage = []

    for line in lines:
        age = int(line[65:67])

        if age >= 15 and age <= 24:
            hhx = int(line[6:12])
            family = int(line[15:17])
            individual = int(line[17:19])
            chip = int(line[691])
            m_cancel = int(line[525])
            med_delay = int(line[524])
            hos = int(line[526])
            
            if line[729] == ' ':
                cost = 0
            else:
                cost = int(line[729])
            
            if line[730] == ' ':
                insurance = 0
            else:
                insurance = int(line[730])

            age_p.append(age)
            household.append(hhx)
            fam.append(family)
            indiv.append(individual)
            schip.append(chip)
            pdmed.append(med_delay)
            med_cancel.append(m_cancel)
            in_hos.append(hos)
            cost_too_high.append(cost)
            insurance_coverage.append(insurance)

    d = {'Age': age_p, 'Household': household, 'Family': family, 'Individual Within Family' : indiv,
         'CHIP': schip, 'MedDelay': pdmed, 'MedCancel':med_cancel,
         'BeenInHospital': in_hos, 'CostTooHigh': cost_too_high, 
         'InsuranceRefused': insurance_coverage}
    
    df = pd.DataFrame.from_dict(d)
    df.fillna(0, inplace=True)
    return df

In [4]:
df_2012 = clean2012("personsx2012.dat") # cleaned data for 2012
df_2014 = clean2014("personsx2014.dat")

df_2014['MedDelay'] = df_2014['MedDelay'].apply(lambda x: 1 if x == 1 else 0)

df_2012['MedDelay'] = df_2012['MedDelay'].apply(lambda x: 1 if x == 1 else 0)

df_2014['MedCancel'] = df_2014['MedCancel'].apply(lambda x: 1 if x == 1 else 0)

df_2012['MedCancel'] = df_2012['MedCancel'].apply(lambda x: 1 if x == 1 else 0)

df_2014['CostTooHigh'] = df_2014['CostTooHigh'].apply(lambda x: 1 if x == 1 else 0)

df_2012['CostTooHigh'] = df_2012['CostTooHigh'].apply(lambda x: 1 if x == 1 else 0)

df_2014['InsuranceRefused'] = df_2014['InsuranceRefused'].apply(lambda x: 1 if x == 1 else 0)

df_2012['InsuranceRefused'] = df_2012['InsuranceRefused'].apply(lambda x: 1 if x == 1 else 0)


In [5]:
Y_estimators = ['MedDelay', 'MedCancel', 'CostTooHigh', 'InsuranceRefused']

print("Running Difference in Differences for CHIP")
    
Pre_experiment = df_2012.loc[(df_2012['CHIP'] == 1) | (df_2012['CHIP'] == 2)]
Pre_control = df_2012.loc[(df_2012['CHIP'] == 3) & (df_2012['Age'] < 24) & (df_2012['Age'] > 20)]
Post_experiment = df_2014.loc[(df_2014['CHIP'] == 1) | (df_2014['CHIP'] == 2)]
Post_control = df_2014.loc[(df_2014['CHIP'] == 3) & (df_2014['Age'] < 24) & (df_2012['Age'] > 20)]


for Y in Y_estimators:
    Pre_control_mean = Pre_control[Y].mean()
    Pre_experiment_mean = Pre_experiment[Y].mean()
    Post_control_mean = Post_control[Y].mean()
    Post_experiment_mean = Post_experiment[Y].mean()
    
    chip_did = (Post_experiment_mean - Pre_experiment_mean) - (Post_control_mean - Pre_control_mean)
    
    print("Causal Coefficient for " + str(Y) + "is " + str(chip_did))


Causal Coefficient for MedDelayis 0.015291962217043348
Causal Coefficient for MedCancelis -0.039071876762495183
Causal Coefficient for CostTooHighis 0.04922641373197948
Causal Coefficient for InsuranceRefusedis 0.0008526746430807293
