In [8]:
import numpy as np
import pandas as pd
import csv

In [9]:
def clean2010(filename):
    """
    'PDMED12M', 'PHOSPYR2', 'HISTOP5','HISTOP6'
    """
    
    with open(filename, 'r') as file:
        lines = file.readlines()
    
    age_p = []
    household = []
    fam = []
    indiv = []
    schip = []
    pdmed = []
    in_hos = []
    cost_too_high = []
    insurance_coverage = []

    for line in lines:
        age = int(line[65:67])

        if age >= 19 and age <= 24:
            hhx = int(line[6:12])
            family = int(line[15:17])
            individual = int(line[17:19])
            chip = int(line[622])
            med_delay = int(line[526])
            hos = int(line[527])
            
            if line[646] == ' ':
                cost = 0
            else:
                cost = int(line[646])
                
            if line[647] == ' ':
                insurance = 0
            else:
                insurance = int(line[647])

            age_p.append(age)
            household.append(hhx)
            fam.append(family)
            indiv.append(individual)
            schip.append(chip)
            pdmed.append(med_delay)
            in_hos.append(hos)
            cost_too_high.append(cost)
            insurance_coverage.append(insurance)

    d = {'Age': age_p, 'Household': household, 'Family' : fam, 'Individual Within Family' : indiv, 
         'CHIP': schip, 'MedDelay': pdmed, 
         'BeenInHospital': in_hos, 'CostTooHigh': cost_too_high, 
         'InsuranceRefused': insurance_coverage}
    
    df = pd.DataFrame.from_dict(d)
    df.fillna(0, inplace=True)
    return df

In [10]:
def clean2012(filename):
    """
    'PDMED12M', 'PHOSPYR2', 'HISTOP5','HISTOP6'
    """
    
    with open(filename, 'r') as file:
        lines = file.readlines()
    
    age_p = []
    household = []
    fam = []
    indiv = []
    schip = []
    pdmed = []
    in_hos = []
    cost_too_high = []
    insurance_coverage = []

    for line in lines:
        age = int(line[65:67])

        if age >= 19 and age <= 24:
            hhx = int(line[6:12])
            family = int(line[15:17])
            individual = int(line[17:19])
            chip = int(line[674])
            med_delay = int(line[527])
            hos = int(line[528])
            
            if line[698] == ' ':
                cost = 0
            else:
                cost = int(line[698])
                
            if line[699] == ' ':
                insurance = 0
            else:
                insurance = int(line[699])

            age_p.append(age)
            household.append(hhx)
            fam.append(family)
            indiv.append(individual)
            schip.append(chip)
            pdmed.append(med_delay)
            in_hos.append(hos)
            cost_too_high.append(cost)
            insurance_coverage.append(insurance)

    d = {'Age': age_p, 'Household': household, 'Family' : fam, 'Individual Within Family' : indiv, 
         'CHIP': schip, 'MedDelay': pdmed, 
         'BeenInHospital': in_hos, 'CostTooHigh': cost_too_high, 
         'InsuranceRefused': insurance_coverage}
    
    df = pd.DataFrame.from_dict(d)
    df.fillna(0, inplace=True)
    return df

In [11]:
def clean2014(filename):
    """
    'PDMED12M', 'PHOSPYR2', 'HISTOP5','HISTOP6'
    """
    
    with open(filename, 'r') as file:
        lines = file.readlines()
    
    age_p = []
    household = []
    fam = []
    indiv = []
    schip = []
    pdmed = []
    in_hos = []
    cost_too_high = []
    insurance_coverage = []

    for line in lines:
        age = int(line[65:67])

        if age >= 19 and age <= 24:
            hhx = int(line[6:12])
            family = int(line[15:17])
            individual = int(line[17:19])
            chip = int(line[691])
            med_delay = int(line[525])
            hos = int(line[526])
            
            if line[729] == ' ':
                cost = 0
            else:
                cost = int(line[729])
            
            if line[730] == ' ':
                insurance = 0
            else:
                insurance = int(line[730])

            age_p.append(age)
            household.append(hhx)
            fam.append(family)
            indiv.append(individual)
            schip.append(chip)
            pdmed.append(med_delay)
            in_hos.append(hos)
            cost_too_high.append(cost)
            insurance_coverage.append(insurance)

    d = {'Age': age_p, 'Household': household, 'Family': family, 'Individual Within Family' : indiv,
         'CHIP': schip, 'MedDelay': pdmed, 
         'BeenInHospital': in_hos, 'CostTooHigh': cost_too_high, 
         'InsuranceRefused': insurance_coverage}
    
    df = pd.DataFrame.from_dict(d)
    df.fillna(0, inplace=True)
    return df

In [13]:
df_2012 = clean2012("personsx2012.dat") # cleaned data for 2012
hh2012 = df_2012.Household.values.tolist()

In [14]:
df_2014 = clean2014("personsx2014.dat")

In [15]:
hh2014 = df_2014.Household.values.tolist()

In [107]:
def common_member(a, b):
    a_set = set(a)
    b_set = set(b)
 
    if (a_set & b_set):
        return list(a_set & b_set)
    else:
        print("No common elements")
    
treat_sample = (common_member(hh2012, hh2014)) 
# these people experienced treatment. Assumption is that households affect affordability 

In [111]:
df_2012_filtered = df_2012[~df_2012['Household'].isin(treat_sample)] # df setting aside sample used for treatment

In [154]:
def split_samples(df):
    N = len(df)
    if N % 2 == 0:
        s = N/2
    else:
        s = int((N + 1)/2)
    
    df_subset = df.sample(s)
    other_half = df.drop(df_subset.index)
    
    return df_subset, other_half

In [155]:
filtered_19 = df_2012_filtered[df_2012_filtered['Age'] == 19]
s1 = split_samples(filtered_19)[0] 
s2 = split_samples(filtered_19)[1] 

In [156]:
delay_19_2012 = len(s1[s1['MedDelay'] == 1])/ len(s1)

In [164]:
delay_20_2012 = len(df_2012[(df_2012['Age'] == 20) 
    & (df_2012['MedDelay'] == 1)])/ len(df_2012[df_2012['Age'] == 20])

In [165]:
delay_21_2012 = len(df_2012[(df_2012['Age'] == 21) 
    & (df_2012['MedDelay'] == 1)])/ len(df_2012[df_2012['Age'] == 21])

In [166]:
delay_22_2012 = len(df_2012[(df_2012['Age'] == 22) 
    & (df_2012['MedDelay'] == 1)])/ len(df_2012[df_2012['Age'] == 22])

In [167]:
delay_23_2012 = len(df_2012[(df_2012['Age'] == 23) 
    & (df_2012['MedDelay'] == 1)])/ len(df_2012[df_2012['Age'] == 23])

In [168]:
delay_24_2012 = len(df_2012[(df_2012['Age'] == 24) 
    & (df_2012['MedDelay'] == 1)])/ len(df_2012[df_2012['Age'] == 24])

In [189]:
control_diff = delay_21_2012 - delay_19_2012 
# assuming the group of 21 year olds and 19 year olds are similar enough
# and no unobserved hetergeneity
control_diff # between 19 and 21

0.023473541383989154

In [170]:
did2 = delay_22_2012 - delay_20_2012 
did2

0.02064091979503821

In [171]:
did3 = delay_23_2012 - delay_21_2012 
did3

-0.0006380435484913127

In [172]:
did4 = delay_24_2012 - delay_22_2012 
did4

-0.002740018859480872

We can clearly see there are differences between age groups. One possible factor is income, which is probably one of the largest determinants of accesibility. 

In [144]:
treat_2012 = df_2012[df_2012['Household'].isin(treat_sample)]
treat_2014 = df_2014[df_2014['Household'].isin(treat_sample)]

In [175]:
delay_pre_treat = len(s2[s2['MedDelay'] == 1])/ len(s2)

In [176]:
delay_post_treat = len(df_2014[(df_2014['Age'] == 21) 
    & (df_2014['MedDelay'] == 1)])/ len(df_2014[df_2014['Age'] == 21])

In [178]:
treat_diff = delay_post_treat - delay_pre_treat
treat_diff

0.01057229161564209

In [191]:
final_did = treat_diff - control_diff
final_did

-0.012901249768347063

In [185]:
len(df_2010[df_2010['MedDelay'] == 1])/len(df_2010)

0.0975340956054553

In [186]:
len(df_2012[df_2012['MedDelay'] == 1])/len(df_2012)

0.0732799245994345

In [187]:
len(df_2014[df_2014['MedDelay'] == 1])/len(df_2014)

0.057927560566082996