In [213]:
import numpy as np
import pandas as pd
import scipy.stats as stats


In [214]:
# ------------------------------------------------------------------------
# NOTE
# ------------------------------------------------------------------------
# Purpose: expand the cleaned&merged data to case-choices level
#      (expend the choice sets of each consumer)
#
# Input:
#     1. data_merged.h5 , key: cases, the cleaned and merged data
#         each row = one case, one consumer might have more than 1 cases
# Output: 
#     1. data_merged.h5 , key: cases_choices, 
#         each row = one case-choices
# ------------------------------------------------------------------------




In [215]:
# 0. Initialize file path -----------------------------------------------
rootpath = '/Users/jingyuanwang/Dropbox/Course/ECON/IO/NU450/NU450_HW/coding_tutorial'
datapath = rootpath + '/' + 'data'
resultpath = rootpath + '/' + 'results'



In [216]:
# I. Input data ---------------------------------------------------------
filename = 'data_merged'
filekey = 'cases'
inputname = datapath + '/' + filename + '.h5'
df = pd.read_hdf(inputname, key=filekey)
df.index.name = 'case_id'

In [217]:
df.head(13)

Unnamed: 0_level_0,consumer_id,year,insurer_id,provider_id,age,female,num_dependents,price_scale,price_provider_ave,public,price
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,0,2015,78,13,43,1,1,0.919416,0.965927,1,0.888089
1,1,2015,78,13,54,1,3,0.919416,0.965927,1,0.888089
2,2,2013,67,7,60,1,1,0.740955,1.167918,0,0.865375
3,3,2015,107,13,71,0,1,0.797114,0.965927,1,0.769954
4,4,2015,107,8,46,1,0,1.290198,1.196602,0,1.543853
5,5,2016,67,6,37,1,0,1.026815,1.398097,0,1.435586
6,6,2014,67,13,47,0,0,0.603633,0.896933,1,0.541418
7,8,2015,99,5,5,0,3,1.246874,1.134914,0,1.415094
8,9,2014,67,13,63,0,0,0.603633,0.896933,1,0.541418
9,11,2013,88,13,37,1,0,1.317313,0.917313,1,1.208389


In [218]:
# II. expand data ---------------------------------------------------------
# 1. store provider information in a separate datafram
providers = df[['provider_id','year','public','price_provider_ave']].drop_duplicates()
df = df.drop(columns = ['public','price_provider_ave'] )

In [219]:
# 2. get a new dataframe for the new case-choices level data
consumer_choices = pd.DataFrame()
ids = ['case_id', 'consumer_id', 'year', 'insurer_id', 'provider_id']

In [220]:
# 3. expand the dataset
for year, frame in df.groupby(['year','insurer_id']):
    # (1). get choice set for each year 
    choices[group] = frame['provider_id'].unique()
    frame = pd.concat([frame]*len(choices[group]) , keys= choices[group])
    consumer_choices = pd.concat([consumer_choices,frame])

In [221]:

consumer_choices = (consumer_choices.reset_index()
                    .rename(columns = {'level_0': 'providers_available'})
                    .sort_values(ids)
                    .reset_index(drop =True))

In [222]:
# order variables
colnames = consumer_choices.columns.tolist()
colnames = colnames[1:] + colnames[:1]
consumer_choices = consumer_choices[colnames]

In [223]:
# 4. merge in provider variables
consumer_choices = pd.merge(consumer_choices,providers, 
                            how='left',
                            left_on = ['providers_available', 'year'], 
                            right_on = ['provider_id', 'year']).drop(columns = ['provider_id_y'])

In [224]:
consumer_choices = consumer_choices.rename(columns = {'provider_id_x' : 'provider_id'})

In [225]:
consumer_choices.head(15)

Unnamed: 0,case_id,consumer_id,year,insurer_id,provider_id,age,female,num_dependents,price_scale,price,providers_available,public,price_provider_ave
0,0,0,2015,78,13,43,1,1,0.919416,0.888089,13,1,0.965927
1,0,0,2015,78,13,43,1,1,0.919416,0.888089,6,0,1.377766
2,0,0,2015,78,13,43,1,1,0.919416,0.888089,7,0,1.175127
3,0,0,2015,78,13,43,1,1,0.919416,0.888089,8,0,1.196602
4,0,0,2015,78,13,43,1,1,0.919416,0.888089,2,0,0.80293
5,0,0,2015,78,13,43,1,1,0.919416,0.888089,9,1,1.108957
6,0,0,2015,78,13,43,1,1,0.919416,0.888089,5,0,1.134914
7,0,0,2015,78,13,43,1,1,0.919416,0.888089,4,0,0.942301
8,0,0,2015,78,13,43,1,1,0.919416,0.888089,3,0,0.963161
9,1,1,2015,78,13,54,1,3,0.919416,0.888089,13,1,0.965927


In [226]:
# III. generate variables ---------------------------------------------------------
# 1. age_largerthanmed
# Create a column that for each consumer- hospital-year 
# indicates whether the age of the patient is 
# above or below the median age of patients of the given hospital in the previous year. 
# Drop the first year.

In [227]:
avg_age = consumer_choices.groupby(['provider_id','year']).agg({'age':np.mean})
avg_age = avg_age.reset_index().rename(columns = {'age':'ave_age'})
avg_age['year'] = avg_age['year'] + 1

In [228]:
consumer_choices = (pd.merge(consumer_choices, avg_age, how = 'inner',
                            left_on = ['providers_available','year'],
                            right_on = ['provider_id','year'])
                    .drop(columns = ['provider_id_y'])
                    .rename(columns = {'provider_id_x' : 'provider_id',
                                      'ave_age' : 'ave_age_providerlastyr'})
                    .sort_values(ids).reset_index(drop =True))

In [229]:
consumer_choices['age_largerthanmed'] = (consumer_choices['age'] > consumer_choices['ave_age_providerlastyr'])*1

In [None]:
# 2. price for each provider-insurer


In [235]:
filename = 'price_scales'
data = datapath + '/' + filename + '.dta'
price_scales = pd.read_stata(data)

In [242]:
price_scales.rename(columns = {'provider_id' : 'providers_available',
                               'price':'price_scale'}, inplace=True)
price_scales.head()

Unnamed: 0,insurer_id,providers_available,year,price_scale
0,67,1,2013,0.923635
1,67,1,2014,0.944445
2,67,1,2015,1.079211
3,67,1,2016,1.0601
4,67,2,2013,0.539515


In [244]:
consumer_choices.drop(columns = 'price_scale', inplace = True)

In [246]:
consumer_choices = pd.merge(consumer_choices, price_scales,
                            how = 'left',
                            left_on = ['insurer_id', 'providers_available', 'year'],
                            right_on = ['insurer_id', 'providers_available', 'year'])

In [247]:
consumer_choices['price'] = consumer_choices['price_scale']*consumer_choices['price_provider_ave']

In [248]:
consumer_choices.head()

Unnamed: 0,case_id,consumer_id,year,insurer_id,provider_id,age,female,num_dependents,price,providers_available,public,price_provider_ave,ave_age_providerlastyr,age_largerthanmed,price_scale
0,0,0,2015,78,13,43,1,1,0.888089,13,1,0.965927,42.987092,1,0.919416
1,0,0,2015,78,13,43,1,1,1.46788,6,0,1.377766,42.006418,1,1.065406
2,0,0,2015,78,13,43,1,1,0.971884,7,0,1.175127,42.07563,1,0.827045
3,0,0,2015,78,13,43,1,1,1.110956,8,0,1.196602,45.047904,0,0.928426
4,0,0,2015,78,13,43,1,1,0.851986,2,0,0.80293,42.528302,1,1.061097


In [249]:
# V. Store data --------------------------------------------------------

In [250]:
filename = 'data_merged'
filekey = 'cases_choices'
outputname = datapath + '/' + filename + '.h5'
consumer_choices.to_hdf(outputname, key = filekey, complevel = 2, mode='a')