In [23]:
# Importing modules
import requests, zipfile
from io import BytesIO

In [24]:
# Downloading the data
print("Beginning Download...")

# Defining the zip file URL
url = "https://meps.ahrq.gov/mepsweb/data_files/pufs/h209/h209xlsx.zip"

# Split URL to get the file name
filename = url.split('/')[-1]

# Downloading the file by sending the request to the URL
req = requests.get(url)
print('Downloading Completed')

# extracting the zip file contents
zipfile= zipfile.ZipFile(BytesIO(req.content))
zipfile.extractall('./Resources')

Beginning Download...
Downloading Completed


In [25]:
# Converting Data from .xlsx to .csv and then to DataFrame
# Initializing needed column list
col_list = ['DUID', 'FAMSZE18', 'REGION18', 'ENDRFY18', 'AGE18X', 'SEX', 'RACEV1X', 'EDUCYR', 'HIDEG', 'TTLP18X', 'ADBMI42', 'ADOFTB42', 'UNINS18', 'TOTEXP18', 'TOTSLF18', 'TOTMCR18', 'TOTMCD18', 'TOTVA18', 'TOTPTR18', 'TOTOTH18'] 

# Reading necessary columns from excel
print("Reading Excel File...")
excel = pd.read_excel('./Resources/h209.xlsx', usecols=col_list)

# Converting to CSV for faster processing
print("Coverting Excel File to CSV...")
excel.to_csv("./Resources/2018_FYC.csv", index=None, header=True)

# Reading CSV into DF
df_raw = pd.DataFrame(pd.read_csv("./Resources/2018_FYC.csv"))

# Reviewing DataFrame
df_raw.head()

Reading Excel File...
Coverting Excel File to CSV...


Unnamed: 0,DUID,FAMSZE18,REGION18,ENDRFY18,AGE18X,SEX,RACEV1X,EDUCYR,HIDEG,ADBMI42,ADOFTB42,TTLP18X,UNINS18,TOTEXP18,TOTSLF18,TOTMCR18,TOTMCD18,TOTVA18,TOTPTR18,TOTOTH18
0,2290001,2,2,2018,27,2,1,16,4,21.4,3,32000,2,2368,225,0,2037,0,107,0
1,2290001,2,2,2018,25,1,1,17,4,30.6,3,0,2,2040,136,0,0,0,1904,0
2,2290002,6,2,2018,34,2,1,10,3,28.2,3,25000,2,173,74,0,0,0,99,0
3,2290002,6,2,2018,39,1,1,10,3,28.7,3,30000,2,0,0,0,0,0,0,0
4,2290002,6,2,2018,11,1,1,4,8,-1.0,-1,0,2,103,69,0,0,0,34,0


In [35]:
# Cleaning Data

# Renaming columns
df_raw.columns = ['ID', 'Family_Size', 'Region', 'Year', 'Age', 'Sex', 'Race', 'Education_Level', 'Highest_Degree', 'BMI', 'Tobacco_Use', 'Total_Personal_Income', 'Uninsured_2018', 'Total_Expenditure', 'Exp_Pocket', 'Exp_Medicare', 'Total_Medicaid', 'Exp_VA', 'Total_Priv_Tri', 'Total_Other']
         
# Filtering data to just the adults
df_trim = df_raw.loc[(df_raw['Age'] > 20)]

# Filtering only for insured customers
df_trim = df_trim.loc[(df_trim['Uninsured_2018'] == 2)]

# Setting index to ID #
df_trim = df_trim.set_index('ID')

# Reviewing Dataframe
df_trim.head(10)

Unnamed: 0_level_0,Family_Size,Region,Year,Age,Sex,Race,Education_Level,Highest_Degree,BMI,Tobacco_Use,Total_Personal_Income,Uninsured_2018,Total_Expenditure,Exp_Pocket,Exp_Medicare,Total_Medicaid,Exp_VA,Total_Priv_Tri,Total_Other
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2290001,2,2,2018,27,2,1,16,4,21.4,3,32000,2,2368,225,0,2037,0,107,0
2290001,2,2,2018,25,1,1,17,4,30.6,3,0,2,2040,136,0,0,0,1904,0
2290002,6,2,2018,34,2,1,10,3,28.2,3,25000,2,173,74,0,0,0,99,0
2290002,6,2,2018,39,1,1,10,3,28.7,3,30000,2,0,0,0,0,0,0,0
2290003,4,2,2018,36,2,1,17,5,21.5,3,30217,2,535,232,0,0,0,303,0
2290003,4,2,2018,36,1,1,17,5,-1.0,-1,227866,2,7023,1460,0,0,0,5563,0
2290005,5,2,2018,30,1,1,14,3,28.3,3,31644,2,800,140,0,0,0,660,0
2290005,5,2,2018,32,2,1,12,2,28.9,3,29244,2,5233,1550,0,0,0,3683,0
2290006,2,3,2018,28,2,1,12,3,43.1,3,11000,2,0,0,0,0,0,0,0
2290007,2,3,2018,60,1,1,11,1,21.9,3,5000,2,519,1,0,0,0,518,0
