In [1]:
# Importing modules
import pandas as pd
import requests, zipfile
from io import BytesIO

In [2]:
# Downloading the data
print("Beginning Download...")

# Defining the zip file URL
url = "https://meps.ahrq.gov/mepsweb/data_files/pufs/h209/h209xlsx.zip"

# Split URL to get the file name
filename = url.split('/')[-1]

# Downloading the file by sending the request to the URL
req = requests.get(url)
print('Download Complete.')

# extracting the zip file contents
zipfile= zipfile.ZipFile(BytesIO(req.content))
zipfile.extractall('./Resources')

Beginning Download...
Download Complete.


In [None]:
# Converting Data from .xlsx to .csv and then to DataFrame
# Initializing needed column list
col_list = ['DUID', 'FAMSZE18', 'REGION18', 'ENDRFY18', 'AGE18X', 'SEX', 'RACEV2X', 'EDUCYR', 'HIDEG', 'TTLP18X', 'ADBMI42', 'ADOFTB42', 'UNINS18', 'TOTEXP18', 'TOTSLF18', 'TOTMCR18', 'TOTMCD18', 'TOTVA18', 'TOTPTR18', 'TOTOTH18'] 

# Reading necessary columns from excel
print("Reading Excel File...")
excel = pd.read_excel('./Resources/h209.xlsx', usecols=col_list)

# Converting to CSV for faster processing
print("Coverting Excel File to CSV...")
excel.to_csv("./Resources/2018_FYC.csv", index=None, header=True)

# Reading CSV into DF
print("Reading CSV into DataFrame...")
df_raw = pd.DataFrame(pd.read_csv("./Resources/2018_FYC.csv"))

# Reviewing DataFrame
print("Conversion Complete.")
df_raw.head()

Reading Excel File...


In [None]:
# Cleaning Data

# Renaming columns
df_raw.columns = ['ID', 'Family_Size', 'Region', 'Year', 'Age', 'Sex', 'Race', 'Education_Level', 'Highest_Degree', 'BMI', 'Tobacco_Use', 'Total_Personal_Income', 'Uninsured_2018', 'Total_Expenditure', 'Exp_Pocket', 'Exp_Medicare', 'Total_Medicaid', 'Exp_VA', 'Total_Priv_Tri', 'Total_Other']
         
# Filtering data to just the adults
df_trim = df_raw.loc[(df_raw['Age'] > 20)]

# Filtering only for insured customers
df_trim = df_trim.loc[(df_trim['Uninsured_2018'] == 2)]

# Filtering for all valid family sizes
df_trim = df_trim.loc[(df_trim['Family_Size'] > 0)]

# Filtering for applicable tobacco answers (1=Yes/2=No)
df_trim = df_trim.loc[(df_trim['Tobacco_Use'] > 0)]

# Removing unnecessary columns
df_trim.drop('Year', axis=1, inplace=True)
df_trim.drop('Uninsured_2018', axis=1, inplace=True)

# Reviewing Dataframe
df_trim.head(10)

In [None]:
# Trasnferring DataFrame to CSV for mutual use in project
df_trim.to_csv("./Resources/2018_FYC.csv", index=None, header=True)