In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import cleaning_functions as clean
%matplotlib inline

pd.set_option('display.max_rows', 200)

In [2]:
# reading in the raw data
health_df = pd.read_csv('data/analytic_data2019.csv', skiprows=1)

In [3]:
# Only retaining the primary length of life Health Outcome as our target,
# and the primary Health Factors from the dataset as potential features,
# along with indentifying details on the county
county_details = ["statecode", "countycode", "fipscode",
                  "state", "county", "year", "county_ranked"]
target_outcome = ["v001_rawvalue"]
ranked_measures = ["v009_rawvalue", "v011_rawvalue", "v133_rawvalue",
                   "v070_rawvalue", "v132_rawvalue", "v049_rawvalue",
                   "v134_rawvalue", "v045_rawvalue", "v014_rawvalue",
                   "v085_rawvalue", "v004_rawvalue", "v088_rawvalue",
                   "v062_rawvalue", "v005_rawvalue", "v050_rawvalue",
                   "v155_rawvalue", "v021_rawvalue", "v069_rawvalue",
                   "v023_rawvalue", "v024_rawvalue", "v044_rawvalue",
                   "v082_rawvalue", "v140_rawvalue", "v043_rawvalue",
                   "v135_rawvalue", "v125_rawvalue", "v124_rawvalue",
                   "v136_rawvalue", "v067_rawvalue", "v137_rawvalue"]

In [4]:
health_df = health_df[county_details
                      +target_outcome
                      +ranked_measures]

In [5]:
# generating a dictionary with the raw column names and more interpretable names
data_path = 'data/analytic_data2019.csv'
columns_dict = clean.create_column_dict(data_path)

# renaming the columns to be more interpretable
health_df.rename(columns=columns_dict, inplace=True)

In [6]:
# dropping rows that are not "county ranked" as they have less data and no premature death info
# also dropping national and state-level data to leave only county-level data
county_health_df = health_df[health_df.county_ranked == 1].copy()
county_health_df.reset_index(inplace=True, drop=True)

In [7]:
county_health_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3081 entries, 0 to 3080
Data columns (total 38 columns):
statecode                                         3081 non-null int64
countycode                                        3081 non-null int64
fipscode                                          3081 non-null int64
state                                             3081 non-null object
county                                            3081 non-null object
year                                              3081 non-null int64
county_ranked                                     3081 non-null float64
premature_death_raw_value                         3081 non-null float64
adult_smoking_raw_value                           3081 non-null float64
adult_obesity_raw_value                           3081 non-null float64
food_environment_index_raw_value                  3062 non-null float64
physical_inactivity_raw_value                     3081 non-null float64
access_to_exercise_opportunities_raw_va

In [8]:
# creating a dataframe listing each column along with its count of NaN values
# to explore which metrics have enough coverage to be usable
na_count_df = clean.column_na_count_df(county_health_df)
na_count_df

Unnamed: 0,column,na_count
0,statecode,0
1,countycode,0
2,fipscode,0
3,state,0
4,county,0
5,year,0
6,county_ranked,0
7,premature_death_raw_value,0
8,adult_smoking_raw_value,0
9,adult_obesity_raw_value,0


In [9]:
# The underlying data was rigorously developed and only excluded values 
# when they felt there was no reliable data available,
# so we made the decision to proceed only with county's containing all Health Factors
county_health_df_no_na = health_df.dropna(axis=0, how='any')
county_health_df_no_na.reset_index(inplace=True, drop=True)

In [10]:
county_health_df_no_na.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2473 entries, 0 to 2472
Data columns (total 38 columns):
statecode                                         2473 non-null int64
countycode                                        2473 non-null int64
fipscode                                          2473 non-null int64
state                                             2473 non-null object
county                                            2473 non-null object
year                                              2473 non-null int64
county_ranked                                     2473 non-null float64
premature_death_raw_value                         2473 non-null float64
adult_smoking_raw_value                           2473 non-null float64
adult_obesity_raw_value                           2473 non-null float64
food_environment_index_raw_value                  2473 non-null float64
physical_inactivity_raw_value                     2473 non-null float64
access_to_exercise_opportunities_raw_va

In [11]:
# writing to csv for easy import into other notebooks
county_health_df_no_na.to_csv('cleaned_county_health_data.csv', index=False)