In [51]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [34]:
cleaned_county_data = pd.read_csv("cleaned_county_health_data.csv")

In [35]:
cleaned_county_data.premature_death_raw_value.describe()

count     2703.000000
mean      8424.311168
std       2467.830038
min       2610.690433
25%       6647.505141
50%       8128.336959
75%       9962.276563
max      22123.700268
Name: premature_death_raw_value, dtype: float64

In [30]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

import numpy as np

In [36]:
target = cleaned_county_data.premature_death_raw_value
admin_vars = ["premature_death_raw_value","statecode", "countycode", "fipscode", "state", "county","year", "county_ranked"]
co_vars = cleaned_county_data.drop(columns = admin_vars)
co_vars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2703 entries, 0 to 2702
Data columns (total 52 columns):
poor_or_fair_health_raw_value                                           2703 non-null float64
poor_physical_health_days_raw_value                                     2703 non-null float64
poor_mental_health_days_raw_value                                       2703 non-null float64
low_birthweight_raw_value                                               2703 non-null float64
adult_smoking_raw_value                                                 2703 non-null float64
adult_obesity_raw_value                                                 2703 non-null float64
physical_inactivity_raw_value                                           2703 non-null float64
excessive_drinking_raw_value                                            2703 non-null float64
uninsured_raw_value                                                     2703 non-null float64
preventable_hospital_stays_raw_value           

In [37]:
pipe = Pipeline([
   ('sc', StandardScaler()),
   ('lr', LinearRegression())])
baseline = cross_val_score(pipe, co_vars, target, cv=5)

In [38]:
baseline

array([0.92792437, 0.95292534, 0.96002449, 0.94713328, 0.95867686])

In [42]:
marginal_factors_edu = ['high_school_graduation_raw_value', 
                       'some_college_raw_value']
      
marginal_factors_econ = ['income_inequality_raw_value','homeownership_raw_value',
                         'percentage_of_households_with_high_housing_costs',
                         'percentage_of_households_with_overcrowding',
                         'percentage_of_households_with_lack_of_kitchen_or_plumbing_facilities',
                         'severe_housing_problems_raw_value', 
                         'severe_housing_cost_burden_raw_value']

marginal_factors_demo = ['pct_american_indian_and_alaskan_native_raw_value',
                       'pct_asian_raw_value',
                       'pct_native_hawaiian_other_pacific_islander_raw_value',
                       'pct_hispanic_raw_value',
                       'pct_not_proficient_in_english_raw_value',
                       'children_in_single_parent_households_raw_value']

marginal_factors_health = ['physical_inactivity_raw_value', 
                           'excessive_drinking_raw_value',
                           'injury_deaths_raw_value',
                           'frequent_physical_distress_raw_value',
                           'frequent_mental_distress_raw_value', 
                           'food_insecurity_raw_value', 
                           'insufficient_sleep_raw_value']

marginal_factors_social = ['children_in_single_parent_households_raw_value',
                           'social_associations_raw_value', 
                           'driving_alone_to_work_raw_value',
                           'long_commute_driving_alone_raw_value']

In [69]:
# plt.subplots(figsize = (15, 10))
# sns.heatmap(cleaned_county_data[marginal_factors_edu + ["premature_death_raw_value"]].corr(),
#            center = 0, annot = True, robust = True)
# plt.show()

In [68]:
cleaned_county_data[marginal_factors_edu + ["premature_death_raw_value"]].corr()

Unnamed: 0,high_school_graduation_raw_value,some_college_raw_value,premature_death_raw_value
high_school_graduation_raw_value,1.0,0.054509,-0.111829
some_college_raw_value,0.054509,1.0,-0.589876
premature_death_raw_value,-0.111829,-0.589876,1.0


In [74]:
cleaned_county_data[["premature_death_raw_value"] + marginal_factors_econ ].corr()

Unnamed: 0,premature_death_raw_value,income_inequality_raw_value,homeownership_raw_value,percentage_of_households_with_high_housing_costs,percentage_of_households_with_overcrowding,percentage_of_households_with_lack_of_kitchen_or_plumbing_facilities,severe_housing_problems_raw_value,severe_housing_cost_burden_raw_value
premature_death_raw_value,1.0,0.423343,-0.029648,-0.002973,0.120098,0.165381,0.082249,0.039574
income_inequality_raw_value,0.423343,1.0,-0.395854,0.43699,0.164867,0.08285,0.419255,0.515453
homeownership_raw_value,-0.029648,-0.395854,1.0,-0.566686,-0.261269,-0.016261,-0.549292,-0.597747
percentage_of_households_with_high_housing_costs,-0.002973,0.43699,-0.566686,1.0,0.136276,-0.031909,0.827213,0.927411
percentage_of_households_with_overcrowding,0.120098,0.164867,-0.261269,0.136276,1.0,0.568444,0.63721,0.165112
percentage_of_households_with_lack_of_kitchen_or_plumbing_facilities,0.165381,0.08285,-0.016261,-0.031909,0.568444,1.0,0.415052,-0.014221
severe_housing_problems_raw_value,0.082249,0.419255,-0.549292,0.827213,0.63721,0.415052,1.0,0.785765
severe_housing_cost_burden_raw_value,0.039574,0.515453,-0.597747,0.927411,0.165112,-0.014221,0.785765,1.0


In [79]:
#drop racial/ethnic shares
cleaned_county_data[["premature_death_raw_value"] + marginal_factors_demo].corr()

Unnamed: 0,premature_death_raw_value,pct_american_indian_and_alaskan_native_raw_value,pct_asian_raw_value,pct_native_hawaiian_other_pacific_islander_raw_value,pct_hispanic_raw_value,pct_not_proficient_in_english_raw_value,children_in_single_parent_households_raw_value
premature_death_raw_value,1.0,0.265042,-0.341503,-0.08329,-0.166164,-0.227582,0.569865
pct_american_indian_and_alaskan_native_raw_value,0.265042,1.0,-0.035651,0.007711,0.034249,0.019184,0.138052
pct_asian_raw_value,-0.341503,-0.035651,1.0,0.460084,0.192174,0.345976,-0.105803
pct_native_hawaiian_other_pacific_islander_raw_value,-0.08329,0.007711,0.460084,1.0,0.096667,0.129708,-0.019342
pct_hispanic_raw_value,-0.166164,0.034249,0.192174,0.096667,1.0,0.831011,0.024836
pct_not_proficient_in_english_raw_value,-0.227582,0.019184,0.345976,0.129708,0.831011,1.0,0.012745
children_in_single_parent_households_raw_value,0.569865,0.138052,-0.105803,-0.019342,0.024836,0.012745,1.0


In [78]:
cleaned_county_data[["premature_death_raw_value"] + marginal_factors_health].corr()

Unnamed: 0,premature_death_raw_value,physical_inactivity_raw_value,excessive_drinking_raw_value,injury_deaths_raw_value,frequent_physical_distress_raw_value,frequent_mental_distress_raw_value,food_insecurity_raw_value,insufficient_sleep_raw_value
premature_death_raw_value,1.0,0.653342,-0.634386,0.708724,0.732313,0.747417,0.666107,0.530036
physical_inactivity_raw_value,0.653342,1.0,-0.592693,0.364553,0.614784,0.615099,0.492352,0.528372
excessive_drinking_raw_value,-0.634386,-0.592693,1.0,-0.410866,-0.687061,-0.69439,-0.55514,-0.483544
injury_deaths_raw_value,0.708724,0.364553,-0.410866,1.0,0.441789,0.485137,0.331804,0.152359
frequent_physical_distress_raw_value,0.732313,0.614784,-0.687061,0.441789,1.0,0.952572,0.688849,0.65183
frequent_mental_distress_raw_value,0.747417,0.615099,-0.69439,0.485137,0.952572,1.0,0.695684,0.657203
food_insecurity_raw_value,0.666107,0.492352,-0.55514,0.331804,0.688849,0.695684,1.0,0.506948
insufficient_sleep_raw_value,0.530036,0.528372,-0.483544,0.152359,0.65183,0.657203,0.506948,1.0


In [77]:
cleaned_county_data[["premature_death_raw_value"] + marginal_factors_social].corr()

Unnamed: 0,premature_death_raw_value,children_in_single_parent_households_raw_value,social_associations_raw_value,driving_alone_to_work_raw_value,long_commute_driving_alone_raw_value
premature_death_raw_value,1.0,0.569865,-0.050471,0.233829,0.093439
children_in_single_parent_households_raw_value,0.569865,1.0,-0.072068,0.096037,-0.033874
social_associations_raw_value,-0.050471,-0.072068,1.0,0.042815,-0.30128
driving_alone_to_work_raw_value,0.233829,0.096037,0.042815,1.0,0.106258
long_commute_driving_alone_raw_value,0.093439,-0.033874,-0.30128,0.106258,1.0


In [81]:
county_descriptors = ["statecode", "countycode", "fipscode", "state", "county","year", "county_ranked"]
other_outcomes = ['premature_age_adjusted_mortality_raw_value', 
                  'life_expectancy_raw_value',
                  'poor_physical_health_days_raw_value',
                  'poor_mental_health_days_raw_value',
                  'low_birthweight_raw_value']
variables = cleaned_county_data.drop(columns = county_descriptors)
all_corr_df = variables.corr()
all_corr_df.premature_death_raw_value.abs().sort_values()

percentage_of_households_with_high_housing_costs                        0.002973
ratio_of_population_to_primary_care_providers_other_than_physicians     0.029418
homeownership_raw_value                                                 0.029648
severe_housing_cost_burden_raw_value                                    0.039574
other_primary_care_providers_raw_value                                  0.046041
pct_below_18_years_of_age_raw_value                                     0.049600
social_associations_raw_value                                           0.050471
uninsured_children_raw_value                                            0.082190
severe_housing_problems_raw_value                                       0.082249
pct_native_hawaiian_other_pacific_islander_raw_value                    0.083290
long_commute_driving_alone_raw_value                                    0.093439
high_school_graduation_raw_value                                        0.111829
pct_females_raw_value       

In [93]:
sum((all_corr_df.premature_death_raw_value>.5) &(all_corr_df.premature_death_raw_value!=1))

16