In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

from cm import plot_confusion_matrix

%matplotlib inline

### Reading in and examining the heart attack cost disparities data

In [None]:
ha_costs_df = pd.read_csv('../data/mmd_heart_attack_data.csv') 
ha_costs_df.head(2)

### Now examining the cancer data

In [None]:
cancer_costs_df = pd.read_csv('../data/mmd_cancer_data.csv')
cancer_costs_df.head(2)

### Getting the income data and cleaning it a bit

In [None]:
income_df = pd.read_csv('../data/irs_county_2016.csv')
income_df.head(2)

In [None]:
income_df = income_df[['STATE', 'COUNTYNAME', 'agi_stub', 'N1', 'mars1', 'MARS2', 'MARS4', 'N2', 'NUMDEP', 'ELDERLY', 'A00100', 'N02650', 'A02650', 'N02300', 'A02300']]
income_df.columns = ['state', 'county', 'income_bucket', 'return_count', 'single_returns', 'joint_returns', 'head_of_house_returns', 'exemptions', 'dependents', 'elderly', 'agi', 'returns_with_total_inc','total_inc_amt', 'returns_with_unemployment', 'unemployment_comp']
income_df.head(2)

#### Creating a new df that aggregates by state and county to get the totals for each county

In [None]:
income_agg = income_df.groupby(['state','county']).agg('sum').reset_index()
income_agg.head(2)

In [None]:
income_agg['avg_income'] = round(income_agg.total_inc_amt * 1000 / income_agg.returns_with_total_inc, 0)

#### Week 4 Coding Tasks
 - rename `analysis_value` columns in `ha_costs_df` and `cancer_costs_df`
 

In [None]:
ha_costs_df.head(2)

In [None]:
ha_costs_df = ha_costs_df.rename(columns = {'analysis_value': 'ha_avg_cost'})
ha_costs_df.head(2)

In [None]:
cancer_costs_df = cancer_costs_df.rename(columns = {'analysis_value': 'cancer_avg_cost'})
cancer_costs_df.head(2)

#### Create a new dataframe  `combined_df` 
- merge `county`, `urban`, and `ha_avg_cost` from `ha_costs_df` with `county` and `cancer_avg_cost` from `cancer_costs_df`


In [None]:
cancer_cost_sub = cancer_costs_df[['state', 'county', 'urban', 'cancer_avg_cost']]
ha_cost_sub = ha_costs_df[['state','county', 'ha_avg_cost']]

In [None]:
combined_df = pd.merge(cancer_cost_sub, ha_cost_sub, on = ['state', 'county'], how = 'inner')
print(combined_df.shape)
combined_df.head(2)

#### Now combine all the data and create the cost_income_ratio variables

- merge `avg_income` from `income_agg` with `combined_df` and save back to the `combined_df` variable  

- create `ha_cost_income_ratio` (`ha_avg_cost` / `avg_income`)  

- create `cancer_cost_income_ratio` (`cancer_avg_cost` / `avg_income`)

In [None]:
county_incomes = income_agg[['state','county', 'avg_income']]
county_incomes.head(2)

#### Before you can merge county incomes and combined_df, the data is the `state` columns will need to match

In [None]:
state_abbrev = pd.read_csv('../data/state_abbrev.csv')
state_abbrev.head(3)

In [None]:
combined_df['state'] = combined_df.state.map(state_abbrev.set_index('name')['abbrev'].to_dict())
combined_df.head()

In [None]:
combined_df = pd.merge(combined_df, county_incomes, on = ['state', 'county'], how = 'inner')
combined_df.head(2)

#### What is our target variable? What are the predictor variables?
- The target is whether or not `cancer_avg_cost` is above or below the mean. 
- Predictors are `ha_avg_cost`, `urban`, and `avg_income`

In [None]:
combined_df['ha_cost_ratio'] = combined_df['ha_avg_cost'] / combined_df['avg_income']
combined_df['cancer_cost_ratio'] = combined_df['cancer_avg_cost'] / combined_df['avg_income']

In [None]:
combined_df.head()

In [None]:
avg_cancer_ratio = combined_df.cancer_cost_ratio.mean()
avg_ha_ratio = combined_df.ha_cost_ratio.mean()

In [None]:
combined_df['above_avg_cancer_ratio'] = combined_df['cancer_cost_ratio'] > avg_cancer_ratio
combined_df['above_avg_ha_ratio'] = combined_df['ha_cost_ratio'] > avg_ha_ratio

In [None]:
combined_df.head()

In [None]:
combined_df = pd.get_dummies(combined_df, columns = ['urban'], drop_first = True)
combined_df.head(3)

In [None]:
X = combined_df[['urban_Urban']]
y = combined_df.above_avg_cancer_ratio

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 2020)

In [None]:
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)

In [None]:
fig, ax = plt.subplots(figsize = (7,5))
sns.barplot(x = logistic_model.coef_[0], y = list(X.columns), ax = ax, edgecolor = 'black')
plt.title('Logistic Regression Coefficients');

In [None]:
y_test.value_counts(normalize = True)

In [None]:
y_pred = logistic_model.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))

#### Our model is a bit better than the naive model

In [None]:
print(metrics.classification_report(y_test, y_pred))

#### Let's check AUC

In [None]:
y_pred_prob = logistic_model.predict_proba(X_test)[:,1]

print(metrics.roc_auc_score(y_test, y_pred_prob))

In [None]:
fp_rate, tp_rate, thresholds = metrics.roc_curve(y_test, y_pred_prob)

plt.plot(fp_rate, tp_rate)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.title('ROC curve for Cancer Cost Above Average classifier')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.grid(True)

In [None]:
geographic_info = pd.read_csv('../data/geographic_profile.csv')

In [None]:
geographic_info

In [None]:
geographic_info['state'] = geographic_info.state.str.upper().str.strip().map(state_abbrev.set_index('name')['abbrev'].to_dict())

In [None]:
geographic_info.isna().sum()

In [None]:
geographic_info = geographic_info[['state', 'county', 'Percent Below Federal Poverty Level (5y Avg.)',
                 'Percent of Senior Population Below Federal Poverty Level (5y Avg.)', 'Unemployment Rate (5y Avg.)']]

In [None]:
combined_df = combined_df[['state', 'county', 'above_avg_cancer_ratio', 'above_avg_ha_ratio', 'urban_Urban']]

In [None]:
combined_df = pd.merge(left = combined_df, right = geographic_info)

# Adding in the geographic profile data

In [None]:
X = combined_df.drop(columns = ['state', 'county', 'above_avg_cancer_ratio', 'above_avg_ha_ratio'])
y = combined_df.above_avg_cancer_ratio

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 2020)

In [None]:
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)

In [None]:
fig, ax = plt.subplots(figsize = (7,5))
sns.barplot(x = logistic_model.coef_[0], y = list(X.columns), ax = ax, edgecolor = 'black')
plt.title('Logistic Regression Coefficients');

In [None]:
y_pred = logistic_model.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))

In [None]:
plot_confusion_matrix(y_test, y_pred, labels = ['below', 'above'], metric = 'accuracy')

In [None]:
y_pred_prob = logistic_model.predict_proba(X_test)[:,1]

print(metrics.roc_auc_score(y_test, y_pred_prob))

In [None]:
fp_rate, tp_rate, thresholds = metrics.roc_curve(y_test, y_pred_prob)

plt.plot(fp_rate, tp_rate)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.title('ROC curve for Cancer Cost Above Average classifier')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.grid(True)

In [None]:
rf_model = RandomForestClassifier().fit(X_train, y_train)

In [None]:
y_pred = rf_model.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))

In [None]:
plot_confusion_matrix(y_test, y_pred, labels = ['below', 'above'], metric = 'accuracy')

In [None]:
y_pred_prob = rf_model.predict_proba(X_test)[:,1]

print(metrics.roc_auc_score(y_test, y_pred_prob))

In [None]:
fp_rate, tp_rate, thresholds = metrics.roc_curve(y_test, y_pred_prob)

plt.plot(fp_rate, tp_rate)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.title('ROC curve for Cancer Cost Above Average classifier')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.grid(True)

In [None]:
importances = pd.DataFrame({'feature': X_train.columns, 'importance': rf_model.feature_importances_})

sns.barplot(data = importances.sort_values('importance', ascending = False).head(10), x = 'importance', y = 'feature')
plt.title('Ten Highest Importance Features');

# Adding States

In [None]:
combined_df_dummy = pd.get_dummies(combined_df, columns = ['state'])

In [None]:
X = combined_df_dummy.drop(columns = ['county', 'above_avg_cancer_ratio', 'above_avg_ha_ratio'])
y = combined_df_dummy.above_avg_cancer_ratio

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 2020)
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)

In [None]:
fig, ax = plt.subplots(figsize = (7,5))
sns.barplot(x = logistic_model.coef_[0], y = list(X.columns), ax = ax, edgecolor = 'black')
plt.title('Logistic Regression Coefficients');

In [None]:
y_pred = logistic_model.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))

In [None]:
plot_confusion_matrix(y_test, y_pred, labels = ['below', 'above'], metric = 'accuracy')

In [None]:
y_pred_prob = logistic_model.predict_proba(X_test)[:,1]

print(metrics.roc_auc_score(y_test, y_pred_prob))

In [None]:
fp_rate, tp_rate, thresholds = metrics.roc_curve(y_test, y_pred_prob)

plt.plot(fp_rate, tp_rate)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.title('ROC curve for Cancer Cost Above Average classifier')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.grid(True)

In [None]:
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

In [None]:
y_pred = rf_model.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))

In [None]:
plot_confusion_matrix(y_test, y_pred, labels = ['below', 'above'], metric = 'accuracy')

In [None]:
y_pred_prob = rf_model.predict_proba(X_test)[:,1]

print(metrics.roc_auc_score(y_test, y_pred_prob))

In [None]:
fp_rate, tp_rate, thresholds = metrics.roc_curve(y_test, y_pred_prob)

plt.plot(fp_rate, tp_rate)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.title('ROC curve for Cancer Cost Above Average classifier')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.grid(True)

In [None]:
importances = pd.DataFrame({'feature': X_train.columns, 'importance': rf_model.feature_importances_})

sns.barplot(data = importances.sort_values('importance', ascending = False).head(10), x = 'importance', y = 'feature')
plt.title('Ten Highest Importance Features');

# Adding the Health Outcomes Data

In [None]:
health_rankings = pd.read_excel('../data/2018 County Health Rankings Data - v2.xls', sheet_name = 1, header = (1))

In [None]:
health_rankings = health_rankings.rename(columns = {'Rank': 'Outcomes_Rank',
                                 'Quartile': 'Outcomes_Quartile',
                                 'Rank.1': 'Factors_Rank',
                                 'Quartile.1': 'Factors_Quartile'})

In [None]:
for col in ['Outcomes_Rank', 'Factors_Rank']:
    health_rankings[col] = pd.to_numeric(health_rankings[col], errors = 'coerce')
    health_rankings['Normalized_{}'.format(col.split('_')[0])] = 1 - health_rankings[col] / (health_rankings['# of Ranked Counties'] + 1)

In [None]:
health_rankings

In [None]:
health_rankings = health_rankings.dropna(subset = ['Normalized_Outcomes', 'Normalized_Factors'])

In [None]:
health_rankings['state'] = health_rankings.State.str.upper().str.strip().map(state_abbrev.set_index('name')['abbrev'].to_dict())

In [None]:
health_rankings['county'] = health_rankings['County'] + ' County'

In [None]:
combined_df = pd.merge(left = combined_df, right = health_rankings[['state', 'county', 'Normalized_Outcomes', 'Normalized_Factors']])

In [None]:
combined_df_dummy = pd.get_dummies(combined_df, columns = ['state'])
X = combined_df_dummy.drop(columns = ['county', 'above_avg_cancer_ratio', 'above_avg_ha_ratio'])
y = combined_df_dummy.above_avg_cancer_ratio

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 2020)
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)

In [None]:
y_pred = logistic_model.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))

In [None]:
plot_confusion_matrix(y_test, y_pred, labels = ['below', 'above'], metric = 'accuracy')

In [None]:
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

In [None]:
y_pred = rf_model.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))

In [None]:
plot_confusion_matrix(y_test, y_pred, labels = ['below', 'above'], metric = 'accuracy')

In [None]:
y_pred_prob = rf_model.predict_proba(X_test)[:,1]

print(metrics.roc_auc_score(y_test, y_pred_prob))

In [None]:
fp_rate, tp_rate, thresholds = metrics.roc_curve(y_test, y_pred_prob)

plt.plot(fp_rate, tp_rate)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.title('ROC curve for Cancer Cost Above Average classifier')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.grid(True)

In [None]:
importances = pd.DataFrame({'feature': X_train.columns, 'importance': rf_model.feature_importances_})

sns.barplot(data = importances.sort_values('importance', ascending = False).head(10), x = 'importance', y = 'feature')
plt.title('Ten Highest Importance Features');

There are many other variables that could be added from the excel sheet. For example, here is code to grab food insecurity.

In [None]:
food_insecurity = pd.read_excel('../data/2018 County Health Rankings Data - v2.xls', 
              sheet_name = 5, 
              header = 1, 
              usecols='B,C,AK')

In [None]:
food_insecurity['state'] = food_insecurity.State.str.upper().str.strip().map(state_abbrev.set_index('name')['abbrev'].to_dict())
food_insecurity['county'] = food_insecurity['County'] + ' County'

In [None]:
combined_df = pd.merge(left = combined_df, right = food_insecurity[['state', 'county', '% Food Insecure']])

In [None]:
combined_df

In [None]:
combined_df_dummy = pd.get_dummies(combined_df, columns = ['state'])
X = combined_df_dummy.drop(columns = ['county', 'above_avg_cancer_ratio', 'above_avg_ha_ratio'])
y = combined_df_dummy.above_avg_cancer_ratio

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 2020)
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

In [None]:
y_pred = rf_model.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))

In [None]:
plot_confusion_matrix(y_test, y_pred, labels = ['below', 'above'], metric = 'accuracy')

In [None]:
y_pred_prob = rf_model.predict_proba(X_test)[:,1]

print(metrics.roc_auc_score(y_test, y_pred_prob))

In [None]:
fp_rate, tp_rate, thresholds = metrics.roc_curve(y_test, y_pred_prob)

plt.plot(fp_rate, tp_rate)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.title('ROC curve for Cancer Cost Above Average classifier')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.grid(True)

In [None]:
importances = pd.DataFrame({'feature': X_train.columns, 'importance': rf_model.feature_importances_})

sns.barplot(data = importances.sort_values('importance', ascending = False).head(10), x = 'importance', y = 'feature')
plt.title('Ten Highest Importance Features');