In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

%matplotlib inline

### Reading in and examining the heart attack cost disparities data

In [None]:
ha_costs_df = pd.read_csv('../data/mmd_heart_attack_data.csv') 
ha_costs_df.head(2)

### Now examining the cancer data

In [None]:
cancer_costs_df = pd.read_csv('../data/mmd_cancer_data.csv')
cancer_costs_df.head(2)

### Getting the income data and cleaning it a bit

In [None]:
income_df = pd.read_csv('../data/irs_county_2016.csv')
income_df.head(2)

In [None]:
income_df = income_df[['STATE', 'COUNTYNAME', 'agi_stub', 'N1', 'mars1', 'MARS2', 'MARS4', 'N2', 'NUMDEP', 'ELDERLY', 'A00100', 'N02650', 'A02650', 'N02300', 'A02300']]
income_df.columns = ['state', 'county', 'income_bucket', 'return_count', 'single_returns', 'joint_returns', 'head_of_house_returns', 'exemptions', 'dependents', 'elderly', 'agi', 'returns_with_total_inc','total_inc_amt', 'returns_with_unemployment', 'unemployment_comp']
income_df.head(2)

#### Creating a new df that aggregates by state and county to get the totals for each county

In [None]:
income_agg = income_df.groupby(['state','county']).agg('sum').reset_index()
income_agg.head(2)

In [None]:
income_agg['avg_income'] = round(income_agg.total_inc_amt * 1000 / income_agg.returns_with_total_inc, 0)

#### Week 4 Coding Tasks
 - rename `analysis_value` columns in `ha_costs_df` and `cancer_costs_df`
 

In [None]:
ha_costs_df.head(2)

In [None]:
ha_costs_df = ha_costs_df.rename(columns = {'analysis_value': 'ha_avg_cost'})
ha_costs_df.head(2)

In [None]:
cancer_costs_df = cancer_costs_df.rename(columns = {'analysis_value': 'cancer_avg_cost'})
cancer_costs_df.head(2)

#### Create a new dataframe  `combined_df` 
- merge `county`, `urban`, and `ha_avg_cost` from `ha_costs_df` with `county` and `cancer_avg_cost` from `cancer_costs_df`


In [None]:
cancer_cost_sub = cancer_costs_df[['state', 'county', 'urban', 'cancer_avg_cost']]
ha_cost_sub = ha_costs_df[['state','county', 'ha_avg_cost']]

In [None]:
combined_df = pd.merge(ha_cost_sub, cancer_cost_sub, on = ['state', 'county'], how = 'inner')
print(combined_df.shape)
combined_df.head(2)

#### Now combine all the data and create the cost_income_ratio variables

- merge `avg_income` from `income_agg` with `combined_df` and save back to the `combined_df` variable  

- create `ha_cost_income_ratio` (`ha_avg_cost` / `avg_income`)  

- create `cancer_cost_income_ratio` (`cancer_avg_cost` / `avg_income`)

In [None]:
county_incomes = income_agg[['state','county', 'avg_income']]
county_incomes.head(2)

#### Before you can merge county incomes and combined_df, the data is the `state` columns will need to match

In [None]:
state_abbrev = pd.read_csv('../data/state_abbrev.csv')
state_abbrev.head(3)

In [None]:
combined_df['state'] = combined_df.state.map(state_abbrev.set_index('name')['abbrev'].to_dict())
combined_df.head()

In [None]:
combined_df = pd.merge(combined_df, county_incomes, on = ['state', 'county'], how = 'inner')
combined_df.head(2)

#### What is our target variable? What are the predictor variables?
- The target is whether or not `cancer_avg_cost` is above or below the mean. 
- Predictors are `ha_avg_cost`, `urban`, and `avg_income`

In [None]:
avg_cancer_cost = combined_df.cancer_avg_cost.mean()

In [None]:
combined_df.cancer_avg_cost.isnull().sum()

In [None]:
above_below_list = []
for value in combined_df.cancer_avg_cost:
    if value > avg_cancer_cost:
        above_below_list.append(1)
    else:
        above_below_list.append(0)

In [None]:
above_below_list[0:10]

In [None]:
combined_df['above_avg_cancer_cost'] = above_below_list
combined_df.head()

In [None]:
avg_cancer_cost

In [None]:
combined_df = pd.get_dummies(combined_df, columns = ['urban'], drop_first = True)
combined_df.head(3)

In [None]:
X = combined_df[['ha_avg_cost', 'avg_income', 'urban_Urban']]
y = combined_df.above_avg_cancer_cost

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 2020)

#### Scaling our predictors will help them look more like normally distributed data

In [None]:
scaler = StandardScaler()

In [None]:
X_train = scaler.fit_transform(X_train)

In [None]:
X_train

In [None]:
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)

In [None]:
fig, ax = plt.subplots(figsize = (7,5))
sns.barplot(x = logistic_model.coef_[0], y = list(X.columns), ax = ax, edgecolor = 'black')
plt.title('Logistic Regression Coefficients');

#### Before using the model to predict with the test data, transform `X_test` with the scaler

In [None]:
X_test = scaler.fit_transform(X_test)

In [None]:
y_pred = logistic_model.predict(X_test)

In [None]:
print(metrics.accuracy_score(y_test, y_pred))

In [None]:
y_test.value_counts(normalize = True)

#### Our model is a bit better than the naive model

In [None]:
print(metrics.classification_report(y_test, y_pred))

#### Let's check AUC

In [None]:
y_pred_prob = logistic_model.predict_proba(X_test)[:,1]

In [None]:
print(metrics.roc_auc_score(y_test, y_pred_prob))

In [None]:
fp_rate, tp_rate, thresholds = metrics.roc_curve(y_test, y_pred_prob)

plt.plot(fp_rate, tp_rate)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.title('ROC curve for Cancer Cost Above Average classifier')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.grid(True)