In [None]:
import pandas as pd

### Reading in and examining the heart attack cost disparities data

In [None]:
ha_costs_df = pd.read_csv('../data/mmd_heart_attack_data.csv') 
ha_costs_df.head(2)

In [None]:
tn_ha_costs = ha_costs_df.loc[ha_costs_df.state == 'TENNESSEE']
tn_ha_costs.shape

### Now examining the cancer data

In [None]:
cancer_costs_df = pd.read_csv('../data/mmd_cancer_data.csv')
cancer_costs_df.head(2)

In [None]:
cancer_costs_df.shape

In [None]:
tn_cancer_costs = cancer_costs_df.loc[cancer_costs_df.state == 'TENNESSEE']
tn_cancer_costs.shape

### There are 91 _counties_ in the TN heart attack data and 95 in the TN cancer data
 - where are they different?

In [None]:
cancer_counties = list(tn_cancer_costs.county.unique())
ha_counties = list(tn_ha_costs.county.unique())

list(set(cancer_counties).difference(ha_counties))

### Getting the income data and cleaning it a bit

In [None]:
income_df = pd.read_csv('../data/irs_county_2016.csv')
income_df.head(2)

In [None]:
tn_income = income_df.loc[income_df.STATE == 'TN']
tn_income.head(2)

In [None]:
tn_income.shape

In [None]:
tn_income = tn_income[['STATE', 'COUNTYNAME', 'agi_stub', 'N1', 'mars1', 'MARS2', 'MARS4', 'N2', 'NUMDEP', 'ELDERLY', 'A00100', 'N02650', 'A02650', 'N02300', 'A02300']]
tn_income.head(2)

In [None]:
tn_income.columns = ['state', 'county', 'income_bucket', 'return_count', 'single_returns', 'joint_returns', 'head_of_house_returns', 'exemptions', 'dependents', 'elderly', 'agi', 'returns_with_total_inc','total_inc_amt', 'returns_with_unemployment', 'unemployment_comp']

## Week two coding tasks
#### Replacing coded values in the `income_bucket` column with descriptive text
- create a dictionary mapping codes to descriptions
- use `replace()` to update the df with text

In [None]:
income_dict = {0:'Total', 1: 'Under $1', 2: 'Between 1 and $10,000', 3: 'Between 10,000 and $25,000',
              4: 'Between 25,000 and $50,000', 5: 'Between 50,000 and $75,000', 
               6: 'Between 75,000 and $100,000', 7: 'Between 100,000 and $200,000', 
               8:'$200,000 or more'}

In [None]:
tn_income.income_bucket = tn_income.income_bucket.replace(income_dict)
tn_income.head(2)

#### Creating a new dataframe that aggregates by county to get the totals for each county

In [None]:
income_county_agg = tn_income.groupby('county').agg('sum').reset_index()
income_county_agg.head(2)

- which county has the greatest number of returns?
- which county has the greatest total income?

In [None]:
#income_county_agg.sort_values('return_count', ascending = False).head(3)
income_county_agg.nlargest(3, 'return_count')

In [None]:
#income_county_agg.sort_values('total_inc_amt', ascending = False).head(3)
income_county_agg.nlargest(3, 'total_inc_amt')