In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

### Reading in and examining the heart attack cost disparities data

In [None]:
ha_costs_df = pd.read_csv('../data/mmd_heart_attack_data.csv') 
ha_costs_df.head(2)

In [None]:
tn_ha_costs = ha_costs_df.loc[ha_costs_df.state == 'TENNESSEE']
tn_ha_costs.shape

### Now examining the cancer data

In [None]:
cancer_costs_df = pd.read_csv('../data/mmd_cancer_data.csv')
cancer_costs_df.head(2)

In [None]:
tn_cancer_costs = cancer_costs_df.loc[cancer_costs_df.state == 'TENNESSEE']
tn_cancer_costs.shape

### Getting the income data and cleaning it a bit

In [None]:
income_df = pd.read_csv('../data/irs_county_2016.csv')
income_df.head(2)

In [None]:
tn_income = income_df.loc[income_df.STATE == 'TN']
tn_income.head(2)

In [None]:
tn_income.shape

In [None]:
tn_income = tn_income[['STATE', 'COUNTYNAME', 'agi_stub', 'N1', 'mars1', 'MARS2', 'MARS4', 'N2', 'NUMDEP', 'ELDERLY', 'A00100', 'N02650', 'A02650', 'N02300', 'A02300']]

In [None]:
tn_income.columns = ['state', 'county', 'income_bucket', 'return_count', 'single_returns', 'joint_returns', 'head_of_house_returns', 'exemptions', 'dependents', 'elderly', 'agi', 'returns_with_total_inc','total_inc_amt', 'returns_with_unemployment', 'unemployment_comp']

### Week two coding tasks
#### Replacing coded values in the `income_bucket` column with descriptive text
- create a dictionary mapping codes to descriptions
- use `replace()` to update the df with text

In [None]:
income_dict = {0:'Total', 1: 'Under $1', 2: 'Between 1 and $10,000', 3: 'Between 10,000 and $25,000',
              4: 'Between 25,000 and $50,000', 5: 'Between 50,000 and $75,000', 
               6: 'Between 75,000 and $100,000', 7: 'Between 100,000 and $200,000', 
               8:'$200,000 or more'}

In [None]:
tn_income.income_bucket = tn_income.income_bucket.replace(income_dict)
tn_income.head(2)

#### Creating a new df that aggregates by county to get the totals for each county

In [None]:
income_county_agg = tn_income.groupby('county').agg('sum').reset_index()
income_county_agg.head(2)

### Week 3 coding tasks

#### Create a barplot to show the total number of returns for each bucket
- first filter to use only the rows where the `county` is **Tennessee**

In [None]:
statewide_totals = tn_income.loc[tn_income.county == 'Tennessee']
statewide_totals.head(2)

In [None]:
plt.bar('income_bucket', 'return_count', data = statewide_totals)
plt.xticks(rotation = 90)
plt.title('Number of Returns by Income Bucket');

#### Create a new column / feature `avg_income`

In [None]:
income_county_agg['avg_income'] = round(income_county_agg.total_inc_amt * 1000 / income_county_agg.returns_with_total_inc, 0)
income_county_agg.head(3)

#### Create a histogram to show distribution of average incomes in the counties

In [None]:
income_county_agg = income_county_agg.loc[income_county_agg.county !='Tennessee']

In [None]:
income_county_agg.avg_income.hist(bins = 10)
plt.title('Distribution of Average Incomes for TN Counties');

#### Create boxplots and swarmplots for the `analysis_value` in cancer and heart attack data to compare urban and rural counties

- cancer data

In [None]:
plt.figure(figsize = (8, 6))

sns.boxplot(x = tn_cancer_costs.urban, y = tn_cancer_costs.analysis_value, order=["Urban", "Rural"])
plt.xlabel('')
plt.ylabel('')
plt.title('Cancer Analysis Value - Urban vs Rural');

In [None]:
sns.swarmplot(x = tn_cancer_costs.urban, y = tn_cancer_costs.analysis_value, order=["Urban", "Rural"])
plt.xlabel('')
plt.ylabel('')
plt.title('Cancer Analysis Value - Urban vs Rural');

In [None]:
sns.boxplot(x = tn_ha_costs.urban, y = tn_ha_costs.analysis_value, order=["Urban", "Rural"])
plt.xlabel('')
plt.ylabel('')
plt.title('Heart Attack Analysis Value - Urban vs Rural');

In [None]:
sns.swarmplot(x = tn_ha_costs.urban, y = tn_ha_costs.analysis_value, order=["Urban", "Rural"])
plt.xlabel('')
plt.ylabel('')
plt.title('Heart Attack Analysis Value - Urban vs Rural');