In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

# pd.read_csv('case_study.csv')

In [None]:
#Load our dataframe account for the index columns
df = pd.read_csv('case_study.csv', index_col=0)

In [None]:
df

In [None]:
# Quickly check to see the size of our DF, both rows and columns
df.shape

In [None]:
pd.set_option('display.max_columns', 358)

## Subset your dataframe to only include the states in the tri-state area (NJ, CT, NY)

In [None]:
 df[df['_STATE'].isin([9,34,36])].head()

In [None]:
# your code here
df['_STATE'].value_counts()

Now check to see how large our subset df.

In [None]:
# your code here
df.shape

**PHYSHLTH** is the column we are interested in for this analysis, so let's start to investigate the column. Since we have a data cookbook, a.k.a. data dictionary) we want to start there before looking at the code. 

In [None]:
df['PHYSHLTH'].describe()

## 1. Handle the non-normal values

As you can see from the data dictionary, there are a number of responses that are coded with values that could throw off our descriptive statistics.  Let's clean up by mapping the responses to a more appropriate value, and handling any missing values.  

In [None]:
#your code here

zero_dict = {88:0, 
            77: np.nan,
            99: np.nan}

df['PHYSHLTH'] = df['PHYSHLTH'].replace(zero_dict)

In [None]:
df.shape

In [None]:
df.dropna(subset=['PHYSHLTH'], inplace=True)

In [None]:
df.shape

## 2. Generate the descriptive statistics

Now that our target variable is cleaned up lets calculate the descriptive statistics, generate a graph that quickly shows those statistics, and generate a gprah shows the histogram of the data.

In [None]:
# descriptive statistics
df['PHYSHLTH'].describe()

In [None]:
fig, ax = plt.subplots()

# Add a boxplot for the "Height" column in the DataFrames
ax.boxplot(df['PHYSHLTH'])

# Add x-axis tick labels:
ax.set_xticklabels('PHYSHLTH')

# Add a y-axis label
ax.set_ylabel("Days Sick")

plt.show()


In [None]:
# graphic showing the histogram of data
fig, ax = plt.subplots()
# Plot a histogram of "Weight" for mens_rowing
ax.hist(df['PHYSHLTH'], bins=30)

ax.set_xlabel('Sick Days')
ax.set_ylabel('# of observations')
ax.set_title("Distribution of People's Sick Days")

plt.show()

## 3. Generate a graphic that compares the average number of sick days for each state. 

In [None]:
# code to calculate the average number of sick days by state

df.groupby('_STATE')['PHYSHLTH'].mean()

In [None]:
means = df.groupby('_STATE')['PHYSHLTH'].mean().values
states = ['Conn', 'NJ', 'NY']

In [None]:
# code to graph those averages using matplotlib
fig, ax = plt.subplots()

# Plot a bar-chart
ax.bar(states, means)

# Set title and lables
ax.set_xticklabels(['Conn', 'NJ', 'NY'], rotation=90)
ax.set_ylabel("AVG Days Sick")
ax.set_title('Comparing Health by State')

plt.show()

In [None]:
# graphic showing the histogram of data

fig, ax = plt.subplots()
# Plot a histogram of "Weight" for mens_rowing
ax.hist(df[df['_STATE']==9]['PHYSHLTH'], bins=15, density=True, histtype='step', label='Conn')
ax.hist(df[df['_STATE']==34]['PHYSHLTH'], bins=15, density=True,histtype='step', label='NJ')
ax.hist(df[df['_STATE']==36]['PHYSHLTH'], bins=15, density=True,histtype='step', label='NY')

# Set labels and title
ax.set_xlabel('Sick Days')
ax.set_ylabel('% of observations')
ax.set_title("Distribution of People's Sick Days")
ax.legend()

plt.show()

## 4. Turn the number of sick days into a classification of of chronically sick people.  

Chronically sick people account for a significant proportion of the costs for health insurers. If we can identify what leads people to be chronically sick, we can address them with specific treatments.



In [None]:
def is_Chronic(df):
    if df['PHYSHLTH'] > 10:
        return 1
    else:
        return 0
df['CHRON_SICK'] = df.apply(is_Chronic, axis=1)

In [None]:
# code to create new colum for chronically sick

df['CHRONIC'] = np.where(df['PHYSHLTH']>=15, 1,0)

## 5. Create a graphic that compares the percentage of people labeled as 'sick' in each state. 

In [None]:
df.groupby('_STATE')['CHRONIC'].mean()

In [None]:
# code to calculate averages per state
chronic_means = df.groupby('_STATE')['CHRONIC'].mean().values
df.groupby('_STATE')['CHRONIC'].mean()

In [None]:
# create graphic to compare chronically sick by state
fig, ax = plt.subplots()

# Plot a bar-chart of gold medals as a function of country
ax.bar(states, chronic_means)

# Set labels and title
ax.set_xticklabels(['Conn', 'NJ', 'NY'], rotation=90)
ax.set_ylabel("Percentage of Chronically Ill")
ax.set_title('Comparing Health by State')

plt.show()

## 6. Independent Variables (categorical)

Let's take the column `SMOKDAY2` and start to prepare it so that we can see how it is possibly correlated to `PHYSHLTH`.

1. Clean up the values.
2. Calculate the descriptive statistics of the variable.
3. Generate a histogram for the variable.
4. Generate graphics that show how the responses to this variable might differ by state.
5. Generate a graphic that compares the physical health of the different groups of 'smokers'.

In [None]:
df.shape

In [None]:
df['SMOKDAY2'].value_counts()

In [None]:
df['SMOKDAY2'].value_counts().sum()

In [None]:
df['smoking'] = np.where(df['SMOKE100']==2, 3, df['SMOKDAY2'])




In [None]:
df['smoking'].value_counts().sum()

In [None]:
df['smoking'] = np.where(df['SMOKE100']==7, 7, df['smoking'])
df['smoking'] = np.where(df['SMOKE100']==9, 9, df['smoking'])

In [None]:
df['smoking'].count()

In [None]:
df['smoking'].describe()

In [None]:
smoking_avg = df['smoking'].value_counts()/df['smoking'].count()
smoking_avg

In [None]:
smoking_avg.index

In [None]:
# code to graph those averages using matplotlib
fig, ax = plt.subplots()

# Plot a bar-chart
ax.bar(['Non-Smoker', 'Everyday', 'Some Days',"Don't Know", "Refused"], smoking_avg)

# Set title and lables
ax.set_xticklabels(['Non-Smoker', 'Everyday', 'Some Days',"Don't Know", "Refused"], rotation=90)
ax.set_ylabel("Percent of survey")
ax.set_title('Smoking Status')

plt.show()

In [None]:
df.groupby(['_STATE', 'smoking'])['smoking'].count()

In [None]:
df.groupby(['_STATE', 'smoking']).agg({'smoking': 'count'})

In [None]:
state_smoking = df.groupby(['_STATE', 'smoking']).agg({'smoking': 'count'})
# Change: groupby state_office and divide by sum
state_pcts = state_smoking.groupby(level=0).apply(lambda x:
                                                 100 * x / float(x.sum()))

In [None]:
state_pcts

In [None]:
state_pcts.unstack()

In [None]:
state_pcts.unstack().plot(kind='bar', stacked=True)

In [None]:
#your code here

zero_dict = {3:'non', 
            2: 'some',
            1: 'every',
            7: 'unknown',
            9: 'unknown'}

df['smoking_cat'] = df['smoking'].replace(zero_dict)

In [None]:
df['smoking_cat']

In [None]:
state_smoking_cat = df.groupby(['_STATE', 'smoking_cat']).agg({'smoking': 'count'})
# Change: groupby state_office and divide by sum
state_pcts_cat = state_smoking_cat.groupby(level=0).apply(lambda x:
                                                 100 * x / float(x.sum()))
state_pcts_cat.index.set_levels(['Conn', 'NJ', 'NY'], level=0, inplace=True)
state_pcts_cat

In [None]:
ax = state_pcts_cat.unstack().plot(kind='bar', stacked=True)
ax.legend(bbox_to_anchor=(1.5, 1.0))
plt.show()

In [None]:
cat_avg = df.groupby('smoking_cat')['PHYSHLTH'].mean()
cat_std = df.groupby('smoking_cat')['PHYSHLTH'].std()
cat_avg

In [None]:
# code to graph those averages using matplotlib
fig, ax = plt.subplots()

# Plot a bar-chart
ax.bar(['Every', 'Non', 'Some',"unkonw"], cat_avg)

# Set title and lables
ax.set_xticklabels(['Everyday', 'Non-Smoker',  'Some Days',"Don't Know", "Refused"], rotation=90)
ax.set_ylabel("Days Sick")
ax.set_title('Days Sick by Smoking Status')

plt.show()

## Justin's way with the `normalize =true`

In [None]:
df._STATE.replace([9,34,36],['Conn','NJ','NY'],inplace=True)
df.SMOKDAY2.replace([1,2,3,7,9],['Every Day','Some Days','Not at All','Not Sure','Refused'],inplace=True)

x = df.groupby('_STATE')['SMOKDAY2'].value_counts(normalize=True).unstack() * 100
fig,ax = plt.subplots()
x.plot.bar(stacked=True,ax=ax,)
ax.legend(bbox_to_anchor=(1.1, 1.05))

## 7. Independent Variables (continuous):

Now let's look at a continuous variable (ALCDAY5) and see how that might impact a person's health. 

1. Clean up the values.
2. Calculate the descriptive statistics of the variable.
3. Generate a histogram for the variable.
4. Generate graphics that show how the responses to this variable might differ by state.
5. Calculate the correlation coefficient between `PHYSHLTH` and `ALCDAY5`.
5. Generate a graphic that shows the possible relationship between `PHYSHLTH` and `ALCDAY5`.

In [None]:
alc_dict = {888:0, 
            777: np.nan,
            999: np.nan
            }

df['alcohol'] = df['ALCDAY5'].replace(alc_dict)



In [None]:
df['alcohol']

In [None]:
# create a function called times100
def calc_monthly(x):
    if x>200:
        return x-200
    elif x>100:
        return min(int((x-100)*4.34524), 30)
    else:
        return x

In [None]:
df['alcohol']= df['alcohol'].apply(calc_monthly)

In [None]:
df['alcohol'].describe()

In [None]:
# graphic showing the histogram of data
fig, ax = plt.subplots()
# Plot a histogram of "Weight" for mens_rowing
ax.hist(df['alcohol'], bins=30)

ax.set_xlabel('Days Drinking')
ax.set_ylabel('# of observations')
ax.set_title("Distribution of Days Drinking")

plt.show()

In [None]:
# code to calculate averages per state
alcohol_state = df.groupby('_STATE')['alcohol'].mean().values
df.groupby('_STATE')['alcohol'].mean()

In [None]:
fig, ax = plt.subplots()

# Add data: "height", "weight" as x-y, index as color
ax.scatter(df["alcohol"], df["PHYSHLTH"])

ax.set_xlabel("Drinking Days")
ax.set_ylabel("Sick Days")
ax.set_title('Relationship between Drinking and Health')


plt.show()

In [None]:
df[['PHYSHLTH', 'alcohol']].corr()['PHYSHLTH']

In [None]:
from src.student_caller import one_random_student
from src.student_list import student_first_names

## 8. Statistical Inference


- Create a 95% confidence interval for the population mean of physically ill days.

In [None]:
one_random_student(student_first_names)

In [None]:
# Calculate the mean of PHYSHLTH

# Use stats.norm.ppf to calculate the z-stat associated with the 95%

# Calculate the standard error

# Calculate the right and left ends of the confidence interval

-  Create a 95% confidence interval for the true proportion of chronically sick people.

In [None]:
# link to lecture notebook with the equation for confidence interval of a binary response
# https://github.com/flatiron-school/ds-confidence_intervals/blob/main/confidence_intervals.ipynb

$Standard Error = \sqrt{\frac{\hat{p}(1 - \hat{p})}{n}}$ <br/>
$\hat{p} = Probability\ of\ chronically\ sick$

In [None]:
# Calculate p_hat

# Calculate the Standard Error

# Calculate the z-score associated with 95% confidence

# Calculate left and right parts of the confidence interval

## 9. Hypothesis Tests 

**For the follwing questions that require hypothesis tests, you must complete the following:**

*1. Write down the null and alternative hypothesis you are testing.* 


*2. Select the appropriate test and calculate the test statistic and P-values.*

*3. Determine the critical value for the 95% confidence interval.*

*4. Evaluate the test statistic agains the critical value.*

*5. Determine if you reject or fail to reject the null hypothesis and write a sentence explaining the results of your hypothesis test.*  

### The following questions are regarding the `PHYSHLTH` variable. 
- What is the probability of getting a value greater than the sample mean of days physically ill, if the population mean is 4?

In [None]:
# Determine what test statistic to calculate

# Calculate the difference in means (the numerator)

# Calculate the standard error (the denominator)

# Feed the sample statistic into the appropriate stats method to return a p-value


In [None]:
one_random_student(student_first_names)

- Is there a statistically significant difference between men and women in the number of days a person feels phsyically ill?

In [None]:
# Null Hypothesis

# Alternative Hypothesis

# Select the appropriate test

# Select an alpha level

# Subset the data to isolate the two groups, and isolate the feature we want to inspect

# Feed the data into the appropriate scipy.stats test

# Determine whether to accept or reject the null hypothesis



In [None]:
one_random_student(student_first_names)

- Perform a statistical test to determine if there statistically significant difference is physical health of people from the three states. 

In [None]:
# What is the null hypothesis

# What is the alternative hypothesis

# Select the appropriate test

# Select an alpha level

# Subset the data

# Feed in the arrays to the appropriate stats test.

# Decide whether to accept or reject the null hypothesis

In [None]:
one_random_student(student_first_names)

### The following questions are regarding the  `CHRONIC` variable.
- What is the probability of drawing a value larger than this sample proportion if the chronically true population proportion is 0.10?

In [None]:
# Check here for how to calculate a test statistic for a proportion
# https://online.stat.psu.edu/stat800/lesson/5/5.2

# Standard Error = sqrt[ P * ( 1 - P ) / n ]
# Calculate the standard error

# z = (p_hat - P) / Standard Error

# Calculate the z-score of the sample using the appropriate stats package.



In [None]:
one_random_student(student_first_names)

- Is there a statistically significant difference in the percentage of men and women who are chronically sick?

In [None]:
# Null Hypothesis

# Alternative Hypothesis

# Select the appropriate test
# https://online.stat.psu.edu/stat800/lesson/5/5.5

# Select an alpha level

# Subset the data to isolate the two groups, and isolate the feature we want to inspect

# Feed the data into the appropriate scipy.stats test
# look into from statsmodels.stats.proportion import proportions_ztest

# Determine whether to accept or reject the null hypothesis


In [None]:
one_random_student(student_first_names)

- Perform a statistical test to determine if there statistically significant difference is physical health of people from the three states.

In [None]:
# Null Hypothesis

# Alternative Hypothesis

# Select the appropriate test

# Select an alpha level

# Subset the data to isolate the two groups, and isolate the feature we want to inspect

# Create a contingency table
# See this notebook from lessons: https://github.com/flatiron-school/ds-anova/blob/main/chi_squared_testing.ipynb

# Feed the contingency table into the stats.chi2_contingency
# Determine whether to accept or reject the null hypothesis

# 10. Independent Investigation

Now that you have invesitgated physical health and chronic sickness and their relationships with states and gender, you will conduct a similar investigation with variables of your choosing. 

Select a continuous dependent varaible and a binary dependent variable that you would like to investigate. Then select a categorical independent variable with at least 3 groups and for which you will test to see if the response to the independent variable affects outcomes of the dependent variable.

For your continuous dependent variable perform the following:
  - Determine the sample mean and sample standard deviation
  - Create a confidence interval for a population mean.
  - Perform a two-sample test of means for gender.
  - Perform a one-way ANOVA using your categorical independent variable.

Select/create a binary variable from the survey questions as you dependent variable and perform the following:
  - Create a confidence interval for a population proportion.
  - Perform a two proportions z-test using gender.
  - Perform a Chi-Square test using your categorical independent variable.