In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
import scipy.stats as stats
from statsmodels.stats.proportion import proportions_ztest

%matplotlib inline

In [None]:
# CLEAN DATA
all_data['PHYSHLTH'] = np.where((all_data.PHYSHLTH == 88),0,all_data.PHYSHLTH)
all_data.drop(all_data[all_data['PHYSHLTH'].isin([77,99])].index, inplace = True)
all_data.drop((all_data[all_data['SEX'] == 9].index), inplace = True)
all_data['chronic'] = np.where(all_data['PHYSHLTH']>16, 1, 0)
all_data.dropna(subset=['SMOKE100', 'SMOKDAY2'], how='all', inplace=True)
conditions = [
((all_data['ALCDAY5'] > 100) & (all_data['ALCDAY5'] < 108)), 
((all_data['ALCDAY5'] > 200) & (all_data['ALCDAY5'] < 231)),
(all_data['ALCDAY5'] == 888)
]
choices = [ (all_data['ALCDAY5']-100)*4, (all_data['ALCDAY5']-200), 0 ]
all_data['DAYSDRNK'] = np.select(conditions, choices, default=99)

In [1]:
# INSPECT DATA
all_data.head(), .describe(), .all_data['COLUMN'].value_counts()
chron_data = all_data.groupby('_STATE')['chronic'].value_counts(normalize=True)
y_vals = chron_data.iloc[chron_data.index.get_level_values('chronic') == 1]

In [4]:
# PLOTTING
fig, ax = plt.subplots()
y_vals = list(all_data['PHYSHLTH'])

ax.boxplot(y_vals)
ax.hist(y_vals, bins=15)
ax.bar(x_vals, y_vals)

#multiple hist
ax.hist(x_conn, bins=15, histtype='step', density=True, label='Conn')
ax.hist(x_nj, bins=15, histtype='step', density=True, label='NJ')
ax.hist(x_ny, bins=15, histtype='step', density=True, label='NY')

#muliple bars
ax.bar(x_vals, not_at_all_vals)
ax.bar(x_vals, everyday_vals)
ax.bar(x_vals, somedays_vals)

ax.set_ylabel("# Days Sick")
ax.set_title("Title")

In [None]:
# STATISTICS

# CORRELATION
daysdrnk_correlation = all_data['DAYSDRNK'].corr(all_data['PHYSHLTH'])

# CONFIDENCE INTERVAL FOR VALUE-based DATA
standard_error = sample_std / np.sqrt(sample_num)
lower_limit = sample_mean - (1.96 * standard_error)
upper_limit = sample_mean + (1.96 * standard_error)

# CONFIDENCE INTERVAL FOR PERCENT-based DATA
standard_error = np.sqrt((chronic_proportion*(1-chronic_proportion))/n_count)
lower_limit = chronic_proportion - (z_val*standard_error)
upper_limit = chronic_proportion + (z_val*standard_error)

#HYPOTHESIS TESTING - STEPS
*1. Write down the null and alternative hypothesis you are testing.* 
*2. Select the appropriate test and calculate the test statistic and P-values.*
*3. Determine the critical value for the 95% confidence interval.*
*4. Evaluate the test statistic agains the critical value.*
*5. Determine if you reject or fail to reject the null hypothesis and write a sentence explaining the results of your hypothesis test.*  

#TWO-TAILED ONE-SAMPLE VERSUS POPULATION T-TEST
t_stat = (sample_mean - 4)/((sample_std/np.sqrt(sample_num)))
results = stats.ttest_1samp(a=all_data['PHYSHLTH'], popmean=4)

#TWO-SAMPLE CONTINUOUS DATA VERSUS EACH OTHER T-TEST
t_statistic = (male_mean - female_mean) / (np.sqrt(pool_var*((1/male_count)+(1/female_count))))
t_stat, p_val = stats.ttest_ind(male_sample, female_sample, equal_var=True)

#TWO-SAMPLE CONTINUOUS DATA BEFORE AND AFTER PAIRED T-TEST (same size)
stats.ttest_rel(male_sample, female_sample)

#ONE-WAY CONTINUOUS DATA MULTIPLE VARIABLE F-TEST
f_data = {each_state:sample_populations['PHYSHLTH'][sample_populations['_STATE'] == each_state] for each_state in all_states}
f_stat, p_val = stats.f_oneway(f_data[9], f_data[34], f_data[36])

#ONE-SAMPLE PERCENTAGE DATA VERSUS POPULATION Z-TEST
z_stat = (chronic_mean - pop_mean) / np.sqrt((pop_mean*(1-pop_mean))/chronic_num)
z_stat, p_value = proportions_ztest(count=chronic_sample_count, nobs=chronic_num, value=0.10, alternative='two-sided')

#TWO-VARIABLE VERSUS EACH OTHER PERCENTAGE DATA Z-TEST
p_ast = (male_count_chronic + female_count_chronic) / (male_count + female_count)
z_stat = np.round((male_mean - female_mean) / np.sqrt((p_ast*(1-p_ast)*(1/male_count + 1/female_count))),3)
z_stat_calc = np.round(stats.ttest_ind(male_sample, female_sample, equal_var=True),3)

#MULTIPLE VARIABLE VERSUS EACH OTHER PERCENTAGE DATA CHI-SQUARE TEST
table = np.zeros((2, 3))
for idx, value in enumerate(all_data['EVERMARRIED'].value_counts().index):
    table[0, idx] = len(all_data[(all_data['NOALC'] == 0) & (all_data['EVERMARRIED'] == value)])
    table[1, idx] = len(all_data[(all_data['NOALC'] == 1) & (all_data['EVERMARRIED'] == value)])
chisq_test = stats.contingency.chi2_contingency(table)
manual_chisq = np.divide((table - chisq_test[3])**2, chisq_test[3]).sum() #test of chi-square

# CALCULATING T-CRITICAL VALUE
t_critical = stats.norm.ppf(p_value)
t_critical = stats.t.ppf(p_value, df=sample_num-1) #one-sample vs population t-test
t_critical = stats.t.ppf(1-0.025, (male_count+female_count-2))
p_value = stats.norm.pdf(z-score)
p_value = stats.norm.cdf(z-score)

# TYPES OF ERRORS IN CALCULATIONS
Type 1 errors (false positives) are when we accept an alternative hypothesis which is actually false.
The  that we pick is the likelihood that we will get a type 1 error due to random chance.
Type 2 errors (false negatives) are when we reject an alternative hypothesis which is actually true.

# STATING HYPOTHESIS SOLUTION
if (results[0]>t_crit) and (results[1]<alpha):
    print ("Null hypothesis rejected. Results are statistically significant with t-value =", 
    round(results[0], 2), "critical t-value =", t_crit, "and p-value =", np.round((results[1]), 10))
else:
    print ("Fail to reject the Null hypothesis with t-value =", 
    round(results[0], 2), ", critical t-value =", t_crit, "and p-value =", np.round((results[1]), 10))

In [None]:
# EFFECT SIZE
def Cohen_d(group1, group2):
    diff = group1.mean() - group2.mean()
    n1 = len(group1)
    n2 = len(group2)
    var1 = group1.var(ddof=1)
    var2 = group2.var(ddof=1)
    pooled_var = ((n1-1) * var1 + (n2-1) * var2) / (n1 + n2 - 2)
    d = diff / np.sqrt(pooled_var)    
    return d

# POWER
test = TTestIndPower()
test.solve_power(alpha=alpha_default,nobs1=n_default, effect_size=d, power=0.8)


