# Statistics Codebook

## Imports

In [None]:
import numpy as np
from scipy import stats
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats import weightstats as stests # z-test code
from statsmodels.stats.proportion import proportions_ztest # proportions z-test
from statsmodels.stats.proportion import proportions_ztest
from scipy.stats import chi2_contingency # chi-squared test with similar proportions
from scipy.stats import chi2
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

### Z-score

#### By Hand

In [None]:
# Z score for population data for a single point in relation to a distribution of points
z = (x_bar - mu)/(std) # Where mean of sample data (x_bar) - population mean / std

In [None]:
# Z score for when we are working with a sampling distribution:
z = (x_bar - mu)/(std/np.sqrt(n))

In [None]:
# Critical Value of the Z score
z_critical_val = st.norm.ppf(1-.05) # Where alpha is '.05'

In [None]:
#crit val for Z scores where comparing in 97% confidence
zcrit_val1 = st.norm.ppf(1-.025)
zcrit_val2 = st.norm.ppf(1-.975)

#### Python function one sided Z-score from statsmodels

In [None]:
# https://towardsdatascience.com/hypothesis-testing-in-machine-learning-using-python-a0dc89e169ce
ztest ,pval = stests.ztest(df['bp_before'], x2=None, value=156)
print(float(pval))
if pval<0.05:
    print("reject null hypothesis")
else:
    print("accept null hypothesis")

#### Python function two sided Z-score from statsmodels

In [None]:
# https://towardsdatascience.com/hypothesis-testing-in-machine-learning-using-python-a0dc89e169ce
ztest ,pval1 = stests.ztest(df['bp_before'], x2=df['bp_after'], value=0,alternative='two-sided')
print(float(pval1))
if pval<0.05:
    print("reject null hypothesis")
else:
    print("accept null hypothesis")

#### Getting Percentile & Probability (hypothesis testing)

In [None]:
# We can use stats to calculate the percentile / probablility of getting given z score OR higher
print("Percentile = ", stats.norm.cdf(z)) # can use for t-value also

# We can also use the survival function to calculate the probability
print("Probability = ", stats.norm.sf(z)) # can use for t-value also

### T-score

* Comparing 2 means to see if they are equal or not equal. Correlation does not equal causation however.

#### T-critical value for 1 tailed test (hypothesis testing)

In [None]:
# Using Python to get the t-statistic & P-value for a 1 sample t-test:
stats.ttest_1samp(std, mu) #(standard deviation, mean)

#### T-critical value for 2 tailed test (hypothesis testing)

In [None]:
# In Python to get the t-statistic & P-value
# This is a two-sided test for the null hypothesis that 2 independent samples have identical average (expected) values. 
# This test assumes that the populations have identical variances by default.
# Equal variance of false if comparing different size data
stats.ttest_ind(sample_data_1, sample data_2, equal_var=False, nan_policy='omit') # nan_policy will omit the nan values in test.

In [None]:
# This is a two-sided test for the null hypothesis that two independent samples have identical average (expected) values.
# T-test for means of two independent samples from descriptive statistics.
# T-test from data provided to get the statistics and p-value
# nobs = number of observations aka n                                                                                
stats.ttest_ind_from_stats(mean1, std1, nobs1, mean2, std2, nobs2, equal_var=True, alternative='two-sided')

In [None]:
# Calculate our t-critical value for 2 tailed test (.025 & .975) = 97% confidence
print(stats.t.ppf(0.025, n-1)) # The degrees of freedom is (n-1)
print(stats.t.ppf(0.975, n-1)) # The significance level is 97%

# OR

st.t.ppf(1-(.05/2), (n1 + n2)-2) # in one line - alpha devided by 2

#### Confidence Interval for Normally Distributed Data (margin of error)

In [None]:
pop_std / np.sqrt(n) * z # or (t)

#### Confidence Intervals for Non-Normally Distributed Data

In [None]:
n = ?
mean = ?
t_value = stats.t.ppf(0.95, n-1) # n-1 is the degrees of freedom, # The significance level is 97%
margin_error = std / (np.sqrt(n)) * t
confidence_interval = (mean - margin_error, mean + margin_error)

#### Confidence Intervals for Binomial Distribution

left endpt.: $\hat{p} - z\times\sqrt{\frac{\hat{p}(1 - \hat{p})}{n}}$ <br/>
right endpt.: $\hat{p} + z\times\sqrt{\frac{\hat{p}(1 - \hat{p})}{n}}$

In [None]:
p_hat = ?
n = ?
z = stats.norm.ppf(0.975)
step = z * np.sqrt(p_hat * (1-p_hat) / n) 

confidence_interval = (p_hat - step, p_hat + step)

## ANOVA - the $F$ test

$F = \frac{s^2_{between}}{s^2_{within}}$

In [None]:
# create random array of data
np.random.seed(42)
one = np.random.normal(0, 3, 100) #(center, std, n-points)
two = np.random.normal(1, 3, 100)

In [None]:
# The "one-way" just means that there is a single input variable.

stats.f_oneway(one, two) # Can insert more than two samples.

In [None]:
# Identical p_values

t = stats.ttest_ind(one, two, equal_var=True) # t-statistic squared = f-statistic, Variance should be the same (P-value)

In [None]:
# The square of the two-sample t-stat = the F-stat
t.statistic**2

## Proportions

* Formula to get the test statistic is : 
z = (p-p0) / √p0(1-p0)/n

#### By Hand

In [None]:
p_hat = x/n # Where x is the sample data and n is the total count of data

In [None]:
# 1 sample z prop test
# if expected proportion = p1
p0 = x_hat.mean()
st_error = ((p0 * (1-p0))/ len(p0)**.5 # can use np.sqrt(p0) for square root
z_stat = (p1 - p0) / st_error
z_stat

zcrit_val = st.norm.ppf(1-.05)

In [None]:
plan_null = 'H0: There is no statistically significant difference in the percentage of men and women who have a healthcare plan.'
plan_alt = 'H1: There is a statistically significant difference in the percentage of men and women who have a healthcaare plan.'

# 2 z prop test
male_plan = df[df['SEX']==1]['HLTHPLN1']
female_plan = df[df['SEX']==2]['HLTHPLN1']

yesplan_m = male_chron[male_chron==1]
yesplan_f = female_chron[female_chron==1]

prob = (len(yesplan_m) + len(yesplan_f)) / (len(male_plan) + len(female_plan))
p_hat_plan_male = len(yesplan_m)/len(male_plan)
p_hat_plan_female = len(yesplan_f)/len(female_plan)
den = (1/(len(male_plan)) + (1/len(female_plan)))
num_plan = p_hat_plan_male - p_hat_plan_female
den_plan =  (prob * (1-prob) * den)**.5

#z statistic 
plan_z_2samp = num_plan / den_plan

#crit val
plan_zcrit_val1 = st.norm.ppf(1-.025)
plan_zcrit_val2 = st.norm.ppf(1-.975)

plan_z2sampprop_conclusion = 'Because the z-statistic is less than the critical value, we reject the null hypthesis.'
plan_z2sampprop_conclusion

#### Python one sided proportion Z test from statsmodels

In [None]:
# Z test for proportions - https://www.statsmodels.org/stable/generated/statsmodels.stats.proportion.proportions_ztest.html
# See url for examples
proportions_ztest(count, nobs)

#### Python two sided proportion Z test from statsmodels

In [None]:
significance = 0.025
# our samples - 82% are good in one, and ~79% are good in the other
# note - the samples do not need to be the same size
sample_success_a, sample_size_a = (410, 500)
sample_success_b, sample_size_b = (379, 400)
# check our sample against Ho for Ha != Ho
successes = np.array([sample_success_a, sample_success_b])
samples = np.array([sample_size_, sample_size_b])
# note, no need for a Ho value here - it's derived from the other parameters
stat, p_value = proportions_ztest(count=successes, nobs=samples,  alternative='two-sided')
# report
print('z_stat: %0.3f, p_value: %0.3f' % (stat, p_value))
if p_value > significance:
   print ("Fail to reject the null hypothesis - we have nothing else to say")
else:
   print ("Reject the null hypothesis - suggest the alternative hypothesis is true")

## Simple Linear Regression

#### Covariance

For two random variables $X$ and $Y$, each with $n$ values:

$\sigma_{XY} = \frac{\Sigma^n_{i = 1}(x_i - \mu_x)(y_i - \mu_y)}{n}$ <br/>

#### Correlation

Pearson Correlation: A correlation of -1 means that X and Y are perfectly negatively correlated, and a correlation of 1 means that X and Y are perfectly positively correlated. <br/>$\ r_P = \frac{\Sigma^n_{i = 1}(x_i - \mu_x)(y_i - \mu_y)}{\sqrt{\Sigma^n_{i = 1}(x_i - \mu_x)^2\Sigma^n_{i = 1}(y_i -\mu_y)^2}}$

#### Numpy Covariance

In [None]:
X = [1, 3, 5]
Y = [2, 9, 10]

# Covariance by hand:
((1-3) * (2-7) + (3-3) * (9-7) + (5-3) * (10-7)) / 3

# Better yet: With NumPy:
np.cov(X, Y, ddof=0)[0, 1]

np.cov(X, Y, ddof=0)

#### Numpy Correlation

In [None]:
np.corrcoef(X, Y)
4 / np.sqrt(19)
np.corrcoef(X, Y)[0, 1] == (np.cov(X, Y, ddof=0) / (np.std(X) * np.std(Y)))[0, 1]

In [None]:
# Scipy function for Correlation
stats.pearsonr(X, Y)[0]

### Regression Equation

The solution for a simple regression best-fit line is as follows:

- slope: <br/>$ m = r_P\frac{\sigma_y}{\sigma_x} = \frac{cov(X, Y)}{var(X)}$

- y-intercept:<br/> $ b = \mu_y - m\mu_x$

#### Regression Without Error in `statsmodels`

In [None]:
#Y as a function of X. Y is target = dependent variable, X is predictor or independent variable
sm.formula.ols(formula = "y ~ x", data = test_df).fit().summary()

#### Regression with Error in `statsmodels`

In [None]:
x = np.arange(20)
y = np.array([3*pt + 5 + gauss(mu=0, sigma=5) for pt in x])

df2 = pd.DataFrame(columns=['x', 'y'])

df2['x'] = x
df2['y'] = y

model = sm.formula.ols(formula='y~x', data=df2).fit()

model.summary()