# Statistics Codebook

#### Imports

In [None]:
import numpy as np
from scipy import stats
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats.proportion import proportions_ztest
from scipy.stats import chi2_contingency # chi-squared test with similar proportions
from scipy.stats import chi2

#### Z-score

In [None]:
# Z score for population data for a single point in relation to a distribution of points
z = (x_bar - mu)/(std) # Where mean of sample data (x_bar) - population mean / std

In [None]:
# Z score for when we are working with a sampling distribution:
z = (x_bar - mu)/(std/np.sqrt(n))

In [None]:
# Critical Value of the Z score
z_critical_val = st.norm.ppf(1-.05) # Where alpha is '.05'

#### Getting Percentile & Probability (hypothesis testing)

In [None]:
# We can use stats to calculate the percentile / probablility of getting given z score OR higher
print("Percentile = ", stats.norm.cdf(z)) # can use for t-value also

# We can also use the survival function to calculate the probability
print("Probability = ", stats.norm.sf(z)) # can use for t-value also

In [None]:
# Using Python to get the t-statistic & P-value:
stats.ttest_1samp(std, mu) #(standard deviation, mean)

#### T-critical value for 1 tailed test (hypothesis testing)

In [None]:
# In Python to get the t-statistic & P-value
stats.ttest_ind(sample_data_1, sample data_2, equal_var=False) # Equal variance of false if comparing different size data

#### T-critical value for 2 tailed test (hypothesis testing)

In [None]:
# Calculate our t-critical value for 2 tailed test (.025 & .975) = 97% confidence
print(stats.t.ppf(0.025, n-1)) # The degrees of freedom is (n-1)
print(stats.t.ppf(0.975, n-1)) # The significance level is 97%

st.t.ppf(1-(.05/2), (n1 + n2)-2) # in one line - alpha devided by 2

#### Confidence Interval for Normally Distributed Data (margin of error)

In [None]:
pop_std / np.sqrt(n) * z # or (t)

#### Confidence Intervals for Non-Normally Distributed Data

In [None]:
n = ?
mean = ?
t_value = stats.t.ppf(0.95, n-1) # n-1 is the degrees of freedom, # The significance level is 97%
margin_error = std / (np.sqrt(n)) * t
confidence_interval = (mean - margin_error, mean + margin_error)

#### Confidence Intervals for Binomial Distribution

left endpt.: $\hat{p} - z\times\sqrt{\frac{\hat{p}(1 - \hat{p})}{n}}$ <br/>
right endpt.: $\hat{p} + z\times\sqrt{\frac{\hat{p}(1 - \hat{p})}{n}}$

In [None]:
p_hat = ?
n = ?
z = stats.norm.ppf(0.975)
step = z * np.sqrt(p_hat * (1-p_hat) / n) 

confidence_interval = (p_hat - step, p_hat + step)

#### ANOVA - the $F$ test

$F = \frac{s^2_{between}}{s^2_{within}}$

In [None]:
# create random array of data
np.random.seed(42)
one = np.random.normal(0, 3, 100) #(center, std, n-points)
two = np.random.normal(1, 3, 100)

In [None]:
# The "one-way" just means that there is a single
# input variable.

stats.f_oneway(one, two)

In [None]:
# Identical p_values

t = stats.ttest_ind(one, two, equal_var=True) # t-statistic squared = f-statistic, Variance should be the same (P-value)

In [None]:
# The square of the two-sample t-stat = the F-stat
t.statistic**2

### Proportions

In [None]:
p_hat = x/n # Where x is the sample data and n is the total count of data