In [None]:
Hypothesis testing is of two types:

One-tailed test
Two-tailed test

In [1]:
import numpy as np
from scipy.stats import describe
v = np.random.normal(size=100)
result = describe(v)
print(result)

DescribeResult(nobs=100, minmax=(-2.422035144227755, 2.9173455909556694), mean=0.12823655374249834, variance=0.890099151098581, skewness=0.10337587794728012, kurtosis=-0.04748807571643843)


# T test

In [2]:
import numpy as np
from scipy.stats import ttest_ind

sample1 = np.random.normal(size=100)
sample2= np.random.normal(size=100)
res = ttest_ind(sample1, sample2)
print(res)

TtestResult(statistic=0.5053859940348663, pvalue=0.6138498627978984, df=198.0)


In [5]:
res = ttest_ind(sample1, sample2).statistic
print(res)

0.5053859940348663


# One-sample t-test example in Python
Let us learn how to conduct a one-sample t-test in Python using the scipy.stats.ttest_1samp() function. 

In [6]:
import numpy as np  
from scipy import stats  

# Population Mean 
mu = 10
# Sample Size
N1 = 21
# Degrees of freedom  
dof = N1 - 1
# Generate a random sample with mean = 11 and standard deviation = 1
x = np.random.randn(N1) + 11
print(x)
# Using the Stats library, compute t-statistic and p-value
t_stat, p_val = stats.ttest_1samp(a=x, popmean = mu)
print("t-statistic = " + str(t_stat))  
print("p-value = " + str(p_val)) 

[12.23474468  8.5567262  11.47498808 12.22166551  9.99158327 12.09474042
 10.94860252 11.1911949  11.15070894 10.87331131 10.34318228 12.09591814
 11.41145735 11.34857568 10.5610383  11.24527304 10.12022792 12.85130674
 10.04660838 10.3306809  11.09518213]
t-statistic = 4.943935709814096
p-value = 7.819290895027923e-05


In [7]:
stats.ttest_1samp(a=x, popmean = mu)

TtestResult(statistic=4.943935709814096, pvalue=7.819290895027923e-05, df=20)

# Two-sample t-test
Let’s consider that the first factory shares 21 samples of ball bearings where the mean diameter of the sample comes out to be 10.5 cm. On the other hand, the second factory shares 25 samples with a mean diameter of 9.5 cm. Both have a standard deviation of 1 cm. 

In [9]:
# Sample Sizes
N1, N2 = 21, 25
# Degrees of freedom  
dof = min(N1,N2) - 1
# Gaussian distributed data with mean = 10.5 and var = 1  
x = np.random.randn(N1) + 10.5
# Gaussian distributed data with mean = 9.5 and var = 1  
y = np.random.randn(N2) + 9.5
## Using the internal function from SciPy Package  
t_stat, p_val = stats.ttest_ind(x, y)  
print("t-statistic = " + str(t_stat))  
print("p-value = " + str(p_val))

t-statistic = 1.7663556203787387
p-value = 0.0842733692326086


In [None]:
Interpretation of the test results
Referring to the p-value of 0.0026 which is less than the significance level of 0.05, we reject the null hypothesis stating that the bearings from the two factories are not identical.

# Paired t-test example in Python
Let’s perform a paired t-test to verify if the change is statistically significant.

In [8]:
# Sample Sizes
N = 25
# Degrees of freedom  
dof = N - 1
# Gaussian distributed data with mean = 10.5 and var = 1  
x = np.random.randn(N) + 10.5
# Gaussian distributed data with mean = 9.9 and var = 1  
y = np.random.randn(N) + 9.9
t_stat, p_val = stats.ttest_rel(x,y)
print("t-statistic = " + str(t_stat))  
print("p-value = " + str(p_val))

t-statistic = 3.8796818059180973
p-value = 0.0007137375257872422


In [None]:
Interpretation of the test results
The low p-value indicates that the null hypothesis is rejected, i.e., there is no change in the diameter of the ball bearings after introducing the new casting machine. 

# Chisquares test

In [9]:
import numpy as np
import pandas as pd
import scipy.stats as stats
national = pd.DataFrame(["white"]*100000 + ["hispanic"]*60000 +\
                        ["black"]*50000 + ["asian"]*15000 + ["other"]*35000)
minnesota = pd.DataFrame(["white"]*600 + ["hispanic"]*300 + \
                         ["black"]*250 +["asian"]*75 + ["other"]*150)

national_table = pd.crosstab(index=national[0], columns="count")
minnesota_table = pd.crosstab(index=minnesota[0], columns="count")

print( "National")
print(national_table)
print(" ")
print( "Minnesota")
print(minnesota_table)

National
col_0      count
0               
asian      15000
black      50000
hispanic   60000
other      35000
white     100000
 
Minnesota
col_0     count
0              
asian        75
black       250
hispanic    300
other       150
white       600


In [11]:
national_ratios = national_table/len(national)  # Get population ratios

observed = minnesota_table
expected = national_ratios * len(minnesota)   # Get expected counts

# chi_square statistics
chi_squared_stat = (((observed-expected)**2)/expected).sum()
print(chi_squared_stat)

crit = stats.chi2.ppf(q = 0.95, # Find the critical value for 95% confidence*
                      df = 4)   # Df = number of variable categories - 1
print("Critical value:", crit)
print()

p_value = 1 - stats.chi2.cdf(x=chi_squared_stat,  # Find the p-value
                             df=4)
print("P value:",p_value)
print()



col_0
count    18.194805
dtype: float64
Critical value: 9.487729036781154

P value: [0.00113047]



In [12]:
stats.chisquare(f_obs= observed,  f_exp= expected)   # Array of expected counts

Power_divergenceResult(statistic=array([18.19480519]), pvalue=array([0.00113047]))

# Chi-Squared Test of Independence

In [13]:
np.random.seed(10)

# Sample data randomly at fixed probabilities
voter_race = np.random.choice(a= ["asian","black","hispanic","other","white"],
                              p = [0.05, 0.15 ,0.25, 0.05, 0.5],
                              size=1000)

# Sample data randomly at fixed probabilities
voter_party = np.random.choice(a= ["democrat","independent","republican"],
                              p = [0.4, 0.2, 0.4],
                              size=1000)

voters = pd.DataFrame({"race":voter_race, 
                       "party":voter_party})

voter_tab = pd.crosstab(voters.race, voters.party, margins = True)

voter_tab.columns = ["democrat","independent","republican","row_totals"]

voter_tab.index = ["asian","black","hispanic","other","white","col_totals"]

observed = voter_tab.iloc[0:5,0:3]   # Get table without totals for later use
voter_tab


Unnamed: 0,democrat,independent,republican,row_totals
asian,21,7,32,60
black,65,25,64,154
hispanic,107,50,94,251
other,15,8,15,38
white,189,96,212,497
col_totals,397,186,417,1000


In [14]:
expected =  np.outer(voter_tab["row_totals"][0:5],
                     voter_tab.loc["col_totals"][0:3]) / 1000

expected = pd.DataFrame(expected)

expected.columns = ["democrat","independent","republican"]
expected.index = ["asian","black","hispanic","other","white"]

expected

Unnamed: 0,democrat,independent,republican
asian,23.82,11.16,25.02
black,61.138,28.644,64.218
hispanic,99.647,46.686,104.667
other,15.086,7.068,15.846
white,197.309,92.442,207.249


In [None]:
[[ 23.82 ,  11.16 ,  25.02 ],
       [ 61.138,  28.644,  64.218],
       [ 99.647,  46.686, 104.667],
       [ 15.086,   7.068,  15.846],
       [197.309,  92.442, 207.249]]))

In [16]:
chi_squared_stat = (((observed-expected)**2)/expected).sum().sum()

print(chi_squared_stat)


7.169321280162059


In [17]:
stats.chisquare(f_obs= observed,  f_exp= expected)

Power_divergenceResult(statistic=array([1.47078796, 2.50934232, 3.189191  ]), pvalue=array([0.83180311, 0.64296369, 0.52667855]))

In [14]:
crit = stats.chi2.ppf(q = 0.95, # Find the critical value for 95% confidence*
                      df = 8)   # *

print("Critical value: ",crit)
# print(crit)

p_value = 1 - stats.chi2.cdf(x=chi_squared_stat,  # Find the p-value
                             df=8)
print("P value: ",p_value)
# print(p_value)

Critical value:  15.50731305586545
P value:  0.518479392948842


In [15]:
stats.chi2_contingency(observed= observed)

Chi2ContingencyResult(statistic=7.169321280162059, pvalue=0.518479392948842, dof=8, expected_freq=array([[ 23.82 ,  11.16 ,  25.02 ],
       [ 61.138,  28.644,  64.218],
       [ 99.647,  46.686, 104.667],
       [ 15.086,   7.068,  15.846],
       [197.309,  92.442, 207.249]]))

# ANOVA

In [None]:
One-Way ANOVA¶
The one-way ANOVA tests whether the mean of some numeric variable differs across the levels of one categorical variable. 
It essentially answers the question: do any of the group means differ from one another? We won't get into the details of
carrying out an ANOVA by hand as it involves more calculations than the t-test, but the process is similar: you go through 
several calculations to arrive at a test statistic and then you compare the test statistic to a critical value based 
on a probability distribution. In the case of the ANOVA, you use the "f-distribution".

In [None]:
The scipy library has a function for carrying out one-way ANOVA tests called scipy.stats.f_oneway(). 
Let's generate some fake voter age and demographic data and use the ANOVA to compare average ages across the groups:

In [20]:
np.random.seed(12)
races =   ["asian","black","hispanic","other","white"]

# Generate random data
voter_race = np.random.choice(a= races,
                              p = [0.05, 0.15 ,0.25, 0.05, 0.5],
                              size=1000)
voter_age = stats.poisson.rvs(loc=18,
                              mu=30,
                              size=1000)
# Group age data by race
voter_frame = pd.DataFrame({"race":voter_race,"age":voter_age})
groups = voter_frame.groupby("race").groups

# Extract individual groups
asian = voter_age[groups["asian"]]
black = voter_age[groups["black"]]
hispanic = voter_age[groups["hispanic"]]
other = voter_age[groups["other"]]
white = voter_age[groups["white"]]


array([51, 57, 49, 61, 53, 49, 43, 55, 52, 50, 41, 51, 49, 45, 51, 46, 45,
       42, 41, 50, 42, 43, 50, 50, 42, 48, 45, 44, 44, 60, 56, 47, 43, 44,
       45, 52, 45, 52, 43, 43, 38, 52, 47, 47, 41, 44, 61, 44, 40, 45, 45,
       44, 46, 48, 41, 51, 38, 45, 50, 43, 40, 52, 45, 60, 49, 51, 48, 48,
       45, 51, 57, 51, 50, 52, 47, 48, 44, 44, 49, 37, 50, 51, 56, 48, 58,
       41, 48, 49, 54, 34, 45, 56, 54, 41, 46, 43, 47, 51, 52, 55, 56, 48,
       57, 46, 45, 43, 47, 46, 50, 45, 42, 45, 51, 47, 46, 42, 43, 51, 46,
       54, 46, 45, 45, 47, 53, 50, 42, 44, 47, 47, 52, 42, 47, 43, 43, 43,
       42, 49, 43, 37, 53, 53, 55, 49, 40, 51, 56], dtype=int64)

In [21]:
# Perform the ANOVA
stats.f_oneway(asian, black, hispanic, other, white)

F_onewayResult(statistic=1.7744689357329695, pvalue=0.13173183201930463)

In [23]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

model = ols('age ~ race',                 # Model formula
            data = voter_frame).fit()
                
anova_result = sm.stats.anova_lm(model, typ=2)
print (anova_result)

             sum_sq     df         F    PR(>F)
race        199.369    4.0  1.774469  0.131732
Residual  27948.102  995.0       NaN       NaN


In [24]:
np.random.seed(12)

# Generate random data
voter_race = np.random.choice(a= races,
                              p = [0.05, 0.15 ,0.25, 0.05, 0.5],
                              size=1000)

# Use a different distribution for white ages
white_ages = stats.poisson.rvs(loc=18, 
                              mu=32,
                              size=1000)

voter_age = stats.poisson.rvs(loc=18,
                              mu=30,
                              size=1000)

voter_age = np.where(voter_race=="white", white_ages, voter_age)

# Group age data by race
voter_frame = pd.DataFrame({"race":voter_race,"age":voter_age})
groups = voter_frame.groupby("race").groups   

# Extract individual groups
asian = voter_age[groups["asian"]]
black = voter_age[groups["black"]]
hispanic = voter_age[groups["hispanic"]]
other = voter_age[groups["other"]]
white = voter_age[groups["white"]]

# Perform the ANOVA
stats.f_oneway(asian, black, hispanic, other, white)


F_onewayResult(statistic=10.164699828386366, pvalue=4.5613242113994585e-08)

In [28]:
# Alternate method
model = ols('age ~ race',                 # Model formula
            data = voter_frame).fit()
                
anova_result = sm.stats.anova_lm(model,typ = 2)
print (anova_result)

                sum_sq     df        F        PR(>F)
race       1284.123213    4.0  10.1647  4.561324e-08
Residual  31424.995787  995.0      NaN           NaN


In [20]:
The test result suggests the groups don't have the same sample means in this case, since the p-value is significant at a 99% confidence level. We know that it is the white voters who differ because we set it up that way in the code, but when testing real data, you may not know which group(s) caused the test to throw a positive result. To check which groups differ after getting a positive ANOVA result, you can perform a follow up test or "post-hoc test".

One post-hoc test is to perform a separate t-test for each pair of groups. You can perform a t-test between all pairs using by running each pair through the stats.ttest_ind() we covered in the lesson on t-tests:

SyntaxError: unterminated string literal (detected at line 1) (977181348.py, line 1)