In [28]:
import numpy as np
import pandas as pd
from scipy.stats import norm, t
from scipy.stats import ttest_ind, ttest_1samp, ttest_ind_from_stats

# Z-Test - 1 Sample

In [2]:
def z_test_1_samp_observed_mean(observed_mean, num_samples, sigma, alpha):
    pass

def z_test_1_samp(data, sigma, alpha):
    pass

## Customers entering a store

In [80]:
"""
The average number of customers entering a store every day is 500,
with a standard deviation of 125.

A marketing company claims to increase this number.
After 70 days, it is seen that the average is 530.
Test the claim of the marketing company at a 95% confidence (or 5% significance)
"""

"""
H0: mu = 500
Ha: mu > 500
Test-Statistic is sample mean
Right-tailed
Alpha = 0.05
"""

z_stat = (530 - 500)/(125/np.sqrt(70))
p_value = 1 - norm.cdf(z_stat)
print("Z-stat = ", z_stat)
print("P-value = ", p_value)
alpha = 0.05
if p_value < alpha:
    print("Reject H0")
else:
    print("Fail to reject H0")

Z-stat =  2.007984063681781
P-value =  0.022322492581293485
Reject H0


In [83]:
ttest_ind_from_stats(530, 125, 70, 500, 125, 100000, alternative="greater")

Ttest_indResult(statistic=2.0072816380114653, pvalue=0.02236118353955271)

## India runs on chai (assessment)

In [91]:
"""
The Chai Point stall at Bengaluru airport estimates that each person visiting the store drinks an average of 1.7 small cups of tea.

Assume a population standard deviation of 0.5 small cups. A sample of 30 customers collected over a few days averaged 1.85 small cups of tea per person.

Test the claim using a z-test at an alpha = 0.05 significance value, with a critical z-score value of .

Note: Round off the z-score to two decimal places.
"""


"""
H0: 1.7 cups
Ha: > 1.7 cups

Test statistic = sample mean; 
Observed test statistic = 1.85

Right tailed

Alpha = 0.05
"""
z_stat = (1.85 - 1.7)/(0.5/np.sqrt(30))
p_value = 1 - norm.cdf(z_stat)
print("Z-stat = ", z_stat)
print("P-value = ", p_value)
alpha = 0.05
if p_value < alpha:
    print("Reject H0")
else:
    print("Fail to reject H0")


Z-stat =  1.6431676725155
P-value =  0.05017412323114523
Fail to reject H0


In [92]:
ttest_ind_from_stats(1.85, 0.5, 30, 1.7, 0.5, 100000, alternative="greater")

Ttest_indResult(statistic=1.6429212528076707, pvalue=0.05020118514142627)

## Local football game

In [4]:
"""
A local football game sees 3.5 goals per match on average, with standard deviation of 0.7. 
A sample of 45 matches was taken. 
What should be the maximum average goal of these 45 matches such that we can continue
to believe the statement that the population average is 3.5 goals, at a 10% confidence
"""

z_critical = norm.ppf(0.9)
x = 3.5 + z_critical * 0.7/np.sqrt(45)
print(x)

3.633729699470687


# Z-Test - 2 Sample

In [5]:
def z_test_2_samp_manual():
    pass

def z_test_2_samp_package():
    pass

# T-Test - 1 Sample

In [6]:
def t_test_1_samp_manual():
    pass

def t_test_1_samp_package():
    pass

In [None]:
def ttest_1_samp_from_data(d1, popmean, alternative):
    pass

## Students solving assessments

In [118]:
"""
An instructor claims that it takes 90 days to solve all assessment questions.
A sample of 9 students took an average of 77.4 days with sample std dev = 29.6.
Would you reject the instructor's claim at a 5% significance?
"""
t_stat = (77.4-90)/( 29.6 / np.sqrt(9))
p_value = t.cdf(t_stat, df=8) 
alpha = 0.05
print("T-stat = ", t_stat)
print("P value = ", p_value)
if p_value < alpha:
    print("Reject H0")
else:
    print("Fail to reject H0")

T-stat =  -1.2770270270270263
P value =  0.11870416200249816
Fail to reject H0


In [115]:
ttest_ind_from_stats(77.4, 29.6, 9, 90, 29.6, 10000000, alternative="less")

Ttest_indResult(statistic=-1.2770264523652521, pvalue=0.10079646907934484)

## Shoe company claim

In [144]:
"""
A shoe company says their shoes last for 6 years on average. 
A survey of 15 shoes showed an average lifetime of 5.4 years with sample std dev of 1.1 years
Test the claim of the company at 5% significance
"""
t_stat = (5.4 - 6) / (1.1/np.sqrt(15))
p_value = t.cdf(t_stat, df=10000000) 
alpha = 0.05
print("T-stat = ", t_stat)
print("P value = ", p_value)
if p_value < alpha:
    print("Reject H0")
else:
    print("Fail to reject H0")

T-stat =  -2.1125363706585896
P value =  0.017320247256100447
Reject H0


In [133]:
ttest_ind_from_stats(5.4, 1.1, 15, 6, 1.1, 10000000, alternative="less")

Ttest_indResult(statistic=-2.1125347862580943, pvalue=0.017320315127828587)

In [135]:
ttest_ind_from_mean_and_std_dev(5.4, 1.1, 15, 6, 1.1, 10000000, alternative="less")

T-stat =  -2.1125347862580943
P-value =  0.017320315127828587


## Quality assurance (assessment)

In [65]:
"""
The quality assurance department claims that on average the non-fat milk contains more than 190 mg of Calcium per 500 ml packet.

To check this claim 45 packets of milk are collected and the content of calcium is recorded. perform the appropriate test with 90% confidence.

"""
data = pd.Series([193, 321, 222, 158, 176, 149, 154, 223, 233, 177, 280, 244, 138, 210, 167, 129, 254, 167, 194, 191, 128, 191, 144, 184, 330, 216, 212, 142, 216, 197, 231, 133, 205, 192, 195, 243, 224, 137, 234, 171, 176, 249, 222, 234, 191])
print("Observed sample mean = ", data.mean())
t_stat, p_value = ttest_1samp(data, popmean=190, alternative="greater")
print("Test statistic = ", t_stat)
print("P-value = ", p_value)
if p_value < 0.1:
    print("Reject H0")
else: 
    print("Fail to reject H0")


Observed sample mean =  199.48888888888888
Test statistic =  1.3689029903414232
P-value =  0.0889889155615061
Reject H0


In [121]:
ttest_ind_from_stats(data.mean(), data.std(), len(data), 190, data.std(), 10000000, alternative="greater")

Ttest_indResult(statistic=1.3688999103200898, pvalue=0.08551529727616433)

# T-Test - 2 Sample

In [17]:
def ttest_ind_from_data(d1, d2, alternative="two-sided"):
    """
    d1: pandas Series
    d2: pandas Series
    alternative: {‘two-sided’, ‘less’, ‘greater’}, optional
    """
    n1 = len(d1)
    n2 = len(d2)
    
    m1 = d1.mean()
    m2 = d2.mean()
    
    s1 = d1.std()
    s2 = d2.std()
    
    df = n1 + n2 - 2
    
    s = np.sqrt((((n1-1)*(s1**2)) + ((n2-1)*(s2**2))) / (n1 + n2 - 2))
    
    t_stat = (m1 - m2) / (s*np.sqrt(1/n1+ 1/n2))
    
    if alternative == "two-sided":
        p_value = 2*(1 - t.cdf(t_stat, df=df))
    if alternative == "less":
        p_value = t.cdf(t_stat, df=df)
    if alternative == "greater":
        p_value = 1 - t.cdf(t_stat, df=df)
    print("T-stat = ", t_stat)
    print("P-value = ", p_value)

In [34]:
def ttest_ind_from_mean_and_std_dev(m1, s1, n1, m2, s2, n2, alternative="two-sided"):
    """
    m1: mean of first set of samples
    m2: mean of second set of samples
    n1: number of samples in first set
    alternative: {‘two-sided’, ‘less’, ‘greater’}, optional
    """
    df = n1 + n2 - 2
    
    s = np.sqrt((((n1-1)*(s1**2)) + ((n2-1)*(s2**2))) / (n1 + n2 - 2))
    
    t_stat = (m1 - m2) / (s*np.sqrt(1/n1+ 1/n2))
    
    if alternative == "two-sided":
        p_value = 2*(1 - t.cdf(t_stat, df=df))
    if alternative == "less":
        p_value = t.cdf(t_stat, df=df)
    if alternative == "greater":
        p_value = 1 - t.cdf(t_stat, df=df)
    print("T-stat = ", t_stat)
    print("P-value = ", p_value)

## Average hourly wage (assessment)

The average hourly wage of a sample of 150 workers in plant 'A' was Rs.2·87 with a standard deviation of Rs. 1·08.

The average wage of a sample of 200 workers in plant 'B' was Rs. 2·56 with a standard deviation of Rs. 1·28.

Can an applicant safely assume that the hourly wages paid by plant 'A' are higher than those paid by plant 'B' at a 1% significance level?

In [35]:
ttest_ind_from_mean_and_std_dev(2.87, 1.08, 150, 2.56, 1.28, 200, alternative="greater")

T-stat =  2.3947738234213953
P-value =  0.008579470194694738


In [134]:
t_stat, p_value = ttest_ind_from_stats(2.87, 1.08, 150, 2.56, 1.28, 200, alternative="greater")
print("Test statistic = ", t_stat)
print("P-value", p_value)
if p_value < 0.01:
    print("Reject H0")
else: 
    print("Fail to reject H0")

Test statistic =  2.3947738234213953
P-value 0.008579470194694674
Reject H0


## Smokers (assessment)

When smokers smoke, nicotine is transformed into cotinine, which can be tested.

The average cotinine level in a group of 50 smokers was 243.5 ng ml.

Assuming that the standard deviation is known to be 229.5 ng ml, at 95 % confidence, test the assertion that the mean cotinine level of all smokers is equal to 300.0 ng ml.

In [74]:
t_stat, p_value = ttest_ind_from_stats(243.5, 229.5, 50, 300, 229.5, 100000000)
print("Test statistic = ", t_stat)
print("P-value", p_value)
if p_value < 0.01:
    print("Reject H0")
else: 
    print("Fail to reject H0")

Test statistic =  -1.7408071088958776
P-value 0.0817173985378351
Fail to reject H0


## Institution's claim (assessment)

In [77]:
t_stat, p_value = ttest_ind_from_stats(110, 15, 50, 100, 15, 100000000, alternative="greater")
print("Test statistic = ", t_stat)
print("P-value", p_value)
if p_value < 0.05:
    print("Reject H0")
else: 
    print("Fail to reject H0")

Test statistic =  4.7140440293994565
P-value 1.2142423949320067e-06
Reject H0


##  Assessment across 2 batches

In [125]:
"""
The number of hours a student spends solving assessment problems across two batches are known.
Test if one batch takes lesser time.
"""
batch_1 = pd.Series([25,30,45,49,47,35,32,42])
batch_2 = pd.Series([45,47,25,22,29,32,27,28,40,49,50,33])
t_stat, p_value = ttest_ind(batch_1, batch_2, alternative="two-sided")
print("Test statistic = ", t_stat)
print("P-value= ", p_value)
if p_value < 0.05:
    print("Reject H0")
else: 
    print("Fail to reject H0")

Test statistic =  0.5795450171026676
P-value=  0.5694047618891788
Fail to reject H0


In [123]:
ttest_ind_from_stats(batch_1.mean(), batch_1.std(), len(batch_1), batch_2.mean(), batch_2.std(), len(batch_2), alternative="two-sided")

Ttest_indResult(statistic=0.5795450171026676, pvalue=0.5694047618891788)

# Unified approach

In [161]:
data = pd.Series(
        [110, 112, 99, 120, 105, 107, 109, 110, 100, 105, 102, 107, 109, 110, 100, 105, 102 ])

In [182]:
data = pd.Series(np.random.randint(95, 107, size=1000))

In [183]:
data.mean()

100.513

In [184]:
ttest_1samp(data, 100, alternative="greater")

TtestResult(statistic=4.659411508734172, pvalue=1.7999925319966461e-06, df=999)

In [185]:
ttest_ind_from_stats(data.mean(), data.std(), len(data), 100, data.std(),  10000000000 ,alternative="greater")     

Ttest_indResult(statistic=4.659411275763614, pvalue=1.585575239438631e-06)

In [174]:
1 - t.cdf(5.121421648770584, df=16)

5.126421573631301e-05

In [191]:
1 - t.cdf(5.121421648770584, df=1000000)

1.5164840583281602e-07

In [176]:
np.random.normal(loc=243, scale=229.5**2)

-408.0889225881908