# One sample T-test

In [3]:
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt

population_ages1 = stats.poisson.rvs(loc=18, mu=35, size=150000)
population_ages2 = stats.poisson.rvs(loc=18, mu=10, size=100000)
population_ages = np.concatenate([population_ages1, population_ages2])

minnesota_ages1 = stats.poisson.rvs(loc=18, mu=30, size=30)
minnesota_ages2 = stats.poisson.rvs(loc=18, mu=10, size=20)
minnesota_ages = np.concatenate([minnesota_ages1, minnesota_ages2])

print(population_ages.mean())
print(minnesota_ages.mean())

43.00018
39.86


In [13]:
result = stats.ttest_1samp(a=minnesota_ages,
                           popmean=population_ages.mean())
print(result.pvalue)
print(result.statistic)

0.03713056312183023
-2.1426998449717276


In [16]:
error = 0.05
t_score1 = stats.t.ppf(q=error/2, # q means quantile!!!
                       df=49)
t_score2 = stats.t.ppf(q=1-error/2,
                       df=49)
print(f"If values like outside of these t scores: {t_score1}, {t_score2}.\nThen we can say that we have the evidence of rejecting NULL hypothesis.")

proba = stats.t.cdf(x=result.statistic,df=49) * 2 # *2 since two tailed
print(proba)
print(np.isclose(proba, result.pvalue))

If values like outside of these t scores: -2.0095752371292397, 2.0095752371292397.
Then we can say that we have the evidence of rejecting NULL hypothesis.
0.03713056312183023
True


In [17]:
SE = minnesota_ages.std() / np.sqrt(50)
CI = stats.t.interval(0.95,
                      49,
                      minnesota_ages.mean(),
                      SE)
print(f"CI = {CI}") # We expect not to capture true population mean

CI = (36.94451700013237, 42.77548299986763)


In [18]:
SE = minnesota_ages.std() / np.sqrt(50)
CI = stats.t.interval(0.99,
                      49,
                      minnesota_ages.mean(),
                      SE)
print(f"CI = {CI}") # We to capture true population mean since 0.99 > 1-alpha_level

CI = (35.971937351125916, 43.74806264887408)


# Two sample T-test

We compare two data samples to one another. `NULL` hypothesis states that both groups are the same.

In [27]:
np.random.seed(1)
wilcoin_ages1 = stats.poisson.rvs(loc=18, mu=33, size=30)
wilcoin_ages2 = stats.poisson.rvs(loc=18, mu=13, size=20)
wilcoin_ages = np.concatenate([wilcoin_ages1, wilcoin_ages2])

In [29]:
alpha_level = 0.05
result2 = stats.ttest_ind(wilcoin_ages, 
                          minnesota_ages, 
                          equal_var=False) # Perform Welch's T-test
print("We reject NULL hypothesis") if result2.pvalue < alpha_level else print("We do not reject NULL hypothesis")
result2

We do not reject NULL hypothesis


TtestResult(statistic=1.5557986661183318, pvalue=0.12299454330100068, df=97.547544166257)

# Paired T-test

It is designed to test the differences between independent groups.

In [46]:
np.random.seed(14)
before = stats.norm.rvs(scale=30, loc=250, size=100)
after = before + stats.norm.rvs(scale=5, loc=-1.25, size=100)
weight_df = pd.DataFrame({"weight_before":before,
                          "weight_after":after,
                          "weight_difference":after-before})
weight_df.describe()

Unnamed: 0,weight_before,weight_after,weight_difference
count,100.0,100.0,100.0
mean,244.863755,242.467136,-2.396619
std,31.079057,31.296827,5.177046
min,176.843508,172.403133,-18.109797
25%,225.707398,222.007707,-4.989827
50%,243.294904,242.979325,-2.542519
75%,264.720763,261.352167,0.291699
max,330.897067,338.5646,11.173041


In [47]:
result3 = stats.ttest_rel(a=before,
                          b=after)
result3

TtestResult(statistic=4.629316964157346, pvalue=1.1151477052678931e-05, df=99)