In [65]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import math

%matplotlib inline

# One sample t-test 
Does sample mean differ from  population mean

In [66]:
# USA/Population Ages
population_ages1 = stats.poisson.rvs(loc = 18, mu = 35, size = 100000)
population_ages2 = stats.poisson.rvs(loc = 18, mu = 10, size = 150000)

population_ages = np.concatenate((population_ages1,population_ages2))
population_ages.mean()

38.004996

In [67]:
# California Ages
california_ages1 = stats.poisson.rvs(loc = 18, mu = 30, size = 100)
california_ages2 = stats.poisson.rvs(loc = 18, mu = 10, size = 150)

california_ages = np.concatenate((california_ages1,california_ages2))
california_ages.mean()

36.404

*We conduct a t-test at 95% confidence to check wheather sample mean age differ from population mean age.*

**Null hypothesis is = mean ages do not differ**

In [68]:
stats.ttest_1samp(a = california_ages,popmean = population_ages.mean())

Ttest_1sampResult(statistic=-2.2620727925468826, pvalue=0.024555518502154285)

In [69]:
stats.t.ppf(q = 0.025, df = 249)   # calculating t-alpha

-1.9695368676395828

In [70]:
stats.t.cdf(x = -2.7291005440102665, df = 249)*2   # calculating p-value to match with stats.ttest_1samp pvalue

0.006803654671954885

**We reject the NULL hypothesis** i.e. No sufficient evidence to infer California age mean is equal to population age mean. 

In [71]:
#95% confidence level interval for California mean does not capture population mean

stats.t.interval(alpha = 0.95, df = 249, loc = california_ages.mean(), scale = california_ages.std()/math.sqrt(250))

(35.01283880619037, 37.79516119380963)

In [72]:
#99%confidence level interval for California mean captures population mean

stats.t.interval(alpha = 0.99, df = 249, loc = california_ages.mean(), scale = california_ages.std()/math.sqrt(250))

(34.57054266558209, 38.23745733441792)

# Two sample t-test

Do means of two independant data samples differ from one another

In [80]:
texas_ages1 = stats.poisson.rvs(loc = 18, mu = 30, size = 30)
texas_ages2 = stats.poisson.rvs(loc = 18, mu = 10, size = 20)
texas_ages = np.concatenate((texas_ages1,texas_ages2))

texas_ages.mean()

39.36

In [83]:
stats.ttest_ind(a = texas_ages, b = california_ages, equal_var = False)

Ttest_indResult(statistic=1.8819043338945496, pvalue=0.063668921622248)

p value is grter than the significance. therefore we do not reject the hypothesis that the sample means are same

# Paired t test

We create data about patients' weight before and after a weight loss program.
The weight data is normally distributed

In [95]:
before = stats.norm.rvs(loc = 250, scale = 30, size = 100)
after = before - stats.norm.rvs(loc = 5, scale = 5, size = 100)
weights = pd.DataFrame({'before_weight': before,
                        'after_weight': after,
                        'weight_loss': before - after})

In [96]:
weights.describe()

Unnamed: 0,before_weight,after_weight,weight_loss
count,100.0,100.0,100.0
mean,249.730113,245.29102,4.439093
std,28.134868,28.897596,5.300107
min,182.521426,177.382945,-6.999968
25%,229.915754,225.417047,1.414571
50%,249.001917,245.974963,4.012442
75%,269.007308,261.888421,6.896199
max,318.978788,313.98457,18.497795


In [97]:
stats.ttest_rel(a = weights['before_weight'],b = weights['after_weight'], axis = 0)

Ttest_relResult(statistic=8.375476573459151, pvalue=3.7405157378347123e-13)