In [None]:
# Let's say we want to do an experiment: Find the average height of all Dutch adults.
# But we we don't have the time and money to measure all 17 million
# So we will measure a subset of 100 people and compute their average height.
# And the standard deviation within that subset to compute a (95%) confidence interval around the average height 
# of that subset.

In [70]:
import numpy as np
from scipy.stats import norm
loc = 1.7
scale = 0.1
size = 1.7e7
heights_of_all_Dutch_adults = np.random.normal(loc=loc, scale=scale, size=int(size)).copy()
print(heights_of_all_Dutch_adults.mean(), heights_of_all_Dutch_adults.min(), heights_of_all_Dutch_adults.max(), 
      heights_of_all_Dutch_adults.std())

1.7000026009587104 1.1675004818886743 2.30659991470501 0.09999390384097719


In [69]:
# Let us take a 1000 "measurements" (samples).
# Calculate the 95% confidence interval for each measurement.

In [60]:
sample_size=100
number_of_samples = 1000
all_samples = [heights_of_all_Dutch_people[i*sample_size:(i+1)*sample_size] for i in range(number_of_samples)]

In [61]:
sample_means = [sample.mean() for sample in all_samples]

In [62]:
sample_stds = [sample.std() for sample in all_samples]

In [63]:
confidence_level = 0.95
alpha = 1 - confidence_level
z = norm.ppf(1 - alpha/2)
print(z)
lower_limits = [sample_means[i] - z*sample_stds[i]/np.sqrt(sample_size) for i in range(number_of_samples)]
upper_limits = [sample_means[i] + z*sample_stds[i]/np.sqrt(sample_size) for i in range(number_of_samples)]

1.959963984540054


In [None]:
# Now the core message of this talk: Verify that 95% of all measurements contain the true mean.

In [64]:
inside_or_out = [1 if (loc > lower_limits[i] and loc < upper_limits[i]) else 0 for i in range(number_of_samples)]

In [65]:
sum(inside_or_out)

951

In [67]:
fraction_in = sum(inside_or_out)/number_of_samples

In [68]:
fraction_in

0.951