In [None]:
import matplotlib
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
import numpy as np
plots.style.use('fivethirtyeight')

## Lecture 25 ##

## Central Limit Theorem

In [None]:
united = Table.read_table('united_summer2015.csv')
united

In [None]:
united.hist('Delay', bins = np.arange(-20, 300, 10))

In [None]:
delays = united.column('Delay')
mean_delay = np.mean(delays)
sd_delay = np.std(delays)

mean_delay, sd_delay

In [None]:
percentile(50, delays)

In [None]:
united.hist('Delay', bins = np.arange(-20, 300, 10))

In [None]:
sample_size = 400

means_400 = make_array()

for i in np.arange(10000):
    sampled_flights = united.sample(sample_size)
    sample_mean = np.mean(sampled_flights.column('Delay'))
    means_400 = np.append(means_400, sample_mean)

In [None]:
Table().with_columns('Sample Mean', means_400).hist(bins = 20)

plots.title('Sample Size ' + str(sample_size))
plots.xlabel('Sample Average')
print('Population Average: ', mean_delay);

In [None]:
np.average(means_400)

## Variability of the Sample Average ##

In [None]:
sample_size = 900

means_900 = make_array()

for i in np.arange(10000):
    sampled_flights = united.sample(sample_size)
    sample_mean = np.mean(sampled_flights.column('Delay'))
    means_900 = np.append(means_900, sample_mean)

In [None]:
means_tbl = Table().with_column(
    '400', means_400,
    '900', means_900
)

In [None]:
means_tbl.hist(bins = np.arange(5, 31, 0.5))
plots.title('Distribution of Sample Average');

In [None]:
united.num_rows

In [None]:
united.num_rows ** 400

In [None]:
"""Empirical distribution of random sample means"""

def sample_means(sample_size):
    
    repetitions = 10000
    means = make_array()

    for i in range(repetitions):
        sampled_flights = united.sample(sample_size)
        sample_mean = np.mean(sampled_flights.column('Delay'))
        means = np.append(means, sample_mean)

    sample_means = Table().with_column('Sample Means', means)
    
    # Display empirical histogram and print all relevant quantities
    sample_means.hist(bins=20)
    plots.xlabel('Sample Means')
    plots.title('Sample Size ' + str(sample_size))
    print("Sample size: ", sample_size)
    print("Population mean:", np.mean(united.column('Delay')))
    print("Average of sample means: ", np.mean(means))
    print("Population SD:", np.std(united.column('Delay')))
    print("SD of sample means:", np.std(means))

In [None]:
sample_means(100)

In [None]:
sample_means(400)

In [None]:
sample_means(625)

In [None]:
sd_delay, sd_delay / make_array(10, 20, 25)

In [None]:
sample_sizes = np.arange(50, 401, 50)

sd_of_sample_means = make_array()

for n in sample_sizes:
    means = make_array()
    for i in np.arange(10000):
        means = np.append(means, np.mean(united.sample(n).column('Delay')))
    sd_of_sample_means = np.append(sd_of_sample_means, np.std(means))

In [None]:
sd_comparison = Table().with_column(
    'Sample Size n', sample_sizes,
    'SD of 10,000 Sample Means', sd_of_sample_means,
    'Population_SD/sqrt(n)', sd_delay/np.sqrt(sample_sizes)
)

In [None]:
sd_comparison

In [None]:
sd_comparison.scatter('Sample Size n')

# Designing Experiments

In [None]:
sample_means(900)

In [None]:
population_sd = np.std(delays)
population_sd

In [None]:
population_sd/np.sqrt(900)

## SD of 0/1 Population ##

In [None]:
# Population of size 10

ones = 5
zero_one_population = np.append(np.ones(ones), np.zeros(10 - ones))
zero_one_population

In [None]:
np.std(zero_one_population) 

In [None]:
pop_proportions = make_array()
pop_SDs = make_array()

for k in np.arange(1, 10):
    population = np.append(np.ones(k), np.zeros(10 - k))
    population_SD = np.std(population)
    pop_SDs = np.append(pop_SDs, population_SD)
    pop_proportions = np.append(pop_proportions, k/10)
    
sd_table = Table().with_columns(
    'Population Proportion', pop_proportions,
    'Population SD', pop_SDs
)

sd_table

In [None]:
sd_table.scatter(0)