In [None]:
import matplotlib
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
import numpy as np
plots.style.use('fivethirtyeight')

# Lecture 23 #

## Standard Deviation ##

In [None]:
values = make_array(2, 3, 3, 9)
values

In [None]:
sd_table = Table().with_columns('Value', values)
sd_table

In [None]:
average_value = np.mean(values)
average_value

In [None]:
deviations = values - average_value
sd_table = sd_table.with_column('Deviation', deviations)
sd_table

In [None]:
sum(deviations)

In [None]:
sd_table = sd_table.with_column('Squared Deviation', deviations ** 2)
sd_table

In [None]:
# Variance of the data:
# mean squared deviation from average

variance = np.mean(deviations ** 2)
variance

In [None]:
# Standard Deviation (SD): 
# root mean squared deviation from average
# = square root of the variance

sd = variance ** 0.5
sd

In [None]:
np.std(values)

## Chebyshev's Bounds ##

In [None]:
births = Table.read_table('baby.csv').drop('Maternal Smoker')

In [None]:
births.labels

In [None]:
births.hist(overlay = False)

In [None]:
mpw = births.column('Maternal Pregnancy Weight')
mean = np.mean(mpw)
sd = np.std(mpw)
print("Mean is: ", mean)
print("SD: ", sd)

mean, sd

In [None]:
within_3_SDs = births.where(
    'Maternal Pregnancy Weight', are.between(mean - 3*sd, mean + 3*sd))

In [None]:
# Proportion within 3 SDs of the mean

within_3_SDs.num_rows / births.num_rows

In [None]:
# Chebyshev's bound: 
# This proportion should be at least

1 - 1/3**2

In [None]:
births.labels

In [None]:
# See if Chebyshev's bounds work for distributions with various shapes

for feature in births.labels:
    values = births.column(feature)
    mean = np.mean(values)
    sd = np.std(values)
    print()
    print(feature)
    for z in make_array(2, 3, 4, 5):
        chosen = births.where(feature, are.between(mean - z*sd, mean + z*sd))
        proportion = chosen.num_rows / births.num_rows
        percent = round(proportion * 100, 2)
        print('Average plus or minus', z, 'SDs:', percent, '%')

## Standard Units ##

In [None]:
def standard_units(x):
    """Convert array x to standard units."""
    return (x - np.mean(x)) / np.std(x)

In [None]:
ages = births.column('Maternal Age')

In [None]:
ages_standard_units = standard_units(ages)

In [None]:
np.mean(ages_standard_units), np.std(ages_standard_units)

## Visualizing Distribution in Standard Units

In [None]:
both = Table().with_columns(
    'Age in Years', ages,
    'Age in Standard Units', ages_standard_units
)
both

In [None]:
np.mean(ages), np.std(ages)

In [None]:
both.hist('Age in Years', bins = np.arange(15, 46, 2))

In [None]:
both.hist('Age in Standard Units', bins = np.arange(-2.2, 3.4, 0.35))
plots.xlim(-2, 3.1);

## The SD and Bell Shaped Curves 

In [None]:
births.hist('Maternal Height', bins = np.arange(56.5, 72.6, 1))
plots.xticks(np.arange(57, 72, 2));

Estimates by eye

The average is approximately: 

Locate the point of inflection on the right. The SD is approximately:

In [None]:
heights = births.column('Maternal Height')
np.mean(heights), np.std(heights)

In [None]:
#one SD from mean
np.mean(heights) - np.std(heights), np.mean(heights) + np.std(heights)

In [None]:
births.where(
    'Maternal Height', are.between(np.mean(heights) - np.std(heights), np.mean(heights) + np.std(heights))).num_rows/births.num_rows

Maternal Height
   * Average plus or minus 1 SD:  67.38 %
   * Average plus or minus 2 SDs: 97.19 %
   * Average plus or minus 3 SDs: 99.66 %
   * Average plus or minus 4 SDs: 99.91 %
   * Average plus or minus 5 SDs: 100.0 %

## Central Limit Theorem

In [None]:
united = Table.read_table('united.csv')
united_bins = np.arange(-20, 300, 10)
united

In [None]:
united.hist('Delay', bins=np.arange(-20, 201, 10), unit = 'minute')

In [None]:
delays = united.column('Delay')
delay_mean = np.mean(delays)
delay_sd = np.std(delays)
delay_mean, delay_sd

In [None]:
# median (minute) of delay
percentile(50, delays)

In [None]:
def one_sample_mean(sample_size):
    """ 
    Takes a sample from the population of flights 
    and computes its mean
    """
    sampled_flights = united.sample(sample_size)
    return np.mean(sampled_flights.column('Delay'))

In [None]:
one_sample_mean(100)

In [None]:
def ten_thousand_sample_means(sample_size):
    means = make_array()
    for i in np.arange(10000):
        mean = one_sample_mean(sample_size)
        means = np.append(means, mean)
    return means

In [None]:
sample_means_100 = ten_thousand_sample_means(100)
sample_means_100

In [None]:
len(sample_means_100)

In [None]:
Table().with_column(
    'Mean of 100 flight delays', sample_means_100).hist(bins=20)

print('Population Average:', delay_mean)

In [None]:
sample_means_400 = ten_thousand_sample_means(400)
Table().with_column(
    'Mean of 400 flight delays', sample_means_400).hist(bins=20)

print('Population Average:', delay_mean)

In [None]:
Table().with_columns(
    'Mean of 100 flight delays', sample_means_100, 'Mean of 400 flight delays', sample_means_400).hist(bins=20)

In [None]:
sample_means_1000 = ten_thousand_sample_means(1000)
Table().with_column(
    'Mean of 1000 flight delays', sample_means_1000).hist(bins=20)

print('Population Average:', delay_mean)

In [None]:
Table().with_columns(
    'Mean of 100 flight delays', sample_means_100, 'Mean of 400 flight delays', sample_means_400,'Mean of 1000 flight delays', sample_means_1000 ).hist(bins=20)