In [None]:
import matplotlib
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
import numpy as np
plots.style.use('fivethirtyeight')

In [None]:
def r_scatter(r):
    plots.figure(figsize=(5,5))
    "Generate a scatter plot with a correlation approximately r"
    x = np.random.normal(0, 1, 1000)
    z = np.random.normal(0, 1, 1000)
    y = r*x + (np.sqrt(1-r**2))*z
    plots.scatter(x, y, color='darkblue', s=20)
    plots.xlim(-4, 4)
    plots.ylim(-4, 4)

# Lecture 25 #

## 95% Confidence Interval

## Review: Lecture 22

### Method Based on Bootstrap (resample)

From a random sample, construct a 95% confidence interval for the ages of the mothers in the population.


In [None]:
# original sample

births = Table.read_table('baby.csv')
births.show(3)

In [None]:
def one_bootstrap_mean():
    resample = births.sample()
    return np.average(resample.column('Maternal Age'))

In [None]:
# Generate means from 3000 bootstrap samples
num_repetitions = 3000
bstrap_means = make_array()
for i in np.arange(num_repetitions):
    bstrap_means = np.append(bstrap_means, one_bootstrap_mean())

In [None]:
# Get the endpoints of the 95% confidence interval
left = percentile(2.5, bstrap_means)
right = percentile(97.5, bstrap_means)

In [None]:
resampled_means = Table().with_columns(
    'Bootstrap Sample Mean', bstrap_means
)
resampled_means.hist(bins=15)
plots.plot([left, right], [0, 0], color='yellow', lw=8);

In [None]:
left, right

## Method Based on CLT

In [None]:
sampled_ages = births.column('Maternal Age')
sample_size = len(sampled_ages)
sample_average = np.average(sampled_ages)
sample_SD = np.std(sampled_ages)
sample_size, sample_average, sample_SD

We need to add and subtract **2 * (population SD)/(sample_size ** 0.5)** from sample average

...but we don't have the population SD.

In [None]:
# Try estimating it from the sample

estimated_SD_of_sample_average = sample_SD / (sample_size**0.5)
estimated_SD_of_sample_average

In [None]:
# Approximate 95% confidence interval for population mean

sample_average - 2*estimated_SD_of_sample_average, sample_average + 2*estimated_SD_of_sample_average

## SD of 0/1 Population

In [None]:
# Population of size 10

number_of_ones = 8
zero_one_population = np.append(np.ones(number_of_ones), np.zeros(10 - number_of_ones))

print('Standard Deviation:', np.round(np.std(zero_one_population),2))

zero_one_population

In [None]:
def sd_of_zero_one_population(number_of_ones):
    """Returns the SD of a population 
    that has 10 elements: num_ones with value 1 and (10 - num_ones) with value 0"""
    zero_one_population = np.append(np.ones(number_of_ones), 
                                    np.zeros(10 - number_of_ones))
    return np.std(zero_one_population)

In [None]:
possible_ones = np.arange(11)
zero_one_pop = Table().with_columns(
    'Number of Ones', possible_ones,
    'Proportion of Ones', possible_ones / 10
)
zero_one_pop.show()

In [None]:
sds = zero_one_pop.apply(sd_of_zero_one_population, 'Number of Ones')
zero_one_pop = zero_one_pop.with_column('SD', sds)
zero_one_pop.show()

In [None]:
zero_one_pop.scatter('Proportion of Ones', 'SD')

## Approach to Prediction

In [None]:
family_heights = Table.read_table('family_heights.csv').drop(3)
parent_averages = (family_heights.column('father') + family_heights.column('mother'))/2
heights = Table().with_columns(
    'Parent Average', parent_averages,
    'Child', family_heights.column('childHeight')
)
heights

In [None]:
heights.scatter('Parent Average')

In [None]:
def predict_child(p_avg):
    """Predict the height of a child whose parents have a parent average height of p_avg.
    
    The prediction is the average height of the children whose parent average height is
    in the range p_avg plus or minus 0.5.
    """
    
    close_points = heights.where('Parent Average', are.between(p_avg-0.5, p_avg + 0.5))
    return np.average(close_points.column('Child'))  

In [None]:
heights_with_predictions = heights.with_columns(
    'Prediction', heights.apply(predict_child, 'Parent Average'))

In [None]:
heights_with_predictions.scatter('Parent Average')

## Association

In [None]:
hybrid = Table.read_table('hybrid.csv')
hybrid

In [None]:
hybrid.scatter('mpg', 'msrp')

In [None]:
hybrid.scatter('acceleration', 'msrp')

In [None]:
suv = hybrid.where('class', 'SUV')
suv.num_rows

In [None]:
suv.scatter('acceleration', 'msrp')

In [None]:
suv.scatter('mpg', 'msrp')

In [None]:
def standard_units(x):
    "Convert any array of numbers to standard units."
    return (x - np.average(x)) / np.std(x)

In [None]:
Table().with_columns(
    'mpg (standard units)',  standard_units(suv.column('mpg')), 
    'msrp (standard units)', standard_units(suv.column('msrp'))
).scatter(0, 1)
plots.xlim(-3, 3)
plots.ylim(-3, 3);

In [None]:
suv.scatter('acceleration', 'msrp')

In [None]:
Table().with_columns(
    'acceleration (standard units)', standard_units(suv.column('acceleration')), 
    'msrp (standard units)',         standard_units(suv.column('msrp'))
).scatter(0, 1)
plots.xlim(-3, 3)
plots.ylim(-3, 3);

## Correlation

In [None]:
r_scatter(1)

In [None]:
r_scatter(-1)

In [None]:
r_scatter(0.7)

In [None]:
r_scatter(0)

In [None]:
x = np.arange(1, 7, 1)
y = make_array(2, 3, 1, 5, 2, 7)
t = Table().with_columns(
        'x', x,
        'y', y
    )
t

In [None]:
t.scatter('x', 'y', s=30, color='red')

In [None]:
t = t.with_columns(
        'x (standard units)', standard_units(x),
        'y (standard units)', standard_units(y)
    )
t

In [None]:
t.scatter(2, 3, s=30, color='red')

In [None]:
t = t.with_columns(
    'product of standard units', t.column(2) * t.column(3))
t

In [None]:
# r is the average of the products of the standard units

r = np.average(t.column(2) * t.column(3))
r

In [None]:
def correlation(t, x, y):
    """t is a table; x and y are column labels"""
    x_in_standard_units = standard_units(t.column(x))
    y_in_standard_units = standard_units(t.column(y))
    return np.average(x_in_standard_units * y_in_standard_units)

In [None]:
correlation(t, 'x', 'y')

In [None]:
suv.scatter('mpg', 'msrp')

In [None]:
correlation(suv, 'mpg', 'msrp')

In [None]:
suv.scatter('acceleration', 'msrp')

In [None]:
correlation(suv, 'acceleration', 'msrp')