# Module 4 Webinar Examples


In [None]:
# import packages
import numpy as np
import pandas as pd
import matplotlib as mpl
from matplotlib import pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import binom
import math
import warnings

warnings.filterwarnings('ignore')
sns.set_context('paper')
%matplotlib inline

import matplotlib.pylab as pylab
params = {'legend.fontsize': 'large',
          'figure.figsize': (15, 5),
         'axes.labelsize': 'x-large',
         'axes.titlesize':'x-large',
         'xtick.labelsize':'large',
         'ytick.labelsize':'large'}
pylab.rcParams.update(params)

## Baker example

In [None]:
m = 17
h0 = 15
n = 40
sigma = .5

se = sigma/np.sqrt(n)

print('p value = ', 1 - stats.norm.cdf(17,loc=15,scale=se))


In [None]:
# z-score

m = 17
h0 = 15
n = 40
sigma = .5

se = sigma/np.sqrt(n)

z_score = (m-h0) / se
print('z = ', z_score)

print('p value = ', 1 - stats.norm.cdf(z_score))


## Converting between z-scores and p-values:

How can we convert a z-score to a p-value? 

Using the cumulative distribution function, stats.norm.cdf:

In [None]:
stats.norm.cdf(-1.96)

How can we convert a p-value to a z-score?

Using the percent point function, stats.norm.ppf, i.e. the inverse of the cumulative distribution function:

In [None]:
stats.norm.ppf(.975)

When using these functions, we need to think really carefully about whether we are conducting a one-tailed (left or right) test or two-tailed test - think about where on the curve we are looking!

<b>Example</b>: what are the critical z-scores for a two-tailed test with $\alpha$ = .1?  

Since this is a two-tailed test, we want to divide $\alpha$ between the two tails, i.e. have .05 in each tail.

Therefore, the z-score in the right tail will be 

In [None]:
z_right = stats.norm.ppf(.95)
print(z_right)

and the z-score in the left tail will be 

In [None]:
z_left = stats.norm.ppf(.05)
print(z_left)

## Type I and type II errors

<b>Example</b>: You want to test the claim that an exercise intervention increases productivity scores from 40 to 45. You collect scores from 35 individuals. Assume the population has a standard deviation of 10, and is normally distributed.  

Null distribution is centered at 40.  
Alternative distribution is centered at 45.   

Use $\alpha$ = .05, one-tailed

In [None]:
# visualize the scenario

n = 35
pop_mean = 40
pop_stdev = 10

se = pop_stdev / np.sqrt(n)
alpha = .05
thr = stats.norm.ppf(1-alpha) # z-score associated with critical X

# plot 

m1 = 40
m2 = 45
se = pop_stdev / np.sqrt(n)

plt.figure(figsize=(10,6))

x1=np.arange(m1-4*se,m1+4*se,0.01)
y1= stats.norm.pdf(np.arange(m1-4*se,m1+4*se,0.01),loc=m1,scale=se)
plt.plot(x1,y1)

x2=np.arange(m2-4*se,m2+4*se,0.01)
y2= stats.norm.pdf(np.arange(m2-4*se,m2+4*se,0.01),loc=m2,scale=se)
plt.plot(x2,y2)


plt.fill_between(x=np.arange(m1+thr*se,m1+4*se,0.01), 
                 y1= stats.norm.pdf(np.arange(m1+thr*se,m1+4*se,0.01),loc=m1,scale=se),
                 facecolor='red', alpha=0.35)

plt.fill_between(x=np.arange(m2-4*se,m1+thr*se,0.01),
                 y1= stats.norm.pdf(np.arange(m2-4*se,m1+thr*se,0.01),loc=m2,scale=se),
                 facecolor='blue', alpha=0.35)

plt.text(x=m1-se, y=0.12, s= "Null Hypothesis")
plt.text(x=m1-se, y=0.1, s= 'Fail to Reject the null ')
plt.text(x=m1-se, y=0.08, s=r'$1- \alpha$')
plt.text(x=m2-.5*se, y=0.12, s= "Alternative Hypothesis")
plt.text(x=m2-.5*se, y=0.1, s= 'Power ' r'$1-\beta$')
plt.text(x=m2-se, y=0.01, s= "Type 1 Error")
plt.text(x=m1+.5*se, y=0.01, s= "Type 2 Error")
plt.show()

In [None]:
# calculate critical X on null distribution

# we can go directly from p-value to a score (X) using ppf if we specify the loc and scale parameters

x_crit = stats.norm.ppf(.95,loc=pop_mean, scale = se)
x_crit.round(2)

In [None]:
# determine beta on alternative distribution

mean_alt = 45

beta = stats.norm.cdf(x_crit,loc=mean_alt,scale=se)
beta.round(2)

In [None]:
# calculate power
power = 1 - beta
power.round(2)

### Similar example with t-distribution.

Let's repeat the process above using a t-distribution. We will use n = 15.

Recall:  
CDF: X-score --> p-value  
PPF: p-value --> X-score  

stats.norm.cdf(X, loc, scale)  
stats.t.cdf(X, df, loc, scale)  

stats.norm.ppf(pval, loc, scale)  
stats.t.ppf(pval, df, loc, scale)

In [None]:
n = 15
pop_mean = 40 

m1 = 40
m2 = 45
sample_std=10

se = sample_std / np.sqrt(n)

In [None]:
# calculate critical X on null distribution
x_crit = stats.t.ppf(.95, df=n-1, loc=pop_mean, scale=se)
x_crit.round(2)

In [None]:
# determine beta on alternative distribution
mean_alt = 45
beta = stats.t.cdf(x_crit, df=n-1, loc=mean_alt, scale=se)
beta.round(2)

In [None]:
# calculate power
power = 1 - beta
power.round(2)

## Statistics professor example

In [None]:
def sample_std(scores):
    return np.sqrt(np.sum((scores - scores.mean())**2)/(np.shape(scores)[0]-1))

scores = np.array([65, 65, 70, 67, 66, 63, 63, 68, 72, 71])
n = np.shape(scores)[0]

t = (np.mean(scores) - 65) / (sample_std(scores) / np.sqrt(n))
print('t =', np.round(t,2))
print('p =', np.round(1-stats.t.cdf(t,df=n-1), 4))