# Notes from Mastering Statistics

In [1]:
# Libraries

import numpy as np
import pandas as pd
import math
import statistics
import scipy
from statistics import NormalDist
from scipy.stats import norm
from scipy.stats import chi2

## Probability in Proportions

**79% of voters in a city are Democrats. If we sample 100 people, what is the probability that more than 68 will vote Democrat?**

In [2]:
p = 0.79       # n * p = 79 >= 5
n = 100        # n * (1-p) = 21 >= 5

phat = 0.68

z = (phat - p) / np.sqrt( (p*(1-p)) / (n) )

print(z)

-2.7006573887712992


In [3]:
# We want area on the right, so we need to invert.
# P(z>-2.7) = P(z<2.7)

P = norm.cdf(-z, 0, 1)
P

0.9965398707258555

**47% of people in a city want to buy a bike. If we surveey 61 people randomly, what is the probability that less than 40% want to buy a bike?**

In [4]:
p = 0.47      # n * p = 28.67 >= 5
n = 61        # n * (1-p) = 32.33 >= 5

phat = 0.40

z = (phat - p) / np.sqrt( (p*(1-p)) / (n) )

print(z)

-1.0954084676318019


In [5]:
# We want area on the left
# P(z<-1.09)

P = norm.cdf(z, 0, 1)
P

0.13666886303692077

**72% of shoppers at a store are women. 40 shoppers are chosen randomly. What is the probability that the proportion of women sampled differs from the mean by more than 3%.**

In [6]:
p = 0.72      # n * p = 16.8 >= 5
n = 40        # n * (1-p) = 11.2 >= 5

phat = 0.69   # and 0.72, but the two areas are the same.

z = (phat - p) / np.sqrt( (p*(1-p)) / (n) )

print(z)

-0.4225771273642586


In [7]:
# We want the area on the left and its symmetrical
# P(z<-0.42) * 2

P = 2 * norm.cdf(z, 0, 1)
P

0.6726038174415163

**Body temperature of adults are normally distributed with a mean of 98.6 deg F and standard deviation of 0.73 deg F. Find the probability that sample of 36 adults have an average body temperature less than 98.3 deg F.**

In [8]:
mu = 98.6
sigma = 0.73

n = 36    # > 30, Normal Distribution

s = sigma / np.sqrt(n)
xbar = 98.3

z = (xbar - mu) / s

P = norm.cdf(z, 0, 1)

print("Z:", round(z,2))
print()
print("(P < 98.3):", round(P*100,2),"%")

Z: -2.47

(P < 98.3): 0.68 %


**IQ scores for the USA have a mean of 100 and standard deviation of 15. If a sample of 50 people are given an IQ test, what is the probability that their mean IQ will be less than 95?**

In [9]:
mu = 100
sigma = 15

n = 50 # > 30, Normal Distribution

s = sigma / np.sqrt(n)
xbar = 95

z = (xbar - mu) / s

P = norm.cdf(z, 0, 1)

print("Z:", round(z,2))
print()
print("(P < 95):", round(P*100,2),"%")

Z: -2.36

(P < 95): 0.92 %


**The population IQ mean is 100 with standard deviation of 15. If we sample 50 people, what is the probability that the average of the sample is greater than 105?** 

In [10]:
mu = 100
sigma = 15

n = 50 # > 30, Normal Distribution

s = sigma / np.sqrt(n)
xbar = 105

z = (xbar - mu) / s

P = norm.cdf(-z, 0, 1)     # We change the sign because the normal dist. give us the area on the left and we want on the right

print("Z:", round(z,2))   
print()
print("(P > 105):", round(P*100,2),"%")

Z: 2.36

(P > 105): 0.92 %


**The population IQ mean is 100 with standardd deviation of 15. If we sample 50 people, what is the probability that their mean IQ will differ from the population mean by more than 5?**

In [11]:
mu = 100
sigma = 15

n = 50 # > 30, Normal Distribution

s = sigma / np.sqrt(n)

xbar1 = mu + 5
xbar2 = mu - 5

z1 = (xbar1 - mu) / s
z2 = (xbar2 - mu) / s

P1 = norm.cdf(-z1, 0, 1)  # we want right side
P2 = norm.cdf(z2, 0, 1)  # we want left side

P = P1 + P2

print("Z1:", round(z1,2))
print("Z2:", round(z2,2))
print()
print("(P < 95 ou  P > 105):", round(P*100,2),"%")

Z1: 2.36
Z2: -2.36

(P < 95 ou  P > 105): 1.84 %


**Averge heartrate is 140 bpm, with standard deviation of 12 bpm. What is the probability that randomly chosen fetal heartrate differs from the mean by mmore than 25 bpm?**

In [12]:
mu = 140
sigma = 12

xbar1 = mu - 25
xbar2 = mu + 25

z1 = (xbar1 - mu) / sigma
z2 = (xbar2 - mu) / sigma

P1 = norm.cdf(z1, 0, 1)  # we want right side
P2 = norm.cdf(-z2, 0, 1)  # we want left side

P = P1 + P2

print("Z1:", round(z1,2))
print("Z2:", round(z2,2))
print()
print("(P < 115 ou  P > 165):", round(P*100,2),"%")

Z1: -2.08
Z2: 2.08

(P < 115 ou  P > 165): 3.72 %


**The average person plays video games 7.5 hours per week with standard deviation of 3 hours. If I choose 110 peope randomly, what is the probability that their mean game play is more than 8 hrs per week?**

In [13]:
mu = 7.5
sigma = 3

n = 110 # > 30, Normal Distribution

s = sigma / np.sqrt(n)

xbar = 8

z = (xbar - mu) / s

P = norm.cdf(-z, 0, 1)     # We want the right side

print("Z:", round(z,2))   
print()
print("(P > 8hrs):", round(P*100,2),"%")

Z: 1.75

(P > 8hrs): 4.02 %


**In a city with 100,000 people, 79% of voters in a city are Democrats. If sample 100 people, what is the probability that more than 68 mil vote Democrats?**

In [14]:
p = 0.79
n = 100 # > 30, Normal Distribution

# n*p = 79  >= 5, check
# n*(1-p) = 21 >= 5, check

phat = 68/100

z = (phat - p) / np.sqrt( (p*(1-p)) / (n) )

P = norm.cdf(-z, 0, 1)     # We want the right side

print("Z:", round(z,2))   
print()
print("(P > 68 mil):", round(P*100,2),"%")

Z: -2.7

(P > 68 mil): 99.65 %


**47% of people in a city want to buy a bike. If we survey 61 people randomly, what is the probability that less than 40% want to buy a bike?**

In [15]:
p = 0.47
n = 61 # > 30, Normal Distribution

# n*p = 28.7     >= 5, check
# n*(1-p) = 32.3 >= 5, check

phat = 0.4

z = (phat - p) / np.sqrt( (p*(1-p)) / (n) )

P = norm.cdf(z, 0, 1)     # We want the left side

print("Z:", round(z,2))   
print()
print("(P < 40% buy a bike):", round(P*100,2),"%")

Z: -1.1

(P < 40% buy a bike): 13.67 %


## Confidence Interval

**A survey of 200 males shows that they read on average 15.7 hours per week. If the margin of error is 2.2 hours at 95% confidence, construct the confidence interval.**

In [16]:
xbar = 115.7
E = 2.2

lcb = xbar - E   # Lower class boundary
ucb = xbar + E   # Upper class boundary

print('95% C.I.:',(lcb,ucb))

95% C.I.: (113.5, 117.9)


**A survey of 600 people finds they sleep an average of 10.5 hours per night. If the margin of error at 98% confidence level is 1.3 hours, construct C.I.**

In [17]:
xbar = 10.5
E = 1.3

lcb = xbar - E
ucb = xbar + E

print('98% C.I.:',(lcb,ucb))

98% C.I.: (9.2, 11.8)


**We ask 100 people how much a Supreme Pizza costs. The average answer is \\$25.99 with standard deviation of $15.50. Construct a 99% C.I. that contains the average price of a Supreme Pizza nationwide.**

In [18]:
xbar = 25.99
n = 100
s = 15.5
z = 2.576   # z for 99% C.I.

E = z * ( s / np.sqrt(n))
E

3.9928000000000003

In [19]:
lcb = xbar - E
ucb = xbar + E

print('99% C.I.:',(lcb,ucb))

99% C.I.: (21.9972, 29.982799999999997)


**78 students surveyed said that study an average of 15 hours per week with standard deviation of 2.3 hours. What is the margin of error for 90% C.I. for the student population:**

In [20]:
xbar = 15
n = 78
s = 2.3
z = 1.645   # z for 90% C.I.

E = z * ( s / np.sqrt(n))
E

0.42839701586860773

In [21]:
lcb = xbar - E
ucb = xbar + E

print('90% C.I.:',(lcb,ucb))

90% C.I.: (14.571602984131392, 15.428397015868608)


**85 home owners answered that they spend \\$67 per month on repairs with standard deviation of \\$14. Find the 99% Confidence Interval for how much money spent on repairs per month by all home owners.**

In [22]:
n = 85
xbar = 67
s = 14
z = 2.576   # z for 99% C.I.

E = z * ( s / np.sqrt(n))

lcb = round(xbar - E, 4)
ucb = round(xbar + E, 4)


print('99% C.I.:',(lcb,ucb))

99% C.I.: (63.0883, 70.9117)


## T-Student Distribution

**25 tigers were found to have average weight of 600 pounds with standard deviation of 90 pounds. What is the margin of error on a 98% confidence interval?**

In [23]:
n = 25     # < 30, t-student distribution

xbar = 600
s = 90

loc = 0.98
alpha = (1-loc)/2

dof = n - 1

t = scipy.stats.t.ppf(alpha, dof)
t = abs(t)
t

2.4921594731575762

In [24]:
E = t * ( s / np.sqrt(n))

print(round(E,2),'pounds')

44.86 pounds


**A sample of 4 bus crashes show on average 49 people died with standard deviation f 15. Find a 99% confidence interval for the number of fatalities in a bus crash nationwide.**

In [25]:
n = 4     # < 30, t-student distribution

xbar = 49
s = 15

loc = 0.99
alpha = (1-loc)/2

dof = n - 1

t = scipy.stats.t.ppf(alpha, dof)
t = abs(t)
t

5.84090929975643

In [26]:
E = t * ( s / np.sqrt(n))

lcb = xbar - E
ucb = xbar + E

print('90% C.I.:',(lcb,ucb))

90% C.I.: (5.1931802518267745, 92.80681974817323)


## Error in Proportions C.I.

**13 out of 147 teachers know sign language. Find the margin of error for a 90% confidence interval for the proportion of teachers who know sign language.**

In [27]:
n = 147

phat = 13/147

z = 1.645   # z for 90% C.I.

E = z * np.sqrt( (phat*(1-phat)) / n )

print(round(E*100,2),'%')

3.85 %


**190 adults are surveyed and 71 say that they eat out regularly. Construct a 95% C.I. for the proportion of adults who eat out regulaarly.**

In [28]:
n = 190

phat = 71/190

z = 1.960   # z for 95% C.I.

E = z * np.sqrt( (phat * (1-phat)) / (n) )

lcb = phat - E
ucb = phat + E

lcbp = round(lcb*100, 2)
ucbp = round(ucb*100, 2)

print('95% C.I.(in percentage):',(lcbp,ucbp))

95% C.I.(in percentage): (30.49, 44.25)


**You want to find a 98% C.I. for the proportion of college students who get a student loan. You read an estimate that states that around 78% of students to get a loan. You want your max error to be 5%. How many samples do you need?**

In [29]:
phat= 0.78
E = 0.05
z = 2.326   # z for 98% C.I.

n = (phat * (1-phat) * z**2) / (E**2)

print(math.ceil(n))   # Round up the number

372


## Chi-Square Distribution

**A pencil factory is testing the variance in the mass of pencil produced. Sampling 15 pencils randomly shows variance of 3.4 grams. What is a 95% confidence interval for variance of all pencils produced?**

In [30]:
# Chi-Square Distribution (for variances and standards deviation)

from scipy.stats import chi2_contingency
from scipy.stats import chi2

var = 3.4
n = 15
dof = n - 1

loc = 0.95
alpha_2 = (1-loc)/2
one_alpha_2 = 1 - alpha_2


area_left = chi2.ppf(alpha_2, dof)        # Area to the left

area_right = chi2.ppf(one_alpha_2, dof)   # Area to the right

lcb = (dof * var) / area_right
ucb = (dof * var) / area_left

lcbr = round(lcb, 2)
ucbr = round(ucb, 2)

print('95% C.I.:',(lcbr,ucbr))

95% C.I.: (1.82, 8.46)


**On a factory assembly line for soft drinks, 40 are sampled and the amount of liquid inside had a standard deviation of 7.7 ml. Find a 98% C.I. for the standard deviation of liquid in the drinks produced by this factory.**

In [31]:
# Chi-Square Distribution (for variances and standards deviation)

#from scipy.stats import chi2_contingency
#from scipy.stats import chi2

s = 7.7
var = s**2
n = 40
dof = n - 1

loc = 0.98
alpha_2 = (1-loc)/2
one_alpha_2 = 1 - alpha_2


area_left = chi2.ppf(alpha_2, dof)         # Area to the left

area_right = chi2.ppf(one_alpha_2, dof)    # Area to the right

lcb = np.sqrt( (dof * var) / area_right )  # "squrt, because now we want the answeer in
ucb = np.sqrt( (dof * var) / area_left )   #  standard deviation and not in variance"

lcbr = round(lcb, 2)
ucbr = round(ucb, 2)

print('98% C.I.(in ml):',(lcbr,ucbr))

98% C.I.(in ml): (6.09, 10.39)


## Hypothesis Test

**Doctors at a hospital believe they see on average at least 8 patients per day. Management claims that they don't see this many patients consistently. They sample 19 doctors who report a sample mean of 7.5 patients seen per day with standard deviation of 11 patients. Test management's claim at 0.025 level of significance.**

In [32]:
# Ho = mu >= 8
# Ha = mu < 8      -> left tail test

<img src="images/lefttailtest.jpg"/>

In [33]:
alpha = 0.025

n = 19        # < 30, t-student distribution
dof = n - 1

mu = 8
xbar = 7.5
s = 1.1

# t-critical
t_alpha = scipy.stats.t.ppf(alpha, dof)
t_alpha

-2.10092204024096

In [34]:
# t-statistic
t_stat = (xbar - mu) / (s / np.sqrt(n))
t_stat

-1.981317701609397

In [35]:
# left tail test: 
#
# If t-statistic > t-critical   i.e, if t-statistic is at left to t-critical  -> Reject Ho!

if abs(t_stat) < abs(t_alpha):
    print('Fail to reject Ho.')
else:
    print('Reject Ho.')

Fail to reject Ho.


**A gocery store assumes that average shopper spends no more thann \\$100 in the store. The new store manager believes that they spend more. He chooses 27 shoppers randmly and they spend an average \\$104.93 with a stanndard deviation of $9.07. Test the manager's claim at the 0.05 significance level.**

In [36]:
# Ho: mu =< 100
# Ha: mu > 100   -> right tail test

<img src="images/righttailtest.jpg"/>

In [37]:
alpha = 0.05

n = 27        # < 30, t-student distribution
dof = n - 1

mu = 100
xbar = 104.93
s = 9.07

# t-critical
t_alpha = scipy.stats.t.ppf(alpha, dof)

# t-statistic
t_stat = (xbar - mu) / (s / np.sqrt(n))

print('t_alpha:',abs(t_alpha))
print()
print('t-statistic:', t_stat)
print()

if abs(t_stat) < abs(t_alpha):
    print('Fail to reject Ho.')
else:
    print('Reject Ho!')

t_alpha: 1.7056179197592731

t-statistic: 2.824369508703829

Reject Ho!


**At water bottling factory, a machine is supposed to put 2 liters of water into the bottles. After an overhaul, management thinks the machine is no longer putting the correct amount of water in. They sample 20 bottles and find an average of 2.10L of water with standard deviation of 0.33L. Test the claim at 0.01 level of significance.**

In [38]:
# Ho: mu == 2L
# Ha: mu != 2L    -> 2 tail test

<img src="images/2tailtest.jpg"/>

In [39]:
alpha = 0.01
alpha_2 = alpha / 2      # 2 tail test

n = 20                   # < 30, t-student distribution
dof = n - 1

mu = 2
xbar = 2.1
s = 0.33

# t-critical
t_alpha_2 = scipy.stats.t.ppf(alpha_2, dof)

# t-statistic
t = (xbar - mu) / (s / np.sqrt(n))

print('t_alpha:',abs(t_alpha_2))
print()
print('t-statistic:', t)
print()

if ( -abs(t_alpha_2) < t) and (t < abs(t_alpha_2)) :
    print('Fail to reject Ho.')
else:
    print('Reject Ho!')

t_alpha: 2.860934606449914

t-statistic: 1.3551927136362374

Fail to reject Ho.


**A magazine report that teenagers make on average at least 4 phone calls per night. The school principal thinks that the magazine is wrong. He samples 25 teens and gets an average of 3.4 calls per night with a standard deviation of 0.9 calls. Test the principals claim at a 99% level of confidence.**

In [40]:
# Ho: mu >= 4 calls
# Ha: mu < 4 calls  -> left tail test

<img src="images/lefttailtest.jpg"/>

In [41]:
alpha = 0.01

n = 25        # < 30, t-student distribution
dof = n - 1

mu = 4
xbar = 3.4
s = 0.9

# t-critical
t_alpha = round( scipy.stats.t.ppf(alpha, dof), 2)  # area on the left

# t-statistic
t_stat = round( (xbar - mu) / (s / np.sqrt(n)), 2)

print('t_alpha:', t_alpha)
print()
print('t-statistic:', t_stat)
print()


if t_stat < t_alpha:    ### Left Analysis ###
    print('Reject Ho!')
else:
    print('Fail to reject Ho.')

t_alpha: -2.49

t-statistic: -3.33

Reject Ho!


**A pizza chain advertises they will deliver your pizza in no more than 20 min from when you placee the order. You don't believe this and decide to test the claim at 95% level of confidence. You sample 7 friends who report an average of 22.7 min delivery time with standard deviation of 4.3 min. Test the claim.**

In [42]:
# Ho: mu =< 20 min
# Ha: mu > 20 min  -> right tail test

<img src="images/righttailtest.jpg"/>

In [43]:
alpha = 0.05

n = 7        # < 30, t-student distribution
dof = n - 1

mu = 20
xbar = 22.7
s = 4.3

# t-critical
t_alpha = (-1) * round( scipy.stats.t.ppf(alpha, dof), 2)  # (-1)*, area on the right in t-distribution

# t-statistic
t_stat = round( (xbar - mu) / (s / np.sqrt(n)), 2)

print('t_alpha:', t_alpha)
print()
print('t-statistic:', t_stat)
print()


if t_stat > t_alpha:    ### Right Analysis ###
    print('Reject Ho!')
else:
    print('Fail to reject Ho.')

t_alpha: 1.94

t-statistic: 1.66

Fail to reject Ho.


**A computer company thinks it takes customers on average 15 min to set up their computer. A newly redesigned model has been launched, and managers want to know if it takes users different from 15 min to set up. They sample 20 users who report average of 14.1 min with standard deviation of 1.9 min. Test the claim at 0.05 level of significance.** 

In [44]:
# Ho: mu == 15 min
# Ha: mu != 15 min    -> 2 tail test

<img src="images/2tailtest.jpg"/>

In [45]:
alpha = 0.05
alpha_2 = alpha / 2      # 2 tail test

n = 20                   # < 30, t-student distribution
dof = n - 1

mu = 15
xbar = 14.1
s = 1.9

# t-critical
t_alpha_2 = round(scipy.stats.t.ppf(alpha_2, dof), 2)

# t-statistic
t_stat = round((xbar - mu) / (s / np.sqrt(n)), 2)

print('t_alpha:',abs(t_alpha_2))
print()
print('t-statistic:', t_stat)
print()


if ( (-1)*abs(t_stat) < (-1)*abs(t_alpha_2)) or (abs(t_stat) > abs(t_alpha_2)) :  ### 2 tail Analysis ###    
    print('Reject Ho!')
else:
    print('Fail to reject Ho.')

t_alpha: 2.09

t-statistic: -2.12

Reject Ho!


**Records show that students on average score less than or equal to 850 on a test. A test prep company says that students who take their course with score on average higher than this. To test, they sample 1000 students wh score an average of 856 with a standardd deviation of 98 after taking the course. At 0.05 level of significance, test the company claim.**

In [46]:
# Ho: mu =< 850 pts
# Ha: mu > 850 pts  -> right tail test

<img src="images/righttailtest.jpg"/>

In [47]:
alpha = 0.05
loc = 1 - alpha

n = 1000       # > 30, normal distribution

mu = 850
xbar = 856
s = 98

# t-critical
t_alpha = round( norm.ppf(loc), 2)

# t-statistic
t_stat = round( (xbar - mu) / (s / np.sqrt(n)), 2)

print('t_alpha:', t_alpha)
print()
print('t-statistic:', t_stat)
print()


if t_stat > t_alpha:    ### Right Analysis ###
    print('Reject Ho!')
else:
    print('Fail to reject Ho.')

t_alpha: 1.64

t-statistic: 1.94

Reject Ho!


## Hypothesis Test with p-value

**A newspaper reports that the average age a woman gets a married is 25 years old or younger. A researcher thinks the average is older. He samples 213 women and gets an average age of 25.4 years with standard deviation of 2.3 years. With 95% level confidence, test the researcher's claim.**

In [48]:
# Ho: mu =< 25 years
# Ha: mu > 25 years  -> right tail test

<img src="images/righttailtest.jpg"/>

In [49]:
alpha = 0.05

mu = 25

n = 213 # > 30, Normal Distribution
xbar = 25.4
s = 2.3

z = (xbar - mu) / (s / np.sqrt(n))

p_value = norm.cdf(-z, 0, 1)     # We want the right side in normal distribution

print("Z-statistic:", round(z,2))   
print("P-value:", round(p_value,4))
print("Alpha:", round(alpha,4))
print()


if p_value < alpha:    ### For All Analysis ###
    print('p-value < alpha: Reject Ho!')
else:
    print('p-value > alpha: Fail to reject Ho.')

Z-statistic: 2.54
P-value: 0.0056
Alpha: 0.05

p-value < alpha: Reject Ho!


**A study showed that on average women in a city had 1.48 kids. A researcher believes this number is wrong. He surveys 128 women in this city nd finds that on average these wmen had 1.39 kid with standard deviation of 0.84 kid. At 90% loc, test the claim.**

In [50]:
# Ho: mu == 1.48 kid
# Ha: mu != 1.39 kid   -> 2 tail test

<img src="images/2tailtest.jpg"/>

In [51]:
alpha = 0.10

mu = 1.48

n = 128 # > 30, Normal Distribution
xbar = 1.39
s = 0.84

z = (xbar - mu) / (s / np.sqrt(n))

p_value = (2) * norm.cdf(z, 0, 1)   # 2 areas

print("Z-statistic:", round(z,2))   
print("P-value:", round(p_value,4))
print("Alpha:", round(alpha,4))
print()


if p_value < alpha:    ### For All Analysis ###
    print('p-value < alpha: Reject Ho!')
else:
    print('p-value > alpha: Fail to reject Ho.')

Z-statistic: -1.21
P-value: 0.2254
Alpha: 0.1

p-value > alpha: Fail to reject Ho.


**The government says the average weight of males is 162.9 pounds or greater. A researcher think this is too high. He does  study of 39 males and gets an average weight of 160.1 pounds with standard deviation of 5.6 pounds. At 0.01 level of significcance, test the claim.**

In [52]:
# Ho: mu >= 162.9 lbs
# Ha: mu < 162.9 lbs  -> left tail test

<img src="images/lefttailtest.jpg"/>

In [53]:
alpha = 0.01

mu = 162.9

n = 39        # > 30, Normal Distribution
xbar = 160.1
s = 5.6

z = (xbar - mu) / (s / np.sqrt(n))

p_value = norm.cdf(z, 0, 1)

print("Z-statistic:", round(z,2))   
print("P-value:", round(p_value,4))
print("Alpha:", round(alpha,4))
print()


if p_value < alpha:    ### For All Analysis ###
    print('p-value < alpha: Reject Ho!')
else:
    print('p-value > alpha: Fail to reject Ho.')

Z-statistic: -3.12
P-value: 0.0009
Alpha: 0.01

p-value < alpha: Reject Ho!


**A local fire dept has been advertising that 65% of staff like their assigned gear.  A new hire believes that less than 65% like their gear, so he asked 50 firefighters and 27 said they did liked the gear. Test the claim at 95% level of confidence.**

In [54]:
# Ho: p >=  0.65
# Ha: p < 0.65  -> left tail test

<img src="images/lefttailtest.jpg"/>

In [55]:
alpha = 0.05

n = 50        # > 30, Normal Distribution
p = 0.65

# n*p = 32.5     >= 5, check
# n*(1-p) = 17.5 >= 5, check

phat = 0.54

z = (phat - p) / np.sqrt( (p*(1-p))/(n) )

p_value = norm.cdf(z, 0, 1)

print("Z-statistic:", round(z,2))   
print("P-value:", round(p_value,4))
print("Alpha:", round(alpha,4))
print()


if p_value < alpha:    ### For All Analysis ###
    print('p-value < alpha: Reject Ho!')
else:
    print('p-value > alpha: Fail to reject Ho.')

Z-statistic: -1.63
P-value: 0.0515
Alpha: 0.05

p-value > alpha: Fail to reject Ho.


**A report states that at least 75% of women like red roses. Angie thinks this figure too high. She asks 125 women and finds that 92 do like red roses. At a 0.10 level of significance, test the claim.**

In [56]:
# Ho: p >=  0.75
# Ha: p < 0.75    -> left tail test

<img src="images/lefttailtest.jpg"/>

In [57]:
alpha = 0.10

n = 125        # > 30, Normal Distribution
p = 0.75

# n*p = 32.5     >= 5, check
# n*(1-p) = 17.5 >= 5, check

phat = 92/125

z = (phat - p) / np.sqrt( (p*(1-p))/(n) )

p_value = norm.cdf(z, 0, 1)

print("Z-statistic:", round(z,2))   
print("P-value:", round(p_value,4))
print("Alpha:", round(alpha,4))
print()


if p_value < alpha:    ### For All Analysis ###
    print('p-value < alpha: Reject Ho!')
else:
    print('p-value > alpha: Fail to reject Ho.')

Z-statistic: -0.36
P-value: 0.3589
Alpha: 0.1

p-value > alpha: Fail to reject Ho.


**A report states that 1% of college degrees are in mathematics. A researcher doesn't believe this is correct. He samples 12,317 graduates and finds that 128 have math degrees. Test the claim at 0.10 level of significance.**

In [58]:
# Ho: p == 0.01 
# Ha: p != 001   -> 2 tail test

<img src="images/2tailtest.jpg"/>

In [59]:
alpha = 0.10

n = 12317        # > 30, Normal Distribution
p = 0.01

# n*p     >= 5, check
# n*(1-p) >= 5, check

phat = 148/12317

z = (phat - p) / np.sqrt( (p*(1-p))/(n) )  

p_value =  2 * norm.cdf(-z, 0, 1)         # If z is positive, change de sign to cath the area on the left
                                          # 2 * because 2 tail test
print("Z-statistic:", round(z,2))   
print("P-value:", round(p_value,4))
print("Alpha:", round(alpha,4))
print()


if p_value < alpha:    ### For All Analysis ###
    print('p-value < alpha: Reject Ho!')
else:
    print('p-value > alpha: Fail to reject Ho.')

Z-statistic: 2.25
P-value: 0.0245
Alpha: 0.1

p-value < alpha: Reject Ho!


## Hypothesis Test with Chi-Square

**A pencil manufacturer requires that mass of their pencils have standar deviation that won't exceed 0.08 grams. An inspector thinks that the standar deviation is larger. He samples 30 pencils and finds they have a mean mass of 1.62 g and sandardd deviation of 0.0804 g. Test the clam at 0.05 level of significance.**

In [60]:
# Ho: sigma =< 0.08 g
# Ha: sigma > 0.08 g  -> right tail test

<img src="images/chirightt.jpg"/>

In [61]:
# Chi-Square Distribution (for variances and standards deviation)

#from scipy.stats import chi2_contingency
#from scipy.stats import chi2

alpha = 0.05
loc = 1 - alpha

sigma = 0.08
s = 0.0804
n = 30
dof = n-1


# t-critical
t_alpha = chi2.ppf(loc, dof)    # Area on the right

# t-statistic
t_stat = (dof * s**2) / (sigma**2)


print('t_alpha:', round(t_alpha,2))
print()
print('t-statistic:', round(t_stat,2))
print()


if t_stat > t_alpha:    ### Right Analysis ###
    print('(t_statistic > t_alpha): Reject Ho!')
else:
    print('(t_statistic < t_alpha): Fail to reject Ho.')

t_alpha: 42.56

t-statistic: 29.29

(t_statistic < t_alpha): Fail to reject Ho.


**Drug A is know to have variance of 0.0009 in its active ingredient. Drug B claims to be better because  it has a smaller variance. Company B tests this by sampling 100 of drug B pills. The active ingredient has a mean of 2.47 mg and standard deviation of 0.026 mg. At 0.01 level of significance, test company B's claim.**

In [62]:
# Ho: variance >=  0.0009
# Ha: variance < 0.0009    -> left tail test

<img src="images/chileftt.jpg"/>

In [63]:
# Chi-Square Distribution (for variances and standards deviation)
#from scipy.stats import chi2

alpha = 0.01
loc = 1 - alpha

sigma_2 = 0.0009  # variance
std = 0.026       # std from sample
n = 100
dof = n-1


# t-critical
t_alpha = chi2.ppf(alpha, dof)    # Area to the left


# t-statistic
t_stat = (dof * std**2) / (sigma_2)


print('t_alpha:', round(t_alpha,2))
print()
print('t-statistic:', round(t_stat,2))
print()


if t_stat < t_alpha:    ### Left Analysis ###
    print('(t_statistic < t_alpha): Reject Ho!')
else:
    print('(t_statistic > t_alpha): Fail to reject Ho.')

t_alpha: 69.23

t-statistic: 74.36

(t_statistic > t_alpha): Fail to reject Ho.


**A pizza manufacturer says his pizzas have a variance in the diameter of 16 cm. He installs new equipment and now no longer thinks this is the case. He select 40 pizzas off the line and finds they have a standard deviation 3.2 cm. Test the claim at 90% level of confidence.**

In [64]:
# Ho: variance == 16 cm 
# Ha: variance != 16 cm   -> 2 tail test

<img src="images/chi2tailt.jpg"/>

In [65]:
# Chi-Square Distribution (for variances and standards deviation)
#from scipy.stats import chi2

alpha = 0.10

alpha_2 = alpha/2
one_alpha_2 = 1 - alpha_2

sigma_2 = 16    # variance
s_2 = 3.2       # variance from sample
n = 40
dof = n-1


# t-critical
t_alpha_2 = chi2.ppf(alpha_2, dof)            # Area to the left
t_one_alpha_2 = chi2.ppf(one_alpha_2, dof)    # Area on the right

# t-statistic
t_stat = (dof * s_2**2) / (sigma_2)

print('t (1-alpha/2):', round(t_alpha_2,2))
print()
print('t (alpha/2):', round(t_one_alpha_2,2))
print()
print('t-statistic:', round(t_stat,2))
print()


if (t_stat < t_alpha_2) or (t_stat > t_one_alpha_2):    ### 2 tail Analysis ###
    print('(t statistic < t 1-alpha/2) or (t statistic > t alpha/2): Reject Ho!')
else:
    print('(t 1-aplpha/2 < t statistic < t alpha/2): Fail to reject Ho.')

t (1-alpha/2): 25.7

t (alpha/2): 54.57

t-statistic: 24.96

(t statistic < t 1-alpha/2) or (t statistic > t alpha/2): Reject Ho!


## Hypothesis Test with 2 Means

**A researcher thinks that grocery shoppers spend more when they haven't eaten. To test, he samples 41 shoppers who didn't eat breakfast. These people sent on average \\$71.27 with standard deviation of \\$8.05. 52 shoppers who did eat breakfast spent on average \\$69.43 with standard deviation $9.22. Test with 95% LOC.**

In [66]:
# Ho: mu1 - mu2 =< 0
# Ha: mu1 - mu2 > 0  -> right tail test

<img src="images/righttailtest.jpg"/>

In [67]:
loc = 0.95
alpha = 1 - loc


# Empty (mu1)
n1 = 41
xbar1 = 72.27
s1 = 8.05

# Full (mu2)
n2 = 52
xbar2 = 69.43
s2 = 9.22

delta_mu = 0

z = ( (xbar1 - xbar2) - (delta_mu) )/ (np.sqrt( (s1**2/n1) + (s2**2/n2) ))

p_value = norm.cdf(-z, 0, 1)     # We want the right side in normal distribution


print("Z-statistic:", round(z,2))   
print("P-value:", round(p_value,4))
print("Alpha:", round(alpha,4))
print()


if p_value < alpha:    ### For All Analysis ###
    print('(p-value < alpha): Reject Ho!')
else:
    print('(p-value > alpha): Fail to reject Ho.')

Z-statistic: 1.58
P-value: 0.0566
Alpha: 0.05

(p-value > alpha): Fail to reject Ho.


**A new cholesterol drug claims to lower cholesterol by over 20 pts. Group A of 55 people eercise and take the drug and lower cholesterol by average of 44.7 pts with standdard deviation of 6.8 pts. Group B is 55 different people who exeercise but don't take the drug and lower cholesterol by average 23.1 pts with standard deviation 5.3 pts. Tet at 0.01 LOC.**

In [68]:
# Ho: mu1 - mu2 =< 20
# Ha: mu1 - mu2 > 20  -> right tail test

<img src="images/righttailtest.jpg"/>

In [69]:
loc = 0.99
alpha = 1 - loc


# Group A (take the drug)
n1 = 55
xbar1 = 44.7
s1 = 6.8

# Group B (not take the drug)
n2 = 55
xbar2 = 23.1
s2 = 5.3

delta_mu = 20

z = ( (xbar1 - xbar2) - (delta_mu) )/ (np.sqrt( (s1**2/n1) + (s2**2/n2) ))

p_value = norm.cdf(-z, 0, 1)     # We want the right side in normal distribution


print("Z-statistic:", round(z,2))   
print("P-value:", round(p_value,4))
print("Alpha:", round(alpha,4))
print()


if p_value < alpha:    ### For All Analysis ###
    print('(p-value < alpha): Reject Ho!')
else:
    print('(p-value > alpha): Fail to reject Ho.')

Z-statistic: 1.38
P-value: 0.0844
Alpha: 0.01

(p-value > alpha): Fail to reject Ho.


**It was always accepted that people from two nearby cities exercises the same amount. A researcher proposes that the two cities don't exercise the same. City A asks 36 people who answer that they exercise on average 2.9 hours/week with standard deviation 1.1 h/w. City B asks 38 people who answer they exercise 2.7 hours/week with standard deviation 1.0 h/w. test claim at 0.05 level off significancce.**

In [70]:
# Ho: mu1 - mu2 == 0 
# Ha: mu1 - mu2 != 0   -> 2 tail test

<img src="images/2tailtest.jpg"/>

In [71]:
alpha = 0.05

# City A
n1 = 36
xbar1 = 2.9
s1 = 1.1

# City B
n2 = 38
xbar2 = 2.7
s2 = 1.0

delta_mu = 0

z = ( (xbar1 - xbar2) - (delta_mu) )/ (np.sqrt( (s1**2/n1) + (s2**2/n2) ))

p_value = 2 * norm.cdf(-z, 0, 1)     # We want the right side in normal distribution
                                     # (2*) because is a 2 tail test

print("Z-statistic:", round(z,2))   
print("P-value:", round(p_value,4))
print("Alpha:", round(alpha,4))
print()


if p_value < alpha:    ### For All Analysis ###
    print('(p-value < alpha): Reject Ho!')
else:
    print('(p-value > alpha): Fail to reject Ho.')

Z-statistic: 0.82
P-value: 0.4139
Alpha: 0.05

(p-value > alpha): Fail to reject Ho.


**It's claimed that people who go to home improvement classes finish projects in ess time. Group A is 10 people who attend a class. on average, they finish projects in 14.1 hours with standard deviation of 2.3 hs. Group B is 10 people who don't attend a cass and finish projects on average 15.0 hs with standard deviation 2.4 hs. Test the claim at 0.01 levl of significance.**

In [72]:
# Ho: mu1 - mu2 >=  0
# Ha: mu1 - mu2 < 0    -> left tail test

<img src="images/lefttailtest.jpg"/>

In [73]:
### Pooled! ###

alpha = 0.01
loc = 1 - alpha

# Data A (took class)
n1 = 10
xbar1 = 14.1
s1 = 2.3

# Data B (No class)
n2 = 10
xbar2 = 15.0
s2 = 2.4

dof = n1 + n2 - 2

delta_mu = 0


# t-critical
t_alpha = round( scipy.stats.t.ppf(alpha, dof), 2)  # area on the left

# t-statistic    # Pooled!
t_stat = ( (xbar1 - xbar2) - (delta_mu) )  /  ( np.sqrt(  (  ( (n1-1)*s1**2 ) +  (  (n1-1)*s1**2  )  ) / (n1+n2-2) ) * np.sqrt( (1/n1) + (1/n2) ) )



print('t_alpha:', round(t_alpha,2))
print()
print('t-statistic:', round(t_stat,2))
print()


if t_stat < t_alpha:    ### Left Analysis ###
    print('(t_statistic < t_alpha): Reject Ho!')
else:
    print('(t_statistic > t_alpha): Fail to reject Ho.')

t_alpha: -2.55

t-statistic: -0.87

(t_statistic > t_alpha): Fail to reject Ho.


**A test prep company claims that their class increases test scores. In city A, 15 students took the class and got an average score of 942 points with standard deviation 103 points. In city B, 18 students did not take the class and got average of 898 score with standard deviation 95 points. Test the claim at 0.95 LOS. Because students are in different cities, assume variances are not equal**

In [74]:
# Ho: mu1 - mu2 <=  0
# Ha: mu1 - mu2 > 0    -> right tail test

<img src="images/righttailtest.jpg"/>

In [75]:
### Not Pooled! ###

alpha = 0.05
loc = 1 - alpha

# City A (took class)
n1 = 15
xbar1 = 942
s1 = 103

# City B (No class)
n2 = 18
xbar2 = 898
s2 = 95


# DOF for Not Pooled     alternative: (dof = n1 - 1) if n1 < n2 else (dof = n2 -1)
if n1 < n2:
    dof = n1 -1
else:
    dof = n2 -1
    

delta_mu = 0


# t-critical
t_alpha = (-1) * round( scipy.stats.t.ppf(alpha, dof), 2)  # area on the right

# t-statistic    # Pooled!
t_stat = ( (xbar1 - xbar2) - (delta_mu) )  /  ( np.sqrt(  (  ( (n1-1)*s1**2 ) +  (  (n1-1)*s1**2  )  ) / (n1+n2-2) ) * np.sqrt( (1/n1) + (1/n2) ) )



print('t_alpha:', round(t_alpha,2))
print()
print('t-statistic:', round(t_stat,2))
print()


if t_stat < t_alpha:    ### Right Analysis ###
    print('(t_statistic < t_alpha): Fail to reject Ho.')
else:
    print('(t_statistic > t_alpha): Reject Ho!')

t_alpha: 1.76

t-statistic: 1.29

(t_statistic < t_alpha): Fail to reject Ho.


**A reporter thinks that the president's approval rating has improved following an article he published. Before the article, 480 of 1200 citizens approved of the president. After the article, 550 of 1180 citizens approved of his work. Test the claim at 5% level of significance.**

In [76]:
# Claim: p1 < p2

# Ho: p1 - p2 >=  0
# Ha: p1 - p2 < 0    -> left tail test

<img src="images/lefttailtest.jpg"/>

In [77]:
alpha = 0.05
loc = 1 - alpha

# Before
x1 = 480
n1 = 1200

# After
x2 = 550
n2 = 1180

p1_hat = x1/n1
p2_hat = x2/n2

pbar = (x1 + x2) / (n1 + n2)

delta_p = 0

z = ( (p1_hat - p2_hat) - delta_p ) / np.sqrt( pbar*(1-pbar)*((1/n1)+(1/n2)) )

p_value = norm.cdf(z, 0, 1)

print("Z-statistic:", round(z,2))   
print("P-value:", round(p_value,4))
print("Alpha:", round(alpha,4))
print()


if p_value < alpha:    ### For All Analysis ###
    print('p-value < alpha: Reject Ho!')
else:
    print('p-value > alpha: Fail to reject Ho.')

Z-statistic: -3.25
P-value: 0.0006
Alpha: 0.05

p-value < alpha: Reject Ho!


**A dealer claims that more cars are purchase by single women then single men. When studying single women, 100 out 500 purchases were made. In the men's study, 72 of 500 single men bought cars. Test the claim at 0.01 level of significance.**

In [78]:
# Claim: p1 > p2

# Ho: p1 - p2 <=  0
# Ha: p1 - p2 > 0    -> right tail test

<img src="images/righttailtest.jpg"/>

In [79]:
alpha = 0.01
loc = 1 - alpha

# Women
x1 = 100
n1 = 500

# Men
x2 = 72
n2 = 500

p1_hat = x1/n1
p2_hat = x2/n2

pbar = (x1 + x2) / (n1 + n2)

delta_p = 0

z = ( (p1_hat - p2_hat) - delta_p ) / np.sqrt( pbar*(1-pbar)*((1/n1)+(1/n2)) )

p_value = norm.cdf(-z, 0, 1)

print("Z-statistic:", round(z,2))   
print("P-value:", round(p_value,4))
print("Alpha:", round(alpha,4))
print()


if p_value < alpha:    ### For All Analysis ###
    print('p-value < alpha: Reject Ho!')
else:
    print('p-value > alpha: Fail to reject Ho.')

Z-statistic: 2.35
P-value: 0.0095
Alpha: 0.01

p-value < alpha: Reject Ho!


Ha is correct, single women buy more cars than single men.

## Hypothesis Test with 2 dependent samples (paired)

**A memory class claims that after taking class you will loose keys less often. To test, 12 people are interviewed before and after the class. Test the claim at 0.05 level of significance.** 

**Before = [8,10,6,7,4,11,12,5,6,3,6,4] ; After =  [6,5,6,6,5,9,4,5,4,4,5,4]**

In [80]:
#         Ho: mud >= 0
# (Claim) Ha: mud < 0    -> left tail test

<img src="images/lefttailtest.jpg"/>

In [81]:
alpha = 0.05
loc = 1 - alpha

before = np.array([8,10,6,7,4,11,12,5,6,3,6,4])
after = np.array([6,5,6,6,5,9,4,5,4,4,5,4])

diff = after - before

n = len(diff)

dbar = np.sum(diff) /n

s = np.std(diff)

dof = n - 1

mud = 0


# t-critical
t_alpha = round( scipy.stats.t.ppf(alpha, dof), 2)  # area on the left

# t-statistic
t_stat = round( (dbar - mud) / (s / np.sqrt(n) ), 2)


print('t_alpha:', t_alpha)
print()
print('t-statistic:', t_stat)
print()


if t_stat < t_alpha:    ### Left Analysis ###
    print('Reject Ho!')
else:
    print('Fail to reject Ho.')

t_alpha: -1.8

t-statistic: -2.2

Reject Ho!
