# More Hypothesis Tests

<img src="images/All.png"/>

### General Libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#import warnings
#warnings.filterwarnings("ignore")

### Normal Distribution - Two Tail Test

**According to *Glassdoor*, the popular salary information website the mean data scientist salary is \$113,000. The sample that is available on glass stores based on self-reported numbers and you would like to see if its value was correct.**

**The population variance is known and its standard deviation is equal to fifteen thousand dollars.**

    data = [117313,104002,113038,101936,84560,113136,80740,100536,
            105052,87201,91986,94868,90745,102848,85927,112276,108637,
            96818,92307,114564,109714,108833,115295,89279,81720,89344,
            114426,90410,95118,113382]

In [2]:
# Ho: mu == $ 113,000
# Ha: mu != $ 113,000

In [3]:
from scipy.stats import norm

salary = np.array([117313,104002,113038,101936,84560,113136,80740,100536,
                   105052,87201,91986,94868,90745,102848,85927,112276,108637,
                   96818,92307,114564,109714,108833,115295,89279,81720,89344,
                   114426,90410,95118,113382])
n = 30

mu = 113000    # mu from Ho
sigma = 15000  # population standard deviation

alpha = 0.05       # 95% C.I.
alpha_2 = alpha/2  # Two tail test

xbar = np.mean(salary)

# z-statistic
z_stat = (xbar - mu) / (sigma/np.sqrt(n))

# z-alpha/2
z_alpha_2 = norm.ppf(1-alpha_2)

# -(z-alpha/2)
neg_z_alpha_2 = norm.ppf(alpha_2)


print('z-statistic:', round(z_stat,2))
print('-z(alpha/2):', round(neg_z_alpha_2,2))
print('z(alpha/2):', round(z_alpha_2,2))
print()

# OBS: p_value = 2 * norm.cdf(-z, 0, 1) , because is a two tail test

if (z_stat < z_alpha_2) or (z_stat > neg_z_alpha_2):   ### 2 tail Analysis ###
    print('(z_stat < -z(alpha/2)) or (z_stat > z(alpha/2)): Reject Ho!')
else:
    print('(-z(alpha/2) < z_stat < z(alpha/2)): Fail to reject Ho.')

z-statistic: -4.67
-z(alpha/2): -1.96
z(alpha/2): 1.96

(z_stat < -z(alpha/2)) or (z_stat > z(alpha/2)): Reject Ho!


### T-Distribution - Left Tail Test

**Imagine you are the marketing analyst of a company and you've been asked to estimate of the email open rate of one of the firm's competitors is above your company's your company has an open rate of 40 percent. An email open rate is a measure of how many people on the email list actually open the emails they have.**

**You struggle to figure out how to get such specific information about a competitor company but then you see that an employee of that competitor company posted a selfie on Facebook saying "Hey Lowell the e-mail management software we are using Drives me nuts". In the background, you can see her screen and it shows clearly the summaries of the last 10 e-mail campaigns that were sent and their corresponding open rates.**

    open rate = [26%, 23%, 42%, 49%, 23%, 59%, 29%, 29%, 57%, 40%]

In [4]:
# Ho: mu >= 40%  
# Ha: mu < 40%  , sample mean is here (left tail test)

In [5]:
from scipy import stats
import scipy

open_rate = np.array([0.26,0.23,0.42,0.49,0.23,0.59,0.29,0.29,0.57,0.40])

xbar = np.mean(open_rate)     # sample mean
sigma = stats.tstd(open_rate) # standard deviation
SE = stats.sem(open_rate)     # standard error


print(f'Sample Mean: {round(xbar*100,2)}%')
print(f'Sample Standard Deviation: {round(sigma*100,2)}%')
print(f'Standard Error: {round(SE*100,2)}%')
print()


# t-statistic
mu = 0.40
t_stat = (xbar - mu) / SE

# t-alpha
n = 10
dof = n - 1
alpha = 0.05
t_alpha = scipy.stats.t.ppf(alpha, dof)


print(f't-stat: {round(t_stat,2)}')
print(f't-alpha: {round(t_alpha,2)}')

if abs(t_stat) < abs(t_alpha):
    print('Fail to reject Ho.')
else:
    print('Reject Ho!')

print()

# p-value
p_value = stats.t.sf(abs(t_stat), df=dof)

print(f'p-value: {p_value}')
print(f'alpha: {round(alpha,4)}')

if abs(alpha) < abs(p_value):
    print('(p-value > alpha): Fail to reject Ho.')
else:
    print('(p-value < alpha): Reject Ho!')

Sample Mean: 37.7%
Sample Standard Deviation: 13.74%
Standard Error: 4.34%

t-stat: -0.53
t-alpha: -1.83
Fail to reject Ho.

p-value: 0.3046341429137719
alpha: 0.05
(p-value > alpha): Fail to reject Ho.


### Two Dependent Sample - Right Tail Test

**There was this drug company developing a new pill that supposedly increased levels of magnesium recipients. There were 10 people involved in the study that were taking the drug for some time and we calculated confidence intervals that helped us study the effects of that drug they indicated the range of plausible
values for the population mean this time. We want to come to a single definite conclusion about the effectiveness of the drug.**

    before = [ 2.0, 1.4, 1.3, 1.1, 1.8, 1.6, 1.5, 0.7, 0.9, 1.5]
     after = [ 1.7, 1.7, 1.8, 1.3, 1.7, 1.5, 1.6, 1.7, 1.7, 2.4]

In [6]:
# Ho: mud <= 0 , where mud = After - Before
# Ha: mud > 0 , (dbar > 0) right tail test

In [7]:
# Dependent Samples

alpha = 0.05
loc = 1 - alpha

before = np.array([2.0,1.4,1.3,1.1,1.8,1.6,1.5,0.7,0.9,1.5])
after = np.array([1.7,1.7,1.8,1.3,1.7,1.5,1.6,1.7,1.7,2.4])

diff = after - before

dbar = np.sum(diff) /n   # Mean
sigma = stats.tstd(diff) # standard deviation
SE = stats.sem(diff)     # standard error

print(f'Sample Mean: {round(xbar*100,2)}%')
print(f'Sample Standard Deviation: {round(sigma*100,2)}%')
print(f'Standard Error: {round(SE*100,2)}%')
print()

# t-statistic
mu = 0
t_stat = (dbar - mu) / SE

# t-alpha
n = len(diff)
dof = n - 1
alpha = 0.05
t_alpha = (-1)*scipy.stats.t.ppf(alpha, dof)  #

print(f't-stat: {round(t_stat,2)}')
print(f't-alpha: {round(t_alpha,2)}')

if abs(t_stat) < abs(t_alpha):
    print('Fail to reject Ho.')
else:
    print('Reject Ho! So... (mud > 0)')

print()

# p-value
p_value = stats.t.sf(abs(t_stat), df=dof)

print(f'p-value: {p_value}')
print(f'alpha: {round(alpha,4)}')

if abs(alpha) < abs(p_value):
    print('(p-value > alpha): Fail to reject Ho.')
else:
    print('(p-value < alpha): Reject Ho! So... (mud > 0)')

Sample Mean: 37.7%
Sample Standard Deviation: 45.47%
Standard Error: 14.38%

t-stat: 2.29
t-alpha: 1.83
Reject Ho! So... (mud > 0)

p-value: 0.023696968198698924
alpha: 0.05
(p-value < alpha): Reject Ho! So... (mud > 0)


### Population Know - Two Independent Samples  - Two Tail Test

**We are about to test the average grades of students from two different departments in a UK university. The two departments are engineering and management we were told by the dean that engineering is a tougher discipline than people tend to get lower grades he believes that on average management students outperform engineering students by 4 percentage points.**

**Here's the table that summarizes the data the sample sizes are one hundred and seventy respectively.**

|  | Engineering | Management |
| --- | --- | --- |
| Size | 100 | 70 |
| Mean | 58% | 65% |
| Population std | 10% | 6% |

In [8]:
# Ho: muE - muM == -4%
# Ha: muE - muM != -4% , (dbar = -7%) two tail test

When the population is known for independent samples the 'standard error' of the difference:

    SE_diff = np.sqrt(sigmaE**2/nE +sigmaM**2/nM)

In [9]:
from scipy.stats import norm
from scipy import stats

nE = 100
nM = 70

muE = 0.58
muM = 0.65

dbar = muE - muM   # sample mean for Diff

sE = 0.10
sM = 0.06

SE_diff = np.sqrt(sE**2/nE +sM**2/nM)   # Standard Error for Diff


print(f'Sample Mean: {round(dbar*100,2)}%')
print(f'Standard Error: {round(SE_diff*100,2)}%')
print()

# Z-statistic
mu = - 0.04
z_stat = (dbar - mu)/SE_diff
# Another way to see:
# z = ( (xbar1 - xbar2) - (delta_mu) )/ (np.sqrt( (s1**2/n1) + (s2**2/n2) ))

alpha = 0.05       # 95% C.I.
alpha_2 = alpha/2  # Two tail test

# z-alpha/2
z_alpha_2 = norm.ppf(1-alpha_2)

# -(z-alpha/2)
neg_z_alpha_2 = norm.ppf(alpha_2)

print('z-statistic:', round(z_stat,2))
print('-z(alpha/2):', round(neg_z_alpha_2,2))
print('z(alpha/2):', round(z_alpha_2,2))

# OBS: p_value = 2 * norm.cdf(-z, 0, 1) , because is a two tail test

if (z_stat < z_alpha_2) or (z_stat > neg_z_alpha_2):   ### 2 tail Analysis ###
    print('(z_stat < -z(alpha/2)) or (z_stat > z(alpha/2)): Reject Ho!')
else:
    print('(-z(alpha/2) < z_stat < z(alpha/2)): Fail to reject Ho.')
    
print()

# p-value
p_value = (2) * norm.cdf(z_stat, 0, 1)  # two tail test

print(f'p-value: {p_value}')
print(f'alpha: {round(alpha,4)}')

if abs(alpha) < abs(p_value):
    print('(p-value > alpha): Fail to reject Ho.')
else:
    print('(p-value < alpha): Reject Ho! So... (mud != -4%)')

Sample Mean: -7.0%
Standard Error: 1.23%

z-statistic: -2.44
-z(alpha/2): -1.96
z(alpha/2): 1.96
(z_stat < -z(alpha/2)) or (z_stat > z(alpha/2)): Reject Ho!

p-value: 0.01477252634819342
alpha: 0.05
(p-value < alpha): Reject Ho! So... (mud != -4%)


### Independent Samples - Unknow Variances - Two Tail Test

**We are trying to see if apples in New York are as expensive as the ones in L.A. You went to 10 grocery shops in New York and your friend Paul lives in L.A. went to 8 grocery shops there. You got all the prices and put them in a table what the population variance of apple prices is but you assume it should be the same for New York and L.A.**

    NY_apples = [3.80,3.76,3.87,3.99,4.02,4.25,4.13,3.98,3.99,3.62]
    LA_apples = [3.02,3.22,3.24,3.02,3.06,3.15,3.81,3.44]

In [10]:
# Ho: muNY - muLA == 0
# Ha: muNY - muLA != 0 , two tail test

In [11]:
# independent variable, unknown variances (we assume equals)

NY = np.array([3.80,3.76,3.87,3.99,4.02,4.25,4.13,3.98,3.99,3.62])
LA = np.array([3.02,3.22,3.24,3.02,3.06,3.15,3.81,3.44])

# Sample Sizes
nNY = len(NY)
nLA = len(LA)

# Means
xbarNY = np.mean(NY)
xbarLA = np.mean(LA)

# Standard Deviations
sNY = stats.tstd(NY)
sLA = stats.tstd(LA)

# Pooled Variance
polled_S2 = (((nNY-1)*sNY**2)+((nLA-1)*sLA**2))/(nNY+nLA-2)

# Standard Error for Diff
SE_diff = np.sqrt(sNY**2/nNY +sLA**2/nLA)   

print(f'Sample Sizes: NY:{round(nNY)}, LA:{round(nLA,2)}')
print(f'Sample Means: NY:{round(xbarNY,2)}, LA:{round(xbarLA,2)}')
print(f'Standard Deviations: NY:{round(sNY,2)}, LA:{round(sLA,2)}')
print(f'Pooled Variance: {round(polled_S2,2)}')
print(f'Standard Error for Diff Means: {round(SE_diff,2)}')
print()

# t-statistic
delta_mu = 0
t_stat = ( (xbarNY - xbarLA) - (delta_mu) )/ (SE_diff)

dof = nNY + nLA -2
alpha = 0.05

# p-value
p_value = stats.t.sf(abs(t_stat), df=dof)

print(f'p-value: {p_value}')
print(f'alpha: {round(alpha,4)}')

if abs(alpha) < abs(p_value):
    print('(p-value > alpha): Fail to reject Ho.')
else:
    print('(p-value < alpha): Reject Ho! So... (muNY-muLA != 0)')

Sample Sizes: NY:10, LA:8
Sample Means: NY:3.94, LA:3.24
Standard Deviations: NY:0.18, LA:0.27
Pooled Variance: 0.05
Standard Error for Diff Means: 0.11

p-value: 5.6735455629297295e-06
alpha: 0.05
(p-value < alpha): Reject Ho! So... (muNY-muLA != 0)


### Independent Samples - Unknow Variances (but assumed to be equal)

**You have data on the amount of times people click on a pop-up add on 24 Mondays and 21 Saturdays on an e-learning platform for several years. The samples are drawn independently. Statistically speaking, is there strong evidence that the number of clicks the add records on Mondays is higher than the number of clicks on Saturdays?**

|  | Monday | Saturday |
| --- | --- | --- |
| Size | 24 | 21 |
| Mean | 1078.00 | 908.20 |
| Std. Deviation | 633.00 | 469.80 |

In [12]:
# Ho: muMo - muSa <= 0
# Ha: muMo - muSa > 0 (right tail test)

In [13]:
# Testing of two means. Independent samples
# population variances unknown, but assumed to be equal

# Sample Sizes
nMo = 24
nSa = 21

# Means
xbarMo = 1078.00
xbarSa = 908.20

# Standard Deviations
sMo = 633.00
sSa = 469.80

# Pooled Variance
polled_S2 = (((nMo-1)*sMo**2)+((nSa-1)*sSa**2))/(nMo+nSa-2)

# Standard Error for Diff
SE_diff = np.sqrt(sMo**2/nMo +sSa**2/nSa)  


print(f'Var(mon):{round(sMo**2,2)}, Var(sat):{round(sSa**2,2)}')
print(f'Pooled Variance: {round(polled_S2,2)}')
print(f'Standard Error for Diff Means: {round(SE_diff,2)}')
print()

# t-statistic
delta_mu = 0
t_stat = ( (xbarMo - xbarSa) - (delta_mu) )/ (SE_diff)
print(f't-statistic: {round(t_stat,2)}')

dof = nNY + nLA -2
alpha = 0.05

# p-value
p_value = stats.t.sf(abs(t_stat), df=dof)

print(f'p-value: {p_value}')
print(f'alpha: {round(alpha,4)}')

if abs(alpha) < abs(p_value):
    print('(p-value > alpha): Fail to reject Ho. So... (muMo-muSa == 0)')
else:
    print('(p-value < alpha): Reject Ho! So... (muMo-muSa != 0)')

Var(mon):400689.0, Var(sat):220712.04
Pooled Variance: 316978.79
Standard Error for Diff Means: 164.94

t-statistic: 1.03
p-value: 0.15928773468517068
alpha: 0.05
(p-value > alpha): Fail to reject Ho. So... (muMo-muSa == 0)


### Pooled - variances of the 2 populations assumed equal

**We will test whether a particular company is discriminating against some of its employees on a gender basis. Our fictitious company is called Sparke fortress incorporated. It is a big company with more than five thousand employees. And here we will work with a sample of 174 of them.**

|  | Female | Male |
| --- | --- | --- |
| Size | 98 | 76 |
| Mean | 65736.91 | 72300.53 |
| Sample Variance | 40713712.34 | 382264352.09 |

In [14]:
# Ho: muM - muF <= 0
# Ha: muM - muF > 0  (right tail test)

In [15]:
# Pooled - variances of the 2 populations assumed equal

# Sample Sizes
nF = 98
nM = 76

# Means
xbarF = 65736.91
xbarM = 72300.00

# Sample Variance
sF2 = 407131712.34
sM2 = 382264352.09

# Pooled Variance
polled_S2 = (((nF-1)*sF2)+((nM-1)*sM2))/(nF+nM-2)

# Standard Error for Diff
SE_diff = np.sqrt(sF2/nF +sM2/nM)   

print(f'Pooled Variance: {round(polled_S2,2)}')
print(f'Standard Error for Diff Means: {round(SE_diff,2)}')
print()

# t-statistic
delta_mu = 0
t_stat = ( (xbarM - xbarF) - (delta_mu) )/ (SE_diff)
print(f't-statistic: {round(t_stat,2)}')

dof = nF + nM -2
alpha = 0.05

# p-value
p_value = stats.t.sf(abs(t_stat), df=dof)  # for 2 tail test (X2)
# Exact same result with: p_value = norm.cdf(-t_stat, 0, 1)

print(f'p-value: {p_value}')
print(f'alpha: {round(alpha,4)}')

if abs(alpha) < abs(p_value):
    print('(p-value > alpha): Fail to reject Ho. So... (muM - muF <= 0)')
else:
    print('(p-value < alpha): Reject Ho! So... (muM - muF > 0)')

Pooled Variance: 396288386.65
Standard Error for Diff Means: 3030.54

t-statistic: 2.17
p-value: 0.015857630806420425
alpha: 0.05
(p-value < alpha): Reject Ho! So... (muM - muF > 0)


### Hypothesis Tests for Linear Regression

**Test at 95% level of confidence if there is a significant linear relationship between the number of absences and GPA.**

    absences = [0,0,0,0,1,1,1,2,2,2,3,3,5,7,8]
    GPA = [3.6,3.9,2.4,3.1,3.5,4.0,3.6,2.8,3.0,2.2,3.9,3.1,2.1,2.8,1.7]

In [24]:
# Ho: ro == 0                 , means NO linear relationship
# Ha: ro != 0 (two tail test) , means linear relationship

loc = 0.95
alpha = 1 - loc
alpha_2 = alpha/2

abse = np.array([0,0,0,0,1,1,1,2,2,2,3,3,5,7,8])
gpa = np.array([3.6,3.9,2.4,3.1,3.5,4.0,3.6,2.8,3.0,2.2,3.9,3.1,2.1,2.8,1.7])

#plt.scatter(gpa,abs,c='g',marker='.')

data = pd.DataFrame({'Absence': abse, 'GPA': gpa})

n = len(abse)
dof = n - 2

r = data['Absence'].corr(data['GPA'])
# r = np.corrcoef(abse,gpa)[1,0]

# t-statistic
t_stat = (r) / np.sqrt((1-r**2)/(n-2))  

# t-alpha/2
t_alpha_2 = scipy.stats.t.ppf(alpha_2, dof)

# -(t-alpha/2)
neg_t_alpha_2 = scipy.stats.t.ppf(1-alpha_2, dof)

print('r:', round(r,4))
print()
print('t-statistic:', round(t_stat,2))
print('-t(alpha/2):', round(neg_t_alpha_2,2))
print('t(alpha/2):', round(t_alpha_2,2))
print()

if (t_stat < t_alpha_2) or (t_stat > neg_t_alpha_2):   ### 2 tail Analysis ###
    print('(t_stat < -t(alpha/2)) or (t_stat > t(alpha/2)): Reject Ho!')
else:
    print('(-t(alpha/2) < t_stat < t(alpha/2)): Fail to reject Ho.')
    
p_value = 2 * stats.t.sf(abs(t_stat), df=dof)

print()
print('p_value', p_value)
print('alpha', round(alpha,2))

if abs(alpha) < abs(p_value):
    print('(p-value > alpha): Fail to reject Ho. So... no linear relationship')
else:
    print('(p-value < alpha): Reject Ho! So... linear relationship')

r: -0.5866

t-statistic: -2.61
-t(alpha/2): 2.16
t(alpha/2): -2.16

(t_stat < -t(alpha/2)) or (t_stat > t(alpha/2)): Reject Ho!

p_value 0.02152434744255244
alpha 0.05
(p-value < alpha): Reject Ho! So... linear relationship


In [18]:
# Check
r, p = scipy.stats.pearsonr(abse, gpa)

print('r:',r)
print('p:',p)

r: -0.5866193234859391
p: 0.021524347442552532
