# One sample and two sample  (test for mean and not for proportion) 

In [1]:
import numpy       as np
import pandas      as pd
import scipy.stats as stats

### 1) Z test

A one sample Z test is one of the most basic types of hypothesis test.


### Example 1: 

A principal of a prestigious city college claims that the average intelligence of the students of the college is above average. 

A random sample of 100 students IQ scores have  a mean score of 115. The mean population mean IQ is 100 with a standard deviation of 15.

Is there sufficient evidence to support the principal's claim?

In [2]:
#H0 = IQ <= mu (100)
#H1 = IQ > mu (100)
# Z-test method

n = 100
mu = 100
sigma = 15
xbar = 115
alpha = 0.05

se = sigma/np.sqrt(n)

zscore = (xbar-mu)/se

rejection_region = stats.norm.isf(0.05)

# Since it's a one-tailed right-tailed test ->
# If the zscore > rejection_region, 
#    There's enough evidence to reject H0. Hence, we accept H1
# Else, 
#    We fail to reject H0. Hence, we accept H0 

print('Se is : ', se)
print('Zscore of sample mean is : ', zscore)
print('Lower limit of the rejection region is : ', rejection_region)
print('\nThere {0} enough evidence to prove principal''s claim'.format(np.where(zscore>=rejection_region, 'IS','IS NOT')))


Se is :  1.5
Zscore of sample mean is :  10.0
Lower limit of the rejection region is :  1.6448536269514729

There IS enough evidence to prove principals claim


### 2) t test

### One sample t test

### Example 2

Suppose that a doctor claims that 17 year olds have an average body temperature that is higher than the commonly accepted average human temperature of 98.6 degree F.

A simple random statistical sample of 25 people, each of age 17 is selected. 

| ID | Temperature |
| --- | ----- |
| 1 | 98.56 | 
| 2 | 98.66 |
| 3 | 97.54 |
| 4 | 98.71 |
| 5 | 99.22 |
| 6 | 99.49 |
| 7 | 98.14 |
| 8 | 98.84 |
| 9 | 99.28 |
| 10 | 98.48 |
| 11 | 98.88 |
| 12 | 97.29 |
| 13 | 98.88 |
| 14 | 99.07 |
| 15 | 98.81 |
| 16 | 99.49 |
| 17 | 98.57 |
| 18 | 97.98 |
| 19 | 97.75 |
| 20 | 97.69 |
| 21 | 99.28 |
| 22 | 98.52 |
| 23 | 98.82 |
| 24 | 98.81 |
| 25 | 98.22 |


In [4]:
temperature = np.array([98.56, 98.66, 97.54, 98.71, 99.22, 99.49, 98.14, 98.84,\
                         99.28, 98.48, 98.88, 97.29, 98.88, 99.07, 98.81, 99.49,\
                         98.57, 97.98, 97.75, 97.69, 99.28, 98.52, 98.82, 98.81, 98.22])            

# H0 = 17 years' body temperature is not higher
# H1 = 17 year's body temperature is higher


mu = 98.6
n = 25
xbar = temperature.mean()
S = temperature.std(ddof=1)
alpha = 0.05

se = S / np.sqrt(n)

# Using Critical Value:

tstat = (xbar - mu)/ se
print(tstat)
critical_value = stats.t.isf(0.05, df=n-1)
reject_H = tstat <= critical_value
print('tstat = ', tstat)
print('critical_value = ',critical_value)
print(reject_H)


# Using p-Value:

p_value = stats.t.cdf(tstat, df=n-1)
p_value

# Using stats library:

tcrit, p_value_two_side = stats.ttest_1samp(temperature, mu)
p_value_rt_side = p_value_two_side / 2
print(tcrit)
print(p_value_rt_side)

-0.006668602694974533
tstat =  -0.006668602694974533
critical_value =  1.7108820799094282
True
-0.006668602694974534
0.4973671933764293


### Two sample t test - Unpaired Data

### Example 3

Compare two unrelated samples. Data was collected on the weight loss of 16 women and 20 men enrolled in a weight reduction program.
At $\alpha$ = 0.05, test whether the weight loss of these two samples is different.

In [5]:
Weight_loss_Male   = [ 3.69, 4.12, 4.65, 3.19,  4.34, 3.68, 4.12, 4.50, 3.70, 3.09,3.65, 4.73, 3.93, 3.46, 3.28, 4.43, 4.13, 3.62, 3.71, 2.92]
Weight_loss_Female = [2.99, 1.80, 3.79, 4.12, 1.76, 3.50, 3.61, 2.32, 3.67, 4.26, 4.57, 3.01, 3.82, 4.33, 3.40, 3.86]

In [6]:
Wt_loss_Male = np.array(Weight_loss_Male)
Wt_loss_Female = np.array(Weight_loss_Female)

xbar1 = Wt_loss_Male.mean()
xbar2 = Wt_loss_Female.mean()

S1 = Wt_loss_Male.std(ddof=1)
S2 = Wt_loss_Female.std(ddof=1)

n1 = len(Wt_loss_Male)
n2 = len(Wt_loss_Female)


# F Test - to test whether two independent populations have the same variability

Fstat = (S1**2)/(S2**2)
f_crit = stats.f.isf(q = 0.05, dfn = n1-1, dfd = n2-1)
equal_variance = Fstat < f_crit
print('Fstat : ', Fstat)
print('F - critical value : ',f_crit)
print('equal-variance = ',equal_variance)
print('\n')

#H0 = mu1 = mu2
#H1 = mu1 != mu2


dof = n1+n2-2
UCL = stats.t.isf(0.025, df=dof)
LCL = stats.t.ppf(0.025, df=dof)

print('UCL and LCL :', UCL, ' and ', LCL)
print('\n')

# Calculation of 'se1' and 'se2' differs; for equal-variance = True/False.
# Formula for T-statistic calculation remains same; for equal-variance = True/False.

'''
print('Calculation of se1 and se2 differs; for equal-variance = True/False.')
print('Formula for T-statistic calculation remains same; for equal-variance = True/False.\n\n')


# Using T-statistics if equal-variance = False (separate-variance formula)

print('Using T-statistics if equal-variance = False (separate-variance formula)')
print('-------------------------------------------')

se1 = (np.power(S1,2)/n1)
se2 = (np.power(S2,2)/n2)

tstat = (xbar1 - xbar2)/np.sqrt(se1 + se2)
accept_H0 = LCL <= tstat <= UCL 

print('t-statistics : ', tstat)
print('Does t-statistics lies in rejection regions : ',np.where(accept_H0,'No','Yes'))
print('Did we fail to reject H0? : ',np.where(accept_H0,'Yes','No')) 

t_statistic, p_value  =  stats.ttest_ind(Weight_loss_Male,Weight_loss_Female, equal_var=False)
print('t_statistic using ttest_ind method : ', t_statistic)

'''

# Using T-statistics if equal-variance = True (pooled-variance formula)
print('\n')
print('Using T-statistics if equal-variance = True (pooled-variance formula)')
print('-------------------------------------------')

numerator = ((n1 - 1) * np.power(S1,2)) + ((n2 - 1) * np.power(S2,2))  
denominator = ((n1 - 1) + (n2 - 1))

S = numerator / denominator
se_1 = S/n1
se_2 = S/n2

tstat1 = (xbar1 - xbar2) / np.sqrt(se_1 + se_2)
accept1_H0 = LCL <= tstat1 <= UCL 

print('t-statistics : ', tstat1)
print('Does t-statistics lies in rejection regions : ',np.where(accept1_H0,'No','Yes'))
print('Did we fail to reject H0? : ',np.where(accept1_H0,'Yes','No')) 
t_statistic, p_value  =  stats.ttest_ind(Weight_loss_Male,Weight_loss_Female, equal_var=True)
print('t_statistic using ttest_ind method : ' ,t_statistic)

Fstat :  0.3763228234982662
F - critical value :  2.339819281665458
equal-variance =  True


UCL and LCL : 2.032244509317719  and  -2.032244509317719




Using T-statistics if equal-variance = True (pooled-variance formula)
-------------------------------------------
t-statistics :  1.8271882959812857
Does t-statistics lies in rejection regions :  No
Did we fail to reject H0? :  Yes
t_statistic using ttest_ind method :  1.827188295981286


### Example 4

Compare the following two unrelated samples. Data was collected on the weight of women and men enrolled in a weight reduction program.
At $\alpha$ = 0.05, test whether the weight of these two samples is different.

In [7]:
Weight_Female       =  [ 53.8, 54.4, 51.2, 52.5, 61.0, 50.6, 51.6, 70.0]
Weight_Male         =  [ 72.5, 80.3, 71.3, 67.7, 66.2, 73.4, 61.3, 76.8]

In [8]:
wt_Female = np.array(Weight_Female)
wt_Male = np.array(Weight_Male)

xbar1 = wt_Female.mean()
xbar2 = wt_Male.mean()

S1 = wt_Female.std(ddof=1)
S2 = wt_Male.std(ddof=1)

n1 = len(wt_Female)
n2 = len(wt_Male)

dof = n1 + n2 - 2
UCL = stats.t.isf(0.025,df=dof)
LCL = stats.t.ppf(0.025,df=dof)

print('UCL and LCL :', UCL, ' and ', LCL)
print('\n')

# Using T-statistics if equal-variance = False
print('Using T-statistics if equal-variance = False')
print('-------------------------------------------')

se1 = np.power(S1,2)/n1
se2 = np.power(S2,2)/n2

tstat = (xbar1 - xbar2)/np.sqrt(se1 + se2)
accept_H0 = LCL <= tstat <= UCL 

print('t-statistics : ', tstat)
print('Does t-statistics lies in rejection regions : ',np.where(accept_H0,'No','Yes'))
print('Did we fail to reject H0? : ',np.where(accept_H0,'Yes','No')) 

t_statistic, p_value  =  stats.ttest_ind(wt_Female,wt_Male, equal_var=False)
print('t_statistic using ttest_ind method : ', t_statistic)

print('\n')
# Using T-statistics if equal-variance = True
print('Using T-statistics if equal-variance = True')
print('-------------------------------------------')

numerator = ((n1 - 1) * np.power(S1,2)) + ((n2 - 1) * np.power(S2,2))  
denominator = ((n1 - 1) + (n2 - 1))

S = numerator/denominator

se_1 = S/n1
se_2 = S/n2

tstat1 = (xbar1 - xbar2)/np.sqrt(se_1+se_2)
accept1_H0 = LCL <= tstat1 <= UCL 

print('t-statistics : ', tstat1)
print('Does t-statistics lies in rejection regions : ',np.where(accept1_H0,'No','Yes'))
print('Did we fail to reject H0? : ',np.where(accept1_H0,'Yes','No')) 

t_statistic, p_value  =  stats.ttest_ind(wt_Female,wt_Male, equal_var=True)
print('t_statistic using ttest_ind method : ' ,t_statistic)

UCL and LCL : 2.1447866879169277  and  -2.1447866879169277


Using T-statistics if equal-variance = False
-------------------------------------------
t-statistics :  -4.886344172533443
Does t-statistics lies in rejection regions :  Yes
Did we fail to reject H0? :  No
t_statistic using ttest_ind method :  -4.886344172533444


Using T-statistics if equal-variance = True
-------------------------------------------
t-statistics :  -4.886344172533443
Does t-statistics lies in rejection regions :  Yes
Did we fail to reject H0? :  No
t_statistic using ttest_ind method :  -4.886344172533444


## Two sample t test for paired data

### Example 6

Compare two related samples. Data was collected on the marks scored by 25 students in their final practice exam and the marks scored by the students after attending special coaching classes conducted by their college.
At 5% level of significance, is there any evidence that the coaching classes has any effect on the marks scored.

In [9]:
Marks_before = [ 52, 56, 61, 47, 58, 52, 56, 60, 52, 46, 51, 62, 54, 50, 48, 59, 56, 51, 52, 44, 52, 45, 57, 60, 45]

Marks_after  = [62, 64, 40, 65, 76, 82, 53, 68, 77, 60, 69, 34, 69, 73, 67, 82, 62, 49, 44, 43, 77, 61, 67, 67, 54]

In [10]:
Marks_after = np.array(Marks_after)
Marks_before = np.array(Marks_before)
alpha = 0.05

D = Marks_after - Marks_before
t_statistic, p_value  =  stats.ttest_1samp(D,0)
print('P Value %1.3f' % p_value) 
print('t_statistic ', t_statistic) 

print('\n')

t_statistic, p_value  =  stats.ttest_rel(Marks_after, Marks_before )
print('P Value %1.3f' % p_value) 
print('t_statistic ', t_statistic) 

effect = p_value > alpha
print('Coaching classes has effect on marks : ', effect)

P Value 0.002
t_statistic  3.404831324883169


P Value 0.002
t_statistic  3.404831324883169
Coaching classes has effect on marks :  False


### Example 7
** Alchohol consumption before and after love failure is given in the following table. Conduct a paired t test to check whether the alcholhol consumption is more after the love failure at 5% level of significance.**

In [11]:
Alchohol_Consumption_before = np.array([470, 354, 496, 351, 349, 449, 378, 359, 469, 329, 389, 497, 493, 268, 445, 287, 338, 271, 412, 335])
Alchohol_Consumption_after  = np.array([408, 439, 321, 437, 335, 344, 318, 492, 531, 417, 358, 391, 398, 394, 508, 399, 345, 341, 326, 467])

D  = Alchohol_Consumption_after -Alchohol_Consumption_before
print(D)
print('Mean is %3.2f and standard deviation is %3.2f' %(D.mean(),np.std(D,ddof = 1)))

[ -62   85 -175   86  -14 -105  -60  133   62   88  -31 -106  -95  126
   63  112    7   70  -86  132]
Mean is 11.50 and standard deviation is 95.68


In [12]:
alpha = 0.05

t_statistic, p_value  =  stats.ttest_1samp(D, 0)
print('P Value %1.3f' % p_value)
print('t_statistic  ', t_statistic)
print('\n')  
t_statistic, p_value  =  stats.ttest_rel(Alchohol_Consumption_after, Alchohol_Consumption_before )
print('P Value %1.3f' % p_value)  
print('t_statistic  ', t_statistic)

print('\n')
effect = p_value > alpha
print('The alchohol consumption is more after love failure : ', effect)


P Value 0.597
t_statistic   0.5375404241815105


P Value 0.597
t_statistic   0.5375404241815105


The alchohol consumption is more after love failure :  True


### Example 8

Sugar consumption in grams of 20 patients (both diabetic and non-diabetic) are given below:**

*At 5% level of significance, is there evidence that the mean sugar consumption is different for diabetic and non-diabetic?**    In the following table, 0 means diabetic and 1 means non-diabetic.*
    

In [13]:
weight               = np.array([[9.31, 0],[7.76, 0],[6.98, 1],[7.88, 1],[8.49, 1],[10.05, 1],[8.80, 1],[10.88, 1],[6.13, 1],[7.90, 1], \
                            [11.51, 0],[12.59, 0],[7.05, 1],[11.85, 0],[9.99, 0],[7.48, 0],[8.79, 0],[8.69, 1],[9.68, 0],[8.58, 1],\
                           [9.19, 0],[8.11, 1]])

alpha = 0.05

sugar_diabetic       = weight[:,1] == 0
sugar_diabetic       = weight[sugar_diabetic][:,0]
sugar_nondiabetic    = weight[:,1] == 1
sugar_nondiabetic    = weight[sugar_nondiabetic][:,0] 

print('sugar_diabetic : ',sugar_diabetic)
print('sugar_nondiabetic : ',sugar_nondiabetic)
print('\n')

t_statistic, p_value = stats.ttest_ind(sugar_diabetic,sugar_nondiabetic)
print('P Value %1.3f' % p_value)  
print('t_statistic  ', t_statistic)

different = p_value > alpha
print('Mean sugar consumption is different for diabetic and non-diabetic : ',different)

sugar_diabetic :  [ 9.31  7.76 11.51 12.59 11.85  9.99  7.48  8.79  9.68  9.19]
sugar_nondiabetic :  [ 6.98  7.88  8.49 10.05  8.8  10.88  6.13  7.9   7.05  8.69  8.58  8.11]


P Value 0.028
t_statistic   2.3730593333971224
Mean sugar consumption is different for diabetic and non-diabetic :  False
