In [63]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_1samp, ttest_ind, mannwhitneyu, levene, shapiro,bartlett, wilcoxon
from statsmodels.stats.power import ttest_power

In [64]:
column_nm=['SALESPERSON','Old Scheme','New Scheme']
data=[[1,57,62],[2,103,122],[3,59,54],[4,75,82],[5,84,84],[6,73,86],[7,35,32],[8,110,104],[9,44,38],[10,82,107],[11,67,84],[12,64,85],[13,78,99],[14,53,39],[15,41,34],[16,39,58],[17,80,73],[18,87,53],[19,73,66],[20,65,78],[21,28,41],[22,62,71],[23,49,38],[24,84,95],[25,63,81],[26,77,58],[27,67,75],[28,101,94],[29,91,100],[30,50,68]]
df = pd.DataFrame(data,columns=column_nm)
df.head(5)

Unnamed: 0,SALESPERSON,Old Scheme,New Scheme
0,1,57,62
1,2,103,122
2,3,59,54
3,4,75,82
4,5,84,84


In [65]:
df = df.set_index(['SALESPERSON'])
df.head(5)

Unnamed: 0_level_0,Old Scheme,New Scheme
SALESPERSON,Unnamed: 1_level_1,Unnamed: 2_level_1
1,57,62
2,103,122
3,59,54
4,75,82
5,84,84


## Find the mean of old scheme and new scheme column

In [66]:
df.describe()

Unnamed: 0,Old Scheme,New Scheme
count,30.0,30.0
mean,68.033333,72.033333
std,20.45598,24.062395
min,28.0,32.0
25%,54.0,55.0
50%,67.0,74.0
75%,81.5,85.75
max,110.0,122.0


In [67]:
# Seperating the data into 2 groups
oldScheme = df.iloc[:,0:1].values
newScheme = df.iloc[:,1:2].values

In [68]:
# Shapiro test will tell us whether the data is normal or not.
#Null hypothesis: Data is normal.
shapiro(newScheme)
#Here the p value is 0.5057 which is greater than 0.05 and hence we fail to reject null hypothesis, i.e data is normal.

(0.9687566161155701, 0.5057379007339478)

In [69]:
shapiro(oldScheme)
#Here the p value is 0.9813 which is greater than 0.05 and hence we fail to reject null hypothesis, i.e data is normal.

(0.9885103702545166, 0.9813674092292786)

In [70]:
# For checking equality of variance between groups
# Null Hypothesis: Variances are equal
levene(oldScheme,newScheme)
#Here the p value is 0.3067 which is greater than 0.05 and hence we fail to reject null hypothesis, i.e variance square of group 1 is equal to variance square of 2nd group.

LeveneResult(statistic=array([1.06306154]), pvalue=array([0.30679836]))

In [71]:
# two-sample t-test
# null hypothesis: the two groups have the same mean
t_statistic, p_value = ttest_ind(newScheme, oldScheme)
print(t_statistic, p_value)

[0.69370676] [0.49063516]


In [72]:
# Computing degrees of freedom
step1 = (((20.455980 * 20.455980)/30)+((24.062395*24.062395)/30))
step1 = step1*step1
step2 = ((20.455980 * 20.455980)/30)
step2 = step2*step2
step2 = step2/29
step3 = ((24.062395*24.062395)/30)
step3 = step3*step3
step3 = step3/29
df = step1/(step2+step3)
df

56.53522679695983

In [73]:
poolVariance = ((30-1)*(20.455980*20.455980)+(30-1)*(24.062395*24.062395))/(30+30-2)
poolVariance = np.sqrt(poolVariance)
tstat = (72.033333-68.033333)/(poolVariance*(np.sqrt((1/30)+(1/30))))
print(poolVariance)
print(tstat)

22.332106605696932
0.6937067630188398


##### Confirming t-static value is same using formula and library function.
##### Taking Degrees of freedom: 57 (Rounding 56.5352), signiicance level: 0.05; critical value of t = 2.0025
##### We determined critical value using calculator : http://www.meracalculator.com/math/t-distribution-critical-value-table.php 
##### As t-static value (0.6937) lies in the range of -2.0025 to +2.0025 and p value (0.4906) which is >0.05 we fail to reject null hypothesis.

## As p-value is >0.05 we fail to reject the null hypothesis, i.e there is no significant difference in the means.

### Suppose it has been calculated that in order for Titan to break even, the average output must increase by £5000 in the scheme compared to the old scheme. If this figure is alternative hypothesis, what is:
The probability of a type 1 error?
What is the p- value of the hypothesis test if we test for a difference of $5000?
Power of the test?


In [74]:
# Null hypothesis: The mean difference is greater than 5 (in muliples of 1000 = 5000 )
# Alternate hypothesis: The mean difference is less than 5 (in multiples of 1000 = 5000)

# Standard Error : SE = sqrt[((s1*s1)/n1) + ((s2*s2)/n2)]
# Degrees of freedom (DF) ~ 57
# t_statistic = [ (x1 - x2) - d ] / SE
# Here s1 is the standard deviation of old scheme, s2 is the standard deviation of new scheme,
# n1 is the sample size of old scheme, n2 is the sample size of new scheme, 
# x1 is the mean of old scheme, x2 is the mean of new scheme,
# d is the hypothesized difference between population means, and SE is the standard error.

se = np.sqrt(((20.455980 * 20.455980)/30)+((24.062395*24.062395)/30))
tstat = (((72.033333-68.033333)-5)/se)

print(se)
print(tstat)

5.7661251313061905
-0.17342669075470996


##### Logic of the analysis: Given the alternative hypothesis (μ1 - μ2 < 5), we want to know whether the observed 
##### difference in sample means is small enough (i.e., sufficiently less than 5) to cause us to reject the null hypothesis.

##### The observed difference in sample means (5) produced a t statistic of -0.1734. 
##### We use the t Distribution Calculator: https://stattrek.com/online-calculator/t-distribution.aspx
##### To find P(t < -0.1734) = 0.4315
#### Value of p = 0.4315

### This means we would expect to find an observed difference in sample means of 5 or less in 43% of our samples even if the true difference were actually 5. Therefore, the P-value in this analysis is 0.43.

### Since the P-value (0.43) is greater than the significance level (0.05), we cannot reject the null hypothesis.

In [75]:
# Caculating power of test
x = (np.mean(newScheme) - np.mean(oldScheme)) / np.sqrt(((30-1)*np.var(oldScheme)+(30-1)*np.var(newScheme)) / 30+30-2)
print (x)
print(ttest_power(x, nobs=60, alpha=0.05, alternative='two-sided'))

0.12909555513169643
0.16610169779030565


#### Power of test: 0.3622

### Alternative approach: Computing t statistic value keeping mean1 = 68.033 and mean=72.0333


In [76]:
#Assuming mean value is differed by 5 (Multiples of 1000=5000)
poolVariance = ((30-1)*(20.455980*20.455980)+(30-1)*(24.062395*24.062395))/(30+30-2)
poolVariance = np.sqrt(poolVariance)
tstat = ((68.033333-72.033333)-5)/(poolVariance*(np.sqrt((1/30)+(1/30))))
print(poolVariance)
print(tstat)
#poolVariance = ((30-1)*np.var(oldScheme)+(30-1)*np.var(newScheme))/(30+30-2)
#t_stat =  5/np.sqrt(poolVariance)
#t_stat

22.332106605696932
-1.5608402167923896


In [77]:
se = np.sqrt(((20.455980 * 20.455980)/30)+((24.062395*24.062395)/30))
tstat = (((68.033333-72.033333)+5)/se)

print(se)
print(tstat)

5.7661251313061905
0.17342669075470996


##### The observed difference in sample means (5) produced a t statistic of 0.1734. 
##### We use the t Distribution Calculator: https://stattrek.com/online-calculator/t-distribution.aspx
##### To find P(t < 0.1734) = 0.5685 
#### Value of p = 0.5685

### This means we would expect to find an observed difference in sample means of 5 or greater in ~57 % of our samples if the true difference were actually 5. Therefore, the P-value in this analysis is 0.5685.