In [1]:
# Z-Score

In [2]:
# z -Score tells us how many standard deviations away a value is from the mean
# we use the following formula to calculate a z-score:
# z = (X – μ) / σ
# scipy.stats.zscore
# scipy.stats.zscore(data, axis= 0, ddof = 0, nan_policy='propagate')
# data: an array like object containing data
# axis: the axis along which to calculate the z-scores. Default is 0.
# ddof: degrees of freedom correction in the calculation of the standard deviation.Default is 0.
# nan_policy: how to handle when input contain nan. Default is propagate, which returns nan.
# 'raise' throws an error and 'omit' performs calculations ignoreing nan values. 

In [4]:
import numpy as np
import pandas as pd
import scipy.stats as stats

In [5]:
data = np.array ([8, 5,5,12,12,12,10,16])
stats.zscore(data)


array([-0.56011203, -1.40028008, -1.40028008,  0.56011203,  0.56011203,
        0.56011203,  0.        ,  1.6803361 ])

In [6]:
print(np.mean(data))

10.0


In [7]:
print(np.std(data))

3.570714214271425


In [8]:
# Which means......
# The first value 8 is 0.56011203 standard deviations below the mean
# The value 10 is 0 standard deviations away from the mean
# The value 16 is 1.6803361 standard deviations above the mean

In [9]:
# Multi-Dimensional Arrays

data = np.array([[1,2,3,4],
                 [5, 6,7,8],
                 [9,10,11,12]])
stats.zscore(data, axis=1)

# The first value of  '1' in the first array is 1.3416079 standard deviation below the mean of its array


array([[-1.34164079, -0.4472136 ,  0.4472136 ,  1.34164079],
       [-1.34164079, -0.4472136 ,  0.4472136 ,  1.34164079],
       [-1.34164079, -0.4472136 ,  0.4472136 ,  1.34164079]])

In [12]:
# Using Pandas DataFrames
data = pd.DataFrame(np.random.randint(0, 15, size=(5,5)), columns = ['A','B','C','D','E'])

In [13]:
data

Unnamed: 0,A,B,C,D,E
0,6,3,4,2,8
1,4,4,2,4,14
2,3,10,13,7,5
3,4,0,14,12,13
4,4,4,7,13,11


In [14]:
data.apply (stats.zscore)
# the first value of '11' in the first column is 1.248040 standard deviatin above mean value of its column

Unnamed: 0,A,B,C,D,E
0,1.837117,-0.369274,-0.837708,-1.297075,-0.664534
1,-0.204124,-0.061546,-1.256562,-0.833834,1.147832
2,-1.224745,1.784827,1.047135,-0.138972,-1.570717
3,-0.204124,-1.292461,1.256562,1.01913,0.845771
4,-0.204124,-0.061546,-0.209427,1.250751,0.241649


In [15]:
# Determining the p-value associated with a certain t-score that results from a hypothesis test
# Syntax:  scipy.stats.t.sf()
# scipy.stats.t.sf(abs(x), df) .....where x is t-score and df is degree of freedom
# Left-tailed test
import scipy.stats
#find p-value
pValue = scipy.stats.t.sf(abs(-1.24), df=10)
pValue
# The p-value is 0.1216. If we use a significance level of α = 0.05, we would fail to reject the null hypothesis

0.1216417367479479

In [16]:
# Right-tailed test
pValue = scipy.stats.t.sf(abs(1.24), df=10)
pValue
# The p-value is 0.1216. If we use a significance level of α = 0.05, we would fail to reject the null hypothesis


0.1216417367479479

In [17]:
# Z-Score test
# Syntax: scipy.stats.norm.sf(abs(x)) where x is z-score

# Left-tailed test
z_lt = scipy.stats.norm.sf(abs(-0.60))
print('Left tail test value is: ', z_lt)
# Right-tailed test
z_rt = scipy.stats.norm.sf(abs(1.5))
print('Right tail test value is: ',z_rt)
# Two-tailed test
z_2t = scipy.stats.norm.sf(abs(0.8))*2
print('Two tail test value is: ',z_2t)

Left tail test value is:  0.2742531177500736
Right tail test value is:  0.06680720126885807
Two tail test value is:  0.4237107971667934
