In [1]:
# Z- Score

In [2]:
# z-score tells us how many standard deviations away a value is from the mean
# We use the following formula to calculate a z-score:
# z = (X – μ) / σ

# scipy.stats.zscore
# scipy.stats.zscore(data, axis=0, ddof=0, nan_policy=’propagate’)
# data: an array like object containing data
# axis: the axis along which to calculate the z-scores. Default is 0.
# ddof: degrees of freedom correction in the calculation of the standard deviation. Default is 0.
# nan_policy: how to handle when input contains nan. Default is propagate, which returns nan. 
#             ‘raise’ throws an error and ‘omit’ performs calculations ignoring nan values.


In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats

In [2]:
data = np.array([8, 5, 5, 12, 12, 12, 10, 16])
stats.zscore(data)

array([-0.56011203, -1.40028008, -1.40028008,  0.56011203,  0.56011203,
        0.56011203,  0.        ,  1.6803361 ])

In [3]:
print(np.mean(data))

10.0


In [4]:
print(np.std(data))

3.570714214271425


In [1]:
# Which means......
# The first value 8 is 0.56011203 standard deviations below the mean
# The value 10 is 0 standard deviations away from the mean
# The value 16 is 1.6803361 standard deviations above the mean

In [6]:
# Multi-Dimensional Arrays

data = np.array([[1, 2, 3, 4],
                 [5, 6, 7, 8],
                 [9, 10, 11, 12]])
stats.zscore(data, axis=1)
# The first value of “1” in the first array is 1.34164079 standard deviations below the mean of its array

array([[-1.34164079, -0.4472136 ,  0.4472136 ,  1.34164079],
       [-1.34164079, -0.4472136 ,  0.4472136 ,  1.34164079],
       [-1.34164079, -0.4472136 ,  0.4472136 ,  1.34164079]])

In [7]:
# Using Pandas DataFrames
data = pd.DataFrame(np.random.randint(0, 15, size=(5, 5)), columns=['A', 'B', 'C', 'D','E'])
data

Unnamed: 0,A,B,C,D,E
0,3,14,10,1,13
1,1,10,7,9,7
2,0,3,1,2,0
3,7,2,7,6,7
4,5,4,6,1,12


In [8]:
data.apply(stats.zscore)
# The first value of “11” in the first column is 1.248040 standard deviations above the mean value of its column

Unnamed: 0,A,B,C,D,E
0,-0.078087,1.598157,1.298813,-0.878438,1.12513
1,-0.858956,0.734288,0.273434,1.631385,-0.173097
2,-1.24939,-0.777482,-1.777323,-0.56471,-1.687695
3,1.483651,-0.993449,0.273434,0.690201,-0.173097
4,0.702782,-0.561514,-0.068359,-0.878438,0.908759


In [10]:
# Determining the p-value associated with a certain t-score that results from a hypothesis test
# Syntax:  scipy.stats.t.sf()
# scipy.stats.t.sf(abs(x), df) .....where x is t-score and df is degree of freedom

# Left-tailed test
import scipy.stats
#find p-value
pValue = scipy.stats.t.sf(abs(-1.24), df=10)
pValue
# The p-value is 0.1216. If we use a significance level of α = 0.05, we would fail to reject the null hypothesis

0.1216417367479479

In [11]:
# Right-tailed test
pValue = scipy.stats.t.sf(abs(1.24), df=10)
pValue
# The p-value is 0.1216. If we use a significance level of α = 0.05, we would fail to reject the null hypothesis

0.1216417367479479

In [12]:
# Two-tailed test
scipy.stats.t.sf(abs(1.24), df=22)*2

0.22803901531680093

In [13]:
# Z-Score test
# Syntax: scipy.stats.norm.sf(abs(x)) where x is z-score

# Left-tailed test
z_lt = scipy.stats.norm.sf(abs(-0.60))
print('Left tail test value is: ', z_lt)
# Right-tailed test
z_rt = scipy.stats.norm.sf(abs(1.5))
print('Right tail test value is: ',z_rt)
# Two-tailed test
z_2t = scipy.stats.norm.sf(abs(0.8))*2
print('Two tail test value is: ',z_2t)

Left tail test value is:  0.2742531177500736
Right tail test value is:  0.06680720126885807
Two tail test value is:  0.4237107971667934
