In [1]:
# Z- Score

In [2]:
# z-score tells us how many standard deviations away a value is from the mean
# We use the following formula to calculate a z-score:
# z = (X – μ) / σ

# scipy.stats.zscore
# scipy.stats.zscore(data, axis=0, ddof=0, nan_policy=’propagate’)
# data: an array like object containing data
# axis: the axis along which to calculate the z-scores. Default is 0.
# ddof: degrees of freedom correction in the calculation of the standard deviation. Default is 0.
# nan_policy: how to handle when input contains nan. Default is propagate, which returns nan. 
#             ‘raise’ throws an error and ‘omit’ performs calculations ignoring nan values.


In [3]:
import numpy as np
import pandas as pd
import scipy.stats as stats

In [4]:
data = np.array([8,5,12,12,12,10,16])
stats.zscore(data)


array([-0.83805407, -1.76432437,  0.39697298,  0.39697298,  0.39697298,
       -0.22054055,  1.63200004])

In [5]:
print(np.mean(data))


10.714285714285714


In [6]:
print(np.std(data))


3.2387954425013237


In [7]:
# which means....
# The first value 8 is 0.56011203 standard deviations below the mean
# The value 10 is 0 standard deviations away from the mean
# the value 16 is 1.6803361 statndard deviations above the mean

In [12]:
# Multi- Dimensinal Arrays

data = np.array([[1,2,3,4],
                [5,6,7,8],
                [9,10,11,12]])
stats.zscore(data, axis=1)
# The first value of "1" in the first array is 1.34164079 statndard deviations below the means of its array. 

array([[-1.34164079, -0.4472136 ,  0.4472136 ,  1.34164079],
       [-1.34164079, -0.4472136 ,  0.4472136 ,  1.34164079],
       [-1.34164079, -0.4472136 ,  0.4472136 ,  1.34164079]])

In [10]:
print(np.mean(data))

6.5


In [14]:
# using Pandas DataFrames
data= pd.DataFrame(np.random.randint(0,15,size =(5,5)), columns=['A','B','C','D','E'])
data

Unnamed: 0,A,B,C,D,E
0,3,10,2,2,0
1,5,1,10,13,12
2,10,0,3,8,3
3,13,5,5,8,9
4,0,14,0,5,11


In [15]:
data.apply(stats.zscore)
# The first value of"11" in the first column is 1.248040 standard deviation above the mean value of its column.

Unnamed: 0,A,B,C,D,E
0,-0.679775,0.750587,-0.58722,-1.422657,-1.492405
1,-0.254916,-0.938233,1.761661,1.586809,1.066004
2,0.807233,-1.12588,-0.29361,0.21887,-0.852803
3,1.444522,-0.187647,0.29361,0.21887,0.426401
4,-1.317064,1.501173,-1.17444,-0.601893,0.852803


In [16]:
print(np.mean(data))

A    6.2
B    6.0
C    4.0
D    7.2
E    7.0
dtype: float64


In [17]:
# Determining the p-value associated with a certain t-score that results from a hypothesis test.
# Syntax: scipy.stats.t.sf()
# scipy.stats.t.sf(abs(x), df) .... where x is t-score and df is degree of freedom

# Left-tailed test
import scipy.stats
# find p- value
pValue = scipy.stats.t.sf(abs(-1.24), df =10)
pValue
# The p-value is 0.1216. If we use a significance level of a= 0.05, we will fail to reject the null hypothesis.

0.1216417367479479

In [19]:
# Two- tailed test 
scipy.stats.t.sf(abs(1.24),df= 22)*2

0.22803901531680093

In [20]:
# Z-Score test
# Syntax: scipy.stats.norm.sf(abs(X)) where x is z-score

# left-tailed test
z_lt = scipy.stats.norm.sf(abs(-0.60))
print('Left tail text value is :', z_lt)
# Right-tailed test
z_rt = scipy.stats.norm.sf(abs(1.5))
print('Right tail test value is :', z_rt)
# Two-tailed test
z_2t = scipy.stats.norm.sf(abs(0.8))*2
print("Two tail test value is :",z_2t)

Left tail text value is : 0.2742531177500736
Right tail test value is : 0.06680720126885807
Two tail test value is : 0.4237107971667934
