In [1]:
from nannyml.drift.univariate.methods import WassersteinDistance, JensenShannonDistance
from nannyml.chunk import SizeBasedChunker
from nannyml.drift.univariate.calculator import UnivariateDriftCalculator
import pandas as pd
import numpy as np



## Continuous

In [2]:
# create two normal distributions with column name x1
ref = pd.DataFrame(np.random.normal(0, 1, 1000), columns=['x1'])
ana = pd.DataFrame(np.random.normal(0.3, 1, 1000), columns=['x1'])

# create a chunker
chunker = SizeBasedChunker(100)
columns = ['x1']
calc = UnivariateDriftCalculator(column_names = columns,
                                 continuous_methods=['wasserstein_distance', 'jensen_shannon'],
                                 chunker = chunker
                                 )

In [3]:
calc.fit(ref)
results = calc.calculate(ana)
results.data


Unnamed: 0_level_0,chunk,chunk,chunk,chunk,chunk,chunk,chunk,x1,x1,x1,x1,x1,x1,x1,x1
Unnamed: 0_level_1,chunk,chunk,chunk,chunk,chunk,chunk,chunk,wasserstein_distance,wasserstein_distance,wasserstein_distance,wasserstein_distance,jensen_shannon,jensen_shannon,jensen_shannon,jensen_shannon
Unnamed: 0_level_2,key,chunk_index,start_index,end_index,start_date,end_date,period,value,upper_threshold,lower_threshold,alert,value,upper_threshold,lower_threshold,alert
0,[0:99],0,0,99,,,reference,0.130556,0.253061,0,False,0.161921,0.1,,True
1,[100:199],1,100,199,,,reference,0.096073,0.253061,0,False,0.114481,0.1,,True
2,[200:299],2,200,299,,,reference,0.072879,0.253061,0,False,0.141272,0.1,,True
3,[300:399],3,300,399,,,reference,0.200464,0.253061,0,False,0.165482,0.1,,True
4,[400:499],4,400,499,,,reference,0.090882,0.253061,0,False,0.117074,0.1,,True
5,[500:599],5,500,599,,,reference,0.07176,0.253061,0,False,0.135301,0.1,,True
6,[600:699],6,600,699,,,reference,0.201305,0.253061,0,False,0.139086,0.1,,True
7,[700:799],7,700,799,,,reference,0.096461,0.253061,0,False,0.105128,0.1,,True
8,[800:899],8,800,899,,,reference,0.120036,0.253061,0,False,0.123791,0.1,,True
9,[900:999],9,900,999,,,reference,0.117897,0.253061,0,False,0.108572,0.1,,True


In [4]:
dist_fig = results.plot(kind = "distribution",
             method = "wasserstein_distance",
             column_name = "x1",
             plot_reference = True)
dist_fig.show()

In [5]:
drift_fig = results.plot(kind = "drift",
                method = "wasserstein_distance",
                column_name = "x1",
                plot_reference = True)
drift_fig.show()

In [6]:
drift_fig = results.plot(kind = "drift",
                method = "jensen_shannon",
                column_name = "x1",
                plot_reference = True)
drift_fig.show()

## Categorical

In [7]:
# create two categorical distributions with column name x1
ref = pd.DataFrame(np.random.choice(['a', 'b', 'c'], 1000), columns=['x1'])
ana = pd.DataFrame(['a']*500 + ['b']*500, columns=['x1'])

# create a chunker
chunker = SizeBasedChunker(100)
columns = ['x1']
calc = UnivariateDriftCalculator(column_names = columns,
                                    categorical_methods=['chi2'],
                                    chunker = chunker
                                    )

calc.fit(ref)
results = calc.calculate(ana)
results.data

Unnamed: 0_level_0,chunk,chunk,chunk,chunk,chunk,chunk,chunk,x1,x1,x1,x1
Unnamed: 0_level_1,chunk,chunk,chunk,chunk,chunk,chunk,chunk,chi2,chi2,chi2,chi2
Unnamed: 0_level_2,key,chunk_index,start_index,end_index,start_date,end_date,period,value,upper_threshold,lower_threshold,alert
0,[0:99],0,0,99,,,reference,0.188005,,0.05,False
1,[100:199],1,100,199,,,reference,0.47703,,0.05,False
2,[200:299],2,200,299,,,reference,0.47703,,0.05,False
3,[300:399],3,300,399,,,reference,2.229498,,0.05,False
4,[400:499],4,400,499,,,reference,2.640001,,0.05,False
5,[500:599],5,500,599,,,reference,2.514758,,0.05,False
6,[600:699],6,600,699,,,reference,1.525567,,0.05,False
7,[700:799],7,700,799,,,reference,2.730625,,0.05,False
8,[800:899],8,800,899,,,reference,5.572857,,0.05,False
9,[900:999],9,900,999,,,reference,1.175419,,0.05,False


In [8]:
drift_fig = results.plot(kind = "drift",
                method = "chi2",
                column_name = "x1",
                plot_reference = True)
drift_fig.show()

## Unit Test stuff

In [50]:
from scipy.stats import wasserstein_distance

# create two normal distributions
np.random.seed(1)

ref = pd.Series(np.random.normal(0, 1, 10000))

ana = ref

print("scipy:", np.round(wasserstein_distance(ref, ana), 2))
print("nml", np.round(WassersteinDistance(chunker = None).fit(ref).calculate(ana), 2))
print()

ana = pd.Series(np.random.normal(1, 1, 10000))

print("scipy:", np.round(wasserstein_distance(ref, ana), 2))
print("nml", np.round(WassersteinDistance(chunker = None).fit(ref).calculate(ana), 2))
print()


ana = pd.Series(np.random.normal(3, 1, 10000))

print("scipy:",np.round(wasserstein_distance(ref, ana), 2))
print("nml", np.round(WassersteinDistance(chunker = None).fit(ref).calculate(ana), 2))
print()

ana = pd.Series(np.random.normal(-4, 1, 10000))

print("scipy:",np.round(wasserstein_distance(ref, ana), 2))
print("nml", np.round(WassersteinDistance(chunker = None).fit(ref).calculate(ana), 2))


scipy: 0.0
nml 0.0

scipy: 1.0
nml 1.0

scipy: 2.98
nml 2.98

scipy: 3.99
nml 3.99
