# Data Drift Workflow



In [1]:
import nannyml as nml
import pandas as pd
import numpy as np

In [2]:
reference, analysis, analysis_gt = nml.load_synthetic_sample()

In [None]:
features_for_drift = list(reference.columns)[:7] + ['y_pred_proba']
features_for_drift

In [3]:
md = nml.extract_metadata(data = reference, model_name='wfh_predictor')

Please review these to determine if they should be marked as ordinal instead.



In [4]:
print(md)

Metadata for model wfh_predictor

# Please identify column names for all '~ UNKNOWN ~' values

Model problem             binary_classification    

Identifier column         identifier               
Timestamp column          timestamp                
Partition column          partition                
Prediction column         ~ UNKNOWN ~              
Ground truth column       ~ UNKNOWN ~              

Features

Name                 Column               Type            Description
distance_from_office distance_from_office continuous      extracted feature: distance_from_office
salary_range         salary_range         categorical     extracted feature: salary_range
gas_price_per_litre  gas_price_per_litre  continuous      extracted feature: gas_price_per_litre
public_transportation_cost public_transportation_cost continuous      extracted feature: public_transportation_cost
wfh_prev_workday     wfh_prev_workday     categorical     extracted feature: wfh_prev_workday
workday         

In [5]:
md.prediction_column_name = 'y_pred_proba'
md.ground_truth_column_name = 'work_home_actual'

In [6]:
print(md)

Metadata for model wfh_predictor

# Please identify column names for all '~ UNKNOWN ~' values

Model problem             binary_classification    

Identifier column         identifier               
Timestamp column          timestamp                
Partition column          partition                
Prediction column         y_pred_proba             
Ground truth column       work_home_actual         

Features

Name                 Column               Type            Description
distance_from_office distance_from_office continuous      extracted feature: distance_from_office
salary_range         salary_range         categorical     extracted feature: salary_range
gas_price_per_litre  gas_price_per_litre  continuous      extracted feature: gas_price_per_litre
public_transportation_cost public_transportation_cost continuous      extracted feature: public_transportation_cost
wfh_prev_workday     wfh_prev_workday     categorical     extracted feature: wfh_prev_workday
workday         

In [7]:
univariate_calculator = nml.StatisticalDriftCalculator(model_metadata=md)

In [8]:
univariate_calculator.fit(reference_data=reference)

In [9]:
fdata = pd.concat([reference, analysis])
fdata

Unnamed: 0,distance_from_office,salary_range,gas_price_per_litre,public_transportation_cost,wfh_prev_workday,workday,tenure,identifier,work_home_actual,timestamp,y_pred_proba,partition
0,5.962247,40K - 60K €,2.119485,8.568058,False,Friday,0.212653,0,1.0,2014-05-09 22:27:20,0.99,reference
1,0.535872,40K - 60K €,2.357199,5.425382,True,Tuesday,4.927549,1,0.0,2014-05-09 22:59:32,0.07,reference
2,1.969519,40K - 60K €,2.366849,8.247158,False,Monday,0.520817,2,1.0,2014-05-09 23:48:25,1.00,reference
3,2.530410,20K - 20K €,2.318722,7.944251,False,Tuesday,0.453649,3,1.0,2014-05-10 01:12:09,0.98,reference
4,2.253635,60K+ €,2.221265,8.884478,True,Thursday,5.695263,4,1.0,2014-05-10 02:21:34,0.99,reference
...,...,...,...,...,...,...,...,...,...,...,...,...
49995,6.043911,0 - 20K €,1.983026,5.891216,True,Thursday,6.411578,99995,,2021-01-01 02:42:38,0.17,analysis
49996,5.676665,20K - 20K €,2.048549,7.584098,True,Wednesday,3.863509,99996,,2021-01-01 04:04:01,0.55,analysis
49997,3.143112,0 - 20K €,2.208197,6.574665,True,Tuesday,6.462975,99997,,2021-01-01 04:12:57,0.22,analysis
49998,8.335141,40K - 60K €,2.394480,5.257455,True,Monday,6.407057,99998,,2021-01-01 04:17:41,0.02,analysis


In [11]:
univariate_results = univariate_calculator.calculate(data=fdata, chunk_size=5000)

In [12]:
list(univariate_results.columns)

['key',
 'start_index',
 'end_index',
 'start_date',
 'end_date',
 'partition',
 'wfh_prev_workday_chi2',
 'wfh_prev_workday_p_value',
 'wfh_prev_workday_alert',
 'salary_range_chi2',
 'salary_range_p_value',
 'salary_range_alert',
 'workday_chi2',
 'workday_p_value',
 'workday_alert',
 'gas_price_per_litre_dstat',
 'gas_price_per_litre_p_value',
 'gas_price_per_litre_alert',
 'distance_from_office_dstat',
 'distance_from_office_p_value',
 'distance_from_office_alert',
 'public_transportation_cost_dstat',
 'public_transportation_cost_p_value',
 'public_transportation_cost_alert',
 'tenure_dstat',
 'tenure_p_value',
 'tenure_alert']

In [13]:
univariate_results

Unnamed: 0,key,start_index,end_index,start_date,end_date,partition,wfh_prev_workday_chi2,wfh_prev_workday_p_value,wfh_prev_workday_alert,salary_range_chi2,...,gas_price_per_litre_alert,distance_from_office_dstat,distance_from_office_p_value,distance_from_office_alert,public_transportation_cost_dstat,public_transportation_cost_p_value,public_transportation_cost_alert,tenure_dstat,tenure_p_value,tenure_alert
0,[0:4999],0,4999,2014-05-09,2014-09-09 23:59:59,reference,0.414606,0.52,False,2.898781,...,False,0.01034,0.712,False,0.00998,0.752,False,0.00978,0.774,False
1,[5000:9999],5000,9999,2014-09-09,2015-01-09 23:59:59,reference,0.033486,0.855,False,3.144391,...,False,0.0075,0.959,False,0.01046,0.698,False,0.01192,0.534,False
2,[10000:14999],10000,14999,2015-01-09,2015-05-09 23:59:59,reference,0.168656,0.681,False,2.451881,...,False,0.0082,0.917,False,0.01706,0.14,False,0.01268,0.454,False
3,[15000:19999],15000,19999,2015-05-09,2015-09-07 23:59:59,reference,0.05627,0.812,False,4.06262,...,False,0.0086,0.887,False,0.0122,0.504,False,0.01074,0.667,False
4,[20000:24999],20000,24999,2015-09-07,2016-01-08 23:59:59,reference,0.242059,0.623,False,2.413988,...,False,0.0091,0.842,False,0.00662,0.988,False,0.00924,0.829,False
5,[25000:29999],25000,29999,2016-01-08,2016-05-09 23:59:59,reference,3.614573,0.057,False,3.796063,...,False,0.01458,0.286,False,0.01186,0.541,False,0.00794,0.935,False
6,[30000:34999],30000,34999,2016-05-09,2016-09-04 23:59:59,reference,0.075705,0.783,False,3.228836,...,False,0.0129,0.432,False,0.00636,0.992,False,0.0112,0.615,False
7,[35000:39999],35000,39999,2016-09-04,2017-01-03 23:59:59,reference,0.414606,0.52,False,1.3933,...,False,0.0138,0.349,False,0.00832,0.909,False,0.0074,0.963,False
8,[40000:44999],40000,44999,2017-01-03,2017-05-03 23:59:59,reference,0.012656,0.91,False,0.304785,...,False,0.01586,0.201,False,0.01176,0.552,False,0.01464,0.281,False
9,[45000:49999],45000,49999,2017-05-03,2017-08-31 23:59:59,reference,2.203832,0.138,False,2.987581,...,False,0.00924,0.829,False,0.0082,0.917,False,0.01306,0.417,False


In [14]:
ranking = nml.AlertCountRanking()

In [15]:
ranked_features_drifted = ranking.rank(univariate_results)

In [16]:
ranked_features_drifted

Unnamed: 0,feature,number_of_alerts,rank
0,wfh_prev_workday,5,1
1,salary_range,5,2
2,distance_from_office,5,3
3,public_transportation_cost,5,4
4,tenure,2,5
5,workday,0,6
6,gas_price_per_litre,0,7


In [17]:
nml.calculate_statistical_drift(reference_data=reference, analysis_data=analysis, model_metadata=md)

Unnamed: 0,key,start_index,end_index,start_date,end_date,partition,wfh_prev_workday_chi2,wfh_prev_workday_p_value,wfh_prev_workday_alert,salary_range_chi2,...,gas_price_per_litre_alert,distance_from_office_dstat,distance_from_office_p_value,distance_from_office_alert,public_transportation_cost_dstat,public_transportation_cost_p_value,public_transportation_cost_alert,tenure_dstat,tenure_p_value,tenure_alert
0,[0:899],0,899,2017-08-31,2017-09-21 23:59:59,analysis,2.402192,0.121,False,0.795467,...,False,0.037169,0.169,False,0.04364,0.067,False,0.036593,0.183,False
1,[900:1799],900,1799,2017-09-21,2017-10-14 23:59:59,analysis,0.25786,0.612,False,9.549308,...,False,0.028222,0.473,False,0.027827,0.492,False,0.029616,0.412,False
2,[1800:2699],1800,2699,2017-10-14,2017-11-07 23:59:59,analysis,0.306612,0.58,False,5.852044,...,False,0.045993,0.046,True,0.024642,0.647,False,0.038676,0.138,False
3,[2700:3599],2700,3599,2017-11-07,2017-11-29 23:59:59,analysis,1.483172,0.223,False,5.324653,...,False,0.031778,0.327,False,0.031191,0.349,False,0.029933,0.399,False
4,[3600:4499],3600,4499,2017-11-29,2017-12-21 23:59:59,analysis,0.471359,0.492,False,4.420238,...,False,0.025504,0.604,False,0.042851,0.076,False,0.0419,0.087,False
5,[4500:5399],4500,5399,2017-12-21,2018-01-12 23:59:59,analysis,3.296138,0.069,False,2.889654,...,False,0.02164,0.794,False,0.037644,0.159,False,0.025218,0.618,False
6,[5400:6299],5400,6299,2018-01-12,2018-02-01 23:59:59,analysis,0.306612,0.58,False,1.764991,...,False,0.0211,0.818,False,0.024858,0.636,False,0.022487,0.754,False
7,[6300:7199],6300,7199,2018-02-01,2018-02-23 23:59:59,analysis,0.007886,0.929,False,3.349158,...,False,0.017267,0.951,False,0.031507,0.337,False,0.041822,0.088,False
8,[7200:8099],7200,8099,2018-02-23,2018-03-18 23:59:59,analysis,0.058615,0.809,False,3.416404,...,False,0.041027,0.099,False,0.039909,0.116,False,0.05602,0.007,True
9,[8100:8999],8100,8999,2018-03-18,2018-04-08 23:59:59,analysis,0.024094,0.877,False,1.284651,...,False,0.02096,0.824,False,0.024927,0.633,False,0.031309,0.344,False


In [18]:
rcerror_calculator = nml.ReconstructionErrorDriftCalculator(model_metadata=md)

In [19]:
rcerror_calculator.fit(reference_data=reference)

In [20]:
rcerror_results = rcerror_calculator.calculate(data=fdata, chunk_size=5000)
rcerror_results

Unnamed: 0,key,start_index,end_index,start_date,end_date,partition,reconstruction_error,alert
0,[0:4999],0,4999,2014-05-09,2014-09-09 23:59:59,reference,1.120961,0
1,[5000:9999],5000,9999,2014-09-09,2015-01-09 23:59:59,reference,1.118071,0
2,[10000:14999],10000,14999,2015-01-09,2015-05-09 23:59:59,reference,1.117237,0
3,[15000:19999],15000,19999,2015-05-09,2015-09-07 23:59:59,reference,1.125514,0
4,[20000:24999],20000,24999,2015-09-07,2016-01-08 23:59:59,reference,1.109446,0
5,[25000:29999],25000,29999,2016-01-08,2016-05-09 23:59:59,reference,1.122759,0
6,[30000:34999],30000,34999,2016-05-09,2016-09-04 23:59:59,reference,1.107138,0
7,[35000:39999],35000,39999,2016-09-04,2017-01-03 23:59:59,reference,1.127134,0
8,[40000:44999],40000,44999,2017-01-03,2017-05-03 23:59:59,reference,1.114237,0
9,[45000:49999],45000,49999,2017-05-03,2017-08-31 23:59:59,reference,1.11045,0
