# Data Drift Workflow



In [1]:
import nannyml as nml
import pandas as pd
import numpy as np

# not sure how to further expose it as nml.datasets.load_synthetic_sample
from nannyml.datasets import load_synthetic_sample

In [2]:
reference, analysis, analysis_gt = load_synthetic_sample()

In [3]:
features_for_drift = list(reference.columns)[:7] + ['y_pred_proba']
features_for_drift

['distance_from_office',
 'salary_range',
 'gas_price_per_litre',
 'public_transportation_cost',
 'wfh_prev_workday',
 'workday',
 'tenure',
 'y_pred_proba']

In [4]:
md = nml.extract_metadata(data = reference, model_name='wfh_predictor')

Please review these to determine if they should be marked as ordinal instead.



In [5]:
print(md)

Metadata for model wfh_predictor

# Please identify column names for all '~ UNKNOWN ~' values

Model problem             binary_classification    

Identifier column         identifier               
Timestamp column          ~ UNKNOWN ~              
Partition column          partition                
Prediction column         ~ UNKNOWN ~              
Ground truth column       ~ UNKNOWN ~              

Features

Name                 Column               Type            Description
distance_from_office distance_from_office continuous      extracted feature: distance_from_office
salary_range         salary_range         categorical     extracted feature: salary_range
gas_price_per_litre  gas_price_per_litre  continuous      extracted feature: gas_price_per_litre
public_transportation_cost public_transportation_cost continuous      extracted feature: public_transportation_cost
wfh_prev_workday     wfh_prev_workday     categorical     extracted feature: wfh_prev_workday
workday         

In [6]:
md.prediction_column_name = 'y_pred_proba'
md.ground_truth_column_name = 'work_home_actual'
md.timestamp_column_name = 'DATE'

In [7]:
print(md)

Metadata for model wfh_predictor

# Please identify column names for all '~ UNKNOWN ~' values

Model problem             binary_classification    

Identifier column         identifier               
Timestamp column          DATE                     
Partition column          partition                
Prediction column         y_pred_proba             
Ground truth column       work_home_actual         

Features

Name                 Column               Type            Description
distance_from_office distance_from_office continuous      extracted feature: distance_from_office
salary_range         salary_range         categorical     extracted feature: salary_range
gas_price_per_litre  gas_price_per_litre  continuous      extracted feature: gas_price_per_litre
public_transportation_cost public_transportation_cost continuous      extracted feature: public_transportation_cost
wfh_prev_workday     wfh_prev_workday     categorical     extracted feature: wfh_prev_workday
workday         

In [8]:
univariate_calculator = nml.StatisticalDriftCalculator(model_metadata=md, features=features_for_drift)

In [9]:
univariate_calculator.fit(reference_data=reference)

In [10]:
fdata = pd.concat([reference, analysis])
fdata

Unnamed: 0,distance_from_office,salary_range,gas_price_per_litre,public_transportation_cost,wfh_prev_workday,workday,tenure,identifier,DATE,work_home_actual,y_pred_proba,partition
0,5.962247,40K - 60K €,2.119485,8.568058,False,Friday,0.212653,0,2015-05-06 13:12:20,1.0,0.99,reference
1,0.535872,40K - 60K €,2.357199,5.425382,True,Tuesday,4.927549,1,2020-08-05 14:58:14,0.0,0.07,reference
2,1.969519,40K - 60K €,2.366849,8.247158,False,Monday,0.520817,2,2014-03-02 06:31:16,1.0,1.00,reference
3,2.530410,20K - 20K €,2.318722,7.944251,False,Tuesday,0.453649,3,2020-12-14 18:06:37,1.0,0.98,reference
4,2.253635,60K+ €,2.221265,8.884478,True,Thursday,5.695263,4,2018-06-09 13:16:14,1.0,0.99,reference
...,...,...,...,...,...,...,...,...,...,...,...,...
49995,6.043911,0 - 20K €,1.983026,5.891216,True,Thursday,6.411578,99995,2020-03-01 06:02:02,,0.17,analysis
49996,5.676665,20K - 20K €,2.048549,7.584098,True,Wednesday,3.863509,99996,2014-08-31 12:20:36,,0.55,analysis
49997,3.143112,0 - 20K €,2.208197,6.574665,True,Tuesday,6.462975,99997,2011-04-30 06:32:24,,0.22,analysis
49998,8.335141,40K - 60K €,2.394480,5.257455,True,Monday,6.407057,99998,2018-02-02 05:56:29,,0.02,analysis


In [11]:
# Hotfix because currently drift calculator has a bug and uses gt from analysis that is NaN
hotfix = fdata.copy(deep=True)
hotfix['work_home_actual'] = np.random.randint(0, 2, size=hotfix.shape[0])

In [12]:
univariate_results = univariate_calculator.calculate(data=hotfix, chunk_size=5000)

In [13]:
list(univariate_results.columns)

['key',
 'start_index',
 'end_index',
 'start_date',
 'end_date',
 'partition',
 'wfh_prev_workday_chi2',
 'wfh_prev_workday_p_value',
 'wfh_prev_workday_alert',
 'salary_range_chi2',
 'salary_range_p_value',
 'salary_range_alert',
 'workday_chi2',
 'workday_p_value',
 'workday_alert',
 'distance_from_office_dstat',
 'distance_from_office_p_value',
 'distance_from_office_alert',
 'public_transportation_cost_dstat',
 'public_transportation_cost_p_value',
 'public_transportation_cost_alert',
 'gas_price_per_litre_dstat',
 'gas_price_per_litre_p_value',
 'gas_price_per_litre_alert',
 'tenure_dstat',
 'tenure_p_value',
 'tenure_alert']

In [14]:
univariate_results

Unnamed: 0,key,start_index,end_index,start_date,end_date,partition,wfh_prev_workday_chi2,wfh_prev_workday_p_value,wfh_prev_workday_alert,salary_range_chi2,...,distance_from_office_alert,public_transportation_cost_dstat,public_transportation_cost_p_value,public_transportation_cost_alert,gas_price_per_litre_dstat,gas_price_per_litre_p_value,gas_price_per_litre_alert,tenure_dstat,tenure_p_value,tenure_alert
0,[0:4999],0,4999,2011-01-02,2020-12-31 23:59:59,reference,0.414606,0.52,False,2.898781,...,False,0.00998,0.752,False,0.01122,0.612,False,0.00978,0.774,False
1,[5000:9999],5000,9999,2011-01-01,2020-12-31 23:59:59,reference,0.033486,0.855,False,3.144391,...,False,0.01046,0.698,False,0.01222,0.502,False,0.01192,0.534,False
2,[10000:14999],10000,14999,2011-01-01,2020-12-30 23:59:59,reference,0.168656,0.681,False,2.451881,...,False,0.01706,0.14,False,0.00886,0.865,False,0.01268,0.454,False
3,[15000:19999],15000,19999,2011-01-01,2020-12-31 23:59:59,reference,0.05627,0.812,False,4.06262,...,False,0.0122,0.504,False,0.00956,0.797,False,0.01074,0.667,False
4,[20000:24999],20000,24999,2011-01-03,2020-12-31 23:59:59,reference,0.242059,0.623,False,2.413988,...,False,0.00662,0.988,False,0.00758,0.955,False,0.00924,0.829,False
5,[25000:29999],25000,29999,2011-01-01,2020-12-30 23:59:59,reference,3.614573,0.057,False,3.796063,...,False,0.01186,0.541,False,0.01032,0.714,False,0.00794,0.935,False
6,[30000:34999],30000,34999,2011-01-02,2020-12-28 23:59:59,reference,0.075705,0.783,False,3.228836,...,False,0.00636,0.992,False,0.01094,0.644,False,0.0112,0.615,False
7,[35000:39999],35000,39999,2011-01-02,2021-01-01 23:59:59,reference,0.414606,0.52,False,1.3933,...,False,0.00832,0.909,False,0.01736,0.128,False,0.0074,0.963,False
8,[40000:44999],40000,44999,2011-01-01,2020-12-31 23:59:59,reference,0.012656,0.91,False,0.304785,...,False,0.01176,0.552,False,0.00842,0.901,False,0.01464,0.281,False
9,[45000:49999],45000,49999,2011-01-01,2020-12-31 23:59:59,reference,2.203832,0.138,False,2.987581,...,False,0.0082,0.917,False,0.00786,0.939,False,0.01306,0.417,False


In [15]:
ranking = nml.AlertCountRanking()

In [16]:
ranked_features_drifted = ranking.rank(univariate_results)

In [17]:
ranked_features_drifted

Unnamed: 0,feature,number_of_alerts,rank
0,wfh_prev_workday,5,1
1,salary_range,5,2
2,distance_from_office,5,3
3,public_transportation_cost,5,4
4,tenure,2,5
5,workday,0,6
6,gas_price_per_litre,0,7


In [18]:
nml.calculate_statistical_drift(reference_data=reference, analysis_data=analysis, model_metadata=md)

KeyError: 'work_home_actual'