# Data Drift Workflow



In [1]:
import nannyml as nml
import pandas as pd

In [2]:
reference, analysis, analysis_gt = nml.load_synthetic_sample()

In [3]:
md = nml.extract_metadata(data = reference, model_name='wfh_predictor')

In [4]:
print(md.print())

Metadata for model wfh_predictor

# Please identify column names for all '~ UNKNOWN ~' values

Model problem             binary_classification    

Identifier column         identifier               
Timestamp column          ~ UNKNOWN ~              
Partition column          partition                
Prediction column         y_pred_proba             
Ground truth column       ~ UNKNOWN ~              

Features

Name                 Column               Type            Description
distance_from_office distance_from_office continuous      extracted feature: distance_from_office
salary_range         salary_range         categorical     extracted feature: salary_range
gas_price_per_litre  gas_price_per_litre  continuous      extracted feature: gas_price_per_litre
public_transportation_cost public_transportation_cost continuous      extracted feature: public_transportation_cost
wfh_prev_workday     wfh_prev_workday     categorical     extracted feature: wfh_prev_workday
workday         

In [5]:
md.timestamp_column_name = 'timestamp'
# md.prediction_column_name = 'y_pred_proba'
md.ground_truth_column_name = 'work_home_actual'

In [6]:
print(md.print())

Metadata for model wfh_predictor

# Please identify column names for all '~ UNKNOWN ~' values

Model problem             binary_classification    

Identifier column         identifier               
Timestamp column          timestamp                
Partition column          partition                
Prediction column         y_pred_proba             
Ground truth column       work_home_actual         

Features

Name                 Column               Type            Description
distance_from_office distance_from_office continuous      extracted feature: distance_from_office
salary_range         salary_range         categorical     extracted feature: salary_range
gas_price_per_litre  gas_price_per_litre  continuous      extracted feature: gas_price_per_litre
public_transportation_cost public_transportation_cost continuous      extracted feature: public_transportation_cost
wfh_prev_workday     wfh_prev_workday     categorical     extracted feature: wfh_prev_workday
workday         

In [7]:
univariate_calculator = nml.UnivariateStatisticalDriftCalculator(model_metadata=md, chunk_size=5000)

In [8]:
univariate_calculator.fit(reference_data=reference)

In [9]:
fdata = pd.concat([reference, analysis], ignore_index=True)
fdata

Unnamed: 0,distance_from_office,salary_range,gas_price_per_litre,public_transportation_cost,wfh_prev_workday,workday,tenure,identifier,work_home_actual,timestamp,y_pred_proba,partition
0,5.962247,40K - 60K €,2.119485,8.568058,False,Friday,0.212653,0,1.0,2014-05-09 22:27:20,0.99,reference
1,0.535872,40K - 60K €,2.357199,5.425382,True,Tuesday,4.927549,1,0.0,2014-05-09 22:59:32,0.07,reference
2,1.969519,40K - 60K €,2.366849,8.247158,False,Monday,0.520817,2,1.0,2014-05-09 23:48:25,1.00,reference
3,2.530410,20K - 20K €,2.318722,7.944251,False,Tuesday,0.453649,3,1.0,2014-05-10 01:12:09,0.98,reference
4,2.253635,60K+ €,2.221265,8.884478,True,Thursday,5.695263,4,1.0,2014-05-10 02:21:34,0.99,reference
...,...,...,...,...,...,...,...,...,...,...,...,...
99995,6.043911,0 - 20K €,1.983026,5.891216,True,Thursday,6.411578,99995,,2021-01-01 02:42:38,0.17,analysis
99996,5.676665,20K - 20K €,2.048549,7.584098,True,Wednesday,3.863509,99996,,2021-01-01 04:04:01,0.55,analysis
99997,3.143112,0 - 20K €,2.208197,6.574665,True,Tuesday,6.462975,99997,,2021-01-01 04:12:57,0.22,analysis
99998,8.335141,40K - 60K €,2.394480,5.257455,True,Monday,6.407057,99998,,2021-01-01 04:17:41,0.02,analysis


In [10]:
univariate_results = univariate_calculator.calculate(data=fdata)

In [11]:
univariate_results_columns = list(univariate_results.columns)
univariate_results_columns

['key',
 'start_index',
 'end_index',
 'start_date',
 'end_date',
 'partition',
 'salary_range_chi2',
 'salary_range_p_value',
 'salary_range_alert',
 'wfh_prev_workday_chi2',
 'wfh_prev_workday_p_value',
 'wfh_prev_workday_alert',
 'workday_chi2',
 'workday_p_value',
 'workday_alert',
 'gas_price_per_litre_dstat',
 'gas_price_per_litre_p_value',
 'gas_price_per_litre_alert',
 'distance_from_office_dstat',
 'distance_from_office_p_value',
 'distance_from_office_alert',
 'tenure_dstat',
 'tenure_p_value',
 'tenure_alert',
 'public_transportation_cost_dstat',
 'public_transportation_cost_p_value',
 'public_transportation_cost_alert']

In [12]:
univariate_results.iloc[-5:, :9]

Unnamed: 0,key,start_index,end_index,start_date,end_date,partition,salary_range_chi2,salary_range_p_value,salary_range_alert
15,[75000:79999],75000,79999,2019-04-30,2019-09-01 23:59:59,analysis,455.622094,0.0,True
16,[80000:84999],80000,84999,2019-09-01,2019-12-31 23:59:59,analysis,428.633384,0.0,True
17,[85000:89999],85000,89999,2019-12-31,2020-04-30 23:59:59,analysis,453.247444,0.0,True
18,[90000:94999],90000,94999,2020-04-30,2020-09-01 23:59:59,analysis,438.25997,0.0,True
19,[95000:99999],95000,99999,2020-09-01,2021-01-01 23:59:59,analysis,474.891775,0.0,True


In [13]:
ranking = nml.AlertCountRanking()

In [14]:
ranked_features_drifted = ranking.rank(univariate_results)

In [15]:
ranked_features_drifted

Unnamed: 0,feature,number_of_alerts,rank
0,salary_range,5,1
1,wfh_prev_workday,5,2
2,distance_from_office,5,3
3,public_transportation_cost,5,4
4,tenure,2,5
5,workday,0,6
6,gas_price_per_litre,0,7


In [16]:
rcerror_calculator = nml.DataReconstructionDriftCalculator(model_metadata=md, chunk_size=5000)

In [17]:
rcerror_calculator.fit(reference_data=reference)

In [18]:
rcerror_results = rcerror_calculator.calculate(data=fdata)
rcerror_results

Unnamed: 0,key,start_index,end_index,start_date,end_date,partition,reconstruction_error,alert
0,[0:4999],0,4999,2014-05-09,2014-09-09 23:59:59,reference,1.120961,0
1,[5000:9999],5000,9999,2014-09-09,2015-01-09 23:59:59,reference,1.118071,0
2,[10000:14999],10000,14999,2015-01-09,2015-05-09 23:59:59,reference,1.117237,0
3,[15000:19999],15000,19999,2015-05-09,2015-09-07 23:59:59,reference,1.125514,0
4,[20000:24999],20000,24999,2015-09-07,2016-01-08 23:59:59,reference,1.109446,0
5,[25000:29999],25000,29999,2016-01-08,2016-05-09 23:59:59,reference,1.122759,0
6,[30000:34999],30000,34999,2016-05-09,2016-09-04 23:59:59,reference,1.107138,0
7,[35000:39999],35000,39999,2016-09-04,2017-01-03 23:59:59,reference,1.127134,0
8,[40000:44999],40000,44999,2017-01-03,2017-05-03 23:59:59,reference,1.114237,0
9,[45000:49999],45000,49999,2017-05-03,2017-08-31 23:59:59,reference,1.11045,0
