In [1]:
import pandas as pd
import nannyml as nml
reference, analysis, analysis_target = nml.load_synthetic_sample()
reference['y_pred'] = reference['y_pred_proba'].map(lambda p: int(p >= 0.8))
analysis['y_pred'] = analysis['y_pred_proba'].map(lambda p: int(p >= 0.8))
reference.head()

Unnamed: 0,distance_from_office,salary_range,gas_price_per_litre,public_transportation_cost,wfh_prev_workday,workday,tenure,identifier,work_home_actual,timestamp,y_pred_proba,partition,y_pred
0,5.962247,40K - 60K €,2.119485,8.568058,False,Friday,0.212653,0,1,2014-05-09 22:27:20,0.99,reference,1
1,0.535872,40K - 60K €,2.357199,5.425382,True,Tuesday,4.927549,1,0,2014-05-09 22:59:32,0.07,reference,0
2,1.969519,40K - 60K €,2.366849,8.247158,False,Monday,0.520817,2,1,2014-05-09 23:48:25,1.0,reference,1
3,2.53041,20K - 20K €,2.318722,7.944251,False,Tuesday,0.453649,3,1,2014-05-10 01:12:09,0.98,reference,1
4,2.253635,60K+ €,2.221265,8.884478,True,Thursday,5.695263,4,1,2014-05-10 02:21:34,0.99,reference,1


In [2]:
metadata = nml.extract_metadata(data = reference, model_name='wfh_predictor')
metadata.target_column_name = 'work_home_actual'
data = pd.concat([reference, analysis], ignore_index=True)
# Let's use a chunk size of 5000 data points to create our drift statistics
chunk_size = 5000

In [3]:
analysis.head()

Unnamed: 0,distance_from_office,salary_range,gas_price_per_litre,public_transportation_cost,wfh_prev_workday,workday,tenure,identifier,timestamp,y_pred_proba,partition,y_pred
0,0.527691,0 - 20K €,1.800003,8.960724,False,Tuesday,4.224628,50000,2017-08-31 04:20:00,0.99,analysis,1
1,8.485134,20K - 20K €,2.222074,8.768792,False,Friday,4.963103,50001,2017-08-31 05:16:16,0.98,analysis,1
2,2.073876,40K - 60K €,2.310077,8.649979,True,Friday,4.588951,50002,2017-08-31 05:56:44,0.98,analysis,1
3,0.118456,20K - 20K €,2.171441,8.855418,False,Tuesday,4.711015,50003,2017-08-31 06:10:17,0.97,analysis,1
4,4.786705,0 - 20K €,2.368541,8.394966,False,Monday,0.906738,50004,2017-08-31 06:29:38,0.92,analysis,1


In [4]:
# fit estimator and estimate
estimator = nml.CBPE(model_metadata=metadata, chunk_size=chunk_size, metrics=['roc_auc', 'f1']).fit(reference)
estimated_performance = estimator.estimate(data=data)
# show results
figure = estimated_performance.plot(kind='performance', metric='roc_auc')
figure.show()
# save figure - not shown on guide:
figure.write_image(file=f"perf-est-guide-syth-example.svg")

TypeError: __init__() got an unexpected keyword argument 'metrics'

In [None]:
# Let's initialize the object that will perform the Univariate Drift calculations
univariate_calculator = nml.UnivariateStatisticalDriftCalculator(model_metadata=metadata, chunk_size=chunk_size).fit(reference_data=reference)
univariate_results = univariate_calculator.calculate(data=data)
# let's plot drift results for all model inputs
for feature in metadata.features:
    figure = univariate_results.plot(kind='feature_drift', metric='statistic', feature_label=feature.label)
    figure.show()

In [None]:
ranker = nml.Ranker.by('alert_count')
ranked_features = ranker.rank(univariate_results, model_metadata=metadata, only_drifting = False)
ranked_features

In [None]:
figure = univariate_results.plot(kind='prediction_drift', metric='statistic')
figure.show()

In [None]:
# Let's initialize the object that will perform Data Reconstruction with PCA
rcerror_calculator = nml.DataReconstructionDriftCalculator(model_metadata=metadata, chunk_size=chunk_size).fit(reference_data=reference)
# let's see Reconstruction error statistics for all available data
rcerror_results = rcerror_calculator.calculate(data=data)
figure = rcerror_results.plot(kind='drift')
figure.show()