In [None]:
import nannyml as nml
from IPython.display import display

reference_df = nml.load_synthetic_binary_classification_dataset()[0]
analysis_df = nml.load_synthetic_binary_classification_dataset()[1]
analysis_target_df = nml.load_synthetic_binary_classification_dataset()[2]
analysis_df = analysis_df.merge(analysis_target_df, on='identifier')

column_names = ['distance_from_office', 'salary_range', 'gas_price_per_litre', 'public_transportation_cost', 'wfh_prev_workday', 'workday', 'tenure', 'y_pred_proba', 'y_pred']
univ_calc = nml.UnivariateDriftCalculator(
    column_names=column_names,
    timestamp_column_name='timestamp',
    continuous_methods=['kolmogorov_smirnov', 'jensen_shannon'],
    categorical_methods=['chi2', 'jensen_shannon'],
    chunk_size=5000
)

univ_calc.fit(reference_df)
univariate_results = univ_calc.calculate(analysis_df)
display(univariate_results.filter(period='analysis', column_names=['distance_from_office']).to_df())

Unnamed: 0_level_0,chunk,chunk,chunk,chunk,chunk,chunk,chunk,distance_from_office,distance_from_office,distance_from_office,distance_from_office,distance_from_office,distance_from_office,distance_from_office,distance_from_office
Unnamed: 0_level_1,chunk,chunk,chunk,chunk,chunk,chunk,chunk,kolmogorov_smirnov,kolmogorov_smirnov,kolmogorov_smirnov,kolmogorov_smirnov,jensen_shannon,jensen_shannon,jensen_shannon,jensen_shannon
Unnamed: 0_level_2,chunk_index,end_date,end_index,key,period,start_date,start_index,alert,lower_threshold,upper_threshold,value,alert,lower_threshold,upper_threshold,value
0,0,2018-01-02 00:45:44,4999,[0:4999],analysis,2017-08-31 04:20:00,0,False,,,0.012,False,,0.1,0.026101
1,1,2018-05-01 13:10:10,9999,[5000:9999],analysis,2018-01-02 01:13:11,5000,False,,,0.0102,False,,0.1,0.020297
2,2,2018-09-01 15:40:40,14999,[10000:14999],analysis,2018-05-01 14:25:25,10000,False,,,0.0166,False,,0.1,0.021096
3,3,2018-12-31 10:11:21,19999,[15000:19999],analysis,2018-09-01 16:19:07,15000,False,,,0.0138,False,,0.1,0.03621
4,4,2019-04-30 11:01:30,24999,[20000:24999],analysis,2018-12-31 10:38:45,20000,False,,,0.011,False,,0.1,0.028708
5,5,2019-09-01 00:24:27,29999,[25000:29999],analysis,2019-04-30 11:02:00,25000,True,,,0.4348,True,,0.1,0.464732
6,6,2019-12-31 09:09:12,34999,[30000:34999],analysis,2019-09-01 00:28:54,30000,True,,,0.4298,True,,0.1,0.460044
7,7,2020-04-30 11:46:53,39999,[35000:39999],analysis,2019-12-31 10:07:15,35000,True,,,0.4374,True,,0.1,0.466746
8,8,2020-09-01 02:46:02,44999,[40000:44999],analysis,2020-04-30 12:04:32,40000,True,,,0.4352,True,,0.1,0.4663
9,9,2021-01-01 04:29:32,49999,[45000:49999],analysis,2020-09-01 02:46:13,45000,True,,,0.4382,True,,0.1,0.467798


In [None]:
print(univariate_results.filter(period='analysis', column_names=['distance_from_office']).to_df().to_markdown(tablefmt="grid"))

+----+-------------------------------------+----------------------------------+-----------------------------------+-----------------------------+--------------------------------+------------------------------------+-------------------------------------+-----------------------------------------------------------+---------------------------------------------------------------------+---------------------------------------------------------------------+-----------------------------------------------------------+-------------------------------------------------------+-----------------------------------------------------------------+-----------------------------------------------------------------+-------------------------------------------------------+
|    |   ('chunk', 'chunk', 'chunk_index') | ('chunk', 'chunk', 'end_date')   |   ('chunk', 'chunk', 'end_index') | ('chunk', 'chunk', 'key')   | ('chunk', 'chunk', 'period')   | ('chunk', 'chunk', 'start_date')   |   ('chunk', 'chunk', 'star

In [None]:
alert_count_ranker = nml.AlertCountRanker()
alert_count_ranked_features = alert_count_ranker.rank(
    univariate_results.filter(methods=['jensen_shannon']),
    only_drifting = False)
display(alert_count_ranked_features)

Unnamed: 0,number_of_alerts,column_name,rank
0,5,y_pred_proba,1
1,5,wfh_prev_workday,2
2,5,salary_range,3
3,5,public_transportation_cost,4
4,5,distance_from_office,5
5,0,y_pred,6
6,0,workday,7
7,0,tenure,8
8,0,gas_price_per_litre,9


In [None]:
print(alert_count_ranked_features.to_markdown(tablefmt="grid"))

+----+--------------------+----------------------------+--------+
|    |   number_of_alerts | column_name                |   rank |
|  0 |                  5 | y_pred_proba               |      1 |
+----+--------------------+----------------------------+--------+
|  1 |                  5 | wfh_prev_workday           |      2 |
+----+--------------------+----------------------------+--------+
|  2 |                  5 | salary_range               |      3 |
+----+--------------------+----------------------------+--------+
|  3 |                  5 | public_transportation_cost |      4 |
+----+--------------------+----------------------------+--------+
|  4 |                  5 | distance_from_office       |      5 |
+----+--------------------+----------------------------+--------+
|  5 |                  0 | y_pred                     |      6 |
+----+--------------------+----------------------------+--------+
|  6 |                  0 | workday                    |      7 |
+----+----

In [None]:
estimated_calc = nml.CBPE(
    y_pred_proba='y_pred_proba',
    y_pred='y_pred',
    y_true='work_home_actual',
    timestamp_column_name='timestamp',
    metrics=['roc_auc', 'recall'],
    chunk_size=5000,
    problem_type='classification_binary',
)
estimated_calc.fit(reference_df)
estimated_perf_results = estimated_calc.estimate(analysis_df)
display(estimated_perf_results.filter(period='analysis').to_df())

Unnamed: 0_level_0,chunk,chunk,chunk,chunk,chunk,chunk,chunk,roc_auc,roc_auc,roc_auc,roc_auc,roc_auc,roc_auc,recall,recall,recall,recall,recall,recall,recall,recall
Unnamed: 0_level_1,key,chunk_index,start_index,end_index,start_date,end_date,period,value,sampling_error,realized,...,lower_threshold,alert,value,sampling_error,realized,upper_confidence_boundary,lower_confidence_boundary,upper_threshold,lower_threshold,alert
0,[0:4999],0,0,4999,2017-08-31 04:20:00,2018-01-02 00:45:44,analysis,0.968631,0.001811,0.970962,...,0.963317,False,0.954644,0.004223,0.957077,0.967311,0.941976,0.965726,0.940831,False
1,[5000:9999],1,5000,9999,2018-01-02 01:13:11,2018-05-01 13:10:10,analysis,0.969044,0.001811,0.970248,...,0.963317,False,0.950074,0.004223,0.949959,0.962742,0.937407,0.965726,0.940831,False
2,[10000:14999],2,10000,14999,2018-05-01 14:25:25,2018-09-01 15:40:40,analysis,0.969444,0.001811,0.976282,...,0.963317,False,0.953431,0.004223,0.959654,0.966098,0.940763,0.965726,0.940831,False
3,[15000:19999],3,15000,19999,2018-09-01 16:19:07,2018-12-31 10:11:21,analysis,0.969047,0.001811,0.967721,...,0.963317,False,0.950695,0.004223,0.945205,0.963363,0.938028,0.965726,0.940831,False
4,[20000:24999],4,20000,24999,2018-12-31 10:38:45,2019-04-30 11:01:30,analysis,0.968873,0.001811,0.969886,...,0.963317,False,0.952322,0.004223,0.948269,0.96499,0.939655,0.965726,0.940831,False
5,[25000:29999],5,25000,29999,2019-04-30 11:02:00,2019-09-01 00:24:27,analysis,0.960478,0.001811,0.96005,...,0.963317,True,0.931746,0.004223,0.945134,0.944414,0.919078,0.965726,0.940831,True
6,[30000:34999],6,30000,34999,2019-09-01 00:28:54,2019-12-31 09:09:12,analysis,0.961134,0.001811,0.95853,...,0.963317,True,0.933032,0.004223,0.94297,0.945699,0.920364,0.965726,0.940831,True
7,[35000:39999],7,35000,39999,2019-12-31 10:07:15,2020-04-30 11:46:53,analysis,0.960536,0.001811,0.959041,...,0.963317,True,0.932623,0.004223,0.940471,0.94529,0.919955,0.965726,0.940831,True
8,[40000:44999],8,40000,44999,2020-04-30 12:04:32,2020-09-01 02:46:02,analysis,0.961869,0.001811,0.963094,...,0.963317,True,0.931093,0.004223,0.9444,0.94376,0.918425,0.965726,0.940831,True
9,[45000:49999],9,45000,49999,2020-09-01 02:46:13,2021-01-01 04:29:32,analysis,0.960537,0.001811,0.957556,...,0.963317,True,0.935494,0.004223,0.943337,0.948162,0.922827,0.965726,0.940831,True


In [None]:
print(estimated_perf_results.filter(period='analysis').to_df().to_markdown(tablefmt="grid"))

+----+--------------------+----------------------------+----------------------------+--------------------------+---------------------------+-------------------------+-----------------------+------------------------+---------------------------------+---------------------------+--------------------------------------------+--------------------------------------------+----------------------------------+----------------------------------+------------------------+-----------------------+--------------------------------+--------------------------+-------------------------------------------+-------------------------------------------+---------------------------------+---------------------------------+-----------------------+
|    | ('chunk', 'key')   |   ('chunk', 'chunk_index') |   ('chunk', 'start_index') |   ('chunk', 'end_index') | ('chunk', 'start_date')   | ('chunk', 'end_date')   | ('chunk', 'period')   |   ('roc_auc', 'value') |   ('roc_auc', 'sampling_error') |   ('roc_auc', 'realized

In [None]:
realized_calc = nml.PerformanceCalculator(
    y_pred_proba='y_pred_proba',
    y_pred='y_pred',
    y_true='work_home_actual',
    timestamp_column_name='timestamp',
    problem_type='classification_binary',
    metrics=['roc_auc', 'recall',],
    chunk_size=5000)
realized_calc.fit(reference_df)
realized_perf_results = realized_calc.calculate(analysis_df)
display(realized_perf_results.filter(period='analysis').to_df())

Unnamed: 0_level_0,chunk,chunk,chunk,chunk,chunk,chunk,chunk,chunk,roc_auc,roc_auc,roc_auc,roc_auc,roc_auc,recall,recall,recall,recall,recall
Unnamed: 0_level_1,key,chunk_index,start_index,end_index,start_date,end_date,period,targets_missing_rate,sampling_error,value,upper_threshold,lower_threshold,alert,sampling_error,value,upper_threshold,lower_threshold,alert
0,[0:4999],0,0,4999,2017-08-31 04:20:00,2018-01-02 00:45:44,analysis,0.0,0.001811,0.970962,0.97866,0.963317,False,0.004223,0.957077,0.965726,0.940831,False
1,[5000:9999],1,5000,9999,2018-01-02 01:13:11,2018-05-01 13:10:10,analysis,0.0,0.001811,0.970248,0.97866,0.963317,False,0.004223,0.949959,0.965726,0.940831,False
2,[10000:14999],2,10000,14999,2018-05-01 14:25:25,2018-09-01 15:40:40,analysis,0.0,0.001811,0.976282,0.97866,0.963317,False,0.004223,0.959654,0.965726,0.940831,False
3,[15000:19999],3,15000,19999,2018-09-01 16:19:07,2018-12-31 10:11:21,analysis,0.0,0.001811,0.967721,0.97866,0.963317,False,0.004223,0.945205,0.965726,0.940831,False
4,[20000:24999],4,20000,24999,2018-12-31 10:38:45,2019-04-30 11:01:30,analysis,0.0,0.001811,0.969886,0.97866,0.963317,False,0.004223,0.948269,0.965726,0.940831,False
5,[25000:29999],5,25000,29999,2019-04-30 11:02:00,2019-09-01 00:24:27,analysis,0.0,0.001811,0.96005,0.97866,0.963317,True,0.004223,0.945134,0.965726,0.940831,False
6,[30000:34999],6,30000,34999,2019-09-01 00:28:54,2019-12-31 09:09:12,analysis,0.0,0.001811,0.95853,0.97866,0.963317,True,0.004223,0.94297,0.965726,0.940831,False
7,[35000:39999],7,35000,39999,2019-12-31 10:07:15,2020-04-30 11:46:53,analysis,0.0,0.001811,0.959041,0.97866,0.963317,True,0.004223,0.940471,0.965726,0.940831,True
8,[40000:44999],8,40000,44999,2020-04-30 12:04:32,2020-09-01 02:46:02,analysis,0.0,0.001811,0.963094,0.97866,0.963317,True,0.004223,0.9444,0.965726,0.940831,False
9,[45000:49999],9,45000,49999,2020-09-01 02:46:13,2021-01-01 04:29:32,analysis,0.0,0.001811,0.957556,0.97866,0.963317,True,0.004223,0.943337,0.965726,0.940831,False


In [None]:
print(realized_perf_results.filter(period='analysis').to_df().to_markdown(tablefmt="grid"))

+----+--------------------+----------------------------+----------------------------+--------------------------+---------------------------+-------------------------+-----------------------+-------------------------------------+---------------------------------+------------------------+----------------------------------+----------------------------------+------------------------+--------------------------------+-----------------------+---------------------------------+---------------------------------+-----------------------+
|    | ('chunk', 'key')   |   ('chunk', 'chunk_index') |   ('chunk', 'start_index') |   ('chunk', 'end_index') | ('chunk', 'start_date')   | ('chunk', 'end_date')   | ('chunk', 'period')   |   ('chunk', 'targets_missing_rate') |   ('roc_auc', 'sampling_error') |   ('roc_auc', 'value') |   ('roc_auc', 'upper_threshold') |   ('roc_auc', 'lower_threshold') | ('roc_auc', 'alert')   |   ('recall', 'sampling_error') |   ('recall', 'value') |   ('recall', 'upper_threshol

In [None]:
ranker1 = nml.CorrelationRanker()
# ranker fits on one metric and reference period data only
ranker1.fit(
    estimated_perf_results.filter(period='reference', metrics=['roc_auc']))
# ranker ranks on one drift method and one performance metric
correlation_ranked_features1 = ranker1.rank(
    univariate_results.filter(methods=['jensen_shannon']),
    estimated_perf_results.filter(metrics=['roc_auc']),
    only_drifting = False)
display(correlation_ranked_features1)

Unnamed: 0,column_name,pearsonr_correlation,pearsonr_pvalue,has_drifted,rank
0,wfh_prev_workday,0.99829,1.1777060000000001e-23,True,1
1,y_pred_proba,0.998072,3.4745760000000003e-23,True,2
2,salary_range,0.996512,7.162919e-21,True,3
3,public_transportation_cost,0.996439,8.622848e-21,True,4
4,distance_from_office,0.996147,1.7532699999999998e-20,True,5
5,y_pred,0.847827,2.359415e-06,False,6
6,workday,0.307497,0.18722,False,7
7,gas_price_per_litre,0.276486,0.2379826,False,8
8,tenure,-0.134447,0.5719876,False,9


In [None]:
print(correlation_ranked_features1.to_markdown(tablefmt="grid"))

+----+----------------------------+------------------------+-------------------+---------------+--------+
|    | column_name                |   pearsonr_correlation |   pearsonr_pvalue | has_drifted   |   rank |
|  0 | wfh_prev_workday           |               0.99829  |       1.17771e-23 | True          |      1 |
+----+----------------------------+------------------------+-------------------+---------------+--------+
|  1 | y_pred_proba               |               0.998072 |       3.47458e-23 | True          |      2 |
+----+----------------------------+------------------------+-------------------+---------------+--------+
|  2 | salary_range               |               0.996512 |       7.16292e-21 | True          |      3 |
+----+----------------------------+------------------------+-------------------+---------------+--------+
|  3 | public_transportation_cost |               0.996439 |       8.62285e-21 | True          |      4 |
+----+----------------------------+-----------

In [None]:
ranker2 = nml.CorrelationRanker()
# ranker fits on one metric and reference period data only
ranker2.fit(
    realized_perf_results.filter(period='reference', metrics=['recall']))
# ranker ranks on one drift method and one performance metric
correlation_ranked_features2 = ranker2.rank(
    univariate_results.filter(period='analysis', methods=['jensen_shannon']),
    realized_perf_results.filter(period='analysis', metrics=['recall']),
    only_drifting = False)
display(correlation_ranked_features2)

Unnamed: 0,column_name,pearsonr_correlation,pearsonr_pvalue,has_drifted,rank
0,public_transportation_cost,0.826876,0.003171,True,1
1,distance_from_office,0.821152,0.003585,True,2
2,y_pred_proba,0.819765,0.003691,True,3
3,wfh_prev_workday,0.817879,0.003839,True,4
4,salary_range,0.804469,0.005013,True,5
5,y_pred,0.566484,0.087759,False,6
6,gas_price_per_litre,0.109673,0.762956,False,7
7,workday,-0.049311,0.892393,False,8
8,tenure,-0.565407,0.088499,False,9


In [None]:
print(correlation_ranked_features2.to_markdown(tablefmt="grid"))

+----+----------------------------+------------------------+-------------------+---------------+--------+
|    | column_name                |   pearsonr_correlation |   pearsonr_pvalue | has_drifted   |   rank |
|  0 | public_transportation_cost |              0.826876  |        0.00317112 | True          |      1 |
+----+----------------------------+------------------------+-------------------+---------------+--------+
|  1 | distance_from_office       |              0.821152  |        0.00358531 | True          |      2 |
+----+----------------------------+------------------------+-------------------+---------------+--------+
|  2 | y_pred_proba               |              0.819765  |        0.00369127 | True          |      3 |
+----+----------------------------+------------------------+-------------------+---------------+--------+
|  3 | wfh_prev_workday           |              0.817879  |        0.00383887 | True          |      4 |
+----+----------------------------+-----------