In [6]:
import nannyml as nml
from IPython.display import display

reference_df, analysis_df, analysis_target_df = nml.load_synthetic_car_loan_dataset()

analysis_full_df = analysis_df.merge(analysis_target_df, left_index=True, right_index=True)

column_names = [
    'car_value', 'salary_range', 'debt_to_income_ratio', 'loan_length', 'repaid_loan_on_prev_car', 'size_of_downpayment', 'driver_tenure', 'y_pred_proba', 'y_pred', 'repaid'
]

univ_calc = nml.UnivariateDriftCalculator(
    column_names=column_names,
    timestamp_column_name='timestamp',
    continuous_methods=['kolmogorov_smirnov', 'jensen_shannon'],
    categorical_methods=['chi2', 'jensen_shannon'],
    chunk_size=5000
)

univ_calc.fit(reference_df)
univariate_results = univ_calc.calculate(analysis_full_df)
display(univariate_results.filter(period='analysis', column_names=['debt_to_income_ratio']).to_df())

Unnamed: 0_level_0,chunk,chunk,chunk,chunk,chunk,chunk,chunk,debt_to_income_ratio,debt_to_income_ratio,debt_to_income_ratio,debt_to_income_ratio,debt_to_income_ratio,debt_to_income_ratio,debt_to_income_ratio,debt_to_income_ratio
Unnamed: 0_level_1,chunk,chunk,chunk,chunk,chunk,chunk,chunk,kolmogorov_smirnov,kolmogorov_smirnov,kolmogorov_smirnov,kolmogorov_smirnov,jensen_shannon,jensen_shannon,jensen_shannon,jensen_shannon
Unnamed: 0_level_2,chunk_index,end_date,end_index,key,period,start_date,start_index,alert,lower_threshold,upper_threshold,value,alert,lower_threshold,upper_threshold,value
0,0,2018-11-30 00:27:16.848,4999,[0:4999],analysis,2018-10-30 18:00:00,0,False,,,0.01576,False,,0.1,0.031661
1,1,2018-12-30 07:03:16.848,9999,[5000:9999],analysis,2018-11-30 00:36:00,5000,False,,,0.01272,False,,0.1,0.030011
2,2,2019-01-29 13:39:16.848,14999,[10000:14999],analysis,2018-12-30 07:12:00,10000,False,,,0.01746,False,,0.1,0.031129
3,3,2019-02-28 20:15:16.848,19999,[15000:19999],analysis,2019-01-29 13:48:00,15000,False,,,0.01282,False,,0.1,0.029464
4,4,2019-03-31 02:51:16.848,24999,[20000:24999],analysis,2019-02-28 20:24:00,20000,False,,,0.01922,False,,0.1,0.030809
5,5,2019-04-30 09:27:16.848,29999,[25000:29999],analysis,2019-03-31 03:00:00,25000,False,,,0.00824,False,,0.1,0.028681
6,6,2019-05-30 16:03:16.848,34999,[30000:34999],analysis,2019-04-30 09:36:00,30000,False,,,0.01068,False,,0.1,0.043628
7,7,2019-06-29 22:39:16.848,39999,[35000:39999],analysis,2019-05-30 16:12:00,35000,False,,,0.01002,False,,0.1,0.029253
8,8,2019-07-30 05:15:16.848,44999,[40000:44999],analysis,2019-06-29 22:48:00,40000,False,,,0.0107,False,,0.1,0.030628
9,9,2019-08-29 11:51:16.848,49999,[45000:49999],analysis,2019-07-30 05:24:00,45000,False,,,0.007,False,,0.1,0.02833


In [4]:
print(univariate_results.filter(period='analysis', column_names=['debt_to_income_ratio']).to_df().to_markdown(tablefmt="grid"))

+----+-------------------------------------+----------------------------------+-----------------------------------+-----------------------------+--------------------------------+------------------------------------+-------------------------------------+-----------------------------------------------------------+---------------------------------------------------------------------+---------------------------------------------------------------------+-----------------------------------------------------------+-------------------------------------------------------+-----------------------------------------------------------------+-----------------------------------------------------------------+-------------------------------------------------------+
|    |   ('chunk', 'chunk', 'chunk_index') | ('chunk', 'chunk', 'end_date')   |   ('chunk', 'chunk', 'end_index') | ('chunk', 'chunk', 'key')   | ('chunk', 'chunk', 'period')   | ('chunk', 'chunk', 'start_date')   |   ('chunk', 'chunk', 'star

In [7]:
alert_count_ranker = nml.AlertCountRanker()
alert_count_ranked_features = alert_count_ranker.rank(
    univariate_results.filter(methods=['jensen_shannon']),
    only_drifting = False)
display(alert_count_ranked_features)

Unnamed: 0,number_of_alerts,column_name,rank
0,5,y_pred_proba,1
1,5,salary_range,2
2,5,repaid_loan_on_prev_car,3
3,5,loan_length,4
4,5,car_value,5
5,0,y_pred,6
6,0,size_of_downpayment,7
7,0,repaid,8
8,0,driver_tenure,9
9,0,debt_to_income_ratio,10


In [8]:
print(alert_count_ranked_features.to_markdown(tablefmt="grid"))

+----+--------------------+-------------------------+--------+
|    |   number_of_alerts | column_name             |   rank |
|  0 |                  5 | y_pred_proba            |      1 |
+----+--------------------+-------------------------+--------+
|  1 |                  5 | salary_range            |      2 |
+----+--------------------+-------------------------+--------+
|  2 |                  5 | repaid_loan_on_prev_car |      3 |
+----+--------------------+-------------------------+--------+
|  3 |                  5 | loan_length             |      4 |
+----+--------------------+-------------------------+--------+
|  4 |                  5 | car_value               |      5 |
+----+--------------------+-------------------------+--------+
|  5 |                  0 | y_pred                  |      6 |
+----+--------------------+-------------------------+--------+
|  6 |                  0 | size_of_downpayment     |      7 |
+----+--------------------+-------------------------+--

In [10]:
estimated_calc = nml.CBPE(
    y_pred_proba='y_pred_proba',
    y_pred='y_pred',
    y_true='repaid',
    timestamp_column_name='timestamp',
    metrics=['roc_auc', 'recall'],
    chunk_size=5000,
    problem_type='classification_binary',
)
estimated_calc.fit(reference_df)
estimated_perf_results = estimated_calc.estimate(analysis_full_df)
display(estimated_perf_results.filter(period='analysis').to_df())

Unnamed: 0_level_0,chunk,chunk,chunk,chunk,chunk,chunk,chunk,roc_auc,roc_auc,roc_auc,roc_auc,roc_auc,roc_auc,recall,recall,recall,recall,recall,recall,recall,recall
Unnamed: 0_level_1,key,chunk_index,start_index,end_index,start_date,end_date,period,sampling_error,realized,value,...,lower_threshold,alert,sampling_error,realized,value,upper_confidence_boundary,lower_confidence_boundary,upper_threshold,lower_threshold,alert
0,[0:4999],0,0,4999,2018-10-30 18:00:00,2018-11-30 00:27:16.848,analysis,0.001811,0.970962,0.968631,...,0.963317,False,0.005137,0.930394,0.928723,0.944133,0.913313,0.941033,0.9171,False
1,[5000:9999],1,5000,9999,2018-11-30 00:36:00,2018-12-30 07:03:16.848,analysis,0.001811,0.970248,0.969044,...,0.963317,False,0.005137,0.923922,0.925261,0.940671,0.909851,0.941033,0.9171,False
2,[10000:14999],2,10000,14999,2018-12-30 07:12:00,2019-01-29 13:39:16.848,analysis,0.001811,0.976282,0.969444,...,0.963317,False,0.005137,0.938246,0.929317,0.944727,0.913907,0.941033,0.9171,False
3,[15000:19999],3,15000,19999,2019-01-29 13:48:00,2019-02-28 20:15:16.848,analysis,0.001811,0.967721,0.969047,...,0.963317,False,0.005137,0.92506,0.929713,0.945123,0.914303,0.941033,0.9171,False
4,[20000:24999],4,20000,24999,2019-02-28 20:24:00,2019-03-31 02:51:16.848,analysis,0.001811,0.969886,0.968873,...,0.963317,False,0.005137,0.927577,0.930604,0.946014,0.915194,0.941033,0.9171,False
5,[25000:29999],5,25000,29999,2019-03-31 03:00:00,2019-04-30 09:27:16.848,analysis,0.001811,0.96005,0.960478,...,0.963317,True,0.005137,0.905086,0.88399,0.8994,0.86858,0.941033,0.9171,True
6,[30000:34999],6,30000,34999,2019-04-30 09:36:00,2019-05-30 16:03:16.848,analysis,0.001811,0.95853,0.961134,...,0.963317,True,0.005137,0.89901,0.883528,0.898938,0.868118,0.941033,0.9171,True
7,[35000:39999],7,35000,39999,2019-05-30 16:12:00,2019-06-29 22:39:16.848,analysis,0.001811,0.959041,0.960536,...,0.963317,True,0.005137,0.901718,0.885501,0.900911,0.870091,0.941033,0.9171,True
8,[40000:44999],8,40000,44999,2019-06-29 22:48:00,2019-07-30 05:15:16.848,analysis,0.001811,0.963094,0.961869,...,0.963317,True,0.005137,0.906124,0.885978,0.901388,0.870568,0.941033,0.9171,True
9,[45000:49999],9,45000,49999,2019-07-30 05:24:00,2019-08-29 11:51:16.848,analysis,0.001811,0.957556,0.960537,...,0.963317,True,0.005137,0.905823,0.889808,0.905218,0.874398,0.941033,0.9171,True


In [11]:
print(estimated_perf_results.filter(period='analysis').to_df().to_markdown(tablefmt="grid"))

+----+--------------------+----------------------------+----------------------------+--------------------------+---------------------------+----------------------------+-----------------------+---------------------------------+---------------------------+------------------------+--------------------------------------------+--------------------------------------------+----------------------------------+----------------------------------+------------------------+--------------------------------+--------------------------+-----------------------+-------------------------------------------+-------------------------------------------+---------------------------------+---------------------------------+-----------------------+
|    | ('chunk', 'key')   |   ('chunk', 'chunk_index') |   ('chunk', 'start_index') |   ('chunk', 'end_index') | ('chunk', 'start_date')   | ('chunk', 'end_date')      | ('chunk', 'period')   |   ('roc_auc', 'sampling_error') |   ('roc_auc', 'realized') |   ('roc_auc', 

In [12]:
realized_calc = nml.PerformanceCalculator(
    y_pred_proba='y_pred_proba',
    y_pred='y_pred',
    y_true='repaid',
    timestamp_column_name='timestamp',
    problem_type='classification_binary',
    metrics=['roc_auc', 'recall',],
    chunk_size=5000)
realized_calc.fit(reference_df)
realized_perf_results = realized_calc.calculate(analysis_full_df)
display(realized_perf_results.filter(period='analysis').to_df())

Unnamed: 0_level_0,chunk,chunk,chunk,chunk,chunk,chunk,chunk,chunk,roc_auc,roc_auc,roc_auc,roc_auc,roc_auc,recall,recall,recall,recall,recall
Unnamed: 0_level_1,key,chunk_index,start_index,end_index,start_date,end_date,period,targets_missing_rate,sampling_error,value,upper_threshold,lower_threshold,alert,sampling_error,value,upper_threshold,lower_threshold,alert
0,[0:4999],0,0,4999,2018-10-30 18:00:00,2018-11-30 00:27:16.848,analysis,0.0,0.001811,0.970962,0.97866,0.963317,False,0.005137,0.930394,0.941033,0.9171,False
1,[5000:9999],1,5000,9999,2018-11-30 00:36:00,2018-12-30 07:03:16.848,analysis,0.0,0.001811,0.970248,0.97866,0.963317,False,0.005137,0.923922,0.941033,0.9171,False
2,[10000:14999],2,10000,14999,2018-12-30 07:12:00,2019-01-29 13:39:16.848,analysis,0.0,0.001811,0.976282,0.97866,0.963317,False,0.005137,0.938246,0.941033,0.9171,False
3,[15000:19999],3,15000,19999,2019-01-29 13:48:00,2019-02-28 20:15:16.848,analysis,0.0,0.001811,0.967721,0.97866,0.963317,False,0.005137,0.92506,0.941033,0.9171,False
4,[20000:24999],4,20000,24999,2019-02-28 20:24:00,2019-03-31 02:51:16.848,analysis,0.0,0.001811,0.969886,0.97866,0.963317,False,0.005137,0.927577,0.941033,0.9171,False
5,[25000:29999],5,25000,29999,2019-03-31 03:00:00,2019-04-30 09:27:16.848,analysis,0.0,0.001811,0.96005,0.97866,0.963317,True,0.005137,0.905086,0.941033,0.9171,True
6,[30000:34999],6,30000,34999,2019-04-30 09:36:00,2019-05-30 16:03:16.848,analysis,0.0,0.001811,0.95853,0.97866,0.963317,True,0.005137,0.89901,0.941033,0.9171,True
7,[35000:39999],7,35000,39999,2019-05-30 16:12:00,2019-06-29 22:39:16.848,analysis,0.0,0.001811,0.959041,0.97866,0.963317,True,0.005137,0.901718,0.941033,0.9171,True
8,[40000:44999],8,40000,44999,2019-06-29 22:48:00,2019-07-30 05:15:16.848,analysis,0.0,0.001811,0.963094,0.97866,0.963317,True,0.005137,0.906124,0.941033,0.9171,True
9,[45000:49999],9,45000,49999,2019-07-30 05:24:00,2019-08-29 11:51:16.848,analysis,0.0,0.001811,0.957556,0.97866,0.963317,True,0.005137,0.905823,0.941033,0.9171,True


In [13]:
print(realized_perf_results.filter(period='analysis').to_df().to_markdown(tablefmt="grid"))

+----+--------------------+----------------------------+----------------------------+--------------------------+---------------------------+----------------------------+-----------------------+-------------------------------------+---------------------------------+------------------------+----------------------------------+----------------------------------+------------------------+--------------------------------+-----------------------+---------------------------------+---------------------------------+-----------------------+
|    | ('chunk', 'key')   |   ('chunk', 'chunk_index') |   ('chunk', 'start_index') |   ('chunk', 'end_index') | ('chunk', 'start_date')   | ('chunk', 'end_date')      | ('chunk', 'period')   |   ('chunk', 'targets_missing_rate') |   ('roc_auc', 'sampling_error') |   ('roc_auc', 'value') |   ('roc_auc', 'upper_threshold') |   ('roc_auc', 'lower_threshold') | ('roc_auc', 'alert')   |   ('recall', 'sampling_error') |   ('recall', 'value') |   ('recall', 'upper_th

In [14]:
ranker1 = nml.CorrelationRanker()

# ranker fits on one metric and reference period data only
ranker1.fit(
    estimated_perf_results.filter(period='reference', metrics=['roc_auc']))
# ranker ranks on one drift method and one performance metric
correlation_ranked_features1 = ranker1.rank(
    univariate_results.filter(methods=['jensen_shannon']),
    estimated_perf_results.filter(metrics=['roc_auc']),
    only_drifting = False)

display(correlation_ranked_features1)

Unnamed: 0,column_name,pearsonr_correlation,pearsonr_pvalue,has_drifted,rank
0,y_pred_proba,0.998566,1.847628e-11,True,1
1,repaid_loan_on_prev_car,0.998399,2.865955e-11,True,2
2,salary_range,0.996189,9.187675e-10,True,3
3,loan_length,0.995933,1.190732e-09,True,4
4,car_value,0.994598,3.7016e-09,True,5
5,size_of_downpayment,0.307878,0.3868074,False,6
6,debt_to_income_ratio,0.147411,0.6844537,False,7
7,y_pred,-0.249251,0.4873934,False,8
8,repaid,-0.395939,0.2573678,False,9
9,driver_tenure,-0.645446,0.04384665,False,10


In [15]:
print(correlation_ranked_features1.to_markdown(tablefmt="grid"))

+----+-------------------------+------------------------+-------------------+---------------+--------+
|    | column_name             |   pearsonr_correlation |   pearsonr_pvalue | has_drifted   |   rank |
|  0 | y_pred_proba            |               0.998566 |       1.84763e-11 | True          |      1 |
+----+-------------------------+------------------------+-------------------+---------------+--------+
|  1 | repaid_loan_on_prev_car |               0.998399 |       2.86596e-11 | True          |      2 |
+----+-------------------------+------------------------+-------------------+---------------+--------+
|  2 | salary_range            |               0.996189 |       9.18768e-10 | True          |      3 |
+----+-------------------------+------------------------+-------------------+---------------+--------+
|  3 | loan_length             |               0.995933 |       1.19073e-09 | True          |      4 |
+----+-------------------------+------------------------+----------------

In [16]:
ranker2 = nml.CorrelationRanker()

# ranker fits on one metric and reference period data only
ranker2.fit(
    estimated_perf_results.filter(period='reference', metrics=['recall']))
# ranker ranks on one drift method and one performance metric
correlation_ranked_features2 = ranker2.rank(
    univariate_results.filter(period='analysis', methods=['jensen_shannon']),
    realized_perf_results.filter(period='analysis', metrics=['recall']),
    only_drifting = False)

display(correlation_ranked_features2)

Unnamed: 0,column_name,pearsonr_correlation,pearsonr_pvalue,has_drifted,rank
0,repaid_loan_on_prev_car,0.969871,3e-06,True,1
1,y_pred_proba,0.967009,5e-06,True,2
2,loan_length,0.966623,5e-06,True,3
3,car_value,0.965028,6e-06,True,4
4,salary_range,0.964699,7e-06,True,5
5,size_of_downpayment,0.312291,0.379679,False,6
6,debt_to_income_ratio,0.304855,0.391726,False,7
7,y_pred,-0.36374,0.301498,False,8
8,repaid,-0.40692,0.243189,False,9
9,driver_tenure,-0.577185,0.080623,False,10


In [17]:
print(correlation_ranked_features2.to_markdown(tablefmt="grid"))

+----+-------------------------+------------------------+-------------------+---------------+--------+
|    | column_name             |   pearsonr_correlation |   pearsonr_pvalue | has_drifted   |   rank |
|  0 | repaid_loan_on_prev_car |               0.969871 |       3.4762e-06  | True          |      1 |
+----+-------------------------+------------------------+-------------------+---------------+--------+
|  1 | y_pred_proba            |               0.967009 |       4.98026e-06 | True          |      2 |
+----+-------------------------+------------------------+-------------------+---------------+--------+
|  2 | loan_length             |               0.966623 |       5.21534e-06 | True          |      3 |
+----+-------------------------+------------------------+-------------------+---------------+--------+
|  3 | car_value               |               0.965028 |       6.27329e-06 | True          |      4 |
+----+-------------------------+------------------------+----------------