In [149]:
import nannyml as nml

In [150]:


import pandas as pd

url = "https://raw.githubusercontent.com/NannyML/examples/main/webinars/inforshare_workshop/medicalcov-2015_2016_2017_2018-IL-reference.csv"
reference_df = pd.read_csv(url)

url_analysis = "https://raw.githubusercontent.com/NannyML/examples/main/webinars/inforshare_workshop/medicalcov-2015_2016_2017_2018-IL-analysis.csv"
analysis_df = pd.read_csv(url_analysis)


url_targets = "https://raw.githubusercontent.com/NannyML/examples/main/webinars/inforshare_workshop/medicalcov-2015_2016_2017_2018-IL-analysis_targets.csv"
analysis_with_targets_df = pd.read_csv(url_targets)


# Display the first few rows of the DataFrame
reference_df.head()


Unnamed: 0,AGEP,SCHL,MAR,SEX,DIS,ESP,CIT,MIG,MIL,ANC,...,DEYE,DREM,PINCP,ESR,ST,FER,RAC1P,y_true,y_pred,y_pred_proba
0,62.0,13.0,5.0,2.0,2.0,0.0,1.0,1.0,4.0,1.0,...,2.0,2.0,11700.0,3.0,17.0,0.0,2.0,1,1,0.84
1,36.0,19.0,5.0,2.0,2.0,0.0,1.0,1.0,4.0,1.0,...,2.0,2.0,5270.0,3.0,17.0,2.0,2.0,1,1,0.9
2,55.0,16.0,3.0,2.0,2.0,0.0,1.0,1.0,4.0,1.0,...,2.0,2.0,0.0,6.0,17.0,0.0,1.0,1,0,0.34
3,32.0,16.0,5.0,1.0,2.0,0.0,5.0,1.0,4.0,1.0,...,2.0,2.0,30000.0,1.0,17.0,0.0,8.0,0,0,0.02
4,34.0,11.0,5.0,2.0,2.0,0.0,5.0,1.0,4.0,1.0,...,2.0,2.0,0.0,6.0,17.0,2.0,8.0,0,0,0.14


Let's start with estimating performance

In [151]:

estimator = nml.CBPE(
    y_pred_proba='y_pred_proba',
    y_pred='y_pred',
    y_true='y_true',
    metrics=['roc_auc', 'accuracy', 'f1'],
    chunk_size=5000,
    problem_type='classification_binary',
)

estimator.fit(reference_df)

results_performance_estimation = estimator.estimate(analysis_df)

In [152]:
display(results_performance_estimation.filter(period='analysis').to_df())

Unnamed: 0_level_0,chunk,chunk,chunk,chunk,chunk,chunk,chunk,roc_auc,roc_auc,roc_auc,...,accuracy,accuracy,f1,f1,f1,f1,f1,f1,f1,f1
Unnamed: 0_level_1,key,chunk_index,start_index,end_index,start_date,end_date,period,value,sampling_error,realized,...,lower_threshold,alert,value,sampling_error,realized,upper_confidence_boundary,lower_confidence_boundary,upper_threshold,lower_threshold,alert
0,[0:4999],0,0,4999,,,analysis,0.788807,0.005953,,...,0.748636,False,0.551502,0.015854,,0.599063,0.50394,0.60112,0.550039,False
1,[5000:9999],1,5000,9999,,,analysis,0.790094,0.005953,,...,0.748636,False,0.565774,0.015854,,0.613336,0.518213,0.60112,0.550039,False
2,[10000:14999],2,10000,14999,,,analysis,0.790476,0.005953,,...,0.748636,False,0.555427,0.015854,,0.602989,0.507866,0.60112,0.550039,False
3,[15000:19999],3,15000,19999,,,analysis,0.790159,0.005953,,...,0.748636,False,0.560128,0.015854,,0.60769,0.512566,0.60112,0.550039,False
4,[20000:24999],4,20000,24999,,,analysis,0.787923,0.005953,,...,0.748636,False,0.549558,0.015854,,0.59712,0.501996,0.60112,0.550039,True
5,[25000:29999],5,25000,29999,,,analysis,0.788422,0.005953,,...,0.748636,False,0.557196,0.015854,,0.604758,0.509634,0.60112,0.550039,False
6,[30000:34999],6,30000,34999,,,analysis,0.790067,0.005953,,...,0.748636,False,0.559262,0.015854,,0.606824,0.5117,0.60112,0.550039,False
7,[35000:39999],7,35000,39999,,,analysis,0.78621,0.005953,,...,0.748636,False,0.550504,0.015854,,0.598066,0.502942,0.60112,0.550039,False
8,[40000:44999],8,40000,44999,,,analysis,0.820722,0.005953,,...,0.748636,False,0.665634,0.015854,,0.713195,0.618072,0.60112,0.550039,True
9,[45000:49999],9,45000,49999,,,analysis,0.813102,0.005953,,...,0.748636,False,0.648305,0.015854,,0.695867,0.600744,0.60112,0.550039,True


In [153]:
metric_fig = results_performance_estimation.plot()
metric_fig.show()

Now we're moving to Root Cause analysis.

In [154]:
#multivariate drift detection first
non_feature_columns = ['y_pred_proba', 'y_pred', 'y_true']

# Define feature columns
feature_column_names = [
    col for col in reference_df.columns
    if col not in non_feature_columns
]



In [155]:
calc = nml.DataReconstructionDriftCalculator(
    column_names=feature_column_names,
    chunk_size=5000
)
calc.fit(reference_df)


<nannyml.drift.multivariate.data_reconstruction.calculator.DataReconstructionDriftCalculator at 0x1b2838670a0>

In [156]:
results_multivariate = calc.calculate(analysis_df)

display(results_multivariate.filter(period='analysis').to_df())

display(results_multivariate.filter(period='reference').to_df())



Unnamed: 0_level_0,chunk,chunk,chunk,chunk,chunk,chunk,chunk,reconstruction_error,reconstruction_error,reconstruction_error,reconstruction_error,reconstruction_error,reconstruction_error,reconstruction_error
Unnamed: 0_level_1,key,chunk_index,start_index,end_index,start_date,end_date,period,sampling_error,value,upper_confidence_boundary,lower_confidence_boundary,upper_threshold,lower_threshold,alert
0,[0:4999],0,0,4999,,,analysis,0.016139,2.035165,2.083581,1.986748,2.137876,2.018938,False
1,[5000:9999],1,5000,9999,,,analysis,0.016139,2.058933,2.107349,2.010516,2.137876,2.018938,False
2,[10000:14999],2,10000,14999,,,analysis,0.016139,2.052833,2.10125,2.004417,2.137876,2.018938,False
3,[15000:19999],3,15000,19999,,,analysis,0.016139,2.061934,2.11035,2.013518,2.137876,2.018938,False
4,[20000:24999],4,20000,24999,,,analysis,0.016139,2.064632,2.113048,2.016216,2.137876,2.018938,False
5,[25000:29999],5,25000,29999,,,analysis,0.016139,2.055686,2.104102,2.00727,2.137876,2.018938,False
6,[30000:34999],6,30000,34999,,,analysis,0.016139,2.080748,2.129165,2.032332,2.137876,2.018938,False
7,[35000:39999],7,35000,39999,,,analysis,0.016139,2.073218,2.121634,2.024802,2.137876,2.018938,False
8,[40000:44999],8,40000,44999,,,analysis,0.016139,2.243393,2.29181,2.194977,2.137876,2.018938,True
9,[45000:49999],9,45000,49999,,,analysis,0.016139,2.227454,2.27587,2.179037,2.137876,2.018938,True


Unnamed: 0_level_0,chunk,chunk,chunk,chunk,chunk,chunk,chunk,reconstruction_error,reconstruction_error,reconstruction_error,reconstruction_error,reconstruction_error,reconstruction_error,reconstruction_error
Unnamed: 0_level_1,key,chunk_index,start_index,end_index,start_date,end_date,period,sampling_error,value,upper_confidence_boundary,lower_confidence_boundary,upper_threshold,lower_threshold,alert
0,[0:4999],0,0,4999,,,reference,0.016139,2.103466,2.151882,2.05505,2.137876,2.018938,False
1,[5000:9999],1,5000,9999,,,reference,0.016139,2.099804,2.148221,2.051388,2.137876,2.018938,False
2,[10000:14999],2,10000,14999,,,reference,0.016139,2.089489,2.137905,2.041073,2.137876,2.018938,False
3,[15000:19999],3,15000,19999,,,reference,0.016139,2.105074,2.153491,2.056658,2.137876,2.018938,False
4,[20000:24999],4,20000,24999,,,reference,0.016139,2.05026,2.098676,2.001844,2.137876,2.018938,False
5,[25000:29999],5,25000,29999,,,reference,0.016139,2.063211,2.111627,2.014794,2.137876,2.018938,False
6,[30000:34999],6,30000,34999,,,reference,0.016139,2.068484,2.116901,2.020068,2.137876,2.018938,False
7,[35000:39999],7,35000,39999,,,reference,0.016139,2.060058,2.108474,2.011642,2.137876,2.018938,False
8,[40000:45404],8,40000,45404,,,reference,0.015522,2.065816,2.112383,2.019249,2.137876,2.018938,False


In [157]:
figure = results_multivariate.plot()
figure.show()

Now let's do univariate drift detection

In [158]:
calc = nml.UnivariateDriftCalculator(
    column_names=feature_column_names ,
    treat_as_categorical=['y_pred'],
    continuous_methods=['jensen_shannon'],
    categorical_methods=['jensen_shannon'],
)

calc.fit(reference_df)
results_univariate = calc.calculate(analysis_df)
display(results_univariate.filter(period='analysis').to_df())

Unnamed: 0_level_0,chunk,chunk,chunk,chunk,chunk,chunk,chunk,AGEP,AGEP,AGEP,...,ST,ST,FER,FER,FER,FER,RAC1P,RAC1P,RAC1P,RAC1P
Unnamed: 0_level_1,chunk,chunk,chunk,chunk,chunk,chunk,chunk,jensen_shannon,jensen_shannon,jensen_shannon,...,jensen_shannon,jensen_shannon,jensen_shannon,jensen_shannon,jensen_shannon,jensen_shannon,jensen_shannon,jensen_shannon,jensen_shannon,jensen_shannon
Unnamed: 0_level_2,key,chunk_index,start_index,end_index,start_date,end_date,period,value,upper_threshold,lower_threshold,...,lower_threshold,alert,value,upper_threshold,lower_threshold,alert,value,upper_threshold,lower_threshold,alert
0,[0:8719],0,0,8719,,,analysis,0.048033,0.1,,...,,False,0.012084,0.1,,False,0.023274,0.1,,False
1,[8720:17439],1,8720,17439,,,analysis,0.052767,0.1,,...,,False,0.008243,0.1,,False,0.028987,0.1,,False
2,[17440:26159],2,17440,26159,,,analysis,0.048369,0.1,,...,,False,0.003052,0.1,,False,0.041896,0.1,,False
3,[26160:34879],3,26160,34879,,,analysis,0.052805,0.1,,...,,False,0.016354,0.1,,False,0.036355,0.1,,False
4,[34880:43599],4,34880,43599,,,analysis,0.106383,0.1,,...,,False,0.033512,0.1,,False,0.054767,0.1,,False
5,[43600:52319],5,43600,52319,,,analysis,0.148193,0.1,,...,,False,0.050316,0.1,,False,0.094847,0.1,,False
6,[52320:61039],6,52320,61039,,,analysis,0.057004,0.1,,...,,False,0.006668,0.1,,False,0.037457,0.1,,False
7,[61040:69759],7,61040,69759,,,analysis,0.048294,0.1,,...,,False,0.012159,0.1,,False,0.042904,0.1,,False
8,[69760:78479],8,69760,78479,,,analysis,0.049087,0.1,,...,,False,0.014407,0.1,,False,0.033735,0.1,,False
9,[78480:87204],9,78480,87204,,,analysis,0.049993,0.1,,...,,False,0.015855,0.1,,False,0.03328,0.1,,False


In [159]:
figure = results_univariate.filter( methods=['jensen_shannon']).plot(kind='drift')
figure.show()

Now let's try to use multivariate drift again to see if columns that didn't drift from the univariate perspective still exhibit drift from multivariate perspective.

In [160]:
alert_count_ranker = nml.AlertCountRanker()
alert_count_ranked_features = alert_count_ranker.rank(
    results_univariate.filter(methods=['jensen_shannon']),
    only_drifting = True)
display(alert_count_ranked_features)

Unnamed: 0,number_of_alerts,column_name,rank
0,2,AGEP,1
1,1,PINCP,2
2,1,MIG,3
3,1,MAR,4
4,1,ESR,5
5,1,DREM,6


In [161]:
drifting_features = alert_count_ranked_features.column_name.to_list()

In [162]:
stable_features = [
    col for col in feature_column_names
    if col not in drifting_features
]

In [163]:
multivariate_drift_on_stable_features_calc = nml.DataReconstructionDriftCalculator(
    column_names=stable_features,
    chunk_size=5000
)
multivariate_drift_on_stable_features_calc.fit(reference_df)

<nannyml.drift.multivariate.data_reconstruction.calculator.DataReconstructionDriftCalculator at 0x1b28aad3790>

In [164]:
results_multivariate_on_stable_features = multivariate_drift_on_stable_features_calc.calculate(analysis_df)

In [165]:
figure = results_multivariate.plot()
figure.show()

Let's examine how the mean of all columns is changing over time

In [166]:
calc = nml.SummaryStatsAvgCalculator(
    column_names=feature_column_names,
)

calc.fit(reference_df)
results = calc.calculate(analysis_df)
display(results.filter(period='all').to_df())

for column_name in results.column_names:
    results.filter(column_names=column_name).plot().show()

Unnamed: 0_level_0,chunk,chunk,chunk,chunk,chunk,chunk,chunk,AGEP,AGEP,AGEP,...,FER,FER,FER,RAC1P,RAC1P,RAC1P,RAC1P,RAC1P,RAC1P,RAC1P
Unnamed: 0_level_1,key,chunk_index,start_index,end_index,start_date,end_date,period,value,sampling_error,upper_confidence_boundary,...,upper_threshold,lower_threshold,alert,value,sampling_error,upper_confidence_boundary,lower_confidence_boundary,upper_threshold,lower_threshold,alert
0,[0:4539],0,0,4539,,,reference,37.061013,0.233596,37.761802,...,0.866459,0.724725,False,1.998458,0.032119,2.094815,1.902101,2.115051,1.841381,False
1,[4540:9079],1,4540,9079,,,reference,36.820044,0.233596,37.520832,...,0.866459,0.724725,False,2.073789,0.032119,2.170146,1.977431,2.115051,1.841381,False
2,[9080:13619],2,9080,13619,,,reference,36.884581,0.233596,37.58537,...,0.866459,0.724725,False,1.937885,0.032119,2.034243,1.841528,2.115051,1.841381,False
3,[13620:18159],3,13620,18159,,,reference,36.993612,0.233596,37.694401,...,0.866459,0.724725,False,1.969824,0.032119,2.066181,1.873467,2.115051,1.841381,False
4,[18160:22699],4,18160,22699,,,reference,36.088106,0.233596,36.788894,...,0.866459,0.724725,False,1.993392,0.032119,2.089749,1.897035,2.115051,1.841381,False
5,[22700:27239],5,22700,27239,,,reference,36.801322,0.233596,37.50211,...,0.866459,0.724725,False,1.992731,0.032119,2.089089,1.896374,2.115051,1.841381,False
6,[27240:31779],6,27240,31779,,,reference,36.46696,0.233596,37.167749,...,0.866459,0.724725,False,1.99207,0.032119,2.088428,1.895713,2.115051,1.841381,False
7,[31780:36319],7,31780,36319,,,reference,36.453304,0.233596,37.154092,...,0.866459,0.724725,False,1.918943,0.032119,2.0153,1.822585,2.115051,1.841381,False
8,[36320:40859],8,36320,40859,,,reference,36.694714,0.233596,37.395502,...,0.866459,0.724725,False,1.907269,0.032119,2.003626,1.810911,2.115051,1.841381,False
9,[40860:45404],9,40860,45404,,,reference,36.373817,0.233468,37.07422,...,0.866459,0.724725,False,1.9978,0.032101,2.094104,1.901496,2.115051,1.841381,False


In [167]:
calc_std = nml.SummaryStatsStdCalculator(column_names=feature_column_names)

calc_std.fit(reference_df)
results = calc_std.calculate(analysis_df)
display(results.filter(period='all').to_df())


Precision loss occurred in moment calculation due to catastrophic cancellation. This occurs when the data are nearly identical. Results may be unreliable.



CalculatorException: failed while fitting <nannyml.stats.std.calculator.SummaryStatsStdCalculator object at 0x000001B285318DF0>.
float division by zero

Finally, let's calculate realized performance to see whether performance estimation worked correctly.

In [None]:
calculator = nml.PerformanceCalculator(
    y_pred_proba='y_pred_proba',
    y_pred='y_pred',
    y_true='y_true',
    metrics=['roc_auc','f1','accuracy'],
    chunk_size=5000,
    problem_type='classification_binary',
).fit(reference_df)
realized_results = calculator.calculate(analysis_with_targets_df)
display(realized_results.filter(period='analysis').to_df())

In [None]:
# Show comparison plot
results_performance_estimation.filter(period='analysis', metrics=['roc_auc']).compare(realized_results.filter(period='analysis', metrics=['roc_auc'])).plot().show()

In [None]:
results_performance_estimation.filter(period='analysis', metrics=['f1']).compare(realized_results.filter(period='analysis', metrics=['f1'])).plot().show()

In [None]:
results_performance_estimation.filter(period='analysis', metrics=['accuracy']).compare(realized_results.filter(period='analysis', metrics=['accuracy'])).plot().show()