In [None]:
import nannyml as nml
from IPython.display import display

# Load synthetic data
reference, analysis, analysis_target = nml.load_synthetic_car_loan_dataset()
display(reference.head())
display(analysis.head())

Unnamed: 0,car_value,salary_range,debt_to_income_ratio,loan_length,repaid_loan_on_prev_car,size_of_downpayment,driver_tenure,repaid,timestamp,y_pred_proba,y_pred
0,39811.0,40K - 60K €,0.63295,19.0,False,40%,0.212653,True,2018-01-01 00:00:00.000,0.99,True
1,12679.0,40K - 60K €,0.718627,7.0,True,10%,4.927549,False,2018-01-01 00:08:43.152,0.07,False
2,19847.0,40K - 60K €,0.721724,17.0,False,0%,0.520817,True,2018-01-01 00:17:26.304,1.0,True
3,22652.0,20K - 20K €,0.705992,16.0,False,10%,0.453649,True,2018-01-01 00:26:09.456,0.98,True
4,21268.0,60K+ €,0.671888,21.0,True,30%,5.695263,True,2018-01-01 00:34:52.608,0.99,True


Unnamed: 0,car_value,salary_range,debt_to_income_ratio,loan_length,repaid_loan_on_prev_car,size_of_downpayment,driver_tenure,timestamp,y_pred_proba,y_pred
0,12638.0,0 - 20K €,0.487926,21.0,False,10%,4.224628,2018-10-30 18:00:00.000,0.99,True
1,52425.0,20K - 20K €,0.672183,20.0,False,40%,4.963103,2018-10-30 18:08:43.152,0.98,True
2,20369.0,40K - 60K €,0.70309,19.0,True,40%,4.588951,2018-10-30 18:17:26.304,0.98,True
3,10592.0,20K - 20K €,0.653258,21.0,False,10%,4.711015,2018-10-30 18:26:09.456,0.97,True
4,33933.0,0 - 20K €,0.722263,18.0,False,0%,0.906738,2018-10-30 18:34:52.608,0.92,True


In [None]:
print(reference.head().to_markdown(tablefmt="grid"))

+----+-------------+----------------+------------------------+---------------+---------------------------+-----------------------+-----------------+----------+-------------------------+----------------+----------+
|    |   car_value | salary_range   |   debt_to_income_ratio |   loan_length | repaid_loan_on_prev_car   | size_of_downpayment   |   driver_tenure | repaid   | timestamp               |   y_pred_proba | y_pred   |
|  0 |       39811 | 40K - 60K €    |               0.63295  |            19 | False                     | 40%                   |        0.212653 | True     | 2018-01-01 00:00:00.000 |           0.99 | True     |
+----+-------------+----------------+------------------------+---------------+---------------------------+-----------------------+-----------------+----------+-------------------------+----------------+----------+
|  1 |       12679 | 40K - 60K €    |               0.718627 |             7 | True                      | 10%                   |        4.9275

In [None]:
print(analysis.head().to_markdown(tablefmt="grid"))

+----+-------------+----------------+------------------------+---------------+---------------------------+-----------------------+-----------------+-------------------------+----------------+----------+
|    |   car_value | salary_range   |   debt_to_income_ratio |   loan_length | repaid_loan_on_prev_car   | size_of_downpayment   |   driver_tenure | timestamp               |   y_pred_proba | y_pred   |
|  0 |       12638 | 0 - 20K €      |               0.487926 |            21 | False                     | 10%                   |        4.22463  | 2018-10-30 18:00:00.000 |           0.99 | True     |
+----+-------------+----------------+------------------------+---------------+---------------------------+-----------------------+-----------------+-------------------------+----------------+----------+
|  1 |       52425 | 20K - 20K €    |               0.672183 |            20 | False                     | 40%                   |        4.9631   | 2018-10-30 18:08:43.152 |           0.9

In [None]:
# Choose a chunker or set a chunk size
chunk_size = 5000

In [None]:
# initialize, specify required data columns, fit estimator and estimate
estimator = nml.CBPE(
    y_pred_proba='y_pred_proba',
    y_pred='y_pred',
    y_true='repaid',
    metrics=['roc_auc'],
    chunk_size=chunk_size,
    problem_type='classification_binary',
)
estimator = estimator.fit(reference)
estimated_performance = estimator.estimate(analysis)

# Show results
figure = estimated_performance.plot()
figure.show()

In [None]:
figure.write_image('../_static/quick-start-perf-est.svg')

In [None]:
# Define feature columns
feature_column_names = [
    col for col in reference.columns if col not in [
        'timestamp', 'repaid',
    ]]

# explicitly set the y_pred column as categorical.
# if left as is, it would be designated as continuous because of the int dtype
reference['y_pred'] = reference['y_pred'].astype("category")
analysis['y_pred'] = analysis['y_pred'].astype("category")

# Let's initialize the object that will perform the Univariate Drift calculations
univariate_calculator = nml.UnivariateDriftCalculator(
    column_names=feature_column_names,
    chunk_size=chunk_size,
    continuous_methods=['kolmogorov_smirnov', 'jensen_shannon'],
    categorical_methods=['chi2', 'jensen_shannon'],
)
univariate_calculator = univariate_calculator.fit(reference)
univariate_results = univariate_calculator.calculate(analysis)
# Plot drift results for all continuous columns
figure = univariate_results.filter(
    column_names=univariate_results.continuous_column_names,
    period='analysis',
    methods=['jensen_shannon']).plot(kind='drift')
figure.show()

# Plot drift results for all categorical columns
figure = univariate_results.filter(
    column_names=univariate_results.categorical_column_names,
    period='analysis',
    methods=['chi2']).plot(kind='drift')
figure.show()

In [None]:
figure = univariate_results.filter(
    column_names=univariate_results.continuous_column_names,
    period='analysis',
    methods=['jensen_shannon']).plot(kind='drift')
figure.write_image(f'../_static/quick-start-drift-continuous.svg')

figure = univariate_results.filter(
    column_names=univariate_results.categorical_column_names,
    period='analysis',
    methods=['chi2']).plot(kind='drift')
figure.write_image(f'../_static/quick-start-drift-categorical.svg')

In [None]:
ranker = nml.CorrelationRanker()
# ranker fits on one metric and reference period data only
ranker.fit(
    estimated_performance.filter(period='reference', metrics=['roc_auc']))
# ranker ranks on one drift method and one performance metric
ranked_features = ranker.rank(
    univariate_results.filter(methods=['jensen_shannon']),
    estimated_performance.filter(metrics=['roc_auc']),
    only_drifting = False)
display(ranked_features)

Unnamed: 0,column_name,pearsonr_correlation,pearsonr_pvalue,has_drifted,rank
0,repaid_loan_on_prev_car,0.99829,1.1777060000000001e-23,True,1
1,y_pred_proba,0.998072,3.4745760000000003e-23,True,2
2,loan_length,0.996876,2.661458e-21,True,3
3,salary_range,0.996512,7.162919e-21,True,4
4,car_value,0.996148,1.74676e-20,True,5
5,size_of_downpayment,0.307497,0.18722,False,6
6,debt_to_income_ratio,0.250211,0.2873424,False,7
7,y_pred,0.075282,0.7524257,False,8
8,driver_tenure,-0.134447,0.5719876,False,9


In [None]:
print(ranked_features.to_markdown(tablefmt="grid"))

+----+-------------------------+------------------------+-------------------+---------------+--------+
|    | column_name             |   pearsonr_correlation |   pearsonr_pvalue | has_drifted   |   rank |
|  0 | repaid_loan_on_prev_car |              0.99829   |       1.17771e-23 | True          |      1 |
+----+-------------------------+------------------------+-------------------+---------------+--------+
|  1 | y_pred_proba            |              0.998072  |       3.47458e-23 | True          |      2 |
+----+-------------------------+------------------------+-------------------+---------------+--------+
|  2 | loan_length             |              0.996876  |       2.66146e-21 | True          |      3 |
+----+-------------------------+------------------------+-------------------+---------------+--------+
|  3 | salary_range            |              0.996512  |       7.16292e-21 | True          |      4 |
+----+-------------------------+------------------------+----------------

In [None]:
# Let's initialize the object that will perform Data Reconstruction with PCA
rcerror_calculator = nml.DataReconstructionDriftCalculator(
    column_names=feature_column_names,
    chunk_size=chunk_size
).fit(reference_data=reference)
# let's see Reconstruction error statistics for all available data
rcerror_results = rcerror_calculator.calculate(analysis)
figure = rcerror_results.plot()
figure.show()

In [None]:
figure.write_image('../_static/quick-start-drift-multivariate.svg')
