In [None]:
import nannyml as nml
from IPython.display import display

# Load synthetic data
reference_df, analysis_df, analysis_targets_df = nml.load_synthetic_car_loan_dataset()
# display(reference_df.head())

column_names = [
    'car_value', 'salary_range', 'debt_to_income_ratio', 'loan_length', 'repaid_loan_on_prev_car', 'size_of_downpayment', 'driver_tenure', 'y_pred_proba', 'y_pred'
]

rce = nml.DataReconstructionDriftCalculator(
    column_names=column_names,
    timestamp_column_name='timestamp',
    chunk_size=5000
)
rce.fit(reference_df)
rcerr = rce.calculate(analysis_df)

estimator = nml.CBPE(
    y_pred_proba='y_pred_proba',
    y_pred='y_pred',
    y_true='repaid',
    timestamp_column_name='timestamp',
    metrics=['roc_auc',],
    chunk_size=5000,
    problem_type='classification_binary',
    normalize_confusion_matrix='pred',
)
estimator.fit(reference_df)
estimated = estimator.estimate(analysis_df)

analysis_with_targets_df = analysis_df.merge(analysis_targets_df, left_index=True, right_index=True)
realize = nml.PerformanceCalculator(
    y_pred_proba='y_pred_proba',
    y_pred='y_pred',
    y_true='repaid',
    timestamp_column_name='timestamp',
    problem_type='classification_binary',
    metrics=['roc_auc'],
    chunk_size=5000)
realize.fit(reference_df)
realized = realize.calculate(analysis_with_targets_df)


drift = nml.UnivariateDriftCalculator(
    column_names=column_names,
    treat_as_categorical=['y_pred'],
    timestamp_column_name='timestamp',
    continuous_methods=['jensen_shannon'],
    categorical_methods=['jensen_shannon'],
)
drift.fit(reference_df)
drifted = drift.calculate(analysis_df)



error uploading: HTTPSConnectionPool(host='api.segment.io', port=443): Max retries exceeded with url: /v1/batch (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7fed45f358e0>: Failed to establish a new connection: [Errno 111] Connection refused'))


In [None]:
estimated.compare(realized).plot().show()
realized.compare(estimated).plot().show()

In [None]:
estimated.compare(rcerr).plot().show()
rcerr.compare(estimated).plot().show()

rcerr.compare(realized).plot().show()
realized.compare(rcerr).plot().show()

In [None]:
drifted.filter(column_names=['salary_range']).compare(rcerr).plot().show()
rcerr.compare(drifted.filter(column_names=['salary_range'])).plot().show()

In [None]:
drifted.filter(column_names=['salary_range']).compare(estimated).plot().show()
estimated.compare(drifted.filter(column_names=['salary_range'])).plot().show()

drifted.filter(column_names=['salary_range']).compare(realized).plot().show()
realized.compare(drifted.filter(column_names=['salary_range'])).plot().show()