In [1]:
import nannyml as nml
from IPython.display import display

# Load synthetic data
reference, analysis, analysis_target = nml.load_synthetic_binary_classification_dataset()
display(reference.head())
display(analysis.head())

ModuleNotFoundError: No module named 'apscheduler'

In [None]:
print(reference.head().to_markdown(tablefmt="grid"))

In [None]:
print(analysis.head().to_markdown(tablefmt="grid"))

In [None]:
# Choose a chunker or set a chunk size
chunk_size = 5000

In [None]:
# initialize, specify required data columns, fit estimator and estimate
estimator = nml.CBPE(
    y_pred_proba='y_pred_proba',
    y_pred='y_pred',
    y_true='work_home_actual',
    metrics=['roc_auc'],
    chunk_size=chunk_size,
    problem_type='classification_binary',
)
estimator = estimator.fit(reference)
estimated_performance = estimator.estimate(analysis)

# Show results
figure = estimated_performance.plot(kind='performance', metric='roc_auc', plot_reference=True)
figure.show()

In [None]:
figure.write_image('../_static/quick-start-perf-est.svg')

In [None]:
# Define feature columns
feature_column_names = [
    col for col in reference.columns if col not in [
        'timestamp', 'y_pred_proba', 'period', 'y_pred', 'work_home_actual', 'identifier'
    ]]
# Let's initialize the object that will perform the Univariate Drift calculations
univariate_calculator = nml.UnivariateStatisticalDriftCalculator(
    feature_column_names=feature_column_names,
    chunk_size=chunk_size
)
univariate_calculator = univariate_calculator.fit(reference)
univariate_results = univariate_calculator.calculate(analysis)
# Plot drift results for all model inputs
for feature in univariate_calculator.feature_column_names:
    figure = univariate_results.plot(
        kind='feature_drift',
        metric='statistic',
        feature_column_name=feature,
        plot_reference=True
    )
    figure.show()

In [None]:
for feature in univariate_calculator.feature_column_names:
    figure = univariate_results.plot(
        kind='feature_drift',
        metric='statistic',
        feature_column_name=feature,
        plot_reference=True
    )
    figure.write_image(f'../_static/quick-start-drift-{feature}.svg')

In [None]:
ranker = nml.Ranker.by('alert_count')
ranked_features = ranker.rank(univariate_results, only_drifting = False)
display(ranked_features)

In [None]:
print(ranked_features.to_markdown(tablefmt="grid"))

In [None]:
calc = nml.StatisticalOutputDriftCalculator(
    y_pred='y_pred',
    y_pred_proba='y_pred_proba',
    problem_type='classification_binary'
)
calc.fit(reference)
results = calc.calculate(analysis)

figure = results.plot(kind='prediction_drift', plot_reference=True)
figure.show()

In [None]:
figure.write_image('../_static/quick-start-score-drift.svg')

In [None]:
# Let's initialize the object that will perform Data Reconstruction with PCA
rcerror_calculator = nml.DataReconstructionDriftCalculator(
    feature_column_names=feature_column_names,
    chunk_size=chunk_size
).fit(reference_data=reference)
# let's see Reconstruction error statistics for all available data
rcerror_results = rcerror_calculator.calculate(analysis)
figure = rcerror_results.plot(kind='drift', plot_reference=True)
figure.show()

In [None]:
figure.write_image('../_static/quick-start-drift-multivariate.svg')
