In [1]:
import nannyml as nml
import pandas as pd
from IPython.display import display

In [None]:
from docs.utils import print_some_of_the_columns_only_markdown, print_table

In [None]:
df_reference, df_analysis, _ = nml.load_us_census_ma_employment_data()
display(df_reference.head())
display(df_analysis.head())

In [None]:
print_some_of_the_columns_only_markdown(df_reference, 2, 5)

In [None]:
print_some_of_the_columns_only_markdown(df_analysis, 2, 5)

In [None]:
chunk_size = 5_000

In [None]:
estimator = nml.CBPE(
    problem_type='classification_binary',
    y_pred_proba='predicted_probability',
    y_pred='prediction',
    y_true='employed',
    metrics=['roc_auc'],
    chunk_size=chunk_size,
)

In [None]:
estimator = estimator.fit(df_reference)
estimated_performance = estimator.estimate(df_analysis)

In [None]:
figure = estimated_performance.plot()
figure.show()

In [None]:
figure.write_image('../_static/quick-start-perf-est.svg', width=1000)

In [None]:
features = ['AGEP', 'SCHL', 'MAR', 'RELP', 'DIS', 'ESP', 'CIT', 'MIG', 'MIL', 'ANC',
       'NATIVITY', 'DEAR', 'DEYE', 'DREM', 'SEX', 'RAC1P']

univariate_calculator = nml.UnivariateDriftCalculator(
    column_names=features,
    chunk_size=chunk_size
)

univariate_calculator.fit(df_reference)
univariate_drift = univariate_calculator.calculate(df_analysis)

In [None]:
alert_count_ranker = nml.AlertCountRanker()
alert_count_ranked_features = alert_count_ranker.rank(univariate_drift)
display(alert_count_ranked_features.head())

In [None]:
print_table(alert_count_ranked_features.head())

In [None]:
figure = univariate_drift.filter(column_names=['RELP','AGEP', 'SCHL']).plot()
figure.show()

In [None]:
figure.write_image(f'../_static/quick-start-drift.svg', width=1000)

In [None]:
uni_drift_AGEP_analysis = univariate_drift.filter(column_names=['RELP'], period='analysis')
figure = estimated_performance.compare(uni_drift_AGEP_analysis).plot()
figure.show()

In [None]:
figure.write_image(f'../_static/quick-start-drift-n-performance.svg', width=1000)

In [None]:
figure = univariate_drift.filter(period='analysis', column_names=['RELP','AGEP', 'SCHL']).plot(kind='distribution')
figure.show()

In [None]:
figure.write_image(f'../_static/quick-start-univariate-distribution.svg', width=1000)

In [None]:
_, _, analysis_targets = nml.load_us_census_ma_employment_data()

In [None]:
df_analysis_with_targets = pd.concat([df_analysis, analysis_targets], axis=1)
display(df_analysis_with_targets.head())

In [None]:
print_some_of_the_columns_only_markdown(df_analysis_with_targets.head(), 2, 5)

In [None]:
performance_calculator = nml.PerformanceCalculator(
    problem_type='classification_binary',
    y_pred_proba='predicted_probability',
    y_pred='prediction',
    y_true='employed',
    metrics=['roc_auc'],
    chunk_size=chunk_size)

performance_calculator.fit(df_reference)
calculated_performance = performance_calculator.calculate(df_analysis_with_targets)

figure = estimated_performance.filter(period='analysis').compare(calculated_performance).plot()
figure.show()

In [None]:
figure.write_image(f'../_static/quick-start-estimated-and-realized.svg', width=1000)