In [None]:
def showrst(df):
    print(df.to_markdown(tablefmt="grid"))

In [None]:
import pandas as pd
import nannyml as nml
reference, analysis, analysis_gt = nml.datasets.load_synthetic_binary_classification_dataset()
reference.head(3)

In [None]:
analysis.head(3)

In [None]:
analysis.head(3)
metadata = nml.extract_metadata(
    reference,
    model_type=nml.ModelType.CLASSIFICATION_BINARY,
    exclude_columns=['identifier']
)
metadata.target_column_name = 'work_home_actual'

In [None]:
cbpe = nml.CBPE(model_metadata=metadata, chunk_size=5000, metrics=['roc_auc', 'f1'])
cbpe.fit(reference_data=reference)

In [None]:
est_perf_analysis = cbpe.estimate(analysis)
est_perf_analysis.data.head(3)
showrst(est_perf_analysis.data.head(3))

In [None]:
for metric in cbpe.metrics:
    est_perf_analysis.plot(kind='performance', metric=metric).show()

In [None]:
est_perf_with_ref = cbpe.estimate(pd.concat([reference, analysis], ignore_index=True))

In [None]:
est_perf_with_ref.data.head(3)

In [None]:
showrst(est_perf_with_ref.data.head(3))

In [None]:
for metric in cbpe.metrics:
    est_perf_with_ref.plot(kind='performance', metric=metric).show()

In [None]:
engine='orca'
for metric in cbpe.metrics:
    fig = est_perf_analysis.plot(kind='performance', metric=metric)
    fig.write_image(file=f"../_static/tutorial-perf-est-guide-analysis-{metric}.svg", engine=engine)

In [None]:
for metric in cbpe.metrics:
    fig = est_perf_with_ref.plot(kind='performance', metric=metric)
    fig.write_image(file=f"../_static/tutorial-perf-est-guide-with-ref-{metric}.svg", engine=engine)

In [None]:
analysis_gt.head(3)

In [None]:
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
# merge gt to analysis
analysis_full = pd.merge(analysis, analysis_gt, on = 'identifier')
df_all = pd.concat([reference, analysis_full]).reset_index(drop=True)
target_col = 'work_home_actual'
pred_score_col = 'y_pred_proba'
actual_performance = []
for idx in est_perf.data.index:
    start_index, end_index = est_perf.data.loc[idx, 'start_index'], est_perf.data.loc[idx, 'end_index']
    sub = df_all.loc[start_index:end_index]
    actual_perf = roc_auc_score(sub[target_col], sub[pred_score_col])
    est_perf.data.loc[idx, 'actual_roc_auc'] = actual_perf
# plot
est_perf.data[['estimated_roc_auc', 'actual_roc_auc']].plot()
plt.xlabel('chunk')
plt.ylabel('ROC AUC')
plt.show()