# Data Drift Workflow



In [None]:
import nannyml as nml
import pandas as pd

In [None]:
reference, analysis, analysis_gt = nml.load_synthetic_sample()

In [None]:
md = nml.extract_metadata(data = reference, model_name='wfh_predictor')

In [None]:
print(md.print())
# md.print()

In [None]:
md.timestamp_column_name = 'timestamp'
# md.prediction_column_name = 'y_pred_proba'
md.ground_truth_column_name = 'work_home_actual'

In [None]:
print(md.print())

In [None]:
univariate_calculator = nml.UnivariateStatisticalDriftCalculator(model_metadata=md, chunk_size=5000)

In [None]:
univariate_calculator.fit(reference_data=reference)

In [None]:
fdata = pd.concat([reference, analysis], ignore_index=True)
fdata

In [None]:
univariate_results = univariate_calculator.calculate(data=fdata)

In [None]:
univariate_results_columns = list(univariate_results.columns)
univariate_results_columns

In [None]:
univariate_results#.iloc[-5:, :9]

In [None]:
ranking = nml.AlertCountRanking()

In [None]:
ranked_features_drifted = ranking.rank(univariate_results)

In [None]:
ranked_features_drifted

In [None]:
rcerror_calculator = nml.DataReconstructionDriftCalculator(model_metadata=md, chunk_size=5000)

In [None]:
rcerror_calculator.fit(reference_data=reference)

In [None]:
rcerror_results = rcerror_calculator.calculate(data=fdata)
rcerror_results

In [None]:
plots = nml.DriftPlots(univariate_calculator)

In [None]:
for itm in md.features:

    fig = plots.plot_univariate_statistical_drift(univariate_results, metric='statistic', feature_label=itm.label)
    fig.show()
    fig.write_image(file=f"drift-guide-{itm.label}.svg")

In [None]:
for itm in md.continuous_features:

    fig = plots.plot_continuous_feature_distribution_over_time(
        data=pd.concat([reference, analysis], ignore_index=True),
        drift_results=univariate_results,
        feature_label=itm.label
    )
    fig.show()
    fig.write_image(file=f"drift-guide-joyplot-{itm.label}.svg")

In [None]:
for itm in md.categorical_features:

    fig = plots.plot_categorical_feature_distribution_over_time(
        data=pd.concat([reference, analysis], ignore_index=True),
        drift_results=univariate_results,
        feature_label=itm.label
    )
    fig.show()
    fig.write_image(file=f"drift-guide-stacked-{itm.label}.svg")

In [None]:
fig = plots.plot_data_reconstruction_drift(rcerror_results)
fig.show()
fig.write_image(file=f"drift-guide-multivariate.svg")