# Data drift

In [None]:
import nannyml as nml
import pandas as pd

reference, analysis, analysis_gt = nml.load_synthetic_sample()
metadata = nml.extract_metadata(data = reference, model_name='wfh_predictor')
metadata.target_column_name = 'work_home_actual'
reference.head()

In [None]:
# Let's initialize the object that will perform the Univariate Drift calculations
# Let's use a chunk size of 5000 data points to create our drift statistics
univariate_calculator = nml.UnivariateStatisticalDriftCalculator(model_metadata=metadata, chunk_size=5000)
# NannyML compares drift versus the full reference dataset.
univariate_calculator.fit(reference_data=reference)
# let's see drift statistics for all available data
data = pd.concat([reference, analysis])
univariate_results = univariate_calculator.calculate(data=data)
# let's view a small subset of our results:
univariate_results.iloc[:5, :9]

In [None]:
univariate_results.iloc[-5:, :9]

In [None]:
# Let's initialize the plotting class:
plots = nml.DriftPlots(model_metadata=univariate_calculator.model_metadata, chunker=univariate_calculator.chunker)
# let's plot drift results for all model inputs
for feature in metadata.features:
    figure = plots.plot_univariate_statistical_drift(univariate_results, metric='statistic', feature_label=feature.label)
    figure.show()

In [None]:
# let's plot distribution drift results for continuous model inputs
for feature in metadata.continuous_features:
    figure = plots.plot_continuous_feature_distribution_over_time(
        data=pd.concat([reference, analysis], ignore_index=True),
        drift_results=univariate_results,
        feature_label=feature.label
    )
    figure.show()

In [None]:
# let's plot distribution drift results for categorical model inputs
for feature in metadata.categorical_features:
    figure = plots.plot_categorical_feature_distribution_over_time(
        data=pd.concat([reference, analysis], ignore_index=True),
        drift_results=univariate_results,
        feature_label=feature.label
    )
    figure.show()

In [None]:
ranker = nml.Ranker.by('alert_count')
ranked_features = ranker.rank(univariate_results, model_metadata=metadata, only_drifting = False)
ranked_features

In [None]:
figure = plots.plot_univariate_statistical_prediction_drift(univariate_results, metric='statistic')
figure.show()

In [None]:
# Let's initialize the object that will perform Data Reconstruction with PCA
# Let's use a chunk size of 5000 data points to create our drift statistics
rcerror_calculator = nml.DataReconstructionDriftCalculator(model_metadata=metadata, chunk_size=5000)
# NannyML compares drift versus the full reference dataset.
rcerror_calculator.fit(reference_data=reference)
# let's see RC error statistics for all available data
rcerror_results = rcerror_calculator.calculate(data=data)
rcerror_results

In [None]:
figure = plots.plot_data_reconstruction_drift(rcerror_results)
figure.show()
