# Butterfly Dataset example

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.spatial.transform import Rotation
from sklearn.datasets import make_classification

import nannyml as nml

# Creating Butterfly Dataset

By butterfly dataset we mean a dataset that does not drift at the univariate level but changes in it's overall stucture.

In [None]:
# 10 reference periods
# 10 analysis periods
# Days/week * Hours/day * events/hour
DPP = 7*24*12

np.random.seed(13)
s1 = np.random.randn(DPP*20)
x1 = s1 + np.random.randn(DPP*20)/8
x2 = s1 + np.random.randn(DPP*20)/8
x3 = np.random.randn(DPP*20)/8
xdat = np.array([x1, x2, x3]).T

rot = Rotation.from_euler('z', 90, degrees=True)

# following proper matrix multiplication rules, we need a 3xN data matrix
ydat = np.matmul(rot.as_matrix(), xdat.T).T

# create overall array that has drifted and not drifted subsets.
dataar = np.concatenate(
    (xdat[:-5*DPP], ydat[-5*DPP:]),
    axis=0
)

# convert data to dataframe
datadf = pd.DataFrame(dataar, columns=['f1', 'f2', 'f3'])

# add "timestamp" column
datadf = datadf.assign(ordered = pd.date_range(start='1/6/2020', freq='5min', periods=20*DPP))

# Adding helper column - duplicates date range functionality
datadf['week'] = datadf.ordered.dt.isocalendar().week - 1
# Adding partition column
datadf['partition'] = 'reference'
datadf.loc[datadf.week >= 11, ['partition']] = 'analysis'

# Assign random predictions and targets (we won't be using them but they are needed for NannyML)
datadf = datadf.assign(y_pred = np.random.rand(DPP*20))
datadf = datadf.assign(y_true = np.random.randint(2, size=DPP*20))

In [None]:
datadf

In [None]:
# Let's Visualize
sns.scatterplot(
    x=datadf.loc[datadf.week == 10, 'f1'][:1000],
    y=datadf.loc[datadf.week == 10, 'f2'][:1000],
    label='Week 10'
)
sns.scatterplot(
    x=datadf.loc[datadf.week == 16, 'f1'][:1000],
    y=datadf.loc[datadf.week == 16, 'f2'][:1000],
    label='Week 16'
)
plt.title('Data Distributions before and after rotation drift')
plt.xlabel('f1')
plt.ylabel('f2', rotation=0)
plt.legend()
plt.savefig('butterfly-scatterplot.svg')

In [None]:
reference = datadf.loc[datadf['partition'] == 'reference'].reset_index(drop=True)
reference.drop(['week'], axis=1, inplace=True)
analysis = datadf.loc[datadf['partition'] == 'analysis'].reset_index(drop=True)
analysis.drop(['y_true', 'week'], axis=1, inplace=True)
analysis

In [None]:
md = nml.extract_metadata(data = reference, model_name='wfh_predictor')
md.identifier_column_name = 'ordered'
md.timestamp_column_name = 'ordered'
md.ground_truth_column_name = 'y_true'
print(md.print())

In [None]:
univariate_calculator = nml.UnivariateStatisticalDriftCalculator(model_metadata=md, chunk_size=DPP)

In [None]:
univariate_calculator.fit(reference_data=reference)
univariate_results = univariate_calculator.calculate(data=pd.concat([reference, analysis], ignore_index=True))

In [None]:
plots = nml.DriftPlots(univariate_calculator)

In [None]:
for itm in md.features:

    fig = plots.plot_univariate_statistical_drift(univariate_results, metric='statistic', feature_label=itm.label)
    fig.show()
    fig.write_image(file=f"butterfly-univariate-drift-{itm.label}.svg")

In [None]:
for itm in md.features:

    fig = plots.plot_continuous_feature_distribution_over_time(
        data=pd.concat([reference, analysis], ignore_index=True),
        drift_results=univariate_results,
        feature_label=itm.label
    )
    fig.show()
    fig.write_image(file=f"butterfly-univariate-drift-joyplot-{itm.label}.svg")

In [None]:
rcerror_calculator = nml.DataReconstructionDriftCalculator(model_metadata=md, chunk_size=DPP)
rcerror_calculator.fit(reference_data=reference)
rcerror_results = rcerror_calculator.calculate(data=pd.concat([reference, analysis], ignore_index=True))

In [None]:
fig = plots.plot_data_reconstruction_drift(rcerror_results)
fig.show()
fig.write_image(file=f"butterfly-multivariate-drift.svg")