# Data reconstruction with PCA

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nannyml as nml

from scipy.spatial.transform import Rotation
from sklearn.datasets import make_classification

# 10 reference periods
# 10 analysis periods
# Days/week * Hours/day * events/hour
DPP = 7*24*12

np.random.seed(23)
s1 = np.random.randn(DPP*20)
x1 = s1 + np.random.randn(DPP*20)/8
x2 = s1 + np.random.randn(DPP*20)/8
x3 = np.random.randn(DPP*20)/8
xdat = np.array([x1, x2, x3]).T
rot = Rotation.from_euler('z', 90, degrees=True)

# following matrix multiplication implementation, we need a 3xN data matrix hence we transpose
ydat = np.matmul(rot.as_matrix(), xdat.T).T

# create overall array that has drifted and not drifted subsets.
# drift is sudden and affects last 5 weeks
dataar = np.concatenate(
    (xdat[:-5*DPP], ydat[-5*DPP:]),
    axis=0
)

# convert data to dataframe
datadf = pd.DataFrame(dataar, columns=['f1', 'f2', 'f3'])

# add "timestamp" column
datadf = datadf.assign(ordered = pd.date_range(start='1/6/2020', freq='5min', periods=20*DPP))

# Adding helper column - duplicates date range functionality
datadf['week'] = datadf.ordered.dt.isocalendar().week - 1
# Adding partition column
datadf['partition'] = 'reference'
datadf.loc[datadf.week >= 11, ['partition']] = 'analysis'

# Assign random predictions and targets (we won't be using them but they are needed for NannyML)
datadf = datadf.assign(y_pred = np.random.rand(DPP*20))
datadf = datadf.assign(y_true = np.random.randint(2, size=DPP*20))

In [None]:
datadf

In [None]:
dat1 = datadf.loc[datadf.week == 10, ['f1', 'f2']][:1500]
dat1['week'] = 10
dat2 = datadf.loc[datadf.week == 16, ['f1', 'f2']][:1500]
dat2['week'] = 16
data_sample = pd.concat([dat1, dat2], ignore_index=True)

# let's plot
colors = nml.plots.colors.Colors
figure = sns.jointplot(
    data=data_sample,
    x="f1",
    y="f2",
    hue="week",
    palette=[colors.BLUE_SKY_CRAYOLA.value, colors.RED_IMPERIAL.value]
)
figure.fig.suptitle('Data Distributions before and after rotation drift')
figure.savefig('butterfly-scatterplot.svg')

In [None]:
# Let's first create the analysis and reference datasets NannyML needs.
reference = datadf.loc[datadf['partition'] == 'reference'].reset_index(drop=True)
reference.drop(['week'], axis=1, inplace=True)
analysis = datadf.loc[datadf['partition'] == 'analysis'].reset_index(drop=True)
analysis.drop(['y_true', 'week'], axis=1, inplace=True)
data = pd.concat([reference, analysis], ignore_index=True)

# Let's create the model metadata object
metadata = nml.extract_metadata(data = reference, model_name='3d_rotation')
metadata.identifier_column_name = 'ordered'
metadata.timestamp_column_name = 'ordered'
metadata.target_column_name = 'y_true'

# Let's compute univariate drift
univariate_calculator = nml.UnivariateStatisticalDriftCalculator(model_metadata=metadata, chunk_size=DPP)
univariate_calculator.fit(reference_data=reference)
# let's compute (and visualize) results across all the dataset.
univariate_results = univariate_calculator.calculate(data=data)

# let's create plot with results
plots = nml.DriftPlots(model_metadata=univariate_calculator.model_metadata, chunker=univariate_calculator.chunker)
for feature in metadata.features:
    figure = plots.plot_univariate_statistical_drift(univariate_results, metric='statistic', feature_label=feature.label)
    figure.show()
    figure.write_image(file=f"butterfly-univariate-drift-{feature.label}.svg")

In [None]:
for feature in metadata.continuous_features:
    figure = plots.plot_continuous_feature_distribution_over_time(
        data=data,
        drift_results=univariate_results,
        feature_label=feature.label
    )
    figure.show()
    figure.write_image(file=f"butterfly-univariate-drift-joyplot-{feature.label}.svg")

In [None]:
# Let's compute univariate drift
rcerror_calculator = nml.DataReconstructionDriftCalculator(model_metadata=metadata, chunk_size=DPP)
rcerror_calculator.fit(reference_data=reference)
# let's compute (and visualize) results across all the dataset.
rcerror_results = rcerror_calculator.calculate(data=data)

# let's create plot with results
figure = plots.plot_data_reconstruction_drift(rcerror_results)
figure.show()
figure.write_image(file=f"butterfly-multivariate-drift.svg")