In [35]:
import tensorflow_data_validation as tfdv
import pandas as pd
import pathlib
import pickle

In [36]:
with open(f"benchmark_data.pickle", "rb") as f:
    data = pickle.load(f)

column_history = data["column_history"]
perturbations = data["column_perturbations"]

In [41]:
column = "numeric_0"
history = [run[0] for run in column_history]
column_perturbations = perturbations[0]

In [42]:
def parse_prediction(anomalies) -> bool:
    """
    Check if the TFDV report object contains reported anomalies
    """
    for feature_name, anomaly in anomalies.anomaly_info.items():
        if anomaly.short_description != 'No anomalies found':
            return True
    return False

In [53]:
fp = []
for i in range(30):
    train_df = pd.concat(history[i: i + 30])[[column]].reset_index(drop=True)
    test_df = history[i + 30][[column]].reset_index(drop=True)

    train_stats = tfdv.generate_statistics_from_dataframe(train_df)
    train_schema = tfdv.infer_schema(statistics=train_stats)
    test_stats = tfdv.generate_statistics_from_dataframe(test_df)

    feature = tfdv.get_feature(train_schema, column)
    feature.drift_comparator.jensen_shannon_divergence.threshold = 1.0

    prediction = tfdv.validate_statistics(
        statistics=test_stats, schema=train_schema, previous_statistics=train_stats
    )

    fp.append(parse_prediction(prediction))

In [54]:
fp

[False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False]

In [11]:
# Generate statistics from the DataFrame directly.
stats = tfdv.generate_statistics_from_dataframe(h_stuff[0][["numeric_4"]])
schema = tfdv.infer_schema(statistics=stats)

In [19]:
feature =  tfdv.get_feature(schema, 'numeric_4')
feature.drift_comparator.jensen_shannon_divergence.threshold = 0.0

In [29]:
new_stats = tfdv.generate_statistics_from_dataframe(h_stuff[1][["numeric_4"]])

In [30]:
# Compare statistics of the new data with the schema from the original data.
anomalies = tfdv.validate_statistics(statistics=new_stats, schema=schema, previous_statistics=stats)

In [31]:
anomalies

baseline {
  feature {
    name: "numeric_4"
    type: FLOAT
    presence {
      min_fraction: 1.0
      min_count: 1
    }
    drift_comparator {
      jensen_shannon_divergence {
        threshold: 0.0
      }
    }
    shape {
      dim {
        size: 1
      }
    }
  }
}
anomaly_info {
  key: "numeric_4"
  value {
    description: "The approximate Jensen-Shannon divergence between current and previous is 0.463437 (up to six significant digits), above the threshold 0."
    severity: ERROR
    short_description: "High approximate Jensen-Shannon divergence between current and previous"
    reason {
      type: COMPARATOR_JENSEN_SHANNON_DIVERGENCE_HIGH
      short_description: "High approximate Jensen-Shannon divergence between current and previous"
      description: "The approximate Jensen-Shannon divergence between current and previous is 0.463437 (up to six significant digits), above the threshold 0."
    }
    path {
      step: "numeric_4"
    }
  }
}
anomaly_name_format: SERI

In [32]:
def has_anomalies(anomalies):
    # Check if the anomalies object contains any anomaly.
    for feature_name, anomaly in anomalies.anomaly_info.items():
        if anomaly.short_description != 'No anomalies found':
            return True
    return False

In [33]:
has_anomalies(anomalies)

True

In [91]:
# drift test


In [92]:
drift_anomalies = tfdv.validate_statistics(
        statistics=new_stats, schema=schema, previous_statistics=stats)

In [93]:
drift_anomalies

baseline {
  feature {
    name: "numeric_4"
    type: FLOAT
    presence {
      min_fraction: 1.0
      min_count: 1
    }
    drift_comparator {
      jensen_shannon_divergence {
        threshold: 0.01
      }
    }
    shape {
      dim {
        size: 1
      }
    }
  }
}
anomaly_info {
  key: "numeric_4"
  value {
    description: "The approximate Jensen-Shannon divergence between current and previous is 1 (up to six significant digits), above the threshold 0.01."
    severity: ERROR
    short_description: "High approximate Jensen-Shannon divergence between current and previous"
    reason {
      type: COMPARATOR_JENSEN_SHANNON_DIVERGENCE_HIGH
      short_description: "High approximate Jensen-Shannon divergence between current and previous"
      description: "The approximate Jensen-Shannon divergence between current and previous is 1 (up to six significant digits), above the threshold 0.01."
    }
    path {
      step: "numeric_4"
    }
  }
}
anomaly_name_format: SERIALIZED_