generate Summariesc

In [None]:
# source
from whylogs.viz.utils import profile_viz_calculations

def generate_summaries(
    target_view: DatasetProfileView, ref_view: Optional[DatasetProfileView], config: Optional[SummaryConfig]
) -> Optional[Dict[str, Any]]:
    if config is None:
        config = SummaryConfig()

    if not target_view or not ref_view:
        raise ValueError("This method has to get both target and reference profiles")

    overall_stats: OverallStats = add_overall_statistics(target_view)
    drift_values = calculate_drift_values(target_view, ref_view)
    # my testing
    # print('Drift Values: \n')
    # print(drift_values)
    # print('\n')
    target_col_views = target_view.get_columns()
    ref_col_views = ref_view.get_columns()
    ref_summary: DatasetSummary = {"columns": {}, "properties": overall_stats}
    target_summary: DatasetSummary = {"columns": {}, "properties": None}
    for target_col_name in target_col_views:
        if target_col_name in ref_col_views and not is_image_compound_metric(target_col_views[target_col_name]):
            target_column_summary: ColumnSummary = {
                "histogram": None,
                "frequentItems": None,
                "drift_from_ref": None,
                "isDiscrete": None,
                "featureStats": None,
            }
            ref_column_summary: ColumnSummary = {
                "histogram": None,
                "frequentItems": None,
                "drift_from_ref": None,
                "isDiscrete": None,
                "featureStats": None,
            }

            target_col_view = target_col_views[target_col_name]
            ref_col_view = ref_col_views[target_col_name]
            if not target_col_view or not ref_col_view:
                continue

            target_stats = add_feature_statistics(target_col_name, target_col_view)
            target_column_summary["featureStats"] = target_stats[target_col_name]

            if target_col_name in drift_values:
                col_drift_value = drift_values[target_col_name]
                if col_drift_value:
                    ref_column_summary["drift_from_ref"] = col_drift_value["p_value"]
            target_dist = target_col_view.get_metric("distribution")
            reference_dist = ref_col_view.get_metric("distribution")
            if (
                target_dist
                and reference_dist
                and not target_dist.kll.value.is_empty()
                and not reference_dist.kll.value.is_empty()
            ):
                target_column_summary["isDiscrete"] = ref_column_summary["isDiscrete"] = False

                target_histogram = histogram_from_view(target_col_view, target_col_name)
                target_column_summary["histogram"] = target_histogram

                ref_histogram = histogram_from_view(ref_col_view, target_col_name)
                ref_column_summary["histogram"] = ref_histogram

            elif target_col_view.get_metric("frequent_items") and ref_col_view.get_metric("frequent_items"):
                target_column_summary["isDiscrete"] = ref_column_summary["isDiscrete"] = True

                target_frequent_items = frequent_items_from_view(target_col_view, target_col_name, config)
                target_column_summary["frequentItems"] = target_frequent_items

                ref_frequent_items = frequent_items_from_view(ref_col_view, target_col_name, config)
                ref_column_summary["frequentItems"] = ref_frequent_items
            target_summary["columns"][target_col_name] = target_column_summary
            ref_summary["columns"][target_col_name] = ref_column_summary

    summaries = {
        "profile_from_whylogs": json.dumps(target_summary),
        "reference_profile_from_whylogs": json.dumps(ref_summary),
    }
    return summaries

calculate drift values

In [None]:
# source
from whylogs.viz.utils import drift_calculations

def calculate_drift_values(
    target_view: DatasetProfileView, reference_view: DatasetProfileView, statistic=False
) -> Dict[str, Optional[Union[ColumnDriftValue, ColumnDriftStatistic]]]:
    """Calculate drift values between both profiles. Applicable for numerical and categorical features.

    Calculates drift only for features found in both profiles, and ignore those not found in either profile.

    Parameters
    ----------
    target_view : DatasetProfileView
        Target Profile View
    reference_view : DatasetProfileView
        Reference Profile View
    statistic: bool
        If false, value will be a pvalue. If true value will be a statistic.


    Returns
    -------
    drift_values: Dict[str, Optional[ColumnDriftValue]]
        A dictionary of the p-values, along with the type of test applied, for the given features.
    """
    drift_values: Dict[str, Optional[Union[ColumnDriftValue, ColumnDriftStatistic]]] = {}
    target_view_columns = target_view.get_columns()
    reference_view_columns = reference_view.get_columns()
    for target_column_name in target_view_columns:
        if target_column_name in reference_view_columns:
            target_view_column = target_view_columns[target_column_name]
            reference_view_column = reference_view_columns[target_column_name]

            if not statistic:
                drift_values[target_column_name] = _get_ks_p_value(
                    target_view_column=target_view_column, reference_view_column=reference_view_column
                ) or _get_chi2_p_value(
                    target_view_column=target_view_column, reference_view_column=reference_view_column
                )
            else:
                drift_values[target_column_name] = _get_hellinger_distance(
                    target_view_column=target_view_column, reference_view_column=reference_view_column
                )
    return drift_values

_get_ks_p_value

In [None]:
# source
from whylogs.viz.utils import drift_calculations

def _get_ks_p_value(target_view_column, reference_view_column) -> Optional[ColumnDriftValue]:
    target_dist_metric = target_view_column.get_metric("distribution")
    ref_dist_metric = reference_view_column.get_metric("distribution")
    # my testing
    # test =  target_view_column.get_metric('image')
    # if(test):
    #     print('Submetrics : \n', test.submetrics)
    print('Target_Dist_metric: \n')
    print(target_dist_metric)
    
    if target_dist_metric is None or ref_dist_metric is None:
        return None

    target_kll_sketch = target_dist_metric.kll.value
    ref_kll_sketch = ref_dist_metric.kll.value

    ks_p_value = _compute_ks_test_p_value(target_kll_sketch, ref_kll_sketch)
    return ks_p_value

_compute_ks_test_p_value

In [None]:
# source
from whylogs.viz.utils import drift_calculations

def _compute_ks_test_p_value(
    target_distribution: kll_doubles_sketch,
    reference_distribution: kll_doubles_sketch,
    quantiles: Optional[List[float]] = None,
) -> Optional[ColumnDriftValue]:
    """Compute the Kolmogorov-Smirnov test p-value of two continuous distributions.

    Uses the quantile values and the corresponding CDFs to calculate the approximate KS statistic.
    Only applicable to continuous distributions.
    The null hypothesis expects the samples to come from the same distribution.

    Parameters
    ----------
    target_distribution : datasketches.kll_floats_sketch
        A kll_floats_sketch (quantiles sketch) from the target distribution's values
    reference_distribution : datasketches.kll_floats_sketch
        A kll_floats_sketch (quantiles sketch) from the reference (expected) distribution's values
        Can be generated from a theoretical distribution, or another sample for the same feature.
    quantiles: Optional[List[float]], optional
        Bucket of quantiles used to get the CDF's for both target and reference profiles.

    Returns
    -------
        p_value : float
        The estimated p-value from the parametrized KS test, applied on the target and reference distributions'
        kll_floats_sketch summaries

    """

    if not quantiles:
        QUANTILES = KSTestConfig().quantiles
    else:
        QUANTILES = quantiles

    if reference_distribution.is_empty() or target_distribution.is_empty():
        return None

    D_max = 0
    target_quantile_values = target_distribution.get_quantiles(QUANTILES)
    ref_quantile_values = reference_distribution.get_quantiles(QUANTILES)

    num_quantiles = len(QUANTILES)
    i, j = 0, 0
    while i < num_quantiles and j < num_quantiles:
        if target_quantile_values[i] < ref_quantile_values[j]:
            current_quantile = target_quantile_values[i]
            i += 1
        else:
            current_quantile = ref_quantile_values[j]
            j += 1

        cdf_target = target_distribution.get_cdf([current_quantile])[0]
        cdf_ref = reference_distribution.get_cdf([current_quantile])[0]
        D = abs(cdf_target - cdf_ref)
        if D > D_max:
            D_max = D

    while i < num_quantiles:
        cdf_target = target_distribution.get_cdf([target_quantile_values[i]])[0]
        cdf_ref = reference_distribution.get_cdf([target_quantile_values[i]])[0]
        D = abs(cdf_target - cdf_ref)
        if D > D_max:
            D_max = D
        i += 1

    while j < num_quantiles:
        cdf_target = target_distribution.get_cdf([ref_quantile_values[j]])[0]
        cdf_ref = reference_distribution.get_cdf([ref_quantile_values[j]])[0]
        D = abs(cdf_target - cdf_ref)
        if D > D_max:
            D_max = D
        j += 1

    m, n = sorted([target_distribution.get_n(), reference_distribution.get_n()], reverse=True)
    en = m * n / (m + n)

    p_value = stats.distributions.kstwo.sf(D_max, np.round(en))

    return {"p_value": p_value, "test": "ks"}

quantiles for KSTestConfig

In [None]:
# source
from whylogs.viz import configs

# KSTestConfig --> quantiles
from dataclasses import dataclass, field
from typing import List
import numpy as np
quantiles: List[float] = field(default_factory=lambda: list(np.linspace(0, 1, 100)))

list(np.linspace(0,1,100))

## Histogram creation

histogram_from_view

In [None]:
# source
from whylogs.viz.utils import histogram_calculations

def histogram_from_view(column_view: ColumnProfileView, feature_name: str) -> HistogramSummary:
    col_dist: Optional[DistributionMetric] = column_view.get_metric("distribution")
    if not col_dist:
        raise ValueError("Distribution Metrics not found for feature {}.".format(feature_name))

    target_kill = col_dist.kll.value
    target_histogram = _histogram_from_sketch(target_kill)
    return target_histogram

_histogram_from_sketch

In [None]:
# source
from whylogs.viz.utils import histogram_calculations

def _histogram_from_sketch(
    sketch: kll_doubles_sketch, max_buckets: int = None, avg_per_bucket: Optional[float] = None
) -> HistogramSummary:
    """
    Generate a summary of a kll_floats_sketch, including a histogram

    Parameters
    ----------
    sketch : kll_floats_sketch
        Data sketch
    max_buckets : int
        Override the default maximum number of buckets
    avg_per_bucket : int
        Override the default target number of items per bucket.

    Returns
    -------
    histogram : HistogramSummary
        Protobuf histogram message
    """
    n = sketch.get_n()
    start = sketch.get_min_value()
    max_val = sketch.get_max_value()
    end = max_val
    if max_buckets is None:
        max_buckets = MAX_HIST_BUCKETS
    if avg_per_bucket is None:
        avg_per_bucket = HIST_AVG_NUMBER_PER_BUCKET

    if (n < 2) or (start == end):
        dx = abs(start) * 1e-7
        end = start + dx
        bins = [start, end]
        counts = [n]
    else:
        bins, end = _calculate_bins(end, start, n, avg_per_bucket, max_buckets)
        pmf = sketch.get_pmf(bins)
        counts = [round(p * n) for p in pmf]
        counts = counts[1:-1]

    histogram: HistogramSummary = {
        "start": start,
        "end": end,
        "width": 0,
        "counts": counts,
        "max": max_val,
        "min": start,
        "bins": bins,
        "n": n,
    }
    return histogram

calculate bins

In [None]:
# source
from whylogs.viz.utils import quantile_stats

def _calculate_bins(
    end: float, start: float, n: int, avg_per_bucket: float, max_buckets: int
) -> Tuple[List[float], float]:
    # Include the max value in the right-most bin
    end += abs(end) * 1e-7
    abs_end = abs(end)
    abs_start = abs(start)

    # Include the right edge in the bin edges
    n_buckets = min(math.ceil(n / avg_per_bucket), max_buckets)
    width = (end - start) / n_buckets

    min_interval = _get_min_interval(abs_start, abs_end)

    # If the bin width is smaller than min_interval, we need bigger bins
    if width < min_interval:
        n_buckets, width = _resize_bins(start, end, min_interval, width, n_buckets)

    # Calculate histograms from the Probability Mass Function
    bins = [start + i * width for i in range(n_buckets + 1)]
    logger.debug(f"about to get pmf using start: {start} end:{end} width:{width} and n_buckets:{n_buckets}")
    logger.debug(f"bin: {bins}")
    return bins, end