In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# import uuid
import time
from typing import List, Tuple

import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.utils.validation import check_is_fitted
from statsmodels.tsa.stattools import adfuller
from IPython.display import Code, display
import inspect

import avh.utility_functions as utils
from avh.data_issues import (
    IssueTransfomer,
    NumericIssueTransformer,
    CategoricalIssueTransformer,
    IncreasedNulls,
    SchemaChange,
    DistributionChange,
    UnitChange,
    CasingChange,
    DQIssueDatasetGenerator,
    VolumeChangeUpsample,
    VolumeChangeDownsample,
    NumericPerturbation,
)

from avh.data_generation import (
    DataColumn,
    NumericColumn,
    CategoricalColumn,
    NormalNumericColumn,
    UniformNumericColumn,
    BetaNumericColumn,
    StaticCategoricalColumn,
    RandomCategoricalColumn,
    DataGenerationPipeline,
)

from avh.metrics import (
    Metric,
    SingleDistributionMetric,
    TwoDistributionMetric,
    RowCount,
    DistinctCount,
    DistinctRatio,
    CompleteRatio,
    Mean,
    Median,
    Range,
    Min,
    Max,
    Sum,
    MeanDigitLength,
    MeanPunctuationLength,
    MeanStringLength,
    EMD,
    KsDist,
    CohenD,
    KlDivergence,
    JsDivergence,
)

from avh.constraints import (
    Constraint,
    ConstantConstraint,
    ChebyshevConstraint,
    CantelliConstraint,
    CLTConstraint,
    ConjuctivDQProgram,
)

from avh.auto_validate_by_history import AVH

import plotly.express as px

In [3]:
import jupyter_black

jupyter_black.load()

In [4]:
rng = np.random.default_rng(42)

In [5]:
from avh.data_generation import DataGenerationPipeline, NormalNumericColumn, BetaNumericColumn

from avh.data_issues import IncreasedNulls
from avh.auto_validate_by_history import AVH

In [6]:
pipeline = DataGenerationPipeline(
    columns=[
        NormalNumericColumn("money", mean=300, std=10),
        BetaNumericColumn("height", alfa=18, beta=2),
    ],
    issues=[
        # ("money", []),
    ],
    random_state=42,
)

In [7]:
H = [pipeline.generate_normal(30000, 30) for i in range(30)]

In [25]:
avh = AVH(columns=["money"])
ps = avh.generate(H, fpr_target=0.05)

creating D(C)...: 23it [00:01, 20.04it/s] 
Generating P(S for columns...: 100%|██████████| 1/1 [00:01<00:00,  1.58s/it]


In [8]:
import pickle

with open("../benchmark/benchmark_data.pickle", "rb") as f:
    data = pickle.load(f)

In [9]:
H = [h[6] for h in data["column_history"]]

In [10]:
total_history_size = 60
train_history_window_size = 30

total_windows = total_history_size - train_history_window_size

In [11]:
metric_h = [h["numeric_6"] for h in H[3 : 3 + train_history_window_size]]
metric_history = Median.calculate(metric_h)

In [12]:
avh = AVH(verbose=0, columns=["numeric_6"], time_differencing="auto")

for i in range(total_windows):
    print("Now processing window: ", i)
    train_h = H[i : i + train_history_window_size]
    test_h = H[i + train_history_window_size]

    PS = avh.generate(train_h, fpr_target=0.05)
    print(PS)

Now processing window:  0


In [None]:

PS = a.generate(H, fpr_target=0.05)

creating D(C)...: 23it [00:00, 27.94it/s]
Generating P(S for columns...: 100%|██████████| 1/1 [00:01<00:00,  1.73s/it]


### Synthetic benchmark data generation

**Benchmark setup**  
columns:
- Numeric:
    - count: 1000
    - dtypes:
        - int: 30%
        - float: 70%
    - distributions:
        - uniform: 10%
        - normal: 20%
        - beta: 70%
- Categorical:
    - count: 1000

### First attempts

In [27]:
from typing import List, Tuple, Iterable, Optional
from scipy.stats import ks_2samp
from joblib import Parallel, delayed
from avh.auto_validate_by_history import AVH
from avh.constraints import Constraint, ConjuctivDQProgram
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm

import pathlib

In [79]:
with open(f"../benchmark/benchmark_data.pickle", "rb") as f:
    benchmark_data = pickle.load(f)

H_FULL = benchmark_data["column_history"]

In [80]:
H_FULL = [run[6] for run in H_FULL]

In [9]:
avh = AVH(columns=["numeric_6"])
ps = avh.generate(H_FULL[:30], fpr_target=0.05)

creating D(C)...: 23it [00:00, 24.13it/s] 
Generating P(S for columns...: 100%|██████████| 1/1 [00:00<00:00,  1.15it/s]


In [87]:
import pickle
import plotly.graph_objects as go
from copy import deepcopy

In [110]:
with open("../benchmark/benchmark_avh_metrics.pickle", "rb") as f:
    metrics_avh = pickle.load(f)

with open("../benchmark/benchmark_ks_test_metrics.pickle", "rb") as f:
    metrics_ks = pickle.load(f)

with open("../benchmark/benchmark_ks_test_full_metrics.pickle", "rb") as f:
    metrics_ks_full = pickle.load(f)

with open("../benchmark/benchmark_lof_metrics.pickle", "rb") as f:
    metrics_lof = pickle.load(f)

with open("../benchmark/benchmark_tfdv_metrics.pickle", "rb") as f:
    metrics_tfdv = pickle.load(f)

with open("../benchmark/benchmark_tfdv_full_metrics.pickle", "rb") as f:
    metrics_tfdv_full = pickle.load(f)

In [116]:
fig = go.Figure()

fig.add_scatter(x=metrics_avh["recall"], y=metrics_avh["precision"], name="AVH")
fig.add_scatter(x=metrics_ks["recall"][1:], y=metrics_ks["precision"][1:], name="KS-test")
fig.add_scatter(
    x=metrics_ks_full["recall"][1:], y=metrics_ks_full["precision"][1:], name="KS-test-full"
)
fig.add_scatter(x=metrics_lof["recall"], y=metrics_lof["precision"], name="LOF")
fig.add_scatter(x=metrics_tfdv["recall"], y=metrics_tfdv["precision"], name="TFDV")
fig.add_scatter(x=metrics_tfdv_full["recall"], y=metrics_tfdv_full["precision"], name="TFDV-full")
fig.update_layout(width=700, height=700)

In [98]:
fig = go.Figure()

fig.add_scatter(x=metrics_avh["recall"], y=metrics_avh["precision"], name="AVH")
fig.add_scatter(x=metrics_ks["recall"], y=metrics_ks["precision"], name="KS-test")
fig.add_scatter(x=metrics_lof["recall"], y=metrics_lof["precision"], name="LOF")
fig.add_scatter(x=metrics_tfdv["recall"], y=metrics_tfdv["precision"], name="TFDV")
fig.update_layout(width=700, height=700)

In [50]:
fig = go.Figure()

fig.add_scatter(x=metrics_avh["recall"], y=metrics_avh["precision"], name="AVH")
fig.add_scatter(x=metrics_ks["recall"], y=metrics_ks["precision"], name="KS-test")
fig.add_scatter(x=metrics_lof["recall"], y=metrics_lof["precision"], name="LOF")
fig.add_scatter(x=metrics_tfdv["recall"], y=metrics_tfdv["precision"], name="TFDV")
fig.update_layout(width=700, height=700)

In [23]:
fig = go.Figure()

fig.add_scatter(x=metrics_avh["recall"], y=metrics_avh["precision"], name="AVH")
fig.add_scatter(x=metrics_ks["recall"], y=metrics_ks["precision"], name="KS-test")
fig.add_scatter(x=metrics_lof["recall"], y=metrics_lof["precision"], name="LOF")
fig.add_scatter(x=metrics_tfdv["recall"], y=metrics_tfdv["precision"], name="TFDV")
fig.update_layout(width=700, height=700)

In [None]:
fig = go.Figure()

fig.add_scatter(x=metrics_avh["recall"], y=metrics_avh["precision"], name="AVH")
fig.add_scatter(x=metrics_ks["recall"], y=metrics_ks["precision"], name="KS-test")
fig.add_scatter(x=metrics_lof["recall"], y=metrics_lof["precision"], name="LOF")
fig.add_scatter(x=metrics_tfdv["recall"], y=metrics_tfdv["precision"], name="TFDV")
fig.update_layout(width=700, height=700)

In [None]:
fig = go.Figure()

fig.add_scatter(x=metrics_avh["recall"], y=metrics_avh["precision"], name="AVH")
fig.add_scatter(x=metrics_ks["recall"], y=metrics_ks["precision"], name="KS-test")
fig.add_scatter(x=metrics_lof["recall"], y=metrics_lof["precision"], name="LOF")
fig.add_scatter(x=metrics_hs["recall"], y=metrics_hs["precision"], name="Health-ESN")
fig.add_scatter(x=metrics_tfdv["recall"], y=metrics_tfdv["precision"], name="TFDV")
fig.update_layout(width=700, height=700)

In [None]:
fig = go.Figure()

fig.add_scatter(x=metrics_ks["recall"], y=metrics_ks["precision"])
fig.add_scatter(x=metrics_lof["recall"], y=metrics_lof["precision"])