In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import uuid
import time
from typing import List

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.utils.validation import check_is_fitted
from statsmodels.tsa.stattools import adfuller
from IPython.display import Code, display
import inspect

import avh.utility_functions as utils
from avh.data_issues import (
    IssueTransfomer,
    NumericIssueTransformer,
    CategoricalIssueTransformer,
    IncreasedNulls,
    SchemaChange,
    DistributionChange,
    UnitChange,
    CasingChange,
    DQIssueDatasetGenerator,
    VolumeChangeUpsample,
    VolumeChangeDownsample,
    NumericPerturbation,
    DistributionChangeV2,
    DistributionChangeV3,
)

from avh.data_generation import (
    DataColumn,
    NumericColumn,
    CategoricalColumn,
    NormalNumericColumn,
    UniformNumericColumn,
    BetaNumericColumn,
    StaticCategoricalColumn,
    RandomCategoricalColumn,
    DataGenerationPipeline,
)

from avh.metrics import (
    Metric,
    SingleDistributionMetric,
    TwoDistributionMetric,
    RowCount,
    DistinctCount,
    DistinctRatio,
    CompleteRatio,
    Mean,
    Median,
    Range,
    Min,
    Max,
    Sum,
    MeanDigitLength,
    MeanPunctuationLength,
    MeanStringLength,
    EMD,
    KsDist,
    CohenD,
    KlDivergence,
    JsDivergence,
)

from avh.constraints import (
    Constraint,
    ConstantConstraint,
    ChebyshevConstraint,
    CantelliConstraint,
    CLTConstraint,
    ConjuctivDQProgram,
)

from avh.auto_validate_by_history import AVH

In [3]:
import jupyter_black

jupyter_black.load()

In [4]:
rng = np.random.default_rng(42)

In [5]:
pipeline = DataGenerationPipeline(
    columns=[
        NormalNumericColumn("money", 300, 10),
        NormalNumericColumn("height", 18, 2),
        UniformNumericColumn("houses", 30, 50, dtype=np.int32),
        RandomCategoricalColumn("pets", ["dog", "cat", "snail"]),
        # RandomCategoricalColumn("pets2", ["dog", "cat", "snail"]),
        # RandomCategoricalColumn("text"),
    ],
    issues=[
        ("money", [IncreasedNulls(p=0.1)]),
        # ("height", [IncreasedNulls(p=0.99)]),
        # ("pets", [IncreasedNulls(p=0.5)]),
    ],
    random_state=rng,
)

In [6]:
# Defining M space
M = [
    RowCount,
    DistinctRatio,
    DistinctCount,
    CompleteRatio,
    Min,
    Max,
    Mean,
    Median,
    Sum,
    Range,
    EMD,
    JsDivergence,
    KlDivergence,
    KsDist,
    CohenD,
]

# Defining available constraint estimators
# Note: Order not important as Q space will be constructed
#     from cartesian product of E & M
E = [
    CLTConstraint,
    ChebyshevConstraint,
    CantelliConstraint,
]

H = [pipeline.generate_normal(100000, 10) for i in range(31)]

In [7]:
issue = DistributionChange(p=0.5, take_last=True, random_state=42)
issuev2 = DistributionChangeV2(p=0.5, take_last=True, random_state=42)
issuev3 = DistributionChangeV3(p=0.5, take_last=True, random_state=42)

In [15]:
%%time
v1 = issue.fit_transform(H_FULL[-1])

CPU times: user 9.68 s, sys: 118 ms, total: 9.8 s
Wall time: 9.85 s


In [37]:
# %%time
# v2 = issuev2.fit_transform(H[-1])

In [17]:
%%time
v3 = issuev3.fit_transform(H_FULL[-1])

CPU times: user 15.6 s, sys: 537 ms, total: 16.2 s
Wall time: 2.46 s


In [39]:
v1

Unnamed: 0,height,houses,pets,money
0,17.990353,39,dog,300.007935
1,17.990496,39,dog,300.007965
2,17.990515,39,dog,300.008850
3,17.990595,39,dog,300.009308
4,17.990631,39,dog,300.009430
...,...,...,...,...
99994,26.078815,49,snail,338.093079
99995,26.210125,49,snail,342.582733
99996,26.423977,49,snail,342.686188
99997,26.635475,49,snail,342.765228


In [40]:
v3

Unnamed: 0,height,houses,pets,money
0,17.990353,39,dog,300.007935
1,17.990496,39,dog,300.007965
2,17.990515,39,dog,300.008850
3,17.990595,39,dog,300.009308
4,17.990631,39,dog,300.009430
...,...,...,...,...
99994,26.078815,49,snail,338.093079
99995,26.210125,49,snail,342.582733
99996,26.423977,49,snail,342.686188
99997,26.635475,49,snail,342.765228


In [19]:
v1.equals(v3)

True

In [17]:
v2

Unnamed: 0,numeric_0,numeric_1,numeric_2,numeric_3,numeric_4,numeric_5,numeric_6,numeric_7,numeric_8,numeric_9,...,numeric_990,numeric_991,numeric_992,numeric_993,numeric_994,numeric_995,numeric_996,numeric_997,numeric_998,numeric_999
0,0.053327,-84432.109375,55.313244,8736,-67792.000000,11455,-79503.015625,2599.174072,9059.712891,25761,...,4266,-34221.335938,-43313,-17846,0,-9459,-73845.523438,0.359601,381.975586,0
1,0.054230,-84418.062500,89.205055,9017,-67792.000000,12130,-79462.835938,2625.439697,12931.257812,25761,...,7761,-33815.214844,-43313,-17746,0,-9099,-73806.671875,0.546967,491.534515,0
2,0.116228,-84132.828125,97.113022,9713,-67792.000000,14428,-79433.789062,2633.321777,14370.169922,25761,...,8842,-33648.117188,-43312,-17664,0,-9097,-73606.750000,0.553102,786.924255,0
3,0.138798,-83650.843750,127.328445,11962,-67792.000000,15439,-79402.968750,2682.683594,14685.614258,25761,...,8973,-33642.355469,-43312,-17315,0,-8991,-73501.164062,0.812114,830.549255,0
4,0.283126,-83237.507812,133.694061,13023,-67792.000000,16356,-79302.656250,2732.687988,15610.158203,25761,...,9353,-33637.082031,-43312,-17285,0,-8984,-73442.898438,0.820959,901.833069,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107247,503.928070,-65310.804688,4404.182129,40752,-67632.390625,45462,-69736.117188,9442.489258,54259.707031,26680,...,35895,-27781.166016,-42286,-12702,279,-5393,-54774.566406,170.221481,11350.974609,19
107248,504.010620,-65310.761719,4404.510254,40753,-67632.367188,45462,-69735.828125,9442.504883,54260.531250,26681,...,35896,-27780.759766,-42286,-12702,279,-5393,-54773.859375,170.234299,11351.440430,19
107249,504.045380,-65310.519531,4404.707520,40753,-67632.343750,45464,-69735.617188,9442.945312,54261.417969,26681,...,35896,-27780.707031,-42286,-12701,280,-5393,-54773.687500,170.254623,11351.444336,19
107250,0.053327,-84432.109375,55.313244,8736,-67792.000000,11455,-79503.015625,2599.174072,9059.712891,25761,...,4266,-34221.335938,-43313,-17846,0,-9459,-73845.523438,0.359601,381.975586,0


In [36]:
v2

Unnamed: 0,height,houses,pets,money
0,9.667356,30,cat,260.409363
1,9.814260,30,cat,260.885437
2,9.963401,30,cat,261.014618
3,10.126824,30,cat,261.351440
4,10.140945,30,cat,262.117249
...,...,...,...,...
99994,10.140945,30,cat,287.166504
99995,10.378928,30,cat,287.167297
99996,10.407301,30,cat,287.167480
99997,10.485166,30,cat,287.167999


In [7]:
avh = AVH(M, E, random_state=42, verbose=6, columns=["money"])

In [8]:
PS = avh.generate(H, fpr_target=0.05)

creating D(C)...: 23it [00:00, 50.89it/s] 
Generating P(S for columns...:   0%|          | 0/1 [00:00<?, ?it/s]

2024-05-01 13:59:32|DEBUG|utility_functions.wrapper: avh.auto_validate_by_history._generate_constraint_space Took 668.4990 ms to execute.


Generating P(S for columns...: 100%|██████████| 1/1 [00:05<00:00,  5.41s/it]

2024-05-01 13:59:37|DEBUG|utility_functions.wrapper: avh.auto_validate_by_history._precalculate_constraint_recalls_fast Took 4725.4651 ms to execute.
2024-05-01 13:59:37|DEBUG|utility_functions.wrapper: avh.auto_validate_by_history._find_optimal_singleton_conjuctive_dq_program Took 0.1028 ms to execute.
2024-05-01 13:59:37|DEBUG|utility_functions.wrapper: avh.auto_validate_by_history._find_optimal_conjunctive_dq_program Took 0.3271 ms to execute.
2024-05-01 13:59:37|DEBUG|utility_functions.wrapper: avh.auto_validate_by_history._generate_conjuctive_dq_program Took 4730.6402 ms to execute.
2024-05-01 13:59:37|DEBUG|utility_functions.wrapper: avh.auto_validate_by_history.generate Took 5881.4399 ms to execute.





In [9]:
PS

{'money': ChebyshevConstraint(2686488.8684 <= Sum <= 2714968.6316, FPR = 0.0400), FPR = 0.040000}

### Synthetic benchmark data generation

**Benchmark setup**  
columns:
- Numeric:
    - count: 1000
    - dtypes:
        - int: 30%
        - float: 70%
    - distributions:
        - uniform: 10%
        - normal: 20%
        - beta: 70%
- Categorical:
    - count: 1000

In [12]:
rng = np.random.default_rng(42)
n = 1000

numeric_columns = []
for i in range(n):

    column_name = f"numeric_{i}"
    dtype = rng.choice([np.int32, np.float32], p=[0.3, 0.7])
    distribution = rng.choice(["uniform", "normal", "beta"], p=[0.1, 0.1, 0.8])

    scale = rng.integers(1, 100000)
    sign = rng.choice([-1, 1], p=[0.2, 0.8])

    if distribution == "uniform":
        low = rng.uniform(0, scale)
        high = rng.uniform(low, scale)

        if sign == -1:
            low, high = high * sign, low * sign

        column = UniformNumericColumn(column_name, low, high, dtype=dtype)

    elif distribution == "normal":
        mean = rng.uniform(0, scale) * sign
        std = rng.uniform(0, 10)
        column = NormalNumericColumn(column_name, mean, std, dtype=dtype)

    elif distribution == "beta":
        alfa = rng.uniform(0.1, 10)
        beta = rng.uniform(0.1, 10)
        column = BetaNumericColumn(
            column_name, alfa, beta, scale=scale * sign, dtype=dtype
        )

    numeric_columns.append(column)

pipeline = DataGenerationPipeline(numeric_columns, random_state=rng)

In [10]:
_ = pipeline.generate(10)
pipeline.generate(10)

Unnamed: 0,numeric_0,numeric_1,numeric_2,numeric_3,numeric_4,numeric_5,numeric_6,numeric_7,numeric_8,numeric_9,...,numeric_990,numeric_991,numeric_992,numeric_993,numeric_994,numeric_995,numeric_996,numeric_997,numeric_998,numeric_999
0,5821.10498,-34834.832031,11238.599609,60716,-66111.226562,63082,-51763.378906,15023.037109,81354.015625,31916,...,40273,-18550.923828,-35820,-12002,4568,-2344,-16475.402344,925.599182,30427.248047,2100
1,10925.53125,-26888.644531,7864.008301,54449,-67443.421875,64605,-35486.953125,10741.845703,64389.609375,29698,...,39738,-22038.558594,-39154,-13365,33,-3469,-12637.672852,986.680237,14986.859375,24766
2,501.136169,-37594.253906,3560.182373,63364,-66828.265625,71020,-51227.363281,10221.790039,82744.335938,31576,...,39960,-21410.837891,-29715,-10334,19782,-2780,-46713.929688,1121.689941,24671.345703,6055
3,8450.057617,-43200.699219,21597.923828,50026,-35321.585938,58457,-57433.160156,14897.063477,72058.53125,26262,...,31435,-17814.777344,-42209,-10778,7125,-784,-33630.265625,317.320984,4732.359863,92
4,853.856262,-68572.703125,14830.930664,61871,-67020.4375,55112,-29222.072266,13150.475586,74611.960938,32657,...,40299,-27168.732422,-41292,-11566,9054,-1135,-49678.265625,894.154236,8903.21582,1332
5,5146.479492,-56280.953125,6069.199219,54402,-52296.609375,68768,-34627.914062,14542.632812,72402.140625,26402,...,40125,-26632.451172,-33060,-8514,4918,-5780,-38391.890625,241.083893,14546.629883,895
6,78.213112,-23492.179688,2075.531738,34764,-55568.453125,55883,-52167.539062,9609.795898,75772.273438,28715,...,40273,-17016.589844,-41198,-7225,1124,-1048,-42372.964844,559.309387,38492.242188,2613
7,2068.486816,-62414.558594,9188.073242,63151,-67206.203125,68311,-39544.320312,8298.933594,73562.179688,27038,...,39464,-21363.919922,-42758,-10456,8527,-4460,-52713.9375,347.82077,24417.640625,18088
8,2648.368652,-59181.84375,22179.609375,49566,-55144.070312,46833,-52097.671875,15849.641602,66682.640625,32410,...,38680,-27883.966797,-35883,-8601,1364,-227,-58724.40625,1006.673279,8738.956055,7082
9,228.171494,-29470.943359,9205.548828,68273,-64909.34375,70608,-45762.242188,16670.693359,71053.734375,31205,...,30016,-17459.476562,-36026,-10307,130,-2129,-56958.410156,1366.932861,39745.515625,10437


In [13]:
H_FULL = [pipeline.generate_normal(100000, 5000) for i in range(31)]

In [36]:
# Defining M space
M = [
    RowCount,
    DistinctRatio,
    DistinctCount,
    CompleteRatio,
    Min,
    Max,
    Mean,
    Median,
    Sum,
    Range,
]

# Defining available constraint estimators
# Note: Order not important as Q space will be constructed
#     from cartesian product of E & M
E = [CLTConstraint, ChebyshevConstraint]

avh = AVH(M, E, random_state=42)

In [37]:
PS = avh.generate(H, fpr_target=0.01)

creating D(C)...: 23it [51:58, 135.60s/it]
Generating P(S for columns...:   7%|▋         | 72/1000 [01:16<16:32,  1.07s/it]


KeyboardInterrupt: 

In [7]:
import time

In [12]:
def minmax(x):
    time.sleep(1)
    return x.max() - x.min()

In [13]:
df

Unnamed: 0,money,height
0,306.025787,16.935593
1,307.398956,17.327097
2,323.848206,18.795792
3,293.234497,17.923433
4,303.093597,18.69458
5,308.189514,16.846575
6,298.945801,18.116465
7,302.219177,12.018182
8,313.597168,18.783092
9,294.416443,14.882791


In [14]:
%%time
df.apply(minmax, axis=1)

CPU times: user 4.56 ms, sys: 1.32 ms, total: 5.88 ms
Wall time: 10 s


0    289.090210
1    290.071869
2    305.052429
3    275.311066
4    284.399017
5    291.342926
6    280.829346
7    290.200989
8    294.814087
9    279.533661
dtype: float32

In [9]:
import dask.dataframe as dd

In [10]:
ddf = dd.from_pandas(df, npartitions=5)  # Dask DataFrame has 5 partitions

In [11]:
def minmax2(df):
    return df.apply(minmax, axis=1)

In [18]:
%%time
p = ddf.map_partitions(minmax2, meta=(None, "float32"))
p.compute()

CPU times: user 11 ms, sys: 3.08 ms, total: 14.1 ms
Wall time: 2.02 s


0    289.090210
1    290.071869
2    305.052429
3    275.311066
4    284.399017
5    291.342926
6    280.829346
7    290.200989
8    294.814087
9    279.533661
dtype: float32