In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import uuid
import time
from typing import List

import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.utils.validation import check_is_fitted
from statsmodels.tsa.stattools import adfuller
from IPython.display import Code, display
import inspect

import avh.utility_functions as utils
from avh.data_issues import (
    IssueTransfomer,
    NumericIssueTransformer,
    CategoricalIssueTransformer,
    IncreasedNulls,
    SchemaChange,
    DistributionChange,
    UnitChange,
    CasingChange,
    DQIssueDatasetGenerator,
    VolumeChangeUpsample,
    VolumeChangeDownsample,
    NumericPerturbation,
)

from avh.data_generation import (
    DataColumn,
    NumericColumn,
    CategoricalColumn,
    NormalNumericColumn,
    UniformNumericColumn,
    BetaNumericColumn,
    StaticCategoricalColumn,
    RandomCategoricalColumn,
    DataGenerationPipeline,
)

from avh.metrics import (
    Metric,
    SingleDistributionMetric,
    TwoDistributionMetric,
    RowCount,
    DistinctCount,
    DistinctRatio,
    CompleteRatio,
    Mean,
    Median,
    Range,
    Min,
    Max,
    Sum,
    MeanDigitLength,
    MeanPunctuationLength,
    MeanStringLength,
    EMD,
    KsDist,
    CohenD,
    KlDivergence,
    JsDivergence,
)

from avh.constraints import (
    Constraint,
    ConstantConstraint,
    ChebyshevConstraint,
    CantelliConstraint,
    CLTConstraint,
    ConjuctivDQProgram,
)

from avh.auto_validate_by_history import AVH

In [3]:
import jupyter_black

jupyter_black.load()

In [4]:
rng = np.random.default_rng(42)

In [5]:
pipeline = DataGenerationPipeline(
    columns=[
        NormalNumericColumn("money", 300, 10),
        NormalNumericColumn("height", 18, 2),
        # UniformNumericColumn("houses", 30, 50, dtype=np.int32),
        # RandomCategoricalColumn("pets", ["dog", "cat", "snail"]),
        # RandomCategoricalColumn("pets2", ["dog", "cat", "snail"]),
        # RandomCategoricalColumn("text"),
    ],
    issues=[
        ("money", [IncreasedNulls(p=0.9)]),
        # ("height", [IncreasedNulls(p=0.99)]),
        # ("pets", [IncreasedNulls(p=0.5)]),
    ],
    random_state=rng,
)

In [6]:
# Defining M space
M = [
    RowCount,
    DistinctRatio,
    DistinctCount,
    CompleteRatio,
    Min,
    Max,
    Mean,
    Median,
    Sum,
    Range,
    EMD,
    JsDivergence,
    KlDivergence,
    KsDist,
    CohenD,
]

# # Defining available constraint estimators
# # Note: Order not important as Q space will be constructed
# #     from cartesian product of E & M
E = [
    CLTConstraint,
    ChebyshevConstraint,
    CantelliConstraint,
]

H = [pipeline.generate(1000) for i in range(31)]

In [7]:
avh = AVH(random_state=42, verbose=3, columns=["money"], n_jobs=-1)

In [8]:
PS = avh.generate(H, fpr_target=0.05)

creating D(C)...: 23it [00:02,  8.75it/s]
creating P(S) (with joblib)...: 100%|██████████| 1/1 [00:00<00:00,  5.06it/s]


2024-05-05 18:29:29|DEBUG|utility_functions.wrapper: avh.auto_validate_by_history._generate_parallel Took 2856.9200 ms to execute.
2024-05-05 18:29:29|DEBUG|utility_functions.wrapper: avh.auto_validate_by_history.generate Took 2857.1632 ms to execute.


In [9]:
PS

{'money': ChebyshevConstraint(28666.9549 <= Sum <= 31364.6545, FPR = 0.0051)
 CantelliConstraint(0.0000 <= EMD <= 10.9308, FPR = 0.0051), FPR = 0.010178}

In [12]:
PS

{'money': ChebyshevConstraint(29590.4123 <= Sum <= 30411.5838, FPR = 0.0400), FPR = 0.040000}

### Synthetic benchmark data generation

**Benchmark setup**  
columns:
- Numeric:
    - count: 1000
    - dtypes:
        - int: 30%
        - float: 70%
    - distributions:
        - uniform: 10%
        - normal: 20%
        - beta: 70%
- Categorical:
    - count: 1000

In [8]:
rng = np.random.default_rng(42)
n = 1000

numeric_columns = []
for i in range(n):

    column_name = f"numeric_{i}"
    dtype = rng.choice([np.int32, np.float32], p=[0.3, 0.7])
    distribution = rng.choice(["uniform", "normal", "beta"], p=[0.1, 0.1, 0.8])

    scale = rng.integers(1, 100000)
    sign = rng.choice([-1, 1], p=[0.2, 0.8])

    if distribution == "uniform":
        low = rng.uniform(0, scale)
        high = rng.uniform(low, scale)

        if sign == -1:
            low, high = high * sign, low * sign

        column = UniformNumericColumn(column_name, low, high, dtype=dtype)

    elif distribution == "normal":
        mean = rng.uniform(0, scale) * sign
        std = rng.uniform(0, 10)
        column = NormalNumericColumn(column_name, mean, std, dtype=dtype)

    elif distribution == "beta":
        alfa = rng.uniform(0.1, 10)
        beta = rng.uniform(0.1, 10)
        column = BetaNumericColumn(
            column_name, alfa, beta, scale=scale * sign, dtype=dtype
        )

    numeric_columns.append(column)

pipeline = DataGenerationPipeline(numeric_columns, random_state=rng)

In [9]:
_ = pipeline.generate(10)
pipeline.generate(10)

Unnamed: 0,numeric_0,numeric_1,numeric_2,numeric_3,numeric_4,numeric_5,numeric_6,numeric_7,numeric_8,numeric_9,...,numeric_990,numeric_991,numeric_992,numeric_993,numeric_994,numeric_995,numeric_996,numeric_997,numeric_998,numeric_999
0,5821.10498,-34834.832031,11238.599609,60716,-66111.226562,63082,-51763.378906,15023.037109,81354.015625,31916,...,40273,-18550.923828,-35820,-12002,4568,-2344,-16475.402344,925.599182,30427.248047,2100
1,10925.53125,-26888.644531,7864.008301,54449,-67443.421875,64605,-35486.953125,10741.845703,64389.609375,29698,...,39738,-22038.558594,-39154,-13365,33,-3469,-12637.672852,986.680237,14986.859375,24766
2,501.136169,-37594.253906,3560.182373,63364,-66828.265625,71020,-51227.363281,10221.790039,82744.335938,31576,...,39960,-21410.837891,-29715,-10334,19782,-2780,-46713.929688,1121.689941,24671.345703,6055
3,8450.057617,-43200.699219,21597.923828,50026,-35321.585938,58457,-57433.160156,14897.063477,72058.53125,26262,...,31435,-17814.777344,-42209,-10778,7125,-784,-33630.265625,317.320984,4732.359863,92
4,853.856262,-68572.703125,14830.930664,61871,-67020.4375,55112,-29222.072266,13150.475586,74611.960938,32657,...,40299,-27168.732422,-41292,-11566,9054,-1135,-49678.265625,894.154236,8903.21582,1332
5,5146.479492,-56280.953125,6069.199219,54402,-52296.609375,68768,-34627.914062,14542.632812,72402.140625,26402,...,40125,-26632.451172,-33060,-8514,4918,-5780,-38391.890625,241.083893,14546.629883,895
6,78.213112,-23492.179688,2075.531738,34764,-55568.453125,55883,-52167.539062,9609.795898,75772.273438,28715,...,40273,-17016.589844,-41198,-7225,1124,-1048,-42372.964844,559.309387,38492.242188,2613
7,2068.486816,-62414.558594,9188.073242,63151,-67206.203125,68311,-39544.320312,8298.933594,73562.179688,27038,...,39464,-21363.919922,-42758,-10456,8527,-4460,-52713.9375,347.82077,24417.640625,18088
8,2648.368652,-59181.84375,22179.609375,49566,-55144.070312,46833,-52097.671875,15849.641602,66682.640625,32410,...,38680,-27883.966797,-35883,-8601,1364,-227,-58724.40625,1006.673279,8738.956055,7082
9,228.171494,-29470.943359,9205.548828,68273,-64909.34375,70608,-45762.242188,16670.693359,71053.734375,31205,...,30016,-17459.476562,-36026,-10307,130,-2129,-56958.410156,1366.932861,39745.515625,10437


In [10]:
H_FULL = [pipeline.generate(100000) for i in range(31)]

In [11]:
import pickle

In [13]:
with open("bruh.pickle", "wb") as f:
    pickle.dump(H_FULL, f)

In [36]:
# Defining M space
M = [
    RowCount,
    DistinctRatio,
    DistinctCount,
    CompleteRatio,
    Min,
    Max,
    Mean,
    Median,
    Sum,
    Range,
]

# Defining available constraint estimators
# Note: Order not important as Q space will be constructed
#     from cartesian product of E & M
E = [CLTConstraint, ChebyshevConstraint]

avh = AVH(M, E, random_state=42)

In [37]:
PS = avh.generate(H, fpr_target=0.01)

creating D(C)...: 23it [51:58, 135.60s/it]
Generating P(S for columns...:   7%|▋         | 72/1000 [01:16<16:32,  1.07s/it]


KeyboardInterrupt: 

In [7]:
import time

In [12]:
def minmax(x):
    time.sleep(1)
    return x.max() - x.min()

In [13]:
df

Unnamed: 0,money,height
0,306.025787,16.935593
1,307.398956,17.327097
2,323.848206,18.795792
3,293.234497,17.923433
4,303.093597,18.69458
5,308.189514,16.846575
6,298.945801,18.116465
7,302.219177,12.018182
8,313.597168,18.783092
9,294.416443,14.882791


In [14]:
%%time
df.apply(minmax, axis=1)

CPU times: user 4.56 ms, sys: 1.32 ms, total: 5.88 ms
Wall time: 10 s


0    289.090210
1    290.071869
2    305.052429
3    275.311066
4    284.399017
5    291.342926
6    280.829346
7    290.200989
8    294.814087
9    279.533661
dtype: float32

In [9]:
import dask.dataframe as dd

In [10]:
ddf = dd.from_pandas(df, npartitions=5)  # Dask DataFrame has 5 partitions

In [11]:
def minmax2(df):
    return df.apply(minmax, axis=1)

In [18]:
%%time
p = ddf.map_partitions(minmax2, meta=(None, "float32"))
p.compute()

CPU times: user 11 ms, sys: 3.08 ms, total: 14.1 ms
Wall time: 2.02 s


0    289.090210
1    290.071869
2    305.052429
3    275.311066
4    284.399017
5    291.342926
6    280.829346
7    290.200989
8    294.814087
9    279.533661
dtype: float32