In [98]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.subplots as sp

import timeit
import pickle
from tqdm import tqdm

import src.coverage as cov
import src.occurence_estimation as occ

In [None]:
from importlib import reload
reload(cov)

In [None]:
acs_income = pd.read_csv("data/df_ACSIncome_enc.csv")
acs_income, mapping = occ.factorize_data(acs_income)
acs_income.to_csv("data/df_ACSIncome_enc_num.csv", index=False)

In [18]:
bluenile = pd.read_csv("data/df_diamonds_enc.csv")
bluenile, mapping = occ.factorize_data(bluenile)
bluenile.to_csv("data/df_diamonds_enc_num.csv", index=False)

100%|██████████| 7/7 [00:01<00:00,  6.61it/s]


In [288]:
uk_roadsafety = pd.read_csv("data/df_uk_road_accident_enc_num.csv")
occ.get_cardinalities(uk_roadsafety)

[6, 10, 7, 7, 10, 3, 3, 6, 13]

### Compare Coverage Algorithms

Runtime Comparison

In [None]:
with(open(f"results_server/UKRoadSafety_NUM_rowbased_results_freq_output.pkl", "rb")) as f:
        rowbased_output = pickle.load(f)
freq_counts, keys = rowbased_output[list(rowbased_output.keys())[-1]]
categories = occ.get_categories(uk_roadsafety)

In [None]:
mups_pwalk_runtime = timeit.repeat(lambda: cov.freqwalk_frequency_weight(categories, freq_counts, threshold=1), repeat=2, number=1)
mups_pwalk_runtime_mean = np.mean(mups_pwalk_runtime)
mups_pwalk_runtime_std = np.std(mups_pwalk_runtime)
print(f"Mean runtime: {mups_pwalk_runtime_mean} seconds with std: {mups_pwalk_runtime_std} seconds")

In [None]:
reload(cov)

In [47]:
from jpype import *
import time
def get_coverage_java(threshold, df):
    dff = df.dtypes

    print(getDefaultJVMPath())
    # get classes
    class_path = "CoverageJava/target/classes/"
    print(class_path)
    cpopt = "-Djava.class.path=%s" % (class_path)
    if not isJVMStarted():
        startJVM(getDefaultJVMPath(), "-ea", cpopt)

    dataset = JClass("io.DataSet")
    hybrid = JClass("search.HybridSearch")

    valid_cols, valid_col_indices, cardinality, categories = [], [], [], []

    # grouping numerical
    for col in list(df):
        if dff[col] != object:
            cur_col = list(df[col])
            dimension = len(set(cur_col))
            if dimension <= 10:
                df[col] = str(df[col])
            else:
                df[col] = [str(bucket) for bucket in pd.cut(cur_col, dimension)]

    for i, col in enumerate(list(df)):
        # restrict cardinality
        temp_set = set(list(df[col]))  # unique values

        if len(temp_set) <= 10 and len(temp_set) >= 2:
            valid_cols.append(col)
            valid_col_indices.append(i)
            cardinality.append(len(temp_set))

            # encoding valid categorical columns as numeric (one-hot encoding)
            labels, uniques = pd.factorize(list(df[col]), sort=True)
            df[col] = labels
            categories.append([col + ":" + str(unique) for unique in uniques])

    temp = df[valid_cols].astype(str)
    temp.to_csv("temp.csv", index=False)

    t_ = time.time()
    dataset1 = dataset(
        "temp.csv", cardinality, [i for i in range(len(valid_cols))], temp.shape[0]
    )

    hybrid1 = hybrid(dataset1)
    a = hybrid1.findMaxUncoveredPatternSet(threshold)  # threshold, maxLevel

    mups = [i.getStr() for i in a]
    t = time.time() - t_
    print("time: ", t)
    print("mups: ", len(mups))

    # get all children patterns of mups to get total amount of uncovered patterns
    # fill in list of uncovered patterns with children patterns (max combination: 3 as three columns) -> add to list all patterns of only one or two elements with all other combinations of length 3
    # for pattern in a:
    #     children_pattern = dataset1.getAllChildren(pattern)
    #     # print("children pattern: ", children_pattern)
    #     mups.extend([i.getStr() for i in children_pattern])

    return t, mups, valid_cols

In [None]:
test_datasets = [bluenile]
test_names = ["BlueNile"]

def cov_threshold_test(test_dataset, test_name):
    repeat = 1
    cov_runtime_dict = {}
    for t_ in [1, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1]:
        if t_ == 1:
            t_ = 0
            t = 1
        else:
            t = int(t_ * test_dataset.shape[0])
            if t < 2:
                continue
        print(f"{test_name} - threshold:", t)
        # read pickle
        with open(
            f"results/{test_name}_rowbased_results_freq_output_2023_10_17.pkl", "rb"
        ) as f:
            rowbased_output = pickle.load(f)
        k, v = list(rowbased_output.items())[-1]
        cov_runtime_dict.setdefault(f"{test_name}_{k}", dict())
        freq_counts, keys = v
        categories = occ.get_categories(test_dataset)
        # calculate coverage with CoverageJava (reduced cardinality)
        df = test_dataset.copy()
        cov_java_time, cov_java_mups = get_coverage_java(t, df)
        cov_runtime_dict[f"{test_name}_{k}"].setdefault(
            "CoverageJava", dict()
        ).setdefault(t_, cov_java_time)
        # calculate coverage baseline
        ucov_baseline_runtime = timeit.repeat(
            lambda: cov.baseline_coverage_with_keys_all_combs(
                keys, categories, freq_counts, max_level=None, threshold=t
            ),
            repeat=repeat,
            number=1,
        )
        ucov_baseline_runtime_mean = np.mean(ucov_baseline_runtime)
        ucov_baseline_runtime_std = np.std(ucov_baseline_runtime)
        cov_runtime_dict[f"{test_name}_{k}"].setdefault(
            "Baseline (All Combs)", dict()
        ).setdefault(t_, ucov_baseline_runtime_mean)
        # calculate coverage with mups baseline
        mups_baseline_runtime = timeit.repeat(
            lambda: cov.baseline_coverage_with_keys_searchbased(
                keys, categories, freq_counts, threshold=t
            ),
            repeat=repeat,
            number=1,
        )
        mups_baseline_runtime_mean = np.mean(mups_baseline_runtime)
        mups_baseline_runtime_std = np.std(mups_baseline_runtime)
        cov_runtime_dict[f"{test_name}_{k}"].setdefault(
            "Baseline (MUPs)", dict()
        ).setdefault(t_, mups_baseline_runtime_mean)
        # calculate coverage with mups pwalk frequencies
        mups_pwalk_runtime = timeit.repeat(
            lambda: cov.freqwalk_frequency_weight(
                categories, freq_counts, threshold=t
            ),
            repeat=repeat,
            number=1,
        )
        mups_pwalk_runtime_mean = np.mean(mups_pwalk_runtime)
        mups_pwalk_runtime_std = np.std(mups_pwalk_runtime)
        cov_runtime_dict[f"{test_name}_{k}"].setdefault(
            "MUPs pwalk", dict()
        ).setdefault(t_, mups_pwalk_runtime_mean)
    return cov_runtime_dict

In [None]:
cov_runtime_dict = cov_threshold_test(bluenile, "BlueNile")

In [26]:
cov_runtime_dict

{'BlueNile_129194': {'CoverageJava': {0: 3.6072330474853516,
   0.0001: 1.601257085800171,
   0.001: 0.6001029014587402,
   0.01: 0.11608290672302246,
   0.1: 0.12131905555725098},
  'Baseline (All Combs)': {0: 0.623004193068482,
   0.0001: 7.3784675599308684,
   0.001: 7.536592236952856,
   0.01: 7.645518705016002,
   0.1: 7.490900462958962},
  'Baseline (MUPs)': {0: 6.055784592987038,
   0.0001: 6.800910323043354,
   0.001: 6.222628304036334,
   0.01: 6.876512652030215,
   0.1: 6.364523399039172},
  'MUPs pwalk': {0: 0.002110998029820621,
   0.0001: 0.0026702970499172807,
   0.001: 0.002317264094017446,
   0.01: 0.0015551440883427858,
   0.1: 0.0016591920284554362}}}

In [29]:
def cov_combsize_test(test_dataset, test_name):
    cov_combsize_dict = {}
    repeat = 1
    t=1
    with open(
            f"results_server/{test_name}_rowbased_results_freq_output.pkl", "rb"
        ) as f:
            rowbased_output = pickle.load(f)
    for k, v in rowbased_output.items():
        cov_runtime_dict.setdefault(f"{test_name}", dict())
        freq_counts, keys = v
        categories = occ.get_categories(test_dataset)
        # calculate coverage with CoverageJava (reduced cardinality)
        df = test_dataset.copy()
        cov_java_time, cov_java_mups = get_coverage_java(t, df)
        cov_combsize_dict[f"{test_name}"].setdefault(
            "CoverageJava", dict()
        ).setdefault(k, cov_java_time)
        # calculate coverage baseline
        ucov_baseline_runtime = timeit.repeat(
            lambda: cov.baseline_coverage_with_keys_all_combs(
                keys, categories, freq_counts, max_level=None, threshold=t
            ),
            repeat=repeat,
            number=1,
        )
        ucov_baseline_runtime_mean = np.mean(ucov_baseline_runtime)
        ucov_baseline_runtime_std = np.std(ucov_baseline_runtime)
        cov_combsize_dict[f"{test_name}"].setdefault(
            "Baseline (All Combs)", dict()
        ).setdefault(k, ucov_baseline_runtime_mean)
        # calculate coverage with mups baseline
        mups_baseline_runtime = timeit.repeat(
            lambda: cov.baseline_coverage_with_keys_searchbased(
                keys, categories, freq_counts, threshold=t
            ),
            repeat=repeat,
            number=1,
        )
        mups_baseline_runtime_mean = np.mean(mups_baseline_runtime)
        mups_baseline_runtime_std = np.std(mups_baseline_runtime)
        cov_combsize_dict[f"{test_name}"].setdefault(
            "Baseline (MUPs)", dict()
        ).setdefault(k, mups_baseline_runtime_mean)
        # calculate coverage with mups pwalk frequencies
        mups_pwalk_runtime = timeit.repeat(
            lambda: cov.freqwalk_frequency_weight(
                categories, freq_counts, threshold=t
            ),
            repeat=repeat,
            number=1,
        )
        mups_pwalk_runtime_mean = np.mean(mups_pwalk_runtime)
        mups_pwalk_runtime_std = np.std(mups_pwalk_runtime)
        cov_combsize_dict[f"{test_name}"].setdefault(
            "MUPs pwalk", dict()
        ).setdefault(k, mups_pwalk_runtime_mean)
    return cov_combsize_dict

In [None]:
cov_combsize_test = cov_combsize_test(bluenile, "BlueNile")

Load results

In [None]:
with open("results/UKRoadSafety_cov_threshold_runtime_dict_2023-10-17.pkl", "rb") as f:
    cov_runtime_dict_uk = pickle.load(f)

In [None]:
cov_runtime_uk_df = pd.DataFrame(cov_runtime_dict_uk)
cov_runtime_uk_df

In [None]:
with open(f"results_server/BlueNile_cov_threshold_runtime_dict_2023-10-17.pkl", "rb") as f:
    cov_runtime_dict_bn = pickle.load(f)

In [None]:
cov_runtime_bn_df = pd.DataFrame(cov_runtime_dict_bn)
cov_runtime_bn_df

In [None]:
def visualize_cov_t(df, t):
    cov_t_df = df.applymap(lambda x: x[t])
    fig = px.line(cov_t_df, title=f"Coverage runtime for threshold {t}", labels={"index": "Dataset", "value": "Runtime (s)"})
    fig.show()

visualize_cov_t(pd.DataFrame(cov_runtime_dict), 1)

In [None]:
max_runtime = cov_runtime_dict["BlueNile_129194"]

In [None]:
def visualize_cov_runtime_for_every_t(max_runtime):
    fig = go.Figure()
    i=0
    for method in max_runtime.keys():
        fig.add_trace(go.Scatter(x=[k*bluenile.shape[0] if not k==1 else k for k in list(max_runtime[method].keys())], y=list(max_runtime[method].values()), name=method, mode='lines+markers', line=dict(color=px.colors.qualitative.Plotly[i])))
        i+=1
    fig.update_layout(title="Coverage runtime for every threshold", xaxis_title="Threshold", yaxis_title="Runtime (s)")
    fig.show()

In [None]:
visualize_cov_runtime_for_every_t(max_runtime)

In [None]:
cov_max_t_dict = {}
for size in cov_runtime_dict.keys():
    s = size.split("_")[1]
    for method in cov_runtime_dict[size].keys():
        cov_max_t_dict.setdefault(method, dict()).setdefault(int(s), cov_runtime_dict[size][method][1])
    

In [59]:
def visualize_cov_runtime_for_every_dimension(cov_max_t_dict):
    fig = go.Figure()
    i=0
    for method in cov_max_t_dict.keys():
        fig.add_trace(go.Scatter(x=list(cov_max_t_dict[method].keys()), y=list([c[0] for c in cov_max_t_dict[method].values()]), name=method, mode='lines+markers', line=dict(color=px.colors.qualitative.Plotly[i])))
        i+=1
    fig.update_layout(title="Coverage runtime for every combination size (t=1)", xaxis_title="Dimension", yaxis_title="Runtime (s)")
    fig.show()
    

In [None]:
visualize_cov_runtime_for_every_dimension(cov_max_t_dict)

#### Directly compare to CoverageJava for BlueNile data

In [None]:
# data = pd.read_csv('CoverageJava/data/BlueNile_categorical.csv')
# data = data.drop(columns=['id'])
data = pd.read_csv("temp_input.csv")

In [None]:
def factorize_data(df_raw):
    """Factorizes the data in a dataframe. The factorization is done column-wise.
    The factorization is done by using the pandas.factorize function doing the following:
    For each column, the unique values are extracted and mapped to a unique integer.
    The values of the column are then replaced by the mapped integer.

    Example:
    Input: 0 1 2 3 4 5 6 7 8 9
           a a b c c b a b c a
    Output: 0 1 2 3 3 2 0 2 3 0

    Args:
        df_raw (pd.DataFrame): The dataframe to factorize. Must contain only categorical data.

    Returns:
        df: The factorized dataframe.
        mapping: The mapping from the original values to the factorized values.
    """
    df = df_raw.copy()
    mapping = {}
    for i, col in tqdm(enumerate(df.columns)):
        # labels, uniques = pd.factorize(df[col])
        labels_ = [f"{str(i)}:{str(l)}" for l in df[col]]
        # labels_unique = list(np.unique(labels))
        # mapping[col] = dict(zip(uniques, labels_unique))
        df[col] = labels_
    # del labels, labels_, uniques
    return df, mapping

In [None]:
data_, mapping = factorize_data(data)

In [None]:
display(data.head())
display(data_.head())

In [None]:
data_.to_csv("temp_input_num.csv", index=False)

In [None]:
categories = occ.get_categories(data_)

In [None]:
freq_counts, keys = occ.calc_occurences_countmin_rowbased_traversal(data_, None, True)
t = 1

In [None]:
freq_counts.quality()

In [None]:
#calc mb size from bytes
freq_counts.size() / 1024 / 1024

In [None]:
#display freq_count_cols dict as barplot
fig = px.bar(x=list(freq_count_cols.keys()), y=list(freq_count_cols.values()), title="Frequency counts for each column")
#sort x axis by length of column name
fig.update_xaxes(categoryorder="total descending")
fig.show()

In [None]:
mups = cov.freqwalk_frequency_weight(categories, freq_counts, threshold=t)

In [None]:
sum_mups = sum([len(mups[m]) for m in mups.keys()])
sum_mups

In [None]:
#read mups.json
import json
with open('data/mups.json') as json_file:
    mups_json = json.load(json_file)

In [None]:
mups_json = [tuple(mup) for mup in mups_json]

In [None]:
#sort mups_json
mups_json.sort(key=lambda x: len(x))

In [None]:
mups_list = sum([list(v) for v in mups.values()], [])
mups_list

### Create Visualizations and Graphs

In [166]:
def visualize_runtime_experiment_threshold(name, data):   
    #visualize runtime_dict as line scatterplot for each bucket and logaritmic x-axis using runtime[0] and show runtime[1] as bars behind the lineplot
    data = data[name]
    fig = sp.make_subplots(specs=[[{"secondary_y": True}]])
     #add bar plot of no. of keys, add second yaxis
    # fig.add_trace(go.Bar(x=[len(i[1]) for i in data['MUPs pwalk'].values()], y=[i for i in data['MUPs pwalk'].keys()], name='No. of keys', marker_color='lightgrey'), secondary_y=True)
    #add CoverageJava runtime scatter plot
    data_coveragejava = data['CoverageJava']
    fig.add_trace(go.Scatter(x=list(data_coveragejava.keys()), y=[i[0] for i in data_coveragejava.values()], mode='lines+markers', name='CoverageJava*', line=dict(color='blue')))
    #add Baseline (All Combs) runtime scatter plot
    data_baseline_allcombs = data['Baseline (All Combs)']
    fig.add_trace(go.Scatter(x=list(data_baseline_allcombs.keys()), y=[i[0] for i in data_baseline_allcombs.values()], mode='lines+markers', name='Baseline (All Combs)', line=dict(color='red')), secondary_y=False)
    #add Baseline (MUPs) runtime scatter plot
    # data_baseline_mups = data['Baseline (MUPs)']
    # fig.add_trace(go.Scatter(x=[i for i in data_baseline_mups.keys()], y=[i for i in data_baseline_mups.values()], mode='lines+markers', name='Baseline (MUPs)', line=dict(color='green')))
    #add MUPs pwalk runtime scatter plot
    data_mups_pwalk = data['MUPs pwalk']
    fig.add_trace(go.Scatter(x=list(data_mups_pwalk.keys()), y=[i[0] for i in data_mups_pwalk.values()], mode='lines+markers', name='MUPs pwalk', line=dict(color='orange')), secondary_y=False)
    #update layout
    fig.update_layout(title=f"Runtime of Coverage Detection for {name} dataset", xaxis_title="No. of keys")
    fig.update_yaxes(title_text="Runtime (s)", secondary_y=False)
    fig.update_yaxes(title_text="total amount of patterns", secondary_y=True)
    return fig

In [154]:
def visualize_runtime_experiment(name, data):   
    #visualize runtime_dict as line scatterplot for each bucket and logaritmic x-axis using runtime[0] and show runtime[1] as bars behind the lineplot
    data = data[name]
    fig = sp.make_subplots(specs=[[{"secondary_y": True}]])
     #add bar plot of no. of keys, add second yaxis
    # fig.add_trace(go.Bar(x=[len(i[1]) for i in data['MUPs pwalk'].values()], y=[i for i in data['MUPs pwalk'].keys()], name='No. of keys', marker_color='lightgrey'), secondary_y=True)
    #add CoverageJava runtime scatter plot
    data_coveragejava = data['CoverageJava']
    fig.add_trace(go.Scatter(x=[len(i[1]) for i in data['Baseline (All Combs)'].values()], y=[i[0] for i in data_coveragejava.values()], mode='lines+markers', name='CoverageJava*', line=dict(color='blue')))
    #add Baseline (All Combs) runtime scatter plot
    data_baseline_allcombs = data['Baseline (All Combs)']
    fig.add_trace(go.Scatter(x=[len(i[1]) for i in data['Baseline (All Combs)'].values()], y=[i[0] for i in data_baseline_allcombs.values()], mode='lines+markers', name='Baseline (All Combs)', line=dict(color='red')), secondary_y=False)
    #add Baseline (MUPs) runtime scatter plot
    # data_baseline_mups = data['Baseline (MUPs)']
    # fig.add_trace(go.Scatter(x=[i for i in data_baseline_mups.keys()], y=[i for i in data_baseline_mups.values()], mode='lines+markers', name='Baseline (MUPs)', line=dict(color='green')))
    #add MUPs pwalk runtime scatter plot
    data_mups_pwalk = data['MUPs pwalk']
    fig.add_trace(go.Scatter(x=[len(i[1]) for i in data_mups_pwalk.values()], y=[i[0] for i in data_mups_pwalk.values()], mode='lines+markers', name='MUPs pwalk', line=dict(color='orange')), secondary_y=False)
    #update layout
    fig.update_layout(title=f"Runtime of Coverage Detection for {name} dataset", xaxis_title="No. of keys")
    fig.update_yaxes(title_text="Runtime (s)", secondary_y=False)
    fig.update_yaxes(title_text="total amount of patterns", secondary_y=True)
    return fig

In [247]:
#create dict with length of i[1] of values as key and i[0] of values as value, if multiple same length of i[1] values exist, take the one with highest i[1] value
def get_pattern_length_viz_data(dict_output):
    cov_runtime_output_uk_viz = {}
    combination_sizes = {}
    for k, v in dict_output.items():
        cov_runtime_output_uk_viz.setdefault(k, dict())
        combination_sizes.setdefault(k, dict())
        for _k, _v in v.items():
            if len(_v[1]) in cov_runtime_output_uk_viz[k].keys():
                if _v[0] > cov_runtime_output_uk_viz[k][len(_v[1])]:
                    cov_runtime_output_uk_viz[k][len(_v[1])] = _v[0]
                    combination_sizes[k][len(_v[1])] = _k
            else:
                cov_runtime_output_uk_viz[k].setdefault(len(_v[1]), _v[0])
                combination_sizes[k].setdefault(len(_v[1]), _k)
    print(combination_sizes)
    return cov_runtime_output_uk_viz

In [208]:
def get_combination_length_viz_data(dict_output):
    cov_runtime_output_uk_viz = {}
    for k, v in dict_output.items():
        cov_runtime_output_uk_viz.setdefault(k, dict())
        for _k, _v in v.items():
            cov_runtime_output_uk_viz[k].setdefault(_k, _v[0])
    return cov_runtime_output_uk_viz

In [241]:
def visualize_runtime_experiment(name, data):   
    #visualize runtime_dict as line scatterplot for each bucket and logaritmic x-axis using runtime[0] and show runtime[1] as bars behind the lineplot
    data = data[name]
    data = get_pattern_length_viz_data(data)
    print(data)
    fig = go.Figure()
    #add Baseline (All Combs) runtime scatter plot
    data_baseline_allcombs = data['Baseline (All Combs)']
    fig.add_trace(go.Scatter(x=list(data_baseline_allcombs.keys()), y=list(data_baseline_allcombs.values()), mode='lines+markers', name='Baseline (All Combs)', line=dict(color='red')))
    #add MUPs pwalk runtime scatter plot
    data_mups_pwalk = data['MUPs pwalk']
    fig.add_trace(go.Scatter(x=list(data_mups_pwalk.keys()), y=list(data_mups_pwalk.values()), mode='lines+markers', name='MUPs pwalk', line=dict(color='orange')))
    #update layout
    fig.update_layout(title=f"Runtime of Coverage Detection for {name} dataset", xaxis_title="No. of keys")
    return fig

BlueNile

In [152]:
with open("results/BlueNile_cov_combsize_runtime_dict_2023-10-17.pkl", "rb") as f:
    cov_runtime_output_bn = pickle.load(f)

In [224]:
with open("results/BlueNile_cov_mups_size_dict_2023-10-17.pkl", "rb") as f:
    cov_mups_size_bn = pickle.load(f)

In [251]:
cov_mups_size_bn['BlueNile']['CoverageJava']

{15: [0, ['Polish', 'Symmetry'], 0.02826523780822754],
 28: [1, ['Cut', 'Fluorescence'], 0.02918720245361328],
 31: [0, ['Color', 'Polish'], 0.028392314910888672],
 39: [0, ['Color', 'Cut'], 0.028886079788208008],
 87: [0, ['Shape', 'Color'], 0.0290224552154541],
 152: [11, ['Cut', 'Clarity', 'Polish'], 0.0379328727722168],
 256: [4, ['Shape', 'Symmetry', 'Fluorescence'], 0.03967428207397461],
 407: [13, ['Color', 'Clarity', 'Fluorescence'], 0.042226552963256836],
 740: [39,
  ['Clarity', 'Polish', 'Symmetry', 'Fluorescence'],
  0.06398510932922363],
 1316: [43, ['Shape', 'Color', 'Polish', 'Symmetry'], 0.062047719955444336],
 1809: [150,
  ['Shape', 'Color', 'Polish', 'Fluorescence'],
  0.06966376304626465],
 2475: [101, ['Shape', 'Color', 'Cut', 'Clarity'], 0.0788414478302002],
 6155: [335,
  ['Color', 'Cut', 'Clarity', 'Symmetry', 'Fluorescence'],
  0.13981270790100098],
 12541: [1569,
  ['Shape', 'Color', 'Clarity', 'Symmetry', 'Fluorescence'],
  0.2578554153442383],
 35138: [3156,

In [None]:
fig_run = visualize_runtime_experiment("BlueNile", cov_runtime_output_bn)
#add CoverageJava runtime scatter plot
data_coveragejava_bn = cov_mups_size_bn['BlueNile']['CoverageJava']
# fig_run.add_trace(go.Scatter(x=[i for i in data_coveragejava_bn.keys()], y=[i[-1] for i in data_coveragejava_bn.values()], mode='lines+markers', name='CoverageJava*', line=dict(color='blue')))
fig_run.add_trace(go.Scatter(x=[len(i[1]) for i in data_coveragejava_bn.values()], y=list(data_coveragejava_bn.values()), mode='lines+markers', name='CoverageJava*', line=dict(color='blue')))
fig_run.update_layout(yaxis_type="log")
# fig_run.update_layout(title=f'Runtime of Coverage Detection for BlueNile', xaxis_title='Pattern Level', yaxis_title='Runtime (s)')
fig_run.show()

In [156]:
with open(f"results_server/BlueNile_cov_threshold_runtime_dict_2023-10-17.pkl", "rb") as f:
    cov_threshold_output_bn = pickle.load(f)

In [231]:
fig_t = visualize_runtime_experiment_threshold('BlueNile_109562', cov_threshold_output_bn)
fig_t.update_layout(title=f'Runtime of Coverage Detection for BlueNile by Threshold', xaxis_title='Threshold', yaxis_title='Runtime (s)', xaxis_type='log')

UKRoadSafety

In [None]:
uk_roadsafety.nunique()

In [69]:
with open(f"results_server/UKRoadSafety_cov_combsize_runtime_dict_2023-10-16.pkl", "rb") as f:
    cov_runtime_output_uk = pickle.load(f)

In [221]:
with open(f"results_server/UKRoadSafety_cov_mups_size_dict_2023-10-17.pkl", "rb") as f:
    cov_runtime_covjava = pickle.load(f)

In [304]:
fig_run = visualize_runtime_experiment("UKRoadSafety", cov_runtime_output_uk)
fig_run.update_layout(title=f'Runtime of Coverage Detection for UKRoadSafety', xaxis_title='Number of Patterns', yaxis_title='Runtime (s)', yaxis_type='log')
#add CoverageJava runtime scatter plot
# data_coveragejava_uk = cov_runtime_covjava['UKRoadSafety']['CoverageJava']
# fig_run.add_trace(go.Scatter(x=[i for i in data_coveragejava_uk.keys()], y=[i[-1] for i in data_coveragejava_uk.values()], mode='lines+markers', name='CoverageJava*', line=dict(color='blue')))
fig_run.show()

{'CoverageJava': {8: 31}, 'Baseline (All Combs)': {2: 29, 3: 722, 4: 7398, 5: 12343, 6: 114779, 8: 1383556, 9: 3723646}, 'MUPs pwalk': {2: 29, 3: 722, 4: 7398, 5: 12343, 6: 114779, 8: 1383556, 9: 3723646}}
{'CoverageJava': {8: 23.9112811088562}, 'Baseline (All Combs)': {2: 0.0005079119000583887, 3: 0.00090568489395082, 4: 0.003931944957002997, 5: 0.010545777156949043, 6: 0.19801846100017428, 8: 13.145856244955212, 9: 63.110174955101684}, 'MUPs pwalk': {2: 0.00030912901274859905, 3: 0.005445211892947555, 4: 0.0475206000264734, 5: 0.1065364449750632, 6: 1.2611411870457232, 8: 22.20008739992045, 9: 67.88506794092245}}


In [72]:
with open("results_server/UKRoadSafety_cov_threshold_runtime_dict_2023-10-16.pkl", "rb") as f:
    cov_threshold_output_uk = pickle.load(f)

In [236]:
fig_t = visualize_runtime_experiment_threshold('UKRoadSafety_3723646', cov_threshold_output_uk)
fig_t.update_layout(title=f'Runtime of Coverage Detection for UKRoadSafety', xaxis_title='Threshold', yaxis_title='Runtime (s)', yaxis_type='log')

In [43]:
acs_income = pd.read_csv("data/df_ACSIncome_enc_num.csv")
sum(occ.get_cardinalities(acs_income))

145

In [79]:
with open(f"results_server/ACSIncome_cov_combsize_runtime_dict_2023-10-16.pkl", "rb") as f:
    cov_runtime_output_acs = pickle.load(f)

In [303]:
fig_acs = visualize_runtime_experiment("ACSIncome", cov_runtime_output_acs)
fig_acs.update_layout(title=f'Runtime of Coverage Detection for ACSIncome (Threshold=1)', xaxis_title='Number of Patterns', yaxis_title='Runtime (s)', yaxis_type='log')

{'CoverageJava': {6: 4522}, 'Baseline (All Combs)': {2: 795, 3: 203, 4: 4522, 5: 72501, 7: 1428222, 8: 3830436, 9: 9339813}, 'MUPs pwalk': {2: 795, 3: 203, 4: 4522, 5: 72501, 7: 1428222, 8: 3830436, 9: 9339813}}
{'CoverageJava': {6: 8.960147142410278}, 'Baseline (All Combs)': {2: 0.0008603830356150866, 3: 0.0007304551545530558, 4: 0.003704494098201394, 5: 0.16112011088989675, 7: 50.45904640899971, 8: 287.1465045711957, 9: 1120.6086388651747}, 'MUPs pwalk': {2: 0.0060557888355106115, 3: 0.0012337020598351955, 4: 0.03592499205842614, 5: 2.1189936969894916, 7: 93.52500780206174, 8: 237.03961986489594, 9: 403.7739740598481}}


In [76]:
with open("results_server/ACSIncome_cov_threshold_runtime_dict_2023-10-16.pkl", "rb") as f:
    cov_threshold_output_acs = pickle.load(f)

In [301]:
fig_acs_t = visualize_runtime_experiment_threshold('ACSIncome_9339813', cov_threshold_output_acs)
fig_acs_t.update_layout(title=f'Runtime of Coverage Detection for ACSIncome', xaxis_title='Threshold', yaxis_title='Runtime (s)', xaxis_type='log')
# fig_acs_t.update_xaxes(range=[0, 0.1])

#### Number of MUPS

In [283]:
with open("results_server/UKRoadSafety_cov_mups_size_dict_2023-10-17.pkl", "rb") as f:
    cov_mups_size_bn = pickle.load(f)

In [284]:
cov_java = pd.DataFrame(cov_mups_size_bn['UKRoadSafety']['CoverageJava']).T
cov_java = cov_java.rename(columns={0: "MUPs_Java", 1: "Attributes_Java"})

In [285]:
pwalk = pd.DataFrame(cov_mups_size_bn['UKRoadSafety']['MUPs pwalk']).T
pwalk = pwalk.rename(columns={0: "MUPs", 1: "Attributes"})
pwalk["MUPs"] = pwalk["MUPs"].apply(lambda x: sum(len(v) for v in x.values()))
mups = pwalk.join(cov_java)

In [286]:
mups

Unnamed: 0,MUPs,Attributes,MUPs_Java,Attributes_Java,2
14,1,"(Urban_or_Rural_Area, Accident_Severity)",1,"[Urban_or_Rural_Area, Accident_Severity]",0.860978
29,2,"(Road_Type, Accident_Severity)",2,"[Road_Type, Accident_Severity]",1.149429
31,0,"(Day_of_Week, Accident_Severity)",0,"[Day_of_Week, Accident_Severity]",0.88232
129,15,"(Speed_limit, Urban_or_Rural_Area, Accident_Se...",15,"[Speed_limit, Urban_or_Rural_Area, Accident_Se...",1.485255
447,0,"(Day_of_Week, Accident_Severity, Year)",0,"[Day_of_Week, Accident_Severity]",1.014553
722,110,"(Road_Type, Speed_limit, Year)",21,"[Road_Type, Speed_limit]",0.821445
1157,158,"(Road_Type, Speed_limit, Urban_or_Rural_Area, ...",158,"[Road_Type, Speed_limit, Urban_or_Rural_Area, ...",1.381274
2601,259,"(Speed_limit, Accident_Severity, Road_Surface_...",26,"[Speed_limit, Accident_Severity, Road_Surface_...",1.306149
7398,594,"(Weather_Conditions, Road_Type, Day_of_Week, Y...",29,"[Weather_Conditions, Road_Type, Day_of_Week]",1.468484
10677,1023,"(Light_Conditions, Weather_Conditions, Road_Ty...",1024,"[Light_Conditions, Weather_Conditions, Road_Ty...",1.341898


----

### Coverage based on No. of Attributes of Interest & Mean of 3x Repeat

Input files can be produced based on test_coverage.py

BlueNile

In [306]:
pd.DataFrame(acs_income.nunique()).to_clipboard()

In [316]:
with open("results_server/BlueNile_cov_combsize_runtime_dict_mean_2023-10-20.pkl", "rb") as f:
    cov_runtime_output_bn_mean = pickle.load(f)
with open("results_server/BlueNile_cov_threshold_runtime_dict_mean_2023-10-20.pkl", "rb") as f:
    cov_runtime_output_bn_t = pickle.load(f)

In [317]:
for method,result in cov_runtime_output_bn_mean["BlueNile"].items():
    print(f"{method}")
    for k,v in result.items():
        print(f"({len(v[-1])}: {v[0]})")

CoverageJava
(2: 0.20654773712158203)
(3: 0.14939355850219727)
(4: 0.18068146705627441)
(5: 0.31819605827331543)
(6: 0.7756261825561523)
(7: 2.3958311080932617)
Baseline (All Combs)
(2: 0.001599363051354885)
(3: 0.00045726137856642407)
(4: 0.0014645533325771491)
(5: 0.005763036043693622)
(6: 0.022834554004172485)
(7: 0.1407714964201053)
MUPs pwalk
(2: 0.0002704731499155362)
(3: 0.0027824616990983486)
(4: 0.025114494531104963)
(5: 0.08577689295634627)
(6: 0.31781935039907694)
(7: 1.1251168426436682)


In [264]:
for method,result in cov_runtime_output_bn_t["BlueNile_129194"].items():
    print(f"{method}")
    for k,v in result.items():
        print(f"({k}: {v[0]})")

CoverageJava
(0: 2.7561709880828857)
(0.0001: 1.3967676162719727)
(0.001: 0.5792684555053711)
(0.01: 0.23100996017456055)
(0.1: 0.132094144821167)
Baseline (All Combs)
(0: 0.12561611109413207)
(0.0001: 2.8880874668247998)
(0.001: 2.8161951198708266)
(0.01: 2.7628919379785657)
(0.1: 2.7380496819969267)
MUPs pwalk
(0: 1.1281105759553611)
(0.0001: 0.6890750308521092)
(0.001: 0.24902757909148932)
(0.01: 0.06202251394279301)
(0.1: 0.006471215048804879)


In [318]:
for method,result in cov_runtime_output_bn_t["BlueNile_129194"].items():
    print(f"{method}")
    for k,v in result.items():
        print(f"({k}: {v[0]})")

CoverageJava
(0: 2.2461795806884766)
(0.0001: 1.2817492485046387)
(0.001: 0.5677435398101807)
(0.01: 0.2361433506011963)
(0.1: 0.12356281280517578)
Baseline (All Combs)
(0: 0.14090658367301026)
(0.0001: 2.786594835886111)
(0.001: 2.7307901840346553)
(0.01: 2.7183195897378027)
(0.1: 2.6788418628275394)
MUPs pwalk
(0: 1.1219812120931845)
(0.0001: 0.6887152420046428)
(0.001: 0.25066059657062095)
(0.01: 0.06028954063852628)
(0.1: 0.006060697603970766)


UK Road Safety

In [265]:
with open("results_server/UKRoadSafety_cov_combsize_runtime_dict_mean_2023-10-17.pkl", "rb") as f:
    cov_runtime_output_uk_mean = pickle.load(f)
with open("results_server/UKRoadSafety_cov_threshold_runtime_dict_mean_2023-10-17.pkl", "rb") as f:
    cov_runtime_output_uk_t = pickle.load(f)

In [267]:
for method,result in cov_runtime_output_uk_mean["UKRoadSafety"].items():
    print(f"{method}")
    for k,v in result.items():
        print(f"({len(v[-1])}, {v[0]})")

CoverageJava
(1, 0.8207216262817383)
(2, 1.0374367237091064)
(3, 2.1196517944335938)
(4, 1.8341691493988037)
(5, 2.7312991619110107)
(6, 3.8924992084503174)
(7, 6.846422910690308)
(8, 16.3694486618042)
Baseline (All Combs)
(2, 0.0004771933890879154)
(3, 0.0007373583503067493)
(4, 0.0038990166503936052)
(5, 0.02577376862366994)
(6, 0.2869309502808998)
(7, 2.677912824321538)
(8, 12.43063148126627)
(9, 61.21041132897759)
MUPs pwalk
(2, 0.0009316693370540937)
(3, 0.006826431102429827)
(4, 0.05023986132194599)
(5, 0.38891160433801514)
(6, 1.9636025559157133)
(7, 7.8727918806641055)
(8, 22.07932007832763)
(9, 67.03065078534807)


In [272]:
for method,result in cov_runtime_output_uk_t["UKRoadSafety_3723646"].items():
    print(f"{method}")
    for k,v in result.items():
        print(f"({k}, {v[0]})")

CoverageJava
(0, 17.26330852508545)
(1e-06, 13.969251871109009)
(1e-05, 8.13606595993042)
(0.0001, 4.697667360305786)
(0.001, 2.776423215866089)
(0.01, 1.8614158630371094)
(0.1, 1.8880629539489746)
Baseline (All Combs)
(0, 62.02678593387827)
(1e-06, 545.9503073061351)
(1e-05, 493.0468776091002)
(0.0001, 476.02420189999975)
(0.001, 472.31484121782705)
(0.01, 474.21298974286765)
(0.1, 467.6620404231362)
MUPs pwalk
(0, 66.80094218417071)
(1e-06, 51.47853068308905)
(1e-05, 21.916362643940374)
(0.0001, 8.054226189153269)
(0.001, 2.3131300089880824)
(0.01, 0.46334648900665343)
(0.1, 0.035612029023468494)


ACSIncome

In [274]:
with open("results_server/ACSIncome_cov_combsize_runtime_dict_mean_2023-10-18.pkl", "rb") as f:
    cov_runtime_output_acs_mean = pickle.load(f)
with open("results_server/ACSIncome_cov_threshold_runtime_dict_mean_2023-10-18.pkl", "rb") as f:    
    cov_runtime_output_acs_t = pickle.load(f)       

In [275]:
for method,result in cov_runtime_output_acs_mean["ACSIncome"].items():
    print(f"{method}")
    for k,v in result.items():
        print(f"({len(v[-1])}, {v[0]})")

Baseline (All Combs)
(2, 0.003249630914069712)
(3, 0.011317450436763465)
(4, 0.16660850250627846)
(5, 1.9309262784663588)
(6, 13.144666976993904)
(7, 59.51114845147822)
(8, 285.46504188352264)
(9, 1289.883854711079)
MUPs pwalk
(2, 0.02126442512962967)
(3, 0.4579013620968908)
(4, 2.2529745020437986)
(5, 9.84007430949714)
(6, 34.19822951592505)
(7, 106.48765819892287)
(8, 241.23116014548577)
(9, 404.9731708118925)


In [281]:
for method,result in cov_runtime_output_acs_t["ACSIncome_9339813"].items():
    print(f"{method}")
    for k,v in result.items():
        print(f"({k}, {v[0]})")

Baseline (All Combs)
(0, 1294.5326224539895)
(1e-05, 2653.121801445959)
(0.0001, 2638.2896890139673)
(0.001, 2635.378029016778)
(0.01, 2606.4885705111083)
(0.1, 2626.2586118630134)
MUPs pwalk
(0, 407.00151078915223)
(1e-05, 86.39760038605891)
(0.0001, 25.134359463118017)
(0.001, 6.200797878904268)
(0.01, 1.235835409956053)
(0.1, 0.20219181990250945)
