# K-Level Approach

Li, S.-C., Tai, B.-C., & Huang, Y. (2019). Evaluating Variational Autoencoder as a Private Data Release Mechanism for Tabular Data. In 2019 IEEE 24th Pacific Rim International Symposium on Dependable Computing (PRDC) (pp. 198–1988). IEEE. https://doi.org/10.1109/PRDC47002.2019.00050

In [1]:
import pandas as pd
import numpy as np

In [2]:
def calculate_k_levels(df):
    """
    Calculate K-level for each column and combinations of columns in a DataFrame.

    Parameters:
    - df (DataFrame): Input DataFrame

    Returns:
    - k_levels (dict): Dictionary containing K-levels for individual columns
    - k_levels_combinations (dict): Dictionary containing K-levels for combinations of columns
    """

    def calculate_k_level(column):
        return column.value_counts().min()

    # Calculate K-level for each column
    k_levels = {}
    for column in df.columns:
        k_levels[column] = calculate_k_level(df[column])

    # Calculate K-level for combinations of columns
    k_levels_combinations = {}
    for i in range(len(df.columns)):
        for j in range(i + 1, len(df.columns)):
            combination_name = df.columns[i] + "_" + df.columns[j]
            # k_levels_combinations[combination_name] = (
            #     df.groupby([df.columns[i], df.columns[j]]).size().min()
            # )

            group_sizes = df.groupby([df.columns[i], df.columns[j]]).size()
            min_group_size = group_sizes.min()
            min_groups = group_sizes[group_sizes == min_group_size].index.tolist()
            k_levels_combinations[combination_name] = (min_group_size, min_groups)

    return k_levels, k_levels_combinations

In [3]:
def k_level_comparsion(df_original, k_levels_combinations):

    k_level = []
    for keys in k_levels_combinations:
        column1, column2 = keys.split("_")
        print(column1, column2)
        count = k_levels_combinations[keys][0]
        values = k_levels_combinations[keys][1]
        for x in values:
            for index, row in df_original.iterrows():
                if str(x[0]) in str(row[column1]) and str(x[1]) in str(row[column2]):
                    k_level.append(count)
                    break  # Exit inner loop if a match is found in the current row

    return k_level

In [14]:
k_levels_combinations['Time_V3'][1]

[(0.2, 0.8)]

In [4]:
df_train_cut = pd.read_csv("../data/interim/cut_data.csv")

file_name_short = "64_128_sample"
synthetic_data_paths = [
    f"../data/interim/privacy_testing/{file_name_short}.csv",
]

file_names = [path.split("/")[-1].split(".")[0] for path in synthetic_data_paths]
synth_data_dict = {}

for path, file_name in zip(synthetic_data_paths, file_names):
    synth_data_dict[file_name] = pd.read_csv(path)

In [5]:
k_levels, k_levels_combinations = calculate_k_levels(
    synth_data_dict[file_name_short].round(1)
)

In [447]:
k_list = file_names[0].split('_')

In [448]:
k_level = k_level_comparsion(df_train_cut.round(1), k_levels_combinations)
print(sum(k_level))
k_list.append(sum(k_level))

Time V1
0.2 0.9 0.2 0.9 22
Time V2
0.2 0.8 0.2 0.8 74
Time V3
0.2 0.8 0.2 0.8 16
Time V4
0.2 0.4 0.2 0.4 1
Time V5
0.1 0.7 0.1 0.7 11
Time V6
0.2 0.3 0.2 0.3 74
Time V7
0.1 0.2 0.1 0.2 3
Time V8
0.1 0.7 0.1 0.7 3
Time V9
0.1 0.4 0.1 0.4 48
Time V10
0.2 0.6 0.2 0.6 1
Time V11
0.1 0.5 0.1 0.5 5
Time V12
0.1 0.5 0.1 0.5 9
Time V13
0.1 0.6 0.1 0.6 1
Time V14
0.1 0.5 0.1 0.5 2
Time V15
0.2 0.3 0.2 0.3 19
Time V16
0.2 0.4 0.2 0.4 9
Time V17
0.1 0.6 0.1 0.6 3
Time V18
0.2 0.7 0.2 0.7 35
Time V19
0.2 0.5 0.2 0.5 25
Time V20
0.1 0.5 0.1 0.5 7
Time V21
0.2 0.5 0.2 0.5 1
Time V22
0.2 0.5 0.2 0.5 74
Time V23
0.2 0.7 0.2 0.7 74
Time V24
0.2 0.4 0.2 0.4 74
Time V25
0.2 0.6 0.2 0.6 74
Time V26
0.2 0.5 0.2 0.5 10
Time V27
0.2 0.4 0.2 0.4 74
Time V28
0.2 0.3 0.2 0.3 74
Time Amount
Time Class
0.2 0 0.2 0.0 74
V1 V2
1.0 0.7 1.0 0.7 37
V1 V3
1.0 0.8 1.0 0.8 78
V1 V4
0.9 0.4 0.9 0.4 19
V1 V5
0.9 0.7 0.9 0.7 11
V1 V6
1.0 0.2 1.0 0.2 28
V1 V7
0.9 0.2 0.9 0.2 3
V1 V8
0.9 0.7 0.9 0.7 3
V1 V9
1.0 0.4 1.0 0.4 4


In [454]:
df = pd.read_csv("../data/interim/privacy_testing/analysis.csv")

In [450]:
new_row_df = pd.DataFrame([k_list], columns=df.columns)
df = pd.concat([df, new_row_df], ignore_index=True)

In [458]:
df

Unnamed: 0,latent_dim,nodes,generation,klevel
0,2,128,data,933411
1,2,128,sample,578356
2,4,128,data,183125
3,4,128,sample,90349
4,8,128,data,36953
5,8,128,sample,359017
6,16,128,data,18589
7,16,128,sample,154160
8,32,128,data,529590
9,32,128,data,257146


In [459]:
df.to_csv("../data/interim/privacy_testing/analysis.csv", index=False)