In [None]:
!pip install syntheval

In [None]:
from syntheval import SynthEval
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import files
import io
import sys
import time

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
data_sizes = ['1000', '2500', '5000', '10000', '20000', '50000']
data_regions = ['national', 'ca', 'tx']
synthesizers = ['CTGAN', 'LLM', 'synthpop', 'TVAE']

In [None]:
# Function to load, round, drop NA, reset index, and remove homogenous columns
def load_and_process_data(file_path):
    df = pd.read_csv(file_path).dropna().reset_index(drop=True).round(0)
    if 'X' in df.columns:
        df = df.drop(columns=['X'])
    if 'row_index' in df.columns:
        df = df.drop(columns=['row_index'])
    if 'Unnamed: 0' in df.columns:
        df = df.drop(columns=['Unnamed: 0'])
    homogenous_cols = [col for col in df.columns if df[col].nunique() == 1]
    df = df.drop(columns=homogenous_cols)
    return df

# Load and process testing and training data
def load_all_data(data_sizes, data_regions, load_func, data_type):
    dataframes = {region: {} for region in data_regions}
    for region in data_regions:
        for size in data_sizes:
            file_path = f'/content/drive/My Drive/06_kdd/02_data/{data_type}/{data_type}_{region}_{size}.csv'
            dataframes[region][size] = load_func(file_path)
    return dataframes

In [None]:
testing_dataframes = load_all_data(data_sizes, data_regions, load_and_process_data, "testing")
training_dataframes = load_all_data(data_sizes, data_regions, load_and_process_data, "training")

synthesized_dataframes = {synth: {region: {size: {} for size in data_sizes} for region in data_regions} for synth in synthesizers}

for synth in synthesizers:
    for region in data_regions:
        for size in data_sizes:
            for run in range(5):
                file_path = f'/content/drive/My Drive/06_kdd/02_data/{synth}/training_{region}_{size}_Run_{run}.csv'
                synthesized_dataframes[synth][region][size][run] = load_and_process_data(file_path)


In [None]:
categorical_columns_national = ['state', 'Parties_Description', 'EthnicGroups_EthnicGroup1Desc', 'Ethnic_Description','Residence_HHParties_Description', 'CommercialData_PropertyType', 'voted', 'Voters_Gender', 'nonpartisan_donation']
categorical_columns_no_state = ['Parties_Description', 'EthnicGroups_EthnicGroup1Desc', 'Ethnic_Description','Residence_HHParties_Description', 'CommercialData_PropertyType', 'voted', 'Voters_Gender', 'nonpartisan_donation']

In [None]:
categorical_columns_national_synthpop = ['state', 'Parties_Description', 'EthnicGroups_EthnicGroup1Desc','Residence_HHParties_Description', 'CommercialData_PropertyType', 'voted', 'Voters_Gender', 'nonpartisan_donation']
categorical_columns_no_state_synthpop = ['Parties_Description', 'EthnicGroups_EthnicGroup1Desc','Residence_HHParties_Description', 'CommercialData_PropertyType', 'voted', 'Voters_Gender', 'nonpartisan_donation']

In [None]:
voted = 'voted'
donation = 'nonpartisan_donation'
party = 'Residence_HHParties_Description'

metrics_all = {
        "corr_diff" : {"mixed_corr": True},
        "mi_diff"   : {},
        "ks_test"   : {"sig_lvl": 0.05, "n_perms": 1000},
        "p_mse"     : {"k_folds": 5, "max_iter": 1000, "solver": "liblinear"},
        "cls_acc"   : {"F1_type": "micro", "k_folds": 5},
        "dcr"       : {},
        "eps_risk"  : {},
        "mia_risk"  : {"num_eval_iter": 5},
        "att_discl" : {}
    }

metrics_gen_util = {
    "corr_diff" : {"mixed_corr": True},
    "mi_diff"   : {},
    "ks_test"   : {"sig_lvl": 0.05, "n_perms": 1000},
    "p_mse"     : {"k_folds": 5, "max_iter": 1000, "solver": "liblinear"},
}

metrics_target = {
        "cls_acc"   : {"F1_type": "micro", "k_folds": 3}
    }

metrics_privacy = {
    # "dcr"       : {},
    "eps_risk"  : {},
    # "mia_risk"  : {"num_eval_iter": 5},
    # "att_discl" : {}
}

In [None]:
# Function to evaluate and save results for synthesized data
def evaluate_synthesized_data(real_df, test_df, synth_df, cat_cols, metrics, base_filename, size, run, metric_type, target='empty'):
    print(f"Evaluating {base_filename}_{size}_Run_{run}...")

    # Capture the output
    old_stdout = sys.stdout
    sys.stdout = mystdout = io.StringIO()

    # Run the evaluation
    if target == 'empty':
      S = SynthEval(real_df, holdout_dataframe=test_df, cat_cols=cat_cols)
      _ = S.evaluate(synth_df, **metrics)
    else:
      S = SynthEval(real_df, holdout_dataframe=test_df, cat_cols=cat_cols)
      _ = S.evaluate(synth_df, target, **metrics)

    # Reset stdout
    sys.stdout = old_stdout

    # Get the captured output
    evaluation_results = mystdout.getvalue()

    # Write evaluation results to a text file
    if target == 'empty':
      text_file_name = f'{base_filename}_{size}_Run_{run}_{metric_type}_evaluation_results.txt'
    else:
      text_file_name = f'{base_filename}_{size}_Run_{run}_{metric_type}_{target}_evaluation_results.txt'
    with open(text_file_name, 'w') as f:
        f.write(evaluation_results)

    # # Save and download the text file
    # files.download(text_file_name)
    # time.sleep(2)  # Add a short delay to ensure the download completes

    # Save and downlaod text file to google drive
    drive_file_name = text_file_name
    drive_file_path = f'/content/drive/My Drive/06_kdd/'
    drive_file_path = drive_file_path + drive_file_name
    with open(drive_file_path, 'w') as f:
        f.write(evaluation_results)

    # files.download(drive_file_path)
    # time.sleep(2)  # Add a short delay to ensure the download completes

    sys.stdout.flush()
    sys.stderr.flush()

    # Save all the figures generated by syntheval
    for j, figure in enumerate(plt.get_fignums()):
        plt.figure(figure)
        plot_file_name = f'{base_filename}_{size}_Run_{run}_figure_{j + 1}.png'
        plt.savefig(plot_file_name)

        drive_file_path = f'/content/drive/My Drive/06_kdd/'
        drive_file_path = drive_file_path + plot_file_name
        with open(drive_file_path, 'w') as f:
            f.write(evaluation_results)

        # files.download(drive_file_path)
        # time.sleep(2)  # Add a short delay to ensure the download completes

        # files.download(plot_file_name)
        # time.sleep(2)  # Add a short delay to ensure the download completes
        sys.stdout.flush()
        sys.stderr.flush()

    # Close all figures to avoid overlapping in the next iteration
    plt.close('all')

## CTGAN

### General - CTGAN

In [None]:
# CTGAN National Level GENERAL UTILITY Evaluation
base_filename = 'ctgan_national'
metric_type = 'general_util'

for size in data_sizes:
    try:
        real_df = training_dataframes['national'][size]
        test_df = testing_dataframes['national'][size]
        for run in range(5):
            try:
                synth_df = synthesized_dataframes['CTGAN']['national'][size][run]
                evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_national, metrics_gen_util, base_filename, size, run, metric_type)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue

In [None]:
# CTGAN ca Level GENERAL UTILITY Evaluation
base_filename = 'ctgan_ca'
metric_type = 'general_util'

for size in data_sizes:
    try:
        real_df = training_dataframes['ca'][size]
        test_df = testing_dataframes['ca'][size]
        for run in range(5):
            try:
                synth_df = synthesized_dataframes['CTGAN']['ca'][size][run]
                evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_no_state, metrics_gen_util, base_filename, size, run, metric_type)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue

In [None]:
# CTGAN tx Level GENERAL UTILITY Evaluation
base_filename = 'ctgan_tx'
metric_type = 'general_util'

for size in data_sizes:
    try:
        real_df = training_dataframes['tx'][size]
        test_df = testing_dataframes['tx'][size]
        for run in range(5):
            try:
                synth_df = synthesized_dataframes['CTGAN']['tx'][size][run]
                evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_no_state, metrics_gen_util, base_filename, size, run, metric_type)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue

### Target - CTGAN

In [None]:
# CTGAN National Level TARGET-SPECIFIC UTILITY Evaluation - VOTED
base_filename = 'ctgan_national'
metric_type = 'target_util'

for size in data_sizes:
    try:
        real_df = training_dataframes['national'][size]
        test_df = testing_dataframes['national'][size]
        for run in range(5):
            try:
                synth_df = synthesized_dataframes['CTGAN']['national'][size][run]
                evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_national, metrics_target, base_filename, size, run, metric_type, target = voted)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue

In [None]:
# CTGAN National Level TARGET-SPECIFIC UTILITY Evaluation - DONATION
base_filename = 'ctgan_national'
metric_type = 'target_util'

for size in data_sizes:
    try:
        real_df = training_dataframes['national'][size]
        test_df = testing_dataframes['national'][size]
        for run in range(5):
            try:
                synth_df = synthesized_dataframes['CTGAN']['national'][size][run]
                evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_national, metrics_target, base_filename, size, run, metric_type, target = donation)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue

In [None]:
# CTGAN National Level TARGET-SPECIFIC UTILITY Evaluation - PARTY
base_filename = 'ctgan_national'
metric_type = 'target_util'

for size in data_sizes:
    try:
        real_df = training_dataframes['national'][size]
        test_df = testing_dataframes['national'][size]
        for run in range(5):
            try:
                synth_df = synthesized_dataframes['CTGAN']['national'][size][run]
                evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_national, metrics_target, base_filename, size, run, metric_type, target = party)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue

In [None]:
# CTGAN ca Level TARGET-SPECIFIC UTILITY Evaluation - VOTED
base_filename = 'ctgan_ca'
metric_type = 'target_util'

for size in data_sizes:
    try:
        real_df = training_dataframes['ca'][size]
        test_df = testing_dataframes['ca'][size]
        for run in range(5):
            try:
                synth_df = synthesized_dataframes['CTGAN']['ca'][size][run]
                evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_no_state, metrics_target, base_filename, size, run, metric_type, target = voted)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue

In [None]:
# CTGAN ca Level TARGET-SPECIFIC UTILITY Evaluation - DONATION
base_filename = 'ctgan_ca'
metric_type = 'target_util'

for size in data_sizes:
    try:
        real_df = training_dataframes['ca'][size]
        test_df = testing_dataframes['ca'][size]
        for run in range(5):
            try:
                synth_df = synthesized_dataframes['CTGAN']['ca'][size][run]
                evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_no_state, metrics_target, base_filename, size, run, metric_type, target = donation)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue

In [None]:
# CTGAN ca Level TARGET-SPECIFIC UTILITY Evaluation - PARTY
base_filename = 'ctgan_ca'
metric_type = 'target_util'

for size in data_sizes:
    try:
        real_df = training_dataframes['ca'][size]
        test_df = testing_dataframes['ca'][size]
        for run in range(5):
            try:
                synth_df = synthesized_dataframes['CTGAN']['ca'][size][run]
                evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_no_state, metrics_target, base_filename, size, run, metric_type, target = party)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue

In [None]:
# CTGAN tx Level TARGET-SPECIFIC UTILITY Evaluation - VOTED
base_filename = 'ctgan_tx'
metric_type = 'target_util'

for size in data_sizes:
    try:
        real_df = training_dataframes['tx'][size]
        test_df = testing_dataframes['tx'][size]
        for run in range(5):
            try:
                synth_df = synthesized_dataframes['CTGAN']['tx'][size][run]
                evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_no_state, metrics_target, base_filename, size, run, metric_type, target = voted)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue

In [None]:
# CTGAN tx Level TARGET-SPECIFIC UTILITY Evaluation - DONATION
base_filename = 'ctgan_tx'
metric_type = 'target_util'

for size in data_sizes:
    try:
        real_df = training_dataframes['tx'][size]
        test_df = testing_dataframes['tx'][size]
        for run in range(5):
            try:
                synth_df = synthesized_dataframes['CTGAN']['tx'][size][run]
                evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_no_state, metrics_target, base_filename, size, run, metric_type, target = donation)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue

In [None]:
# CTGAN tx Level TARGET-SPECIFIC UTILITY Evaluation - PARTY
base_filename = 'ctgan_tx'
metric_type = 'target_util'

for size in data_sizes:
    try:
        real_df = training_dataframes['tx'][size]
        test_df = testing_dataframes['tx'][size]
        for run in range(5):
            try:
                synth_df = synthesized_dataframes['CTGAN']['tx'][size][run]
                evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_no_state, metrics_target, base_filename, size, run, metric_type, target = party)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue

## synthpop

### General - synthpop

In [None]:
# synthpop National Level GENERAL UTILITY Evaluation
base_filename = 'synthpop_national'
metric_type = 'general_util'

for size in data_sizes:
    try:
        real_df = training_dataframes['national'][size]
        test_df = testing_dataframes['national'][size]
        real_df = real_df.drop(columns=['Ethnic_Description'])
        test_df = test_df.drop(columns=['Ethnic_Description'])

        for run in range(5):
            try:
                synth_df = synthesized_dataframes['synthpop']['national'][size][run]
                evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_national_synthpop, metrics_gen_util, base_filename, size, run, metric_type)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue

In [None]:
# synthpop ca Level GENERAL UTILITY Evaluation
base_filename = 'synthpop_ca'
metric_type = 'general_util'

for size in data_sizes:
    try:
        real_df = training_dataframes['ca'][size]
        test_df = testing_dataframes['ca'][size]
        real_df = real_df.drop(columns=['Ethnic_Description'])
        test_df = test_df.drop(columns=['Ethnic_Description'])

        for run in range(5):
            try:
                synth_df = synthesized_dataframes['synthpop']['ca'][size][run]
                evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_no_state_synthpop, metrics_gen_util, base_filename, size, run, metric_type)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue

In [None]:
# synthpop tx Level GENERAL UTILITY Evaluation
base_filename = 'synthpop_tx'
metric_type = 'general_util'

for size in data_sizes:
    try:
        real_df = training_dataframes['tx'][size]
        test_df = testing_dataframes['tx'][size]
        real_df = real_df.drop(columns=['Ethnic_Description'])
        test_df = test_df.drop(columns=['Ethnic_Description'])

        for run in range(5):
            try:
                synth_df = synthesized_dataframes['synthpop']['tx'][size][run]
                evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_no_state_synthpop, metrics_gen_util, base_filename, size, run, metric_type)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue

### Target - synthpop

In [None]:
# synthpop National Level TARGET-SPECIFIC UTILITY Evaluation - VOTED
base_filename = 'synthpop_national'
metric_type = 'target_util'

for size in data_sizes:
    try:
        real_df = training_dataframes['national'][size]
        test_df = testing_dataframes['national'][size]

        real_df = real_df.drop(columns=['Ethnic_Description'])
        test_df = test_df.drop(columns=['Ethnic_Description'])

        for run in range(5):
            try:
                synth_df = synthesized_dataframes['synthpop']['national'][size][run]
                evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_national_synthpop, metrics_target, base_filename, size, run, metric_type, target = voted)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue

In [None]:
# synthpop National Level TARGET-SPECIFIC UTILITY Evaluation - DONATION
base_filename = 'synthpop_national'
metric_type = 'target_util'

for size in data_sizes:
    try:
        real_df = training_dataframes['national'][size]
        test_df = testing_dataframes['national'][size]

        real_df = real_df.drop(columns=['Ethnic_Description'])
        test_df = test_df.drop(columns=['Ethnic_Description'])

        for run in range(5):
            try:
                synth_df = synthesized_dataframes['synthpop']['national'][size][run]
                evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_national_synthpop, metrics_target, base_filename, size, run, metric_type, target = donation)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue

In [None]:
# synthpop National Level TARGET-SPECIFIC UTILITY Evaluation - PARTY
base_filename = 'synthpop_national'
metric_type = 'target_util'

for size in data_sizes:
    try:
        real_df = training_dataframes['national'][size]
        test_df = testing_dataframes['national'][size]

        real_df = real_df.drop(columns=['Ethnic_Description'])
        test_df = test_df.drop(columns=['Ethnic_Description'])

        for run in range(5):
            try:
               synth_df = synthesized_dataframes['synthpop']['national'][size][run]
               evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_national_synthpop, metrics_target, base_filename, size, run, metric_type, target = party)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue

In [None]:
# synthpop ca Level TARGET-SPECIFIC UTILITY Evaluation - VOTED
base_filename = 'synthpop_ca'
metric_type = 'target_util'

for size in data_sizes:
    try:
        real_df = training_dataframes['ca'][size]
        test_df = testing_dataframes['ca'][size]

        real_df = real_df.drop(columns=['Ethnic_Description'])
        test_df = test_df.drop(columns=['Ethnic_Description'])

        for run in range(5):
            try:
               synth_df = synthesized_dataframes['synthpop']['ca'][size][run]
               evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_no_state_synthpop, metrics_target, base_filename, size, run, metric_type, target = voted)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue

In [None]:
# synthpop ca Level TARGET-SPECIFIC UTILITY Evaluation - DONATION
base_filename = 'synthpop_ca'
metric_type = 'target_util'

for size in data_sizes:
    try:
        real_df = training_dataframes['ca'][size]
        test_df = testing_dataframes['ca'][size]

        real_df = real_df.drop(columns=['Ethnic_Description'])
        test_df = test_df.drop(columns=['Ethnic_Description'])

        for run in range(5):
            try:
               synth_df = synthesized_dataframes['synthpop']['ca'][size][run]
               evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_no_state_synthpop, metrics_target, base_filename, size, run, metric_type, target = donation)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue

In [None]:
# synthpop ca Level TARGET-SPECIFIC UTILITY Evaluation - PARTY
base_filename = 'synthpop_ca'
metric_type = 'target_util'

for size in data_sizes:
    try:
        real_df = training_dataframes['ca'][size]
        test_df = testing_dataframes['ca'][size]

        real_df = real_df.drop(columns=['Ethnic_Description'])
        test_df = test_df.drop(columns=['Ethnic_Description'])

        for run in range(5):
            try:
               synth_df = synthesized_dataframes['synthpop']['ca'][size][run]
               evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_no_state_synthpop, metrics_target, base_filename, size, run, metric_type, target = party)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue

In [None]:
# synthpop tx Level TARGET-SPECIFIC UTILITY Evaluation - VOTED
base_filename = 'synthpop_tx'
metric_type = 'target_util'

for size in data_sizes:
    try:
        real_df = training_dataframes['tx'][size]
        test_df = testing_dataframes['tx'][size]

        real_df = real_df.drop(columns=['Ethnic_Description'])
        test_df = test_df.drop(columns=['Ethnic_Description'])

        for run in range(5):
            try:
               synth_df = synthesized_dataframes['synthpop']['tx'][size][run]
               evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_no_state_synthpop, metrics_target, base_filename, size, run, metric_type, target = voted)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue

In [None]:
# synthpop tx Level TARGET-SPECIFIC UTILITY Evaluation - DONATION
base_filename = 'synthpop_tx'
metric_type = 'target_util'

for size in data_sizes:
    try:
        real_df = training_dataframes['tx'][size]
        test_df = testing_dataframes['tx'][size]

        real_df = real_df.drop(columns=['Ethnic_Description'])
        test_df = test_df.drop(columns=['Ethnic_Description'])

        for run in range(5):
            try:
               synth_df = synthesized_dataframes['synthpop']['tx'][size][run]
               evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_no_state_synthpop, metrics_target, base_filename, size, run, metric_type, target = donation)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue

In [None]:
# synthpop tx Level TARGET-SPECIFIC UTILITY Evaluation - PARTY
base_filename = 'synthpop_tx'
metric_type = 'target_util'

for size in data_sizes:
    try:
        real_df = training_dataframes['tx'][size]
        test_df = testing_dataframes['tx'][size]

        real_df = real_df.drop(columns=['Ethnic_Description'])
        test_df = test_df.drop(columns=['Ethnic_Description'])

        for run in range(5):
            try:
               synth_df = synthesized_dataframes['synthpop']['tx'][size][run]
               evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_no_state_synthpop, metrics_target, base_filename, size, run, metric_type, target = party)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue

## LLM

### General - LLM

In [None]:
# LLM National Level GENERAL UTILITY Evaluation
base_filename = 'LLM_national'
metric_type = 'general_util'

for size in data_sizes:
    try:
        real_df = training_dataframes['national'][size]
        test_df = testing_dataframes['national'][size]
        for run in range(5):
            try:
                synth_df = synthesized_dataframes['LLM']['national'][size][run]
                evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_national, metrics_gen_util, base_filename, size, run, metric_type)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue

In [None]:
# LLM ca Level GENERAL UTILITY Evaluation
base_filename = 'LLM_ca'
metric_type = 'general_util'

for size in data_sizes:
    try:
        real_df = training_dataframes['ca'][size]
        test_df = testing_dataframes['ca'][size]
        for run in range(5):
            try:
                synth_df = synthesized_dataframes['LLM']['ca'][size][run]
                evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_no_state, metrics_gen_util, base_filename, size, run, metric_type)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue

In [None]:
# LLM tx Level GENERAL UTILITY Evaluation
base_filename = 'LLM_tx'
metric_type = 'general_util'

for size in data_sizes:
    try:
        real_df = training_dataframes['tx'][size]
        test_df = testing_dataframes['tx'][size]
        for run in range(5):
            try:
                synth_df = synthesized_dataframes['LLM']['tx'][size][run]
        evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_no_state, metrics_gen_util, base_filename, size, run, metric_type)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue

### Target - LLM

In [None]:
# LLM National Level TARGET-SPECIFIC UTILITY Evaluation - VOTED
base_filename = 'LLM_national'
metric_type = 'target_util'

for size in data_sizes:
    try:
        real_df = training_dataframes['national'][size]
        test_df = testing_dataframes['national'][size]
        for run in range(5):
            try:
                synth_df = synthesized_dataframes['LLM']['national'][size][run]
                evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_national, metrics_target, base_filename, size, run, metric_type, target = voted)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue

In [None]:
# LLM National Level TARGET-SPECIFIC UTILITY Evaluation - DONATION
base_filename = 'LLM_national'
metric_type = 'target_util'

for size in data_sizes:
    try:
        real_df = training_dataframes['national'][size]
        test_df = testing_dataframes['national'][size]
        for run in range(5):
            try:
                synth_df = synthesized_dataframes['LLM']['national'][size][run]
                evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_national, metrics_target, base_filename, size, run, metric_type, target = donation)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue

In [None]:
# LLM National Level TARGET-SPECIFIC UTILITY Evaluation - PARTY
base_filename = 'LLM_national'
metric_type = 'target_util'

for size in data_sizes:
    try:
        real_df = training_dataframes['national'][size]
        test_df = testing_dataframes['national'][size]
        for run in range(5):
            try:
                synth_df = synthesized_dataframes['LLM']['national'][size][run]
                evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_national, metrics_target, base_filename, size, run, metric_type, target = party)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue

In [None]:
# LLM ca Level TARGET-SPECIFIC UTILITY Evaluation - VOTED
base_filename = 'LLM_ca'
metric_type = 'target_util'

for size in data_sizes:
    try:
        real_df = training_dataframes['ca'][size]
        test_df = testing_dataframes['ca'][size]
        for run in range(5):
            try:
                synth_df = synthesized_dataframes['LLM']['ca'][size][run]
                evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_no_state, metrics_target, base_filename, size, run, metric_type, target = voted)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue

In [None]:
# LLM ca Level TARGET-SPECIFIC UTILITY Evaluation - DONATION
base_filename = 'LLM_ca'
metric_type = 'target_util'

for size in data_sizes:
    try:
        real_df = training_dataframes['ca'][size]
        test_df = testing_dataframes['ca'][size]
        for run in range(5):
            try:
                synth_df = synthesized_dataframes['LLM']['ca'][size][run]
                evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_no_state, metrics_target, base_filename, size, run, metric_type, target = donation)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue

In [None]:
# LLM ca Level TARGET-SPECIFIC UTILITY Evaluation - PARTY
base_filename = 'LLM_ca'
metric_type = 'target_util'

for size in data_sizes:
    try:
        real_df = training_dataframes['ca'][size]
        test_df = testing_dataframes['ca'][size]
        for run in range(5):
            try:
                synth_df = synthesized_dataframes['LLM']['ca'][size][run]
                evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_no_state, metrics_target, base_filename, size, run, metric_type, target = party)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue

In [None]:
# LLM tx Level TARGET-SPECIFIC UTILITY Evaluation - VOTED
base_filename = 'LLM_tx'
metric_type = 'target_util'

for size in data_sizes:
    try:
        real_df = training_dataframes['tx'][size]
        test_df = testing_dataframes['tx'][size]
        for run in range(5):
            try:
                synth_df = synthesized_dataframes['LLM']['tx'][size][run]
                evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_no_state, metrics_target, base_filename, size, run, metric_type, target = voted)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue

In [None]:
# LLM tx Level TARGET-SPECIFIC UTILITY Evaluation - DONATION
base_filename = 'LLM_tx'
metric_type = 'target_util'

for size in data_sizes:
    try:
        real_df = training_dataframes['tx'][size]
        test_df = testing_dataframes['tx'][size]
        for run in range(5):
            try:
                synth_df = synthesized_dataframes['LLM']['tx'][size][run]
                evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_no_state, metrics_target, base_filename, size, run, metric_type, target = donation)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue

In [None]:
# LLM tx Level TARGET-SPECIFIC UTILITY Evaluation - PARTY
base_filename = 'LLM_tx'
metric_type = 'target_util'

for size in data_sizes:
    try:
        real_df = training_dataframes['tx'][size]
        test_df = testing_dataframes['tx'][size]
        for run in range(5):
            try:
                synth_df = synthesized_dataframes['LLM']['tx'][size][run]
                evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_no_state, metrics_target, base_filename, size, run, metric_type, target = party)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue

### Target - SMOTE

In [None]:
# SMOTE National Level TARGET-SPECIFIC UTILITY Evaluation - VOTED
base_filename = 'SMOTE_national'
metric_type = 'target_util'

for size in data_sizes:
    try:
        real_df = training_dataframes['national'][size]
        test_df = testing_dataframes['national'][size]
        for run in range(5):
            try:
                synth_df = synthesized_dataframes['SMOTE']['national'][size][run]
                evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_national, metrics_target, base_filename, size, run, metric_type, target = voted)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue

In [None]:
# SMOTE National Level TARGET-SPECIFIC UTILITY Evaluation - DONATION
base_filename = 'SMOTE_national'
metric_type = 'target_util'

for size in data_sizes:
    try:
        real_df = training_dataframes['national'][size]
        test_df = testing_dataframes['national'][size]
        for run in range(5):
            try:
                synth_df = synthesized_dataframes['SMOTE']['national'][size][run]
                evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_national, metrics_target, base_filename, size, run, metric_type, target = donation)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue

In [None]:
# SMOTE National Level TARGET-SPECIFIC UTILITY Evaluation - PARTY
base_filename = 'SMOTE_national'
metric_type = 'target_util'

for size in data_sizes:
    try:
        real_df = training_dataframes['national'][size]
        test_df = testing_dataframes['national'][size]
        for run in range(5):
            try:
                synth_df = synthesized_dataframes['SMOTE']['national'][size][run]
                evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_national, metrics_target, base_filename, size, run, metric_type, target = party)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue

In [None]:
# SMOTE ca Level TARGET-SPECIFIC UTILITY Evaluation - VOTED
base_filename = 'SMOTE_ca'
metric_type = 'target_util'

for size in data_sizes:
    try:
        real_df = training_dataframes['ca'][size]
        test_df = testing_dataframes['ca'][size]
        for run in range(5):
            try:
                synth_df = synthesized_dataframes['SMOTE']['ca'][size][run]
                evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_no_state, metrics_target, base_filename, size, run, metric_type, target = voted)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue

In [None]:
# SMOTE ca Level TARGET-SPECIFIC UTILITY Evaluation - DONATION
base_filename = 'SMOTE_ca'
metric_type = 'target_util'

for size in data_sizes:
    try:
        real_df = training_dataframes['ca'][size]
        test_df = testing_dataframes['ca'][size]
        for run in range(5):
            try:
                synth_df = synthesized_dataframes['SMOTE']['ca'][size][run]
                evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_no_state, metrics_target, base_filename, size, run, metric_type, target = donation)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue

In [None]:
# SMOTE ca Level TARGET-SPECIFIC UTILITY Evaluation - PARTY
base_filename = 'SMOTE_ca'
metric_type = 'target_util'

for size in data_sizes:
    try:
        real_df = training_dataframes['ca'][size]
        test_df = testing_dataframes['ca'][size]
        for run in range(5):
            try:
                synth_df = synthesized_dataframes['SMOTE']['ca'][size][run]
                evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_no_state, metrics_target, base_filename, size, run, metric_type, target = party)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue

In [None]:
# SMOTE tx Level TARGET-SPECIFIC UTILITY Evaluation - VOTED
base_filename = 'SMOTE_tx'
metric_type = 'target_util'

for size in data_sizes:
    try:
        real_df = training_dataframes['tx'][size]
        test_df = testing_dataframes['tx'][size]
        for run in range(5):
            try:
                synth_df = synthesized_dataframes['SMOTE']['tx'][size][run]
                evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_no_state, metrics_target, base_filename, size, run, metric_type, target = voted)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue

In [None]:
# SMOTE tx Level TARGET-SPECIFIC UTILITY Evaluation - DONATION
base_filename = 'SMOTE_tx'
metric_type = 'target_util'

for size in data_sizes:
    try:
        real_df = training_dataframes['tx'][size]
        test_df = testing_dataframes['tx'][size]
        for run in range(5):
            try:
                synth_df = synthesized_dataframes['SMOTE']['tx'][size][run]
                evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_no_state, metrics_target, base_filename, size, run, metric_type, target = donation)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue

In [None]:
# SMOTE tx Level TARGET-SPECIFIC UTILITY Evaluation - PARTY
base_filename = 'SMOTE_tx'
metric_type = 'target_util'

for size in data_sizes:
    try:
        real_df = training_dataframes['tx'][size]
        test_df = testing_dataframes['tx'][size]
        for run in range(5):
            try:
                synth_df = synthesized_dataframes['SMOTE']['tx'][size][run]
                evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_no_state, metrics_target, base_filename, size, run, metric_type, target = party)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue

### Target - TVAE

In [None]:
# TVAE National Level TARGET-SPECIFIC UTILITY Evaluation - VOTED
base_filename = 'TVAE_national'
metric_type = 'target_util'

for size in data_sizes:
    try:
        real_df = training_dataframes['national'][size]
        test_df = testing_dataframes['national'][size]
        for run in range(5):
            try:
                synth_df = synthesized_dataframes['TVAE']['national'][size][run]
                evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_national, metrics_target, base_filename, size, run, metric_type, target=voted)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue


In [None]:
# TVAE National Level TARGET-SPECIFIC UTILITY Evaluation - DONATION
base_filename = 'TVAE_national'
metric_type = 'target_util'

for size in data_sizes:
    try:
        real_df = training_dataframes['national'][size]
        test_df = testing_dataframes['national'][size]
        for run in range(5):
            try:
                synth_df = synthesized_dataframes['TVAE']['national'][size][run]
                evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_national, metrics_target, base_filename, size, run, metric_type, target = donation)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue


In [None]:
# TVAE National Level TARGET-SPECIFIC UTILITY Evaluation - PARTY
base_filename = 'TVAE_national'
metric_type = 'target_util'

for size in data_sizes:
    try:
        real_df = training_dataframes['national'][size]
        test_df = testing_dataframes['national'][size]
        for run in range(5):
            try:
                synth_df = synthesized_dataframes['TVAE']['national'][size][run]
                evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_national, metrics_target, base_filename, size, run, metric_type, target = party)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue


In [None]:
# TVAE ca Level TARGET-SPECIFIC UTILITY Evaluation - VOTED
base_filename = 'TVAE_ca'
metric_type = 'target_util'

for size in data_sizes:
    try:
        real_df = training_dataframes['ca'][size]
        test_df = testing_dataframes['ca'][size]
        for run in range(5):
            try:
                synth_df = synthesized_dataframes['TVAE']['ca'][size][run]
                evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_no_state, metrics_target, base_filename, size, run, metric_type, target = voted)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue


In [None]:
# TVAE ca Level TARGET-SPECIFIC UTILITY Evaluation - DONATION
base_filename = 'TVAE_ca'
metric_type = 'target_util'

for size in data_sizes:
    try:
        real_df = training_dataframes['ca'][size]
        test_df = testing_dataframes['ca'][size]
        for run in range(5):
            try:
                synth_df = synthesized_dataframes['TVAE']['ca'][size][run]
                evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_no_state, metrics_target, base_filename, size, run, metric_type, target = donation)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue

In [None]:
# TVAE ca Level TARGET-SPECIFIC UTILITY Evaluation - PARTY
base_filename = 'TVAE_ca'
metric_type = 'target_util'

for size in data_sizes:
    try:
        real_df = training_dataframes['ca'][size]
        test_df = testing_dataframes['ca'][size]
        for run in range(5):
            try:
                synth_df = synthesized_dataframes['TVAE']['ca'][size][run]
                evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_no_state, metrics_target, base_filename, size, run, metric_type, target = party)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue

In [None]:
# TVAE tx Level TARGET-SPECIFIC UTILITY Evaluation - VOTED
base_filename = 'TVAE_tx'
metric_type = 'target_util'

for size in data_sizes:
    try:
        real_df = training_dataframes['tx'][size]
        test_df = testing_dataframes['tx'][size]
        for run in range(5):
            try:
                synth_df = synthesized_dataframes['TVAE']['tx'][size][run]
                evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_no_state, metrics_target, base_filename, size, run, metric_type, target = voted)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue


In [None]:
# TVAE tx Level TARGET-SPECIFIC UTILITY Evaluation - DONATION
base_filename = 'TVAE_tx'
metric_type = 'target_util'

for size in data_sizes:
    try:
        real_df = training_dataframes['tx'][size]
        test_df = testing_dataframes['tx'][size]
        for run in range(5):
            try:
                synth_df = synthesized_dataframes['TVAE']['tx'][size][run]
                evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_no_state, metrics_target, base_filename, size, run, metric_type, target = donation)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue

In [None]:
# TVAE tx Level TARGET-SPECIFIC UTILITY Evaluation - PARTY
base_filename = 'TVAE_tx'
metric_type = 'target_util'


for size in data_sizes:
    try:
        real_df = training_dataframes['tx'][size]
        test_df = testing_dataframes['tx'][size]
        for run in range(5):
            try:
                synth_df = synthesized_dataframes['TVAE']['tx'][size][run]
                evaluate_synthesized_data(real_df, test_df, synth_df, categorical_columns_no_state, metrics_target, base_filename, size, run, metric_type, target = party)
            except Exception as e:
                print(f"Error during evaluation for size {size}, run {run}: {e}")
                continue
    except Exception as e:
        print(f"Error accessing data for size {size}: {e}")
        continue