In [1]:
import pandas as pd
import numpy as np
import torch
import os
import sys
from tqdm import tqdm, trange

from sdv.single_table import GaussianCopulaSynthesizer, TVAESynthesizer, CTGANSynthesizer

sys.path.append("../../")
import biked_commons
from biked_commons.design_evaluation.design_evaluation import *
from biked_commons.resource_utils import split_datasets_path, models_and_scalers_path
from biked_commons.conditioning import conditioning
from biked_commons.design_evaluation.scoring import *
from biked_commons.transformation.one_hot_encoding import ONE_HOT_ENCODED_CLIPS_COLUMNS, BOOLEAN_COLUMNS, encode_to_continuous
from biked_commons.benchmark_models import benchmarking_utils

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_csv(split_datasets_path("bike_bench_mixed_modality.csv"), index_col=0)

In [3]:
import sdv
from sdv.metadata import Metadata  
metadata = Metadata.detect_from_dataframe(
    data=data,
    table_name='my_table'
)

categorical_cols = ONE_HOT_ENCODED_CLIPS_COLUMNS
boolean_cols = BOOLEAN_COLUMNS
continuous_cols = data.columns.difference(categorical_cols + boolean_cols).tolist()
data[continuous_cols] = data[continuous_cols].astype(np.float32)

In [None]:
for model in ["CTGAN", "TVAE"]:
    if model == "CTGAN":
        synthesizer = CTGANSynthesizer(metadata, verbose=True, epochs=500)
    elif model == "TVAE":
        synthesizer = TVAESynthesizer(metadata, verbose=True, epochs=500)
    else:
        raise ValueError("Unknown model type")

    synthesizer.fit(data)
    print(f"Fitted {model}")

    synthetic_collapsed = synthesizer.sample(num_rows=10000)

    synthetic_cont = one_hot_encoding.encode_to_continuous(synthetic_collapsed)

    synthetic_tens = torch.tensor(synthetic_cont.values, dtype=torch.float32)

    benchmarking_utils.evaluate_cond(synthetic_tens, model, synthetic_cont.columns, "cpu")
    print(f"Evaluated {model}")

for i in range(10):
    for model in ["CTGAN", "TVAE"]:
        if model == "CTGAN":
            synthesizer = CTGANSynthesizer(metadata, verbose=True, epochs=500)
        elif model == "TVAE":
            synthesizer = TVAESynthesizer(metadata, verbose=True, epochs=500)
        else:
            raise ValueError("Unknown model type")

        synthesizer.fit(data)
        print(f"Fitted {model}")

        synthetic_collapsed = synthesizer.sample(num_rows=1000)

        synthetic_cont = one_hot_encoding.encode_to_continuous(synthetic_collapsed)

        synthetic_tens = torch.tensor(synthetic_cont.values, dtype=torch.float32)

        benchmarking_utils.evaluate_uncond(synthetic_tens, model, i, synthetic_cont.columns, "cpu")
        print(f"Evaluated {model}")



Gen. (-0.75) | Discrim. (0.22): 100%|██████████| 5/5 [00:01<00:00,  2.57it/s] 


Fitted CTGAN
Evaluated CTGAN


Loss: -39.714:   3%|▎         | 15/500 [00:06<03:26,  2.35it/s]


KeyboardInterrupt: 