# Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
import torch
import random
import pyreadr
from synthcity.plugins import Plugins
import os

# synthcity absolute
from synthcity.plugins.core.dataloader import DataLoader, GenericDataLoader
from synthcity.plugins.core.distribution import Distribution
from synthcity.plugins.core.plugin import Plugin
from synthcity.plugins.core.schema import Schema

    The default C++ compiler could not be found on your system.
    You need to either define the CXX environment variable or a symlink to the g++ command.
    For example if g++-8 is the command you can do
      import os
      os.environ['CXX'] = 'g++-8'
    


# Load dataset

In [6]:
# Load the CSV file into a DataFrame
data_small = (pyreadr.read_r("D:/Master Statistik/Masterarbeit/masterarbeit/IST-3 Data/Raw Data/data_small.Rds"))[None]
real_train = pd.read_csv('D:/Master Statistik/Masterarbeit/masterarbeit/IST-3 Data/Raw Data/real_train.csv')

# Display the first few rows
print(data_small.head())
print(real_train.head())

  outcome itt_treat  nihss  randdelay vis_infarct   age
0       1     rt-PA   10.0   3.616667          No  81.0
1       0   Placebo   18.0   1.866667          No  92.0
2       0   Placebo    4.0   5.083333          No  75.0
3       0     rt-PA   13.0   4.333333         Yes  60.0
4       1     rt-PA    6.0   1.916667          No  88.0
   outcome itt_treat  nihss  randdelay vis_infarct   age
0        1     rt-PA     15   4.250000         Yes  76.0
1        1     rt-PA     17   4.166667         Yes  56.0
2        1   Placebo     10   3.750000         Yes  82.0
3        1   Placebo     25   5.416667         Yes  85.0
4        0   Placebo     11   4.200000          No  86.0


In [7]:
# convert category columns to object for all datasets
data_small[data_small.select_dtypes(include = "category").columns] = data_small.select_dtypes(include = "category").astype(object)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Synthesize dataset

In [8]:
# Initialize the CTGAN model from synthcity
ctgan_model = Plugins().get("ctgan", n_iter = 100)

[2025-03-17T09:16:20.787630+0100][12088][CRITICAL] module disabled: C:\Users\Julia Hoepler\anaconda3\Lib\site-packages\synthcity\plugins\generic\plugin_goggle.py


## m = 50

In [None]:
# Directory to save the datasets
save_directory = "D:/Master Statistik/Masterarbeit/masterarbeit/IST-3 Data/Data/ctgan/"

# Ensure the save directory exists
os.makedirs(save_directory, exist_ok=True)

# set seeds
def setSeed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

for i in range(50):  # Loop to create 5 datasets
    # Train the CTGAN model on the shuffled dataset
    ctgan_model.fit(data_small)

    # Generate synthetic data, set seed if the method allows for it
    setSeed(i)
    synthetic_data_small = ctgan_model.generate(count=3035).dataframe()  # Use the seed if possible

    # Create a dynamic filename with the dataset number
    filename = f"syn_data_small_ctgan_{i + 1}.csv"  # i + 1 for proper numbering

    # Save the synthetic dataset
    synthetic_data_small.to_csv(os.path.join(save_directory, filename), index=False)

    print(f"Saved: {filename}")

## m = 50  for train_data to evaluate the membership disclosure risk

In [11]:
# Directory to save the datasets
save_directory = "D:/Master Statistik/Masterarbeit/masterarbeit/IST-3 Data/Data/train_data/ctgan/"

# Ensure the save directory exists
os.makedirs(save_directory, exist_ok=True)

# set seeds
def setSeed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

for i in range(50):  # Loop to create 5 datasets
    # Train the CTGAN model on the shuffled dataset
    ctgan_model.fit(real_train)

    # Generate synthetic data, set seed if the method allows for it
    setSeed(i)
    synthetic_data_small = ctgan_model.generate(count=2428).dataframe()  # Use the seed if possible

    # Create a dynamic filename with the dataset number
    filename = f"syn_real_train_ctgan_{i + 1}.csv"  # i + 1 for proper numbering

    # Save the synthetic dataset
    synthetic_data_small.to_csv(os.path.join(save_directory, filename), index=False)
 
    print(f"Saved: {filename}")

100%|██████████| 100/100 [02:23<00:00,  1.43s/it]


Saved: syn_real_train_ctgan_1.csv


100%|██████████| 100/100 [02:15<00:00,  1.35s/it]


Saved: syn_real_train_ctgan_2.csv


100%|██████████| 100/100 [02:01<00:00,  1.22s/it]


Saved: syn_real_train_ctgan_3.csv


100%|██████████| 100/100 [02:04<00:00,  1.24s/it]


Saved: syn_real_train_ctgan_4.csv


100%|██████████| 100/100 [02:04<00:00,  1.24s/it]


Saved: syn_real_train_ctgan_5.csv


100%|██████████| 100/100 [02:04<00:00,  1.25s/it]


Saved: syn_real_train_ctgan_6.csv


100%|██████████| 100/100 [02:02<00:00,  1.22s/it]


Saved: syn_real_train_ctgan_7.csv


100%|██████████| 100/100 [02:45<00:00,  1.65s/it]


Saved: syn_real_train_ctgan_8.csv


100%|██████████| 100/100 [02:14<00:00,  1.35s/it]


Saved: syn_real_train_ctgan_9.csv


100%|██████████| 100/100 [02:43<00:00,  1.63s/it]


Saved: syn_real_train_ctgan_10.csv


100%|██████████| 100/100 [02:50<00:00,  1.71s/it]


Saved: syn_real_train_ctgan_11.csv


100%|██████████| 100/100 [02:19<00:00,  1.40s/it]


Saved: syn_real_train_ctgan_12.csv


100%|██████████| 100/100 [02:01<00:00,  1.21s/it]


Saved: syn_real_train_ctgan_13.csv


100%|██████████| 100/100 [01:58<00:00,  1.19s/it]


Saved: syn_real_train_ctgan_14.csv


100%|██████████| 100/100 [01:57<00:00,  1.18s/it]


Saved: syn_real_train_ctgan_15.csv


100%|██████████| 100/100 [01:58<00:00,  1.19s/it]


Saved: syn_real_train_ctgan_16.csv


100%|██████████| 100/100 [01:56<00:00,  1.17s/it]


Saved: syn_real_train_ctgan_17.csv


100%|██████████| 100/100 [02:00<00:00,  1.20s/it]


Saved: syn_real_train_ctgan_18.csv


100%|██████████| 100/100 [01:56<00:00,  1.17s/it]


Saved: syn_real_train_ctgan_19.csv


100%|██████████| 100/100 [01:58<00:00,  1.18s/it]


Saved: syn_real_train_ctgan_20.csv


100%|██████████| 100/100 [02:00<00:00,  1.20s/it]


Saved: syn_real_train_ctgan_21.csv


100%|██████████| 100/100 [02:00<00:00,  1.21s/it]


Saved: syn_real_train_ctgan_22.csv


100%|██████████| 100/100 [02:00<00:00,  1.21s/it]


Saved: syn_real_train_ctgan_23.csv


100%|██████████| 100/100 [01:59<00:00,  1.20s/it]


Saved: syn_real_train_ctgan_24.csv


100%|██████████| 100/100 [02:00<00:00,  1.20s/it]


Saved: syn_real_train_ctgan_25.csv


100%|██████████| 100/100 [01:59<00:00,  1.20s/it]


Saved: syn_real_train_ctgan_26.csv


100%|██████████| 100/100 [01:59<00:00,  1.20s/it]


Saved: syn_real_train_ctgan_27.csv


100%|██████████| 100/100 [02:02<00:00,  1.23s/it]


Saved: syn_real_train_ctgan_28.csv


100%|██████████| 100/100 [02:00<00:00,  1.20s/it]


Saved: syn_real_train_ctgan_29.csv


100%|██████████| 100/100 [02:01<00:00,  1.21s/it]


Saved: syn_real_train_ctgan_30.csv


100%|██████████| 100/100 [02:01<00:00,  1.21s/it]


Saved: syn_real_train_ctgan_31.csv


100%|██████████| 100/100 [01:59<00:00,  1.20s/it]


Saved: syn_real_train_ctgan_32.csv


100%|██████████| 100/100 [02:01<00:00,  1.22s/it]


Saved: syn_real_train_ctgan_33.csv


100%|██████████| 100/100 [02:00<00:00,  1.20s/it]


Saved: syn_real_train_ctgan_34.csv


100%|██████████| 100/100 [02:03<00:00,  1.23s/it]


Saved: syn_real_train_ctgan_35.csv


100%|██████████| 100/100 [02:01<00:00,  1.21s/it]


Saved: syn_real_train_ctgan_36.csv


100%|██████████| 100/100 [02:00<00:00,  1.21s/it]


Saved: syn_real_train_ctgan_37.csv


100%|██████████| 100/100 [02:00<00:00,  1.21s/it]


Saved: syn_real_train_ctgan_38.csv


100%|██████████| 100/100 [02:00<00:00,  1.21s/it]


Saved: syn_real_train_ctgan_39.csv


100%|██████████| 100/100 [02:00<00:00,  1.20s/it]


Saved: syn_real_train_ctgan_40.csv


100%|██████████| 100/100 [02:00<00:00,  1.20s/it]


Saved: syn_real_train_ctgan_41.csv


100%|██████████| 100/100 [02:00<00:00,  1.20s/it]


Saved: syn_real_train_ctgan_42.csv


100%|██████████| 100/100 [02:00<00:00,  1.21s/it]


Saved: syn_real_train_ctgan_43.csv


100%|██████████| 100/100 [02:00<00:00,  1.20s/it]


Saved: syn_real_train_ctgan_44.csv


100%|██████████| 100/100 [02:00<00:00,  1.20s/it]


Saved: syn_real_train_ctgan_45.csv


100%|██████████| 100/100 [02:00<00:00,  1.20s/it]


Saved: syn_real_train_ctgan_46.csv


100%|██████████| 100/100 [02:02<00:00,  1.23s/it]


Saved: syn_real_train_ctgan_47.csv


100%|██████████| 100/100 [02:00<00:00,  1.20s/it]


Saved: syn_real_train_ctgan_48.csv


100%|██████████| 100/100 [01:59<00:00,  1.19s/it]


Saved: syn_real_train_ctgan_49.csv


100%|██████████| 100/100 [02:00<00:00,  1.21s/it]

Saved: syn_real_train_ctgan_50.csv



