### Imports
Run this script in xu_venv

In [1]:
import json
from sklearn import model_selection
import numpy as np
print("Numpy version should be >= 1.20:", np.__version__) 
import pandas as pd
print("Pandas version should be >=1.1.3:", pd.__version__)
#from sdv.tabular import CTGAN, TVAE
from ctgan.synthesizers.ctgan import CTGANSynthesizer
from ctgan.synthesizers.tvae import TVAESynthesizer
import time
pd.options.mode.chained_assignment = None  # default='warn' # disable slicing warning

Numpy version should be >= 1.20: 1.21.5
Pandas version should be >=1.1.3: 1.3.5


In [3]:
! pip list

Package                      Version    
---------------------------- -----------
absl-py                      1.0.0      
argcomplete                  1.12.3     
argon2-cffi                  20.1.0     
astunparse                   1.6.3      
async-generator              1.10       
attrs                        21.4.0     
backcall                     0.2.0      
bleach                       4.1.0      
brotlipy                     0.7.0      
cached-property              1.5.2      
cachetools                   4.2.4      
certifi                      2021.10.8  
cffi                         1.15.0     
charset-normalizer           2.0.10     
colorama                     0.4.4      
copulas                      0.6.0      
cryptography                 36.0.0     
ctgan                        0.5.0      
cycler                       0.11.0     
cymem                        2.0.5      
cytoolz                      0.9.0.1    
debugpy                      1.5.1      
decorator       

In [2]:
import torch
print(torch.cuda.is_available()) # check if CUDA available
print(torch.cuda.device_count()) # should be 1 GPU
print(torch.cuda.current_device()) # which device is used by PyTorch: should be 0
print(torch.cuda.device(0))
print(torch.cuda.get_device_name(0))

False
0


AssertionError: Torch not compiled with CUDA enabled

### Loading and selecting the data

In [6]:
num_exp =  3
num_epoch =  300
dataset_name = "Sachs" #  "Asia" # "Census" #  "Adult" #     "Sachs2" #  "Cancer2" #  "Cancer" #  "Intrusion" # 
fake_file_root = "./Fake_Datasets"

if dataset_name in ["Adult", "Census"]: # "Asia"
    real_file_root = "./Real_Datasets/"
else:
    real_file_root = "./BN_Datasets/"
real_path = real_file_root + dataset_name+"/"+dataset_name
#real_path = real_file_root +dataset_name

with open(real_path+"_meta.json") as json_file:
    meta = json.load(json_file)

target = meta["target"]
categorical_cols = meta["categorical_cols"]
categorical_cols.append(target)
mixed_cols = meta["mixed_cols"]
probl_type = {"Classification": target}
log_cols = meta["log_cols"]
integer_cols = meta["numerical_cols"]

real_path = real_path +".csv"
real_data = pd.read_csv(real_path)

real_data.head()

Unnamed: 0,Erk,Mek,P38,PKA,PKC,Plcg,Jnk,Raf,PIP2,PIP3,Akt
0,0,0,1,1,1,0,-16.244,-8.934,2.063,10.151,0
1,1,1,0,1,1,0,-2.276,-11.339,-10.911,-12.699,0
2,0,1,0,1,2,0,-2.313,-11.638,-13.709,0.554,0
3,1,0,0,1,0,0,-1.32,11.439,-12.647,12.881,0
4,1,0,0,1,1,0,-11.802,-12.972,-12.683,-13.319,1


In [7]:
# Perform stratified train/test split with same random seed as in other experiments:
test_ratio=0.2
y_real = real_data[target]
X_real = real_data.drop(columns=[target])
X_train_real, _, y_train_real, _ = model_selection.train_test_split(X_real ,y_real, test_size=test_ratio, stratify=y_real,random_state=42)
X_train_real[target]= y_train_real
X_train_real.head()

Unnamed: 0,Erk,Mek,P38,PKA,PKC,Plcg,Jnk,Raf,PIP2,PIP3,Akt
49060,0,1,1,1,1,0,-0.942,9.645,-11.869,0.312,0
31295,2,2,2,0,0,0,12.327,15.282,-11.275,-0.311,1
12761,1,0,0,2,0,0,-12.131,-15.48,-13.085,1.076,0
33886,2,0,0,1,0,0,-13.053,14.824,-11.997,-11.41,1
1777,1,1,0,1,1,0,-8.079,-9.043,-13.146,-0.848,0


In [5]:
X_train_real.shape

(40000, 8)

# CTGAN Model

In [8]:
times = []
print("Running ", num_exp, "experiment(s) for ", dataset_name, "with ", num_epoch, " epochs")
for i in list(range(1,num_exp+1)):
    print("PERFORMING EXPERIMENT", i)
    start_time = time.time()
    ctgan = CTGANSynthesizer(epochs=num_epoch)
    ctgan.fit(X_train_real, categorical_cols)
    syn = ctgan.sample(X_train_real.shape[0])
    syn.to_csv(fake_file_root+"/"+dataset_name+"/"+ dataset_name+"_fake_ctgan_"+str(num_epoch)+"epochs_"+str(i)+".csv", index= False)
    end_time = time.time()
    print("FINISHED TRAINING AFTER", end_time-start_time, "seconds")
    times.append(end_time-start_time)
# save training times to csv:
pd.DataFrame(times, columns=["Comp.times"]).to_csv("./Evaluation/Comp_time/times_"+dataset_name+"_ctgan_"+str(num_epoch)+"epochs.csv", index= False)
# ConvergenceWarning is normal & happens randomly so can ignore

Running  3 experiment(s) for  Sachs with  300  epochs
PERFORMING EXPERIMENT 1




FINISHED TRAINING AFTER 1883.6685988903046 seconds
PERFORMING EXPERIMENT 2




FINISHED TRAINING AFTER 1910.919795513153 seconds
PERFORMING EXPERIMENT 3




FINISHED TRAINING AFTER 1859.3617787361145 seconds


# TVAE Model

In [9]:
times = []
print("Running ", num_exp, "experiment(s) for ", dataset_name, "with ", num_epoch, " epochs")
for i in list(range(1,num_exp+1)):
    print("PERFORMING EXPERIMENT", i)
    start_time = time.time()
    ctgan = TVAESynthesizer(epochs=num_epoch)
    ctgan.fit(X_train_real, categorical_cols)
    syn = ctgan.sample(X_train_real.shape[0])
    syn.to_csv(fake_file_root+"/"+dataset_name+"/"+ dataset_name+"_fake_tvae_"+str(num_epoch)+"epochs_"+str(i)+".csv", index= False)
    end_time = time.time()
    print("FINISHED TRAINING AFTER", end_time-start_time, "seconds")
    times.append(end_time-start_time)
# save training times to csv:
pd.DataFrame(times, columns=["Comp.times"]).to_csv("./Evaluation/Comp_time/times_"+dataset_name+"_tvae_"+str(num_epoch)+"epochs.csv", index= False)
print("Running ", num_exp, "experiment(s) for ", dataset_name, "with ", num_epoch, " epochs")

Running  3 experiment(s) for  Sachs with  300  epochs
PERFORMING EXPERIMENT 1




FINISHED TRAINING AFTER 762.3770334720612 seconds
PERFORMING EXPERIMENT 2




FINISHED TRAINING AFTER 860.5662453174591 seconds
PERFORMING EXPERIMENT 3




FINISHED TRAINING AFTER 858.245632648468 seconds
Running  3 experiment(s) for  Sachs with  300  epochs
