In [1]:
from data_transformer import DataTransformer
import pandas as pd 
import numpy as np
from tqdm import tqdm

In [2]:
rawdata = pd.read_csv('adult.csv')
cat_cols = [ 'workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country', 'income']
log_cols = []
mixed_cols= {'capital-loss':[0.0],'capital-gain':[0.0]}
num_cols = ['age', 'fnlwgt','educational-num','hours-per-week']
target_col = 'income'

In [3]:
transformer = DataTransformer(rawdata, cat_cols, num_cols,mixed_cols, log_cols, target_col)

In [4]:
df = transformer.transformData()
transformer.fit(df)

In [5]:
transformed_data = transformer.transform(df)
output_info = transformer.output_info_list

In [6]:
from data_sampler import DataSampler
sampler = DataSampler(transformed_data, output_info)

In [7]:
from ctabgan import CTABGAN
ctabgan = CTABGAN(classifier_dim=(100, 200, 300, 400))
discriminator, discriminator_rep, dside = ctabgan.make_discriminator(sampler, transformer)

2023-05-08 23:58:17.740975: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
2023-05-08 23:58:18.473303: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2023-05-08 23:58:18.474203: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2023-05-08 23:58:18.499919: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:3b:00.0 name: Quadro RTX 8000 computeCapability: 7.5
coreClock: 1.77GHz coreCount: 72 deviceMemorySize: 47.45GiB deviceMemoryBandwidth: 625.94GiB/s
2023-05-08 23:58:18.500112: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 1 with properties: 
pciBusID: 0000:af:00.0 name: Quadro RTX 8000 computeCapability: 7.5
coreClock: 1.77GHz coreCount: 72 deviceMemorySize: 47.45GiB deviceMemoryBandwidth: 625.94GiB/s
2023-05-08 23:5

In [8]:
discriminator.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 24, 24, 1)]       0         
_________________________________________________________________
conv_two (ConvTwo)           (None, 12, 12, 64)        1088      
_________________________________________________________________
batch_normalization (BatchNo (None, 12, 12, 64)        256       
_________________________________________________________________
leaky_re_lu (LeakyReLU)      (None, 12, 12, 64)        0         
_________________________________________________________________
conv_two_1 (ConvTwo)         (None, 6, 6, 128)         131200    
_________________________________________________________________
batch_normalization_1 (Batch (None, 6, 6, 128)         512       
_________________________________________________________________
leaky_re_lu_1 (LeakyReLU)    (None, 6, 6, 128)         0     

In [9]:
generator, gside = ctabgan.make_generator(sampler, transformer, batchSize=500)

In [10]:
generator.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 1, 1, 258)]       0         
_________________________________________________________________
conv_two_transpose (ConvTwoT (None, 2, 2, 256)         264192    
_________________________________________________________________
batch_normalization_3 (Batch (None, 2, 2, 256)         1024      
_________________________________________________________________
re_lu (ReLU)                 (None, 2, 2, 256)         0         
_________________________________________________________________
conv_two_transpose_1 (ConvTw (None, 4, 4, 128)         524288    
_________________________________________________________________
batch_normalization_4 (Batch (None, 4, 4, 128)         512       
_________________________________________________________________
re_lu_1 (ReLU)               (None, 4, 4, 128)         0   

In [11]:
def get_st_ed(target_col, output_info):
    """
    Used to obtain the start and ending positions of the target column as per the transformed data to be used by the classifier 
    Inputs:
    1) target_col -> name of the target column used for machine learning tasks (binary/multi-classification) in the raw data 
    2) output_info -> column information corresponding to the data after applying the data transformer
    Outputs:
    1) starting (st) and ending (ed) positions of the target column as per the transformed data

    """
    target_col_index = df.columns.get_loc(target_col)
    st = 0
    c = 0
    length = 0
    for info in output_info:
        if c==target_col_index:
            target_info = info
            break
        for item in info:
            st += item.dim
        c += 1
    for item in target_info:
        length += item.dim
    ed = st + length 
    return(st, ed)

In [12]:
st_ed = get_st_ed(target_col, output_info)

In [13]:
classifier = ctabgan.make_classifier(transformer, st_ed)

In [14]:
classifier.summary()

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 162)]             0         
_________________________________________________________________
dense_1 (Dense)              (None, 100)               16300     
_________________________________________________________________
leaky_re_lu_3 (LeakyReLU)    (None, 100)               0         
_________________________________________________________________
dropout (Dropout)            (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 200)               20200     
_________________________________________________________________
leaky_re_lu_4 (LeakyReLU)    (None, 200)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 200)               0   

## train

In [15]:
from train import Train
trainobj = Train(transformer, sampler, generator, gside, discriminator, discriminator_rep, dside, classifier)

In [16]:
trainobj.train(transformed_data)

  0%|          | 0/5 [00:00<?, ?it/s]2023-05-08 23:58:59.707369: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.10
2023-05-08 23:58:59.896074: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudnn.so.7


yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes


  0%|          | 0/5 [00:15<?, ?it/s]


KeyboardInterrupt: 