In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from ctgan import CTGANSynthesizer

import os, glob

In [3]:
def plot_corr(data, figsize=(15,15)):
    '''
    Plot correlation 
    Args:
    - data: pd dataframe
    '''
    corr = data.corr()
    sns.set(font_scale=1.2)
    mask = np.triu(np.ones_like(corr, dtype=bool))
    with sns.axes_style("white"):
        f, ax = plt.subplots(figsize=figsize)
        ax = sns.heatmap(corr, mask=mask, square=True, cmap='RdBu_r', center=0, annot=True,
                        annot_kws={'fontsize':8})

In [4]:
data = pd.read_csv(os.path.join(os.path.dirname(os.getcwd()), '../Data/Merged_data/MERGE_FT_TEP_UT_on_ID.csv'),
                    index_col=0)

In [5]:
data.index = data.index.str.rstrip('-12345')

In [6]:
mean_df = data.groupby('ID').mean()
mean_df.dropna(how='any', inplace=True)
mean_df['Type'] = mean_df.index.str.split('-').str[0].astype('category')
mean_df['CW'] = mean_df.index.str.split('-').str[1].astype('category')

# 1. Model the data

## Step 1: Prepare your data

CTGAN expects the input data to be a table given as either a numpy.ndarray or a pandas.DataFrame object with two types of columns:

- Continuous Columns: Columns that contain numerical values and which can take any value.
- Discrete columns: Columns that only contain a finite number of possible values, wether these are string values or not.


Aside from the table itself, you will need to create a **list with the names of the discrete variables.**


In [7]:
discrete_columns = mean_df.loc[:, mean_df.dtypes == 'category'].columns.to_list()

## Step 2: Fit CTGAN to your data

In [8]:
ctgan = CTGANSynthesizer()
ctgan.fit(mean_df, discrete_columns)

  return torch._C._cuda_getDeviceCount() > 0


Epoch 1, Loss G: 1.5534, Loss D: 0.0207
Epoch 2, Loss G: 1.5839, Loss D: -0.0011
Epoch 3, Loss G: 1.5904, Loss D: -0.0168
Epoch 4, Loss G: 1.5803, Loss D: -0.0783
Epoch 5, Loss G: 1.5842, Loss D: -0.0492
Epoch 6, Loss G: 1.5928, Loss D: -0.0867
Epoch 7, Loss G: 1.5432, Loss D: -0.0847
Epoch 8, Loss G: 1.5081, Loss D: -0.1326
Epoch 9, Loss G: 1.5236, Loss D: -0.1189
Epoch 10, Loss G: 1.5067, Loss D: -0.1797
Epoch 11, Loss G: 1.4856, Loss D: -0.1868
Epoch 12, Loss G: 1.4582, Loss D: -0.2246
Epoch 13, Loss G: 1.4539, Loss D: -0.2627
Epoch 14, Loss G: 1.3981, Loss D: -0.2496
Epoch 15, Loss G: 1.2603, Loss D: -0.2520
Epoch 16, Loss G: 1.2530, Loss D: -0.2342
Epoch 17, Loss G: 1.1871, Loss D: -0.2500
Epoch 18, Loss G: 1.1111, Loss D: -0.2457
Epoch 19, Loss G: 1.0691, Loss D: -0.2838
Epoch 20, Loss G: 1.0138, Loss D: -0.2429
Epoch 21, Loss G: 0.9850, Loss D: -0.1365
Epoch 22, Loss G: 0.9066, Loss D: -0.3229
Epoch 23, Loss G: 0.7858, Loss D: -0.3191
Epoch 24, Loss G: 0.6909, Loss D: -0.2266
Ep

# 2. Generate synthetic data

In [9]:
samples = ctgan.sample(1000)

In [10]:
samples

Unnamed: 0,KJIC,MS_Avg,MS_neg_error,MS_pos_error,TEP_average,TEP_error,Beta_avg,PC_IF_2.25MHz,SE_IF_2.25MHz,SE_%_IF_2.25MHz,IF_2.25MHz,PC_IF_3.5MHz,SE_IF_3.5MHz,SE_%_IF_3.5MHz,IF_3.5MHz,PC_BS,Type,CW
0,248.653,1.41651,0.740487,0.471502,-0.822544,0.0534609,0.003718,76.6039,0.0203959,3.69885,0.188587,62.8851,0.0137979,0.2667,1.01899,53.1746,A286,80
1,132.077,1.59288,1.38507,0.958847,-0.654897,0.0481919,0.00260537,-39.0379,0.0273225,4.98569,0.824427,141.586,0.0106868,1.32449,1.32806,41.0695,A286,60
2,122.157,6.34384,-0.040457,0.696882,-1.33516,0.0403887,0.00339046,300.296,0.0254365,2.03527,0.362042,200.881,0.0370509,-0.594792,2.00635,21.3188,304,20
3,107.312,2.30976,0.160731,0.559164,-1.13709,0.0408587,0.00201321,308.387,-0.000285519,2.79662,1.47924,1138.89,0.000686133,3.00747,0.706591,-11.9959,A286,0
4,154.292,0.350078,-0.260476,0.29555,-0.747025,0.0554788,0.00268505,73.972,0.0269118,2.99069,1.18778,336.59,0.0879277,3.47101,1.76338,73.0566,347,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,119.345,1.55114,-0.134797,1.13642,-1.31657,0.0429477,0.00404815,274.103,0.0167033,2.16114,1.05021,185.592,0.0774919,2.65314,0.844645,-6.97621,304,80
996,77.2396,0.89396,2.37566,1.20728,-0.918556,0.0464413,0.00072779,300.756,0.00465413,2.11046,1.34911,258.884,0.0234816,1.69129,0.51763,56.4579,304,40
997,127.201,0.275033,0.52936,0.391131,-1.36933,0.0523377,0.00427382,64.8841,0.0186746,0.849617,0.802602,412.896,0.0611745,0.358749,0.574652,-10.0258,304,80
998,106.486,0.968796,0.696149,-0.189049,-1.12861,0.0439983,0.00097279,331.699,0.00593035,0.668483,1.62606,232.836,0.0236116,-1.88744,1.58071,-3.46316,304,20


In [11]:
short_mean.loc[:, ['KJIC', 'MS_Avg', 'TEP_average',
        'Beta_avg', 'IF_2.25MHz', 'IF_3.5MHz',
       'PC_BS']].astype('float').describe()

NameError: name 'short_mean' is not defined

In [None]:
short_mean = samples.loc[:, ['KJIC', 'MS_Avg', 'TEP_average',
        'Beta_avg', 'IF_2.25MHz', 'IF_3.5MHz',
       'PC_BS', 'Type']]
sns.pairplot(short_mean, hue='Type')

# 4. Save and load the synthesizer

To save a trained ctgan synthesizer, use

`ctgan.save(path_to_a_folder)`

To restore a saved synthesizer, use

`ctgan = CTGANSynthesizer()
ctgan.fit(data, discrete_columns, epochs=0, load_path=path_to_a_folder)`