Test CTGAN for simulation (problem with distribution i.e. negative numbers)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from ctgan import CTGANSynthesizer

import os, glob

In [None]:
def plot_corr(data, figsize=(15,15)):
    '''
    Plot correlation 
    Args:
    - data: pd dataframe
    '''
    corr = data.corr()
    sns.set(font_scale=1.2)
    mask = np.triu(np.ones_like(corr, dtype=bool))
    with sns.axes_style("white"):
        f, ax = plt.subplots(figsize=figsize)
        ax = sns.heatmap(corr, mask=mask, square=True, cmap='RdBu_r', center=0, annot=True,
                        annot_kws={'fontsize':8})

In [None]:
data = pd.read_csv(os.path.join(os.path.dirname(os.getcwd()), '../Data/Merged_data/MERGE_FT_TEP_UT_on_ID.csv'),
                    index_col=0)

In [None]:
data.index = data.index.str.rstrip('-12345')

In [None]:
mean_df = data.groupby('ID').mean()
mean_df.dropna(how='any', inplace=True)
#mean_df['Type'] = mean_df.index.str.split('-').str[0].astype('category')
#mean_df['CW'] = mean_df.index.str.split('-').str[1].astype('category')
mean_df['type_cw'] = mean_df.index.astype('category')

In [None]:
mean_df.info()

# 1. Model the data

## Step 1: Prepare your data

CTGAN expects the input data to be a table given as either a numpy.ndarray or a pandas.DataFrame object with two types of columns:

- Continuous Columns: Columns that contain numerical values and which can take any value.
- Discrete columns: Columns that only contain a finite number of possible values, wether these are string values or not.


Aside from the table itself, you will need to create a **list with the names of the discrete variables.**


In [None]:
discrete_columns = mean_df.loc[:, mean_df.dtypes == 'category'].columns.to_list()

## Step 2: Fit CTGAN to your data

In [None]:
ctgan = CTGANSynthesizer()
ctgan.fit(mean_df, discrete_columns, epochs=500)

# 2. Generate synthetic data

In [None]:
samples = ctgan.sample(1000)

In [None]:
samples.columns

In [None]:
col = ['KJIC', 'MS_Avg', 'MS_neg_error', 'MS_pos_error', 'TEP_average',
       'TEP_error', 'Beta_avg', 'PC_IF_2.25MHz', 'SE_IF_2.25MHz',
       'SE_%_IF_2.25MHz', 'PC_IF_3.5MHz', 'SE_IF_3.5MHz',
       'SE_%_IF_3.5MHz', 'PC_BS']
samples[col] = samples[col].astype('float')
samples.info()

In [None]:
short_mean = samples.loc[:, ['KJIC', 'MS_Avg', 'TEP_average',
        'Beta_avg', 'PC_IF_2.25MHz', 'PC_IF_3.5MHz',
       'PC_BS', 'type_cw']]
short_mean.info()

In [None]:
plot_corr(short_mean)

In [None]:
sns.pairplot(short_mean)

# 4. Save and load the synthesizer

To save a trained ctgan synthesizer, use

`ctgan.save(path_to_a_folder)`

To restore a saved synthesizer, use

`ctgan = CTGANSynthesizer()
ctgan.fit(data, discrete_columns, epochs=0, load_path=path_to_a_folder)`