In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sdv.tabular import CopulaGAN
from sdv.evaluation import evaluate
from sdv.constraints import UniqueCombinations, GreaterThan


import os, glob

In [8]:
data = pd.read_csv(os.path.join(os.path.dirname(os.getcwd()), '../Data/Merged_data/Survival_df.csv'),
                    index_col=0)
#data.drop(columns=['NLE_ratio_119_17'],inplace=True)

In [9]:
data.NDE_cycle = data.NDE_cycle.astype('str')

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 21 entries, 41C to 49C
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Observed              21 non-null     bool   
 1   F_Time                21 non-null     float64
 2   NDE_cycle             21 non-null     object 
 3   diff_two_peaks_ratio  21 non-null     float64
 4   amp_ratio             21 non-null     float64
 5   pos_ratio             21 non-null     float64
 6   energy_ratio          21 non-null     float64
 7   NLE_ratio_51_17       21 non-null     float64
 8   NLE_ratio_85_17       21 non-null     float64
 9   NLE_ratio_51_119      21 non-null     float64
 10  NLO_avg               21 non-null     float64
 11  Avg_RP                21 non-null     float64
 12  ACEE_30_Avg_58        21 non-null     float64
 13  ACEE_500_Avg_58       21 non-null     float64
 14  ACPD_Avg_58           21 non-null     float64
dtypes: bool(1), float64(13), ob

# CopulaGAN

The sdv.tabular.CopulaGAN model is a variation of the CTGAN Model which takes advantage of the CDF based transformation that the GaussianCopulas apply to make the underlying CTGAN model task of learning the data easier.

# Model the data

## tuning distribution and fitting model

In [11]:
model = CopulaGAN(
    epochs=5000,
    field_distributions={
        'diff_two_peaks_ratio':'gaussian_kde',
        'energy_ratio':'gaussian_kde',
        'ACEE_500_Avg_58':'gaussian_kde',
        'amp_ratio':'gaussian_kde',
        'NLE_ratio_51_17':'gaussian_kde',
        'NLE_ratio_85_17':'gaussian_kde',
        'NLE_ratio_119_17':'gaussian_kde',
    }
)

In [12]:
model.fit(data)

  sk = 2*(b-a)*np.sqrt(a + b + 1) / (a + b + 2) / np.sqrt(a*b)
  improvement from the last ten iterations.
  return c**2 / (c**2 - n**2)
  Lhat = muhat - Shat*mu


In [13]:
model.get_distributions()

{'Observed': 'copulas.univariate.gaussian.GaussianUnivariate',
 'F_Time': 'copulas.univariate.beta.BetaUnivariate',
 'diff_two_peaks_ratio': 'copulas.univariate.gaussian_kde.GaussianKDE',
 'amp_ratio': 'copulas.univariate.gaussian_kde.GaussianKDE',
 'pos_ratio': 'copulas.univariate.uniform.UniformUnivariate',
 'energy_ratio': 'copulas.univariate.gaussian_kde.GaussianKDE',
 'NLE_ratio_51_17': 'copulas.univariate.gaussian_kde.GaussianKDE',
 'NLE_ratio_85_17': 'copulas.univariate.gaussian_kde.GaussianKDE',
 'NLE_ratio_51_119': 'copulas.univariate.student_t.StudentTUnivariate',
 'NLO_avg': 'copulas.univariate.gamma.GammaUnivariate',
 'Avg_RP': 'copulas.univariate.gamma.GammaUnivariate',
 'ACEE_30_Avg_58': 'copulas.univariate.log_laplace.LogLaplace',
 'ACEE_500_Avg_58': 'copulas.univariate.gaussian_kde.GaussianKDE',
 'ACPD_Avg_58': 'copulas.univariate.gamma.GammaUnivariate'}

# Generate synthetic data

In [14]:
samples = model.sample(1000)

In [15]:
samples.head()

Unnamed: 0,Observed,F_Time,NDE_cycle,diff_two_peaks_ratio,amp_ratio,pos_ratio,energy_ratio,NLE_ratio_51_17,NLE_ratio_85_17,NLE_ratio_51_119,NLO_avg,Avg_RP,ACEE_30_Avg_58,ACEE_500_Avg_58,ACPD_Avg_58
0,True,365049.8,450000,0.872973,1.084782,0.996407,0.868493,0.943979,1.239804,1.818179,4.403272,176.113495,-1.35737,-1.43416,3.533225
1,True,1146995.0,900000,0.409864,1.053705,0.992385,0.789081,0.900651,1.12919,1.801345,5.644146,170.309593,-1.024484,-1.262041,3.426573
2,True,649351.5,600000,0.499492,1.17163,1.009097,0.790234,0.875083,1.012485,1.893711,4.985178,173.056739,-0.95886,-0.945966,3.131511
3,True,924625.4,900000,0.317456,1.033807,0.998379,0.611282,0.888176,1.128938,1.798674,5.916575,171.322606,-1.165082,-1.250638,2.882109
4,True,552105.9,450000,0.63875,1.103302,1.001451,0.930498,0.890924,1.153246,1.802459,5.409683,170.150222,-0.894832,-1.295212,3.103951


In [16]:
samples.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Observed              1000 non-null   bool   
 1   F_Time                1000 non-null   float64
 2   NDE_cycle             1000 non-null   object 
 3   diff_two_peaks_ratio  1000 non-null   float64
 4   amp_ratio             1000 non-null   float64
 5   pos_ratio             1000 non-null   float64
 6   energy_ratio          1000 non-null   float64
 7   NLE_ratio_51_17       1000 non-null   float64
 8   NLE_ratio_85_17       1000 non-null   float64
 9   NLE_ratio_51_119      1000 non-null   float64
 10  NLO_avg               1000 non-null   float64
 11  Avg_RP                1000 non-null   float64
 12  ACEE_30_Avg_58        1000 non-null   float64
 13  ACEE_500_Avg_58       1000 non-null   float64
 14  ACPD_Avg_58           1000 non-null   float64
dtypes: bool(1), float64(13

### Evaluate

The output of this function call will be a number between 0 and 1 that will indicate us how similar the two tables are, being 0 the worst and 1 the best possible score.

The evaluate function applies a collection of pre-configured metric functions and returns the average of the scores that the data obtained on each one of them. To explore the metrics in more detail, you can pass and additional argument aggregate=False.


- cstest: This metric compares the distributions of all the categorical columns of the table by using a Chi-squared test and returns the average of the p-values obtained across all the columns. If the tables that you are evaluating do not contain any categorical columns the result will be nan.

- kstest: This metric compares the distributions of all the numerical columns of the table with a two-sample Kolmogorov–Smirnov test using the empirical CDF and returns the average of the p-values obtained across all the columns. If the tables that you are evaluating do not contain any numerical columns the result will be nan.

- logistic_detection: This metric tries to use a Logistic Regression classifier to detect whether each row is real or synthetic and then evaluates its performance using an Area under the ROC curve metric. The returned score is 1 minus the ROC AUC score obtained by the classifier.

- svc_detection: This metric tries to use an Support Vector Classifier to detect whether each row is real or synthetic and then evaluates its performance using an Area under the ROC curve metric. The returned score is 1 minus the ROC AUC score obtained by the classifier.


In [17]:
evaluate(samples, data)

0.6807934189707292

In [18]:
evaluate(samples, data, aggregate = False)

{'cstest': 0.9790888009727357,
 'kstest': 0.3557752650436851,
 'logistic_detection': 0.6640078915528017,
 'svc_detection': 0.6147618961990219}

In [19]:
data.describe()

Unnamed: 0,F_Time,diff_two_peaks_ratio,amp_ratio,pos_ratio,energy_ratio,NLE_ratio_51_17,NLE_ratio_85_17,NLE_ratio_51_119,NLO_avg,Avg_RP,ACEE_30_Avg_58,ACEE_500_Avg_58,ACPD_Avg_58
count,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0
mean,1350812.0,0.73777,1.031476,0.997398,0.9269,0.890681,1.114791,1.851555,5.108145,174.878014,-0.861326,-1.624466,3.468163
std,892130.9,0.539229,0.178927,0.009993,0.274695,0.038486,0.057093,0.072408,1.305638,6.346672,0.250543,0.60308,0.605883
min,200007.0,0.144147,0.656073,0.977985,0.394731,0.780283,0.999564,1.719235,3.061693,169.717689,-1.276355,-3.20222,2.375193
25%,514878.0,0.444218,0.919473,0.989698,0.798325,0.881052,1.091059,1.82414,4.296261,171.1647,-1.01434,-1.69359,3.061743
50%,1318034.0,0.589633,1.061223,1.001696,0.852229,0.892038,1.114762,1.845136,4.891617,172.727399,-0.8849,-1.521755,3.370729
75%,2103203.0,0.763296,1.151889,1.003158,1.030618,0.912962,1.140535,1.862064,6.164777,174.454983,-0.6878,-1.24822,3.820881
max,3245267.0,2.280959,1.365329,1.015363,1.559453,0.978877,1.235508,2.046896,7.862192,190.979294,-0.13764,-0.927155,4.804749


In [20]:
samples.describe()

Unnamed: 0,F_Time,diff_two_peaks_ratio,amp_ratio,pos_ratio,energy_ratio,NLE_ratio_51_17,NLE_ratio_85_17,NLE_ratio_51_119,NLO_avg,Avg_RP,ACEE_30_Avg_58,ACEE_500_Avg_58,ACPD_Avg_58
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,1206709.0,0.716583,1.086077,0.996618,0.835553,0.876439,1.098204,1.837882,5.77628,174.955496,-0.899234,-1.627186,3.493639
std,783768.7,0.506858,0.139682,0.008866,0.249909,0.033401,0.054835,0.073164,1.221449,7.306767,0.316335,0.661203,0.514763
min,200007.0,0.112755,0.583504,0.977986,0.37963,0.761678,0.975166,1.278764,3.353651,169.717689,-1.415855,-3.543558,2.350011
25%,515867.7,0.430366,1.003161,0.991352,0.671391,0.869018,1.060763,1.810145,4.881068,170.583795,-1.06532,-1.626998,3.090777
50%,1058246.0,0.566239,1.081034,0.997024,0.784604,0.883422,1.100097,1.834802,5.572508,171.890762,-0.964512,-1.395388,3.461519
75%,1751087.0,0.729057,1.157594,1.001944,0.901251,0.896033,1.132504,1.863616,6.535208,175.510024,-0.867355,-1.236361,3.875832
max,3245261.0,2.758983,1.551715,1.015363,1.662994,0.952506,1.253202,2.717417,10.279674,207.893961,1.416574,-0.789245,4.877341


In [21]:
samples.NDE_cycle = samples.NDE_cycle.astype('int')
samples.to_csv(os.path.join(os.path.dirname(os.getcwd()), '../Data/Merged_data/CopulaGAN_simulated_data_survival_2.csv'), index=False)

In [22]:
samples

Unnamed: 0,Observed,F_Time,NDE_cycle,diff_two_peaks_ratio,amp_ratio,pos_ratio,energy_ratio,NLE_ratio_51_17,NLE_ratio_85_17,NLE_ratio_51_119,NLO_avg,Avg_RP,ACEE_30_Avg_58,ACEE_500_Avg_58,ACPD_Avg_58
0,True,3.650498e+05,450000,0.872973,1.084782,0.996407,0.868493,0.943979,1.239804,1.818179,4.403272,176.113495,-1.357370,-1.434160,3.533225
1,True,1.146995e+06,900000,0.409864,1.053705,0.992385,0.789081,0.900651,1.129190,1.801345,5.644146,170.309593,-1.024484,-1.262041,3.426573
2,True,6.493515e+05,600000,0.499492,1.171630,1.009097,0.790234,0.875083,1.012485,1.893711,4.985178,173.056739,-0.958860,-0.945966,3.131511
3,True,9.246254e+05,900000,0.317456,1.033807,0.998379,0.611282,0.888176,1.128938,1.798674,5.916575,171.322606,-1.165082,-1.250638,2.882109
4,True,5.521059e+05,450000,0.638750,1.103302,1.001451,0.930498,0.890924,1.153246,1.802459,5.409683,170.150222,-0.894832,-1.295212,3.103951
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,True,4.542517e+05,900000,0.273958,1.063727,0.988327,0.567723,0.891147,1.116070,1.790089,5.936106,170.379233,-1.047098,-1.549214,3.105398
996,True,2.449483e+05,450000,0.736411,1.167078,0.994594,0.768238,0.869078,1.137601,1.756084,4.342214,169.717689,-0.981540,-1.089090,3.037435
997,False,1.953314e+06,600000,0.789224,1.122707,0.989113,0.963463,0.886972,1.088569,1.842581,4.514627,175.652292,-0.899437,-1.389575,3.826512
998,False,5.054954e+05,300000,0.624349,1.075128,1.003698,0.793756,0.814651,1.002721,1.808275,5.530732,172.567202,-1.149893,-1.645774,3.837561


# 4. Save and load the synthesizer

To save a trained ctgan synthesizer, use

`model.save('my_model.pkl')`

To restore a saved synthesizer, use

`loaded = CopulaGAN.load('my_model.pkl')`

`new_data = loaded.sample(200)`

In [23]:
model.save('CopulaGAN_survival_2.pkl')