In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sdv.tabular import CopulaGAN
from sdv.evaluation import evaluate
from sdv.constraints import UniqueCombinations, GreaterThan


import os, glob

In [2]:
data = pd.read_csv(os.path.join(os.path.dirname(os.getcwd()), '../Data/Merged_data/Survival_df.csv'),
                    index_col=0)
#data.drop(columns=['NLE_ratio_119_17'],inplace=True)

In [3]:
data.NDE_cycle = data.NDE_cycle.astype('str')

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16 entries, 41C to 49C
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Observed              16 non-null     bool   
 1   F_Time                16 non-null     float64
 2   NDE_cycle             16 non-null     object 
 3   diff_two_peaks_ratio  16 non-null     float64
 4   amp_ratio             16 non-null     float64
 5   pos_ratio             16 non-null     float64
 6   energy_ratio          16 non-null     float64
 7   NLE_ratio_51_17       16 non-null     float64
 8   NLE_ratio_85_17       16 non-null     float64
 9   NLE_ratio_119_17      16 non-null     float64
 10  NLO_avg               16 non-null     float64
 11  Avg_RP                16 non-null     float64
 12  ACEE_30_Avg_58        16 non-null     float64
 13  ACEE_500_Avg_58       16 non-null     float64
 14  ACPD_Avg_58           16 non-null     float64
dtypes: bool(1), float64(13), ob

# CopulaGAN

The sdv.tabular.CopulaGAN model is a variation of the CTGAN Model which takes advantage of the CDF based transformation that the GaussianCopulas apply to make the underlying CTGAN model task of learning the data easier.

# Model the data

## tuning distribution and fitting model

In [5]:
model = CopulaGAN(
    epochs=5000,
    field_distributions={
        'diff_two_peaks_ratio':'gaussian_kde',
        'energy_ratio':'gaussian_kde',
        'ACEE_500_Avg_58':'gaussian_kde',
        'amp_ratio':'gaussian_kde',
        'NLE_ratio_51_17':'gaussian_kde',
        'NLE_ratio_85_17':'gaussian_kde',
        'NLE_ratio_119_17':'gaussian_kde',
    }
)



In [6]:
model.fit(data)

  sk = 2*(b-a)*np.sqrt(a + b + 1) / (a + b + 2) / np.sqrt(a*b)
  improvement from the last ten iterations.
  return c**2 / (c**2 - n**2)
  Lhat = muhat - Shat*mu
  return torch._C._cuda_getDeviceCount() > 0


In [7]:
model.get_distributions()

{'Observed': 'copulas.univariate.gaussian.GaussianUnivariate',
 'F_Time': 'copulas.univariate.beta.BetaUnivariate',
 'diff_two_peaks_ratio': 'copulas.univariate.gaussian_kde.GaussianKDE',
 'amp_ratio': 'copulas.univariate.gaussian_kde.GaussianKDE',
 'pos_ratio': 'copulas.univariate.gamma.GammaUnivariate',
 'energy_ratio': 'copulas.univariate.gaussian_kde.GaussianKDE',
 'NLE_ratio_51_17': 'copulas.univariate.gaussian_kde.GaussianKDE',
 'NLE_ratio_85_17': 'copulas.univariate.gaussian_kde.GaussianKDE',
 'NLE_ratio_119_17': 'copulas.univariate.gaussian_kde.GaussianKDE',
 'NLO_avg': 'copulas.univariate.truncated_gaussian.TruncatedGaussian',
 'Avg_RP': 'copulas.univariate.gamma.GammaUnivariate',
 'ACEE_30_Avg_58': 'copulas.univariate.student_t.StudentTUnivariate',
 'ACEE_500_Avg_58': 'copulas.univariate.gaussian_kde.GaussianKDE',
 'ACPD_Avg_58': 'copulas.univariate.gamma.GammaUnivariate'}

# Generate synthetic data

In [8]:
samples = model.sample(1000)

In [9]:
samples.head()

Unnamed: 0,Observed,F_Time,NDE_cycle,diff_two_peaks_ratio,amp_ratio,pos_ratio,energy_ratio,NLE_ratio_51_17,NLE_ratio_85_17,NLE_ratio_119_17,NLO_avg,Avg_RP,ACEE_30_Avg_58,ACEE_500_Avg_58,ACPD_Avg_58
0,True,469617.6,600000,0.652425,1.246098,0.997119,0.90457,0.920096,1.088206,0.467396,5.73533,175.609656,-0.770646,-1.285146,3.326268
1,True,325823.7,450000,0.720146,1.262309,0.993423,0.980101,0.85737,1.101848,0.467855,3.645883,169.717689,-0.581949,-1.358144,2.985542
2,False,1163226.0,750000,0.563658,1.160935,0.99658,0.75048,0.884058,1.077712,0.470394,7.851831,170.98232,-0.76957,-1.502093,3.510375
3,True,1845031.0,900000,0.514807,1.03714,1.000231,0.737361,0.907228,1.150152,0.488943,5.009504,171.280515,-1.024741,-1.227602,2.845183
4,True,1453082.0,900000,0.379343,1.170794,0.998513,0.826376,0.896547,1.16272,0.488752,5.828494,170.252894,-0.828375,-1.248879,3.067759


In [10]:
samples.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Observed              1000 non-null   bool   
 1   F_Time                1000 non-null   float64
 2   NDE_cycle             1000 non-null   object 
 3   diff_two_peaks_ratio  1000 non-null   float64
 4   amp_ratio             1000 non-null   float64
 5   pos_ratio             1000 non-null   float64
 6   energy_ratio          1000 non-null   float64
 7   NLE_ratio_51_17       1000 non-null   float64
 8   NLE_ratio_85_17       1000 non-null   float64
 9   NLE_ratio_119_17      1000 non-null   float64
 10  NLO_avg               1000 non-null   float64
 11  Avg_RP                1000 non-null   float64
 12  ACEE_30_Avg_58        1000 non-null   float64
 13  ACEE_500_Avg_58       1000 non-null   float64
 14  ACPD_Avg_58           1000 non-null   float64
dtypes: bool(1), float64(13

### Evaluate

The output of this function call will be a number between 0 and 1 that will indicate us how similar the two tables are, being 0 the worst and 1 the best possible score.

The evaluate function applies a collection of pre-configured metric functions and returns the average of the scores that the data obtained on each one of them. To explore the metrics in more detail, you can pass and additional argument aggregate=False.


- cstest: This metric compares the distributions of all the categorical columns of the table by using a Chi-squared test and returns the average of the p-values obtained across all the columns. If the tables that you are evaluating do not contain any categorical columns the result will be nan.

- kstest: This metric compares the distributions of all the numerical columns of the table with a two-sample Kolmogorov–Smirnov test using the empirical CDF and returns the average of the p-values obtained across all the columns. If the tables that you are evaluating do not contain any numerical columns the result will be nan.

- logistic_detection: This metric tries to use a Logistic Regression classifier to detect whether each row is real or synthetic and then evaluates its performance using an Area under the ROC curve metric. The returned score is 1 minus the ROC AUC score obtained by the classifier.

- svc_detection: This metric tries to use an Support Vector Classifier to detect whether each row is real or synthetic and then evaluates its performance using an Area under the ROC curve metric. The returned score is 1 minus the ROC AUC score obtained by the classifier.


In [11]:
evaluate(samples, data)

0.6875553622283976

In [12]:
evaluate(samples, data, aggregate = False)

{'cstest': 0.9728890374534073,
 'kstest': 0.5883046611549878,
 'logistic_detection': 0.5663601725477974,
 'svc_detection': 0.6303247359135583}

In [13]:
data.describe()

Unnamed: 0,F_Time,diff_two_peaks_ratio,amp_ratio,pos_ratio,energy_ratio,NLE_ratio_51_17,NLE_ratio_85_17,NLE_ratio_119_17,NLO_avg,Avg_RP,ACEE_30_Avg_58,ACEE_500_Avg_58,ACPD_Avg_58
count,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0
mean,1417697.0,0.788127,1.031994,0.998382,0.934331,0.899475,1.121816,0.484558,5.15668,175.298605,-0.818316,-1.670322,3.403606
std,965361.5,0.61,0.203137,0.008879,0.312578,0.028879,0.042745,0.021237,1.405015,7.172232,0.267701,0.675367,0.660872
min,200007.0,0.144147,0.656073,0.982972,0.394731,0.844824,1.037133,0.432253,3.061693,169.717689,-1.276355,-3.20222,2.375193
25%,502256.5,0.432558,0.892097,0.991597,0.785142,0.887572,1.097819,0.477415,4.17188,170.846385,-0.926859,-1.696258,2.890848
50%,1381439.0,0.630974,1.09054,1.000778,0.849441,0.893242,1.117198,0.488176,4.925884,172.326261,-0.8629,-1.467427,3.255522
75%,2146388.0,0.83315,1.180048,1.00303,1.130465,0.913813,1.1391,0.494032,6.210666,174.994399,-0.661134,-1.262696,3.8768
max,3245267.0,2.280959,1.365329,1.015363,1.559453,0.978877,1.235508,0.530102,7.862192,190.979294,-0.13764,-0.927155,4.804749


In [14]:
samples.describe()

Unnamed: 0,F_Time,diff_two_peaks_ratio,amp_ratio,pos_ratio,energy_ratio,NLE_ratio_51_17,NLE_ratio_85_17,NLE_ratio_119_17,NLO_avg,Avg_RP,ACEE_30_Avg_58,ACEE_500_Avg_58,ACPD_Avg_58
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,1493591.0,0.747963,1.126408,0.999036,0.956255,0.903087,1.126805,0.485662,5.094756,174.733792,-0.726169,-1.654611,3.333669
std,961498.8,0.506882,0.147768,0.008894,0.279098,0.029751,0.041702,0.018402,1.412501,8.201461,0.231625,0.688212,0.478492
min,200007.0,0.073161,0.642264,0.976361,0.41247,0.812627,1.017602,0.414511,3.061694,169.717689,-1.296343,-3.725753,2.214484
25%,585532.6,0.424819,1.04278,0.99241,0.746248,0.888249,1.099947,0.477159,3.926434,170.364963,-0.877894,-1.778635,3.017262
50%,1438912.0,0.596902,1.1433,0.999047,0.872029,0.899531,1.127493,0.485894,4.766037,171.241248,-0.784524,-1.409826,3.261537
75%,2248127.0,0.79786,1.230912,1.005089,1.102173,0.912262,1.149337,0.493869,6.205404,173.750108,-0.629836,-1.23025,3.62541
max,3245267.0,2.369119,1.504981,1.023522,1.717411,1.01124,1.272543,0.549706,7.862192,209.657004,0.137718,-0.62861,5.144383


In [15]:
samples.NDE_cycle = samples.NDE_cycle.astype('int')
samples.to_csv(os.path.join(os.path.dirname(os.getcwd()), '../Data/Merged_data/CopulaGAN_simulated_data_survival_v2.csv'), index=False)

In [16]:
samples

Unnamed: 0,Observed,F_Time,NDE_cycle,diff_two_peaks_ratio,amp_ratio,pos_ratio,energy_ratio,NLE_ratio_51_17,NLE_ratio_85_17,NLE_ratio_119_17,NLO_avg,Avg_RP,ACEE_30_Avg_58,ACEE_500_Avg_58,ACPD_Avg_58
0,True,4.696176e+05,600000,0.652425,1.246098,0.997119,0.904570,0.920096,1.088206,0.467396,5.735330,175.609656,-0.770646,-1.285146,3.326268
1,True,3.258237e+05,450000,0.720146,1.262309,0.993423,0.980101,0.857370,1.101848,0.467855,3.645883,169.717689,-0.581949,-1.358144,2.985542
2,False,1.163226e+06,750000,0.563658,1.160935,0.996580,0.750480,0.884058,1.077712,0.470394,7.851831,170.982320,-0.769570,-1.502093,3.510375
3,True,1.845031e+06,900000,0.514807,1.037140,1.000231,0.737361,0.907228,1.150152,0.488943,5.009504,171.280515,-1.024741,-1.227602,2.845183
4,True,1.453082e+06,900000,0.379343,1.170794,0.998513,0.826376,0.896547,1.162720,0.488752,5.828494,170.252894,-0.828375,-1.248879,3.067759
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,False,2.825437e+06,900000,0.552311,1.175387,0.997886,0.907612,0.882277,1.120056,0.485095,7.860860,170.004194,-0.771458,-0.896907,4.420880
996,False,2.665376e+06,0,1.976501,1.194383,0.993424,1.446834,0.895328,1.135725,0.482671,3.692038,186.416143,-0.131110,-3.008475,4.146605
997,True,5.166930e+05,750000,0.306940,1.017359,1.002774,0.624054,0.916610,1.100316,0.483606,6.820040,174.078034,-0.866947,-1.642809,3.278346
998,True,4.226425e+05,450000,0.681572,1.276485,1.002120,1.314244,0.984665,1.245225,0.529420,3.785093,170.268408,-0.876770,-1.520067,3.746101


# 4. Save and load the synthesizer

To save a trained ctgan synthesizer, use

`model.save('my_model.pkl')`

To restore a saved synthesizer, use

`loaded = CopulaGAN.load('my_model.pkl')`

`new_data = loaded.sample(200)`

In [17]:
model.save('CopulaGAN_survival.pkl')