# ABSEHRD package demo
This notebook demonstrates Automated Brewering Synthetic Electronic Health Record Data (ABSEHRD) package functionality on a toy dataset.

## Setup

### Import python and ABSEHRD modules

In [1]:
import numpy as np
import matplotlib.pyplot as plt

from preprocessor import Preprocessor
from corgan import Corgan
from realism import Realism
from privacy import Privacy

### Set parameters for the toy dataset and demo

In [2]:
# toy dataset
n = 10000
count_min = 5
count_max = 19
constant_value = 'helloworld'
binary_A = 'A'
binary_B = 'B'
categorical_values = ['X','Y','Z']
missing_value = -99999

# synthetic data generation and validation
n_gen = round(n/2)
outcome = 'binary01'

# sehrd objects
pre = Preprocessor(missing_value=missing_value)
rea = Realism()
pri = Privacy()
cor = Corgan()

### Generate the toy dataset

In [3]:
names = np.array(['constant','binary01', 'binaryAB', 'categorical','count','continuous'])
v_constant = np.full(shape=n, fill_value=constant_value)
v_binary01 = np.random.randint(low=0, high=2, size=n)
v_binaryAB = np.concatenate((np.full(shape=n-1, fill_value=binary_A), np.array([binary_B])))
v_categorical = np.random.choice(categorical_values, size=n)
v_count = np.random.randint(low=count_min, high=count_max+1, size=n)
v_continuous = np.random.random(size=n)
x = np.column_stack((v_constant, v_binary01, v_binaryAB, v_categorical, v_count, v_continuous))
print(x)

[['helloworld' '1' 'A' 'Y' '14' '0.24115798945101719']
 ['helloworld' '1' 'A' 'X' '16' '0.7739167315409661']
 ['helloworld' '1' 'A' 'Z' '13' '0.733591719689214']
 ...
 ['helloworld' '1' 'A' 'X' '8' '0.15631542580525104']
 ['helloworld' '0' 'A' 'X' '10' '0.4571789089675996']
 ['helloworld' '1' 'B' 'Z' '12' '0.937723003397054']]


### Split into training and testing set

In [4]:
n_subset = round(len(x) * 0.5)
idx_trn = np.random.choice(len(x), n_subset, replace=False)
idx_tst = np.setdiff1d(range(len(x)), idx_trn)
x_trn = x[idx_trn,:]
x_tst = x[idx_tst,:]

print('Number of training samples: '+str(len(x_trn)))
print('Number of testing samples: '+str(len(x_tst)))

Number of training samples: 5000
Number of testing samples: 5000


## Preprocessing

### Save metadata for restoring data format after synthetic data generation

In [5]:
meta = pre.get_metadata(arr=x_trn, header=names)
print('var_name, var_type, min, max, zero, one, unique, missing')
print(meta)

var_name, var_type, min, max, zero, one, unique, missing
[('constant', 'constant', 0.00000000e+00,  0.        , 'helloworld', '', '', False)
 ('binary01', 'binary', 0.00000000e+00,  0.        , '0', '1', '', False)
 ('binaryAB', 'binary', 0.00000000e+00,  0.        , 'A', 'B', '', False)
 ('categorical', 'categorical', 0.00000000e+00,  0.        , '', '', 'X,Y,Z', False)
 ('count', 'count', 5.00000000e+00, 19.        , '', '', '', False)
 ('continuous', 'continuous', 5.41996356e-05,  0.99936697, '', '', '', False)]


### Encode raw data matrix in preparation for training synthetic data generator
Note that count and continuous variables have been scaled between 0 and 1 while constant, categorical, and binary have been one-hot encoded.

In [6]:
d_trn = pre.get_discretized_matrix(x_trn, meta, names)
print('Formatted matrix:')
print(d_trn['x'])
print('\nHeader for formatted matrix:')
print(d_trn['header'])

Formatted matrix:
[[0.         0.         0.         ... 0.         0.48734487 0.        ]
 [0.         0.         1.         ... 0.         0.48557903 0.        ]
 [0.         0.         0.         ... 0.         0.3206077  0.        ]
 ...
 [0.         0.         0.         ... 0.         0.46792516 0.        ]
 [0.         0.         1.         ... 0.         0.1298717  0.        ]
 [0.         0.         1.         ... 0.         0.28453527 0.        ]]

Header for formatted matrix:
['constant__constant' 'constant__-99999' 'binary01__binary01'
 'binary01__-99999' 'binaryAB__binaryAB' 'binaryAB__-99999'
 'categorical__0' 'categorical__1' 'categorical__2' 'categorical__3'
 'count__count' 'count__-99999' 'continuous__continuous'
 'continuous__-99999']


## Generation

### Train CorGAN model 

In [None]:
model = cor.train(x=d_trn['x'], n_cpu=1, debug=True, n_epochs_pretrain=100, n_epochs=100)

Pre-training: 100%|██████████| 100/100 [00:10<00:00,  9.77 epochs/s, [A loss: 1.041]]
Training:  90%|█████████ | 90/100 [00:37<00:04,  2.39 epochs/s, TRAIN: [Loss_D: -0.009] [Loss_G: 0.012] [Loss_D_real: 0.019] [Loss_D_fake 0.010] | TEST: [A loss: 1.04] [real accuracy: 95.90] [fake accuracy: 23.44]]   

### Generate synthetic samples

In [None]:
s = cor.generate(model, n_gen)
print(s)

### Use metadata to restore original formatting

In [None]:
f = pre.restore_matrix(arr=s, meta=meta, header=d_trn['header'])
print('Synthetic samples:')
print(f['x'])
print('\nReal samples:')
print(x)

## Realism

### Compare univariate frequency for real and synthetic features

In [None]:
res_frq = rea.feature_frequency(mat_f_r_trn=x_trn, 
                                mat_f_r_tst=x_tst, 
                                mat_f_s=f['x'], 
                                header=names, 
                                missing_value=missing_value)
print(rea.summarize(res_frq))
res_plt = rea.plot(res_frq, labels_on=True)
res_plt = rea.plot(res_frq, labels_on=False)

### Compare effect sizes
Compare effect sizes from a train logistic regression method for a binary outcome between a model trained from real data and synthetic data.

In [None]:
res_effect = rea.feature_effect(mat_f_r_trn=x_trn, 
                                mat_f_r_tst=x_tst, 
                                mat_f_s=f['x'], 
                                header=names, 
                                outcome=outcome, 
                                missing_value=missing_value, 
                                scaled=False)
print(rea.summarize(res_effect))
res_plt = rea.plot(res_effect, labels_on=True)
res_plt = rea.plot(res_effect, labels_on=False)

### Compare predictive performance
* Real: use real dataset to train predictive model and test on a separate real dataset
* GAN-train: use synthetic dataset to train predictive model and test on a real dataset
*GAN-test: use real dataset to train predictive model and test on the synthetic dataset

In [None]:
print('Train models to predict outcome \'', outcome,'\' from real and synthetic datasets', sep='')

In [None]:
res_gan = rea.gan_train_test(mat_f_r_trn=x_trn, 
                                mat_f_r_tst=x_tst, 
                                mat_f_s=f['x'], 
                                header=names, 
                                outcome=outcome, 
                                missing_value=missing_value, 
                                n_epoch=100, 
                                model_type='mlp')
print(rea.summarize(res_gan))
res_plt = rea.plot(res_gan)

## Privacy

## Nearest neighbors
Ensure that synthetic dataset is not a copy of the real dataset by comparing distances between pairs of real and synthetic samples
* Real-real: distance between randomly selected pairs of real samples
* Real-synthetic: distance between pairs of real and synthetic samples
* Real-probabilistic: distance between a real sample and sampled binary vector where each column is sampled from a binomial where the frequency equals that in the real training set
* Real-random: distance between a real sample and a randomly sampled binary vector

Calculate, summarize, and plot nearest neighbor distributions:

In [None]:
res_nn = pri.assess_memorization(x_trn, 
                                 f['x'], 
                                 missing_value=missing_value, 
                                 header=names,
                                 metric='euclidean',
                                 debug=False)

In [None]:
print(pri.summarize(res_nn))
res_plt = pri.plot(res_nn)

## Membership inference
Membership inference refers to the ability to determine if a given data sample was used to train a model of interest.  In the case of synthetic data, calculating the risk of accurate membership inference given a sample of synthetic data can provide a metric to assess risk to privacy of the synthetic dataset to individuals whose data was used to train the synthetic data generator.  

Risk of membership inference can be assessed in multiple scenarios with differing assumptions about what data is available to the attacker.

### Distance-based thresholding
Choi et al. (2017) and Torfi et al. (2020) calculated the distance between synthetic and real samples. Real samples were derived from the training dataset for the synthetic data generator and from a separate testing set.  Pairwise distances between synthetic and real samples were predicted as a match if the distance was within a specified threshold.  Predictions and labels were then compared to derived performance metrics for the membership inference attack. 

In [None]:
res_mi_torfi = pri.membership_inference(mat_f_r_trn=x_trn,
                                    mat_f_r_tst=x_tst,
                                    mat_f_s=f['x'],
                                    header=names,
                                    missing_value=missing_value,
                                    mi_type='torfi',
                                    n_cpu=1)

In [None]:
print(pri.summarize(res_mi_torfi))
res_plt = pri.plot(res_mi_torfi)