## Import packages

In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import pandas as pd
from sklearn.ensemble import HistGradientBoostingClassifier

from fasd.utils import set_seed
from fasd import TabularFASD

seed = 123
set_seed(seed)

## Load data

In [2]:
X, y = load_breast_cancer(return_X_y=True, as_frame=True)
target_col = y.name
X = pd.concat((X, y), axis=1)

## Generate synthetic data

In [None]:
generator = TabularFASD(
    target_column=target_col,
    representation_dim=100,
    predictor_hidden_layers=[],
    predictor_nonlin="relu",
    representations_nonlin="tanh",
    predictor_dropout=0,
    decoder_hidden_layers=[100],
    decoder_nonlin="relu",
    decoder_dropout=0,
    vae_encoder_hidden_layers=[128],
    vae_encoder_nonlin="relu",
    vae_encoder_dropout=0,
    vae_decoder_hidden_layers=[128],
    vae_decoder_nonlin="relu",
    vae_decoder_dropout=0,
    vae_embedding_size=128,
    random_state=seed,
)
generator.fit(X)
syn = generator.generate(len(X))

100%|██████████| 100/100 [00:00<00:00, 148.81it/s]
100%|██████████| 100/100 [00:03<00:00, 30.59it/s]
100%|██████████| 100/100 [00:00<00:00, 148.90it/s]


## Evaluate Machine Learning Efficacy

In [4]:
yy = X[target_col].copy()
xx = X.drop(target_col, axis=1)
y_syn = syn[target_col].copy()
X_syn = syn.drop(target_col, axis=1)

X_tr, X_te, y_tr, y_te = train_test_split(
    xx, yy, stratify=yy, train_size=0.7, random_state=seed
)
X_syn_tr, X_syn_te, y_syn_tr, y_syn_te = train_test_split(
    X_syn, y_syn, stratify=y_syn, train_size=0.7, random_state=seed
)

model = HistGradientBoostingClassifier(max_depth=3)
model.fit(X_tr, y_tr)
preds = model.predict_proba(X_te)[:, 1]
score = roc_auc_score(y_te, preds)
print(f"Train Real Test Real ROCAUC: {score}")

model = HistGradientBoostingClassifier(max_depth=3)
model.fit(X_syn_tr, y_syn_tr)
preds = model.predict_proba(X_te)[:, 1]
score = roc_auc_score(y_te, preds)
print(f"Train Synthetic Test Real ROCAUC: {score}")

Train Real Test Real ROCAUC: 0.9981016355140186
Train Synthetic Test Real ROCAUC: 0.9668516355140186
