# ⭐️ Synthetic Data Generation

It is recommended to run this notebook in Colab environment.

The notebook under Colab environment: https://colab.research.google.com/drive/1DVFFCmSvpkftDJpjBzm3cgwbuZAqhNPc

Install all the required packages.

In [None]:
pip install sdv



Import all the required packages.


In [None]:
import pandas as pd
import re
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split

from sdv.metadata import Metadata
from sdv.single_table import CTGANSynthesizer, TVAESynthesizer
from sdv.evaluation.single_table import (
    get_column_plot,
    evaluate_quality,
    run_diagnostic,
)

# configure pandas settings for data display
pd.options.mode.chained_assignment = None
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("future.no_silent_downcasting", True)

In [None]:
def synthesize_data(synthesizer_type, metadata, df, no_samples, synthesizer_name):
    synthesizer = synthesizer_type(metadata)
    synthesizer.fit(df)

    synthetic_data = synthesizer.sample(num_rows=no_samples)

    synthetic_data.to_csv(f"{synthesizer_name}_synthetic_data.csv", index=False)

    return synthetic_data

Upload preprocessed dataset to Colab environment and load it.

In [None]:
data_path = "/content/preds_preprocessed_endo_data.csv"

In [None]:
df = pd.read_csv(data_path)

Define subset of the variables from dataset that synthetci data should be genearted on.

In [None]:
# features for non-causal and manual bias correction modelling
prediction_cols_endo = [
    "pelvic_pain_frequency_between_periods",
    "deep_vaginal_pain_during_intercourse",
    "painful_bowel_movements",
    "unable_to_cope_with_pain",
    "experienced_infertility",
    "family_history_endometriosis_prediction",
    "pelvic_pain_worst",
    "has_endometriosis",
]

# features for automatic bias correction modelling
prediction_cols_endo_with_treatments = [
    "pelvic_pain_frequency_between_periods",
    "deep_vaginal_pain_during_intercourse",
    "painful_bowel_movements",
    "unable_to_cope_with_pain",
    "experienced_infertility",
    "family_history_endometriosis_prediction",
    "pelvic_pain_worst",
    "takes_hormones_for_pain",
    "takes_presc_painkillers",
    "has_endometriosis",
]

In [None]:
df_base = df[prediction_cols_endo]
df_with_treatments = df[prediction_cols_endo_with_treatments]

Split dataset into training and test sets, so that the synthetic data is generated only on the training dataset.

In [None]:
X_base = df_base.drop(columns=["has_endometriosis"])
y_base = df_base["has_endometriosis"]

In [None]:
X_train_base, X_test_base, y_train_base, y_test_base = train_test_split(
    X_base, y_base, test_size=0.3, random_state=42
)

In [None]:
df_train_base = pd.concat([X_train_base, y_train_base], axis=1)

In [None]:
X_with_treatments = df_with_treatments.drop(columns=["has_endometriosis"])
y_with_treatments = df_with_treatments["has_endometriosis"]

In [None]:
(
    X_train_with_treatments,
    X_test_with_treatments,
    y_train_with_treatments,
    y_test_with_treatments,
) = train_test_split(
    X_with_treatments, y_with_treatments, test_size=0.3, random_state=42
)

In [None]:
df_train_with_treatments = pd.concat(
    [X_train_with_treatments, y_train_with_treatments], axis=1
)

In [None]:
# define metadata; used by sdv package

endo_metadata_base = Metadata.detect_from_dataframe(
    data=df_train_base, table_name="endometriosis_dataset_base"
)

endo_metadata_with_treatments = Metadata.detect_from_dataframe(
    data=df_train_with_treatments, table_name="endometriosis_dataset_with_treatments"
)

In [None]:
# number of data points to be generated by synthetic data generator
sample_no = 10000

## Synthetic data generated by CTGAN

In [None]:
ctgan_synthetic_data = synthesize_data(
    CTGANSynthesizer,
    endo_metadata_base,
    df_train_base,
    sample_no,
    f"ctgan_selected_features_exp_{sample_no}",
)



## Synthetic data generated by TVAE


In [None]:
tvae_synthetic_data = synthesize_data(
    TVAESynthesizer,
    endo_metadata_base,
    df_train_base,
    sample_no,
    f"tvae_selected_features_exp_{sample_no}",
)

In [None]:
# with treatment features
tvae_synthetic_data_with_treatments = synthesize_data(
    TVAESynthesizer,
    endo_metadata_with_treatments,
    df_train_with_treatments,
    sample_no,
    f"tvae_selected_features_with_treatments_exp_{sample_no}",
)



## Data Overview (without Treatment Variables)

### Generated by TVAE

In [None]:
tvae_synthetic_data = pd.read_csv(
    "/content/tvae_selected_features_exp_10000_synthetic_data.csv"
)

In [None]:
diagnostic_report = run_diagnostic(
    real_data=df_base, synthetic_data=tvae_synthetic_data, metadata=endo_metadata_base
)

Generating report ...

(1/2) Evaluating Data Validity: |██████████| 8/8 [00:00<00:00, 977.41it/s]|
Data Validity Score: 100.0%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 316.77it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 100.0%



In [None]:
quality_report = evaluate_quality(
    real_data=df_base, synthetic_data=tvae_synthetic_data, metadata=endo_metadata_base
)

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 8/8 [00:00<00:00, 228.61it/s]|
Column Shapes Score: 90.31%

(2/2) Evaluating Column Pair Trends: |██████████| 28/28 [00:00<00:00, 109.54it/s]|
Column Pair Trends Score: 79.96%

Overall Score (Average): 85.14%



In [None]:
for col in prediction_cols_endo:
    fig = get_column_plot(
        real_data=df_base,
        synthetic_data=tvae_synthetic_data,
        metadata=endo_metadata_base,
        column_name=col,
    )

    fig.show()

### Generated by CTGAN

In [None]:
ctgan_synthetic_data = pd.read_csv(
    "/content/ctgan_selected_features_exp_10000_synthetic_data.csv"
)

In [None]:
diagnostic_report = run_diagnostic(
    real_data=df_base, synthetic_data=ctgan_synthetic_data, metadata=endo_metadata_base
)

Generating report ...

(1/2) Evaluating Data Validity: |██████████| 8/8 [00:00<00:00, 568.52it/s]|
Data Validity Score: 100.0%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 79.04it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 100.0%



In [None]:
quality_report = evaluate_quality(
    real_data=df_base, synthetic_data=ctgan_synthetic_data, metadata=endo_metadata_base
)

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 8/8 [00:00<00:00, 67.48it/s]|
Column Shapes Score: 96.28%

(2/2) Evaluating Column Pair Trends: |██████████| 28/28 [00:00<00:00, 55.16it/s]|
Column Pair Trends Score: 84.35%

Overall Score (Average): 90.32%



In [None]:
for col in prediction_cols_endo:
    fig = get_column_plot(
        real_data=df_base,
        synthetic_data=ctgan_synthetic_data,
        metadata=endo_metadata_base,
        column_name=col,
    )

    fig.show()

## Data Overview (with Treatment Variables) by TVAE

In [None]:
tvae_synthetic_with_treatments_data = pd.read_csv(
    "/content/tvae_selected_features_with_treatments_exp_10000_synthetic_data.csv"
)

In [None]:
diagnostic_report = run_diagnostic(
    real_data=df_train_with_treatments,
    synthetic_data=tvae_synthetic_with_treatments_data,
    metadata=endo_metadata_with_treatments,
)

Generating report ...

(1/2) Evaluating Data Validity: |██████████| 10/10 [00:00<00:00, 1060.67it/s]|
Data Validity Score: 100.0%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 302.40it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 100.0%



In [None]:
quality_report = evaluate_quality(
    real_data=df_train_with_treatments,
    synthetic_data=tvae_synthetic_with_treatments_data,
    metadata=endo_metadata_with_treatments,
)

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 10/10 [00:00<00:00, 456.08it/s]|
Column Shapes Score: 86.03%

(2/2) Evaluating Column Pair Trends: |██████████| 45/45 [00:00<00:00, 199.67it/s]|
Column Pair Trends Score: 75.05%

Overall Score (Average): 80.54%



In [None]:
for col in prediction_cols_endo_with_treatments:
    fig = get_column_plot(
        real_data=df_train_with_treatments,
        synthetic_data=tvae_synthetic_with_treatments_data,
        metadata=endo_metadata_with_treatments,
        column_name=col,
    )

    fig.show()