1.Importing Libraries

In [None]:
import pandas as pd

from sdv.metadata import SingleTableMetadata
from sdv.evaluation.single_table import evaluate_quality, run_diagnostic
from sdv.single_table import GaussianCopulaSynthesizer, CTGANSynthesizer, TVAESynthesizer, CopulaGANSynthesizer
from sdv.lite import SingleTablePreset

2.Loading the dataset

In [None]:
# Load Data
primary_data = pd.read_csv("./Etongue_filtered_dataset.csv")
primary_data["index"] = primary_data.index # Add an index column for primary key


In [None]:
primary_data.head()

Unnamed: 0,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,BCUT2D_MWHI,BertzCT,Chi0,Chi0n,Chi0v,Chi1,...,VSA_EState6,VSA_EState8,HeavyAtomCount,NOCount,NumHAcceptors,NumHeteroatoms,MolMR,Bitterness Concentration,Log(Bitterness Concentration),index
0,567.06,535.812,566.148965,206,35.495693,1293.336234,28.294318,21.270444,22.84287,18.060298,...,14.467631,4.415479,38,10,9,12,141.8337,9.12e-05,-9.302456,0
1,584.105,544.793,583.259138,214,35.495693,1368.238526,28.277446,23.730749,25.381105,19.20686,...,14.361112,10.217241,40,9,9,11,166.4369,0.03749328,-3.283594,1
2,390.867,367.683,390.134635,144,35.495692,730.336845,20.23384,15.248382,16.004311,12.790601,...,14.170506,1.031614,27,6,4,7,105.1146,0.005825595,-5.145494,2
3,330.749,319.661,330.00772,112,35.495694,771.195516,15.620956,10.537628,12.110053,9.786712,...,5.512018,0.220566,21,7,5,9,75.8192,3.05e-08,-17.305539,3
4,206.285,188.141,206.13068,82,16.366405,324.894119,11.422285,9.52667,9.52667,7.002908,...,7.871574,6.056901,15,2,1,2,61.0348,0.01475528,-4.216154,4


In [None]:
# Metadata Creation

metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=primary_data)# Automatically infer metadata
metadata.update_column(
    column_name='index',
    sdtype='id') # Mark index as ID column
metadata.set_primary_key("index") # Set the primary key

In [None]:
metadata

{
    "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1",
    "columns": {
        "MolWt": {
            "sdtype": "numerical"
        },
        "HeavyAtomMolWt": {
            "sdtype": "numerical"
        },
        "ExactMolWt": {
            "sdtype": "numerical"
        },
        "NumValenceElectrons": {
            "sdtype": "numerical"
        },
        "BCUT2D_MWHI": {
            "sdtype": "numerical"
        },
        "BertzCT": {
            "sdtype": "numerical"
        },
        "Chi0": {
            "sdtype": "numerical"
        },
        "Chi0n": {
            "sdtype": "numerical"
        },
        "Chi0v": {
            "sdtype": "numerical"
        },
        "Chi1": {
            "sdtype": "numerical"
        },
        "Chi1n": {
            "sdtype": "numerical"
        },
        "Chi1v": {
            "sdtype": "numerical"
        },
        "Chi2n": {
            "sdtype": "numerical"
        },
        "Chi2v": {
            "sdtype": "numerical"
        },
   

In [None]:
# Validate metadata
metadata.validate()

3.Define Functions

In [None]:
# Define common synthesizer function
def run_synthesizer(synthesizer, name, primary_data, metadata, num_rows=500, output_dir="./"):
    synthesizer.fit(data=primary_data)
    synthetic_data = synthesizer.sample(num_rows=num_rows)
    synthetic_data.to_csv(f"{output_dir}{name}_500_v1.csv", index=False)

In [None]:
# Function to evaluate quality
def evaluate_synthetic_data(real_data, synthetic_data, metadata, name):
    quality_report = evaluate_quality(real_data, synthetic_data, metadata)

In [None]:
# Function to run diagnostics
def run_diagnostics(real_data, synthetic_data, metadata, name):
    diagnostic_report = run_diagnostic(real_data=real_data, synthetic_data=synthetic_data, metadata=metadata)

4.Generate and evaluate synthetic data

4.1  FAST_ML Synthesizer

In [None]:
fast_ml_synthesizer = SingleTablePreset(metadata, name='FAST_ML')
run_synthesizer(fast_ml_synthesizer, "FAST_ML", primary_data, metadata, output_dir=OUTPUT_DIR)


In [None]:
# Evaluate quality
evaluate_synthetic_data(primary_data, synthetic_data, metadata, "FAST_ML")

Creating report: 100%|██████████| 4/4 [00:12<00:00,  3.11s/it]


Overall Quality Score: 88.07%

Properties:
Column Shapes: 77.41%
Column Pair Trends: 98.73%





In [None]:
# Run diagnostics
run_diagnostics(primary_data, synthetic_data, metadata, "FAST_ML")

Creating report: 100%|██████████| 4/4 [00:01<00:00,  2.14it/s]


DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data





4.2 Gaussian Copula with different distributions

In [None]:
## Gaussian Copula Synthesizer with "norm" distribution
gc_norm_synthesizer = GaussianCopulaSynthesizer(metadata, enforce_min_max_values=True, enforce_rounding=True, default_distribution='norm')
synthetic_data_gc_norm = run_synthesizer(gc_norm_synthesizer, "GCS_norm", primary_data, metadata, output_dir=OUTPUT_DIR)

In [None]:
# Evaluate quality
evaluate_synthetic_data(primary_data, synthetic_data, metadata, "GCS_norm")

Creating report: 100%|██████████| 4/4 [00:12<00:00,  3.18s/it]


Overall Quality Score: 86.73%

Properties:
Column Shapes: 75.37%
Column Pair Trends: 98.09%





In [None]:
# Run diagnostics
run_diagnostics(primary_data, synthetic_data, metadata, "GCS_norm")

Creating report: 100%|██████████| 4/4 [00:01<00:00,  2.38it/s]


DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data





In [None]:
## Gaussian Copula Synthesizer with "gaussian_kde" distribution
gc_kde_synthesizer = GaussianCopulaSynthesizer(metadata, enforce_min_max_values=True, enforce_rounding=True, default_distribution='gaussian_kde')
synthetic_data_gc_kde = run_synthesizer(gc_kde_synthesizer, "GCS_gaussian_kde", primary_data, metadata, output_dir=OUTPUT_DIR)


In [None]:
# Evaluate quality
evaluate_synthetic_data(primary_data, synthetic_data, metadata, "GCS_gaussian_kde")

Creating report: 100%|██████████| 4/4 [00:12<00:00,  3.24s/it]


Overall Quality Score: 89.64%

Properties:
Column Shapes: 82.02%
Column Pair Trends: 97.27%





In [None]:
# Run diagnostics
run_diagnostics(primary_data, synthetic_data, metadata, "GCS_gaussian_kde")

Creating report: 100%|██████████| 4/4 [00:01<00:00,  2.23it/s]


DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data





In [None]:
## Gaussian Copula Synthesizer with "beta" distribution
gc_beta_synthesizer = GaussianCopulaSynthesizer(metadata, enforce_min_max_values=True, enforce_rounding=True, default_distribution='beta')
synthetic_data_gc_beta = run_synthesizer(gc_beta_synthesizer, "GCS_beta", primary_data, metadata, output_dir=OUTPUT_DIR)


In [None]:
# Evaluate quality
evaluate_synthetic_data(primary_data, synthetic_data_gc_beta, metadata, "GCS_beta")



Creating report: 100%|██████████| 4/4 [00:13<00:00,  3.39s/it]


Overall Quality Score: 78.79%

Properties:
Column Shapes: 67.85%
Column Pair Trends: 89.72%





In [None]:
# Run diagnostics
run_diagnostics(primary_data, synthetic_data_gc_beta, metadata, "GCS_beta")

Creating report: 100%|██████████| 4/4 [00:01<00:00,  2.13it/s]


DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data





4.3 CTGAN Synthesizer

In [None]:

ctgan_synthesizer = CTGANSynthesizer(metadata, enforce_rounding=True, epochs=2000, verbose=True)
synthetic_data_ctgan = run_synthesizer(ctgan_synthesizer, "CTGAN", primary_data, metadata, output_dir=OUTPUT_DIR)


In [None]:
# Evaluate CTGAN
evaluate_synthetic_data(primary_data, synthetic_data_ctgan, metadata, "CTGAN")


Creating report: 100%|██████████| 4/4 [00:11<00:00,  2.91s/it]


Overall Quality Score: 76.3%

Properties:
Column Shapes: 66.82%
Column Pair Trends: 85.77%





In [None]:
# Diagnose CTGAN
run_diagnostics(primary_data, synthetic_data_ctgan, metadata, "CTGAN")


Creating report: 100%|██████████| 4/4 [00:01<00:00,  2.32it/s]


DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data





4.4 TVAE Synthesizer

In [None]:

tvae_synthesizer = TVAESynthesizer(metadata, enforce_min_max_values=True, enforce_rounding=True, epochs=2000)
synthetic_data_tvae = run_synthesizer(tvae_synthesizer, "TVAE", primary_data, metadata, output_dir=OUTPUT_DIR)


In [None]:
# Evaluate TVAE
evaluate_synthetic_data(primary_data, synthetic_data_tvae, metadata, "TVAE")

Creating report: 100%|██████████| 4/4 [00:12<00:00,  3.01s/it]


Overall Quality Score: 78.01%

Properties:
Column Shapes: 62.02%
Column Pair Trends: 94.0%





In [None]:
# Diagnose TVAE
run_diagnostics(primary_data, synthetic_data_tvae, metadata, "TVAE")

Creating report: 100%|██████████| 4/4 [00:01<00:00,  2.20it/s]


DiagnosticResults:

SUCCESS:
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data

! The synthetic data is missing more than 10% of the numerical ranges present in the real data





4.5 Copula GAN Synthesizer with different distribution

In [None]:
## Copula GAN Synthesizer with "norm" distribution
copula_norm_synthesizer = CopulaGANSynthesizer(metadata, enforce_min_max_values=True, enforce_rounding=True, default_distribution='norm', epochs=2000, verbose=True)
synthetic_data_copula_norm = run_synthesizer(copula_norm_synthesizer, "CopulaGAN_norm", primary_data, metadata, output_dir=OUTPUT_DIR)


In [None]:
# Evaluate Copula GAN "norm"
evaluate_synthetic_data(primary_data, synthetic_data_copula_norm, metadata, "CopulaGAN_norm")


Creating report: 100%|██████████| 4/4 [00:10<00:00,  2.67s/it]


Overall Quality Score: 76.32%

Properties:
Column Shapes: 67.0%
Column Pair Trends: 85.64%





In [None]:
# Diagnose Copula GAN "norm"
run_diagnostics(primary_data, synthetic_data_copula_norm, metadata, "CopulaGAN_norm")


Creating report: 100%|██████████| 4/4 [00:01<00:00,  2.49it/s]


DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data





In [None]:
## Copula GAN Synthesizer with "beta" distribution
copula_beta_synthesizer = CopulaGANSynthesizer(metadata, enforce_min_max_values=True, enforce_rounding=True, default_distribution='beta', epochs=2000, verbose=True)
synthetic_data_copula_beta = run_synthesizer(copula_beta_synthesizer, "CopulaGAN_beta", primary_data, metadata, output_dir=OUTPUT_DIR)



In [None]:
# Evaluate Copula GAN "beta"
evaluate_synthetic_data(primary_data, synthetic_data_copula_beta, metadata, "CopulaGAN_beta")

Creating report: 100%|██████████| 4/4 [00:10<00:00,  2.69s/it]


Overall Quality Score: 72.18%

Properties:
Column Shapes: 59.65%
Column Pair Trends: 84.71%





In [None]:
# Diagnose Copula GAN "beta"
run_diagnostics(primary_data, synthetic_data_copula_beta, metadata, "CopulaGAN_beta")


Creating report: 100%|██████████| 4/4 [00:01<00:00,  2.38it/s]


DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data





In [None]:
## Copula GAN Synthesizer with "gaussian_kde" distribution
copula_kde_synthesizer = CopulaGANSynthesizer(metadata, enforce_min_max_values=True, enforce_rounding=True, default_distribution='gaussian_kde', epochs=2000, verbose=True)
synthetic_data_copula_kde = run_synthesizer(copula_kde_synthesizer, "CopulaGAN_gaussian_kde", primary_data, metadata, output_dir=OUTPUT_DIR)


In [None]:
# Evaluate and Diagnose Copula GAN "gaussian_kde"
evaluate_synthetic_data(primary_data, synthetic_data_copula_kde, metadata, "CopulaGAN_gaussian_kde")



Creating report: 100%|██████████| 4/4 [00:10<00:00,  2.70s/it]


Overall Quality Score: 72.39%

Properties:
Column Shapes: 66.54%
Column Pair Trends: 78.23%





In [None]:
# Diagnose Copula GAN "gaussian_kde"
run_diagnostics(primary_data, synthetic_data_copula_kde, metadata, "CopulaGAN_gaussian_kde")


Creating report: 100%|██████████| 4/4 [00:01<00:00,  2.59it/s]


DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data



