# SDV SYNTHETIC DATA GENERATION

## SETUP

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)

## LOAD REAL DATA

In [None]:
from ucimlrepo import fetch_ucirepo 

'''# metadata 
print(diabetes_130_us_hospitals_for_years_1999_2008.metadata) 
  
# variable information 
print(diabetes_130_us_hospitals_for_years_1999_2008.variables) '''
  
# fetch dataset 
diabetes_130_us_hospitals_for_years_1999_2008 = fetch_ucirepo(id=296) 
  
# data (as pandas dataframes) 
X = diabetes_130_us_hospitals_for_years_1999_2008.data.features 
y = diabetes_130_us_hospitals_for_years_1999_2008.data.targets 

# create complete real_data
diabetes = pd.DataFrame(X)
diabetes["readmitted"] = y

# visualize data
diabetes.head()

## EXPLORE REAL DATA

In [None]:
# dimensions
print(f"Dimension: {diabetes.shape}")

# data information
print(f"\nData information: {diabetes.info()}\n")

# understanding columns values
for col in diabetes.columns:
    print(f"\nColumn: {col} has values: {diabetes[col].unique()}")

# identify identity sensible data: 
sensitive_column_names = ['race', 'gender', 'age', 'admission_type_id','discharge_disposition_id','admission_source_id','payer_code', 'medical_specialty']

## EXPLORE REAL DATA VISUALLY

In [None]:
# visualizing categorical data distribution
import matplotlib.pyplot as plt
import seaborn as sns

categorical_cols = diabetes.select_dtypes(include='object')

for col in categorical_cols:
    fig, ax = plt.subplots(figsize = (10,5))
    sns.countplot(data=diabetes, x=col, ax = ax)
    ax.set_title(col)
    plt.tight_layout()
    plt.show()

In [None]:
# visualizing continuos data distribution
import matplotlib.pyplot as plt
import seaborn as sns

# continuous data
continuos_cols = diabetes.select_dtypes(include='int64')

# check data
print(f"First filter: {continuos_cols.columns}")

# remove id type values 
continuos_cols = [col for col in continuos_cols.columns if 'id' not in col]

# check data
print(f"Without `_id` columns: {continuos_cols}")

# KDE visualizations
for col in continuos_cols:
    fig, ax = plt.subplots(1,2, figsize = (10,5))
    sns.kdeplot(data=diabetes[col], ax=ax[0], fill=True)
    ax[1].hist(diabetes[col])
    ax[0].set_title(f"KDE: {col}")
    ax[1].set_title(f"HISTOGRAM: {col}")
    plt.tight_layout()
    plt.show()

## CREATE SYNTHETIZER & SYNTHETIC DATA WITH SDV

In [None]:
# Transform `diabetes` dataframe `SingleTableMetadata` data type 
from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata()

# Automatically detect metadata from the actual DataFrame
metadata.detect_from_dataframe(diabetes)

# Check if metadata has been correctly generated
print(metadata)

In [4]:
from sdv.single_table import GaussianCopulaSynthesizer

synthesizer = GaussianCopulaSynthesizer(metadata)

In [5]:
# train data to learn from real data
synthesizer.fit(
    data = diabetes
)

In [6]:
# create new data (same dimensions) based on learned model
synthetic_data = synthesizer.sample(
    num_rows=diabetes.shape[0]
)

## EXPLORE SYNTHETIC DATA AND VALIDATE

In [None]:
# dimensions
print(f"Real dimension: {diabetes.shape}")
print(f"Synth dimension: {synthetic_data.shape}")

# data information
print(f"\n\nReal data information: {diabetes.info()}")
print(f"Synth data information: {synthetic_data.info()}\n")

# understanding columns values
for col in diabetes.columns:
    print(f"\n\nReal column: {col} has values: {diabetes[col].unique()}")
    print(f"Synth column: {col} has values: {synthetic_data[col].unique()}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# CORRELATION MATRIX
fig, ax = plt.subplots(1,2,figsize = (15,5))
corr_r = diabetes.corr()
corr_s = synthetic_data.corr()
sns.heatmap(corr_r, 
            xticklabels=corr_r.columns.values,
            yticklabels=corr_r.columns.values,
            cmap="Blues",
            annot=True,         # Display the correlation values in the cells
            fmt=".2f", ax = ax[0])
sns.heatmap(corr_s, 
            xticklabels=corr_s.columns.values,
            yticklabels=corr_s.columns.values,
            cmap="Greens",
            annot=True,         # Display the correlation values in the cells
            fmt=".2f", ax = ax[1])
ax[0].set_title("REAL")
ax[1].set_title("SYNTH")
plt.tight_layout()     
plt.show()

In [None]:
# check sensitive data
sensitive_column_names = ['race', 'gender', 'age','payer_code', 'medical_specialty']

# understanding columns values
for col in sensitive_column_names:
    print(f"\n\nReal column: {col} has values: {diabetes[col].unique()}")
    print(f"Synth column: {col} has values: {synthetic_data[col].unique()}")

## SAVE SYNTHETIZER & SYNTHETIC DATA

In [8]:
import os

# create save folder
synth_folder = os.path.join("./","synthetic_data")
os.makedirs(synth_folder, exist_ok = True) 

# save synth generator 
synthesizer.save(os.path.join(synth_folder, "sdv_synthesizer.pkl"))

In [9]:
# save synthetic data
synthetic_data.to_parquet(os.path.join(synth_folder,"sdv_synth.parquet"), engine='pyarrow')