# SDV SYNTHETIC DATA GENERATION

## SETUP

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)

## LOAD REAL DATA

In [2]:
from ucimlrepo import fetch_ucirepo 

'''# metadata 
print(diabetes_130_us_hospitals_for_years_1999_2008.metadata) 
  
# variable information 
print(diabetes_130_us_hospitals_for_years_1999_2008.variables) '''
  
# fetch dataset 
diabetes_130_us_hospitals_for_years_1999_2008 = fetch_ucirepo(id=296) 
  
# data (as pandas dataframes) 
X = diabetes_130_us_hospitals_for_years_1999_2008.data.features 
y = diabetes_130_us_hospitals_for_years_1999_2008.data.targets 

# create complete real_data
diabetes = pd.DataFrame(X)
diabetes["readmitted"] = y

# visualize data
diabetes.head()

  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,Caucasian,Female,[0-10),,6,25,1,1,,Pediatrics-Endocrinology,41,0,1,0,0,0,250.83,,,1,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO
1,Caucasian,Female,[10-20),,1,1,7,3,,,59,0,18,0,0,0,276.0,250.01,255,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30
2,AfricanAmerican,Female,[20-30),,1,1,7,2,,,11,5,13,2,0,1,648.0,250.0,V27,6,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO
3,Caucasian,Male,[30-40),,1,1,7,2,,,44,1,16,0,0,0,8.0,250.43,403,7,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO
4,Caucasian,Male,[40-50),,1,1,7,1,,,51,0,8,0,0,0,197.0,157.0,250,5,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO


## EXPLORE REAL DATA

In [None]:
# dimensions
print(f"Dimension: {diabetes.shape}")

# data information
print(f"\nData information: {diabetes.info()}\n")

# understanding columns values
for col in diabetes.columns:
    print(f"\nColumn: {col} has values: {diabetes[col].unique()}")

# identify identity sensible data: 
sensitive_column_names = ['race', 'gender', 'age', 'admission_type_id','discharge_disposition_id','admission_source_id','payer_code', 'medical_specialty']

In [23]:
# understanding columns values
for col in diabetes.columns:
    print(f"\nColumn: {col} has values: {diabetes[col].unique()}")


Column: race has values: ['Caucasian' 'AfricanAmerican' nan 'Other' 'Asian' 'Hispanic']

Column: gender has values: ['Female' 'Male' 'Unknown/Invalid']

Column: age has values: ['[0-10)' '[10-20)' '[20-30)' '[30-40)' '[40-50)' '[50-60)' '[60-70)'
 '[70-80)' '[80-90)' '[90-100)']

Column: weight has values: [nan '[75-100)' '[50-75)' '[0-25)' '[100-125)' '[25-50)' '[125-150)'
 '[175-200)' '[150-175)' '>200']

Column: admission_type_id has values: [6 1 2 3 4 5 8 7]

Column: discharge_disposition_id has values: [25  1  3  6  2  5 11  7 10  4 14 18  8 13 12 16 17 22 23  9 20 15 24 28
 19 27]

Column: admission_source_id has values: [ 1  7  2  4  5  6 20  3 17  8  9 14 10 22 11 25 13]

Column: time_in_hospital has values: [ 1  3  2  4  5 13 12  9  7 10  6 11  8 14]

Column: payer_code has values: [nan 'MC' 'MD' 'HM' 'UN' 'BC' 'SP' 'CP' 'SI' 'DM' 'CM' 'CH' 'PO' 'WC' 'OT'
 'OG' 'MP' 'FR']

Column: medical_specialty has values: ['Pediatrics-Endocrinology' nan 'InternalMedicine'
 'Family/Genera

## EXPLORE REAL DATA VISUALLY

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# visualizing data distribution
for col in diabetes.columns:
    fig, ax = plt.subplots(figsize = (10,5))
    sns.countplot(data=diabetes, x=col, ax = ax)
    ax.set_title(col)
    plt.tight_layout()
    plt.show()

## CREATE SYNTHETIZER & SYNTHETIC DATA WITH SDV

In [22]:
# Transform `diabetes` dataframe `SingleTableMetadata` data type 
from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata()

# Automatically detect metadata from the actual DataFrame
metadata.detect_from_dataframe(diabetes)

# Change dtype of "_id" columns. Threat as categorical instead of numerical
for column_name in metadata.columns:
    if '_id' in column_name:
        metadata.update_column(column_name, sdtype='categorical')

# Check if metadata has been correctly generated
print(metadata)	

{
    "columns": {
        "race": {
            "sdtype": "categorical"
        },
        "gender": {
            "sdtype": "categorical"
        },
        "age": {
            "sdtype": "categorical"
        },
        "weight": {
            "sdtype": "categorical"
        },
        "admission_type_id": {
            "sdtype": "categorical"
        },
        "discharge_disposition_id": {
            "sdtype": "categorical"
        },
        "admission_source_id": {
            "sdtype": "categorical"
        },
        "time_in_hospital": {
            "sdtype": "numerical"
        },
        "payer_code": {
            "sdtype": "categorical"
        },
        "medical_specialty": {
            "sdtype": "categorical"
        },
        "num_lab_procedures": {
            "sdtype": "numerical"
        },
        "num_procedures": {
            "sdtype": "numerical"
        },
        "num_medications": {
            "sdtype": "numerical"
        },
        "number_outpatient"

In [10]:
from sdv.single_table import GaussianCopulaSynthesizer

synthesizer = GaussianCopulaSynthesizer(
    metadata,
    enforce_min_max_values=True,
    enforce_rounding=True) 

In [11]:
# train data to learn from real data
synthesizer.fit(
    data = diabetes
)

In [12]:
# create new data (same dimensions) based on learned model
synthetic_data = synthesizer.sample(
    num_rows=diabetes.shape[0]
)

## EXPLORE SYNTHETIC DATA AND VALIDATE

In [None]:
# dimensions
print(f"Real dimension: {diabetes.shape}")
print(f"Synth dimension: {synthetic_data.shape}")

# data information
print(f"\n\nReal data information: {diabetes.info()}")
print(f"Synth data information: {synthetic_data.info()}\n")

In [20]:
# Compare values creation
for col in diabetes.columns:
    print(f"\n\nColumn: {col}")
    
    # Get value counts for both real and synthetic data
    real_counts = diabetes[col].value_counts()
    synth_counts = synthetic_data[col].value_counts()
    # Combine the counts to ensure all categories are represented in both datasets
    combined_counts = pd.DataFrame({'Real': real_counts, 'Synthetic': synth_counts}).fillna(0)
    
    # Print the combined counts for easy comparison
    print(combined_counts)



Column: race
                  Real  Synthetic
AfricanAmerican  19210      24122
Asian              641        627
Caucasian        76099      67272
Hispanic          2037        214
Other             1506       2084


Column: gender
                  Real  Synthetic
Female           54708      57908
Male             47055      39491
Unknown/Invalid      3       4367


Column: age
           Real  Synthetic
[0-10)      161        202
[10-20)     691        573
[20-30)    1657       1736
[30-40)    3775       4407
[40-50)    9685       9657
[50-60)   17256      16637
[60-70)   22483      23339
[70-80)   26068      24391
[80-90)   17197      16274
[90-100)   2793       4550


Column: weight
           Real  Synthetic
>200          3        0.0
[0-25)       48      131.0
[100-125)   625        7.0
[125-150)   145        0.0
[150-175)    35        0.0
[175-200)    11        0.0
[25-50)      97        0.0
[50-75)     897     1380.0
[75-100)   1336    14358.0


Column: admission_type_id
  

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# CORRELATION MATRIX
fig, ax = plt.subplots(1,2,figsize = (15,5))
corr_r = diabetes.corr()
corr_s = synthetic_data.corr()
sns.heatmap(corr_r, 
            xticklabels=corr_r.columns.values,
            yticklabels=corr_r.columns.values,
            cmap="Blues",
            annot=True,         # Display the correlation values in the cells
            fmt=".2f", ax = ax[0])
sns.heatmap(corr_s, 
            xticklabels=corr_s.columns.values,
            yticklabels=corr_s.columns.values,
            cmap="Greens",
            annot=True,         # Display the correlation values in the cells
            fmt=".2f", ax = ax[1])
ax[0].set_title("REAL")
ax[1].set_title("SYNTH")
plt.tight_layout()     
plt.show()

In [None]:
# check sensitive data
sensitive_column_names = ['race', 'gender', 'age','payer_code', 'medical_specialty']

# understanding columns values
for col in sensitive_column_names:
    print(f"\n\nReal column: {col} has values: {diabetes[col].unique()}")
    print(f"Synth column: {col} has values: {synthetic_data[col].unique()}")

## SAVE SYNTHETIZER & SYNTHETIC DATA

In [14]:
import os

# create save folder
synth_folder = os.path.join("./","synthetic_data")
os.makedirs(synth_folder, exist_ok = True) 

# save synth generator 
synthesizer.save(os.path.join(synth_folder, "sdv_synthesizer.pkl"))

In [15]:
# save synthetic data
synthetic_data.to_parquet(os.path.join(synth_folder,"sdv_synth.parquet"), engine='pyarrow')