# SDV SYNTHETIC DATA GENERATION

## SETUP

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)

## LOAD REAL DATA

In [2]:
from ucimlrepo import fetch_ucirepo 

'''# metadata 
print(diabetes_130_us_hospitals_for_years_1999_2008.metadata) 
  
# variable information 
print(diabetes_130_us_hospitals_for_years_1999_2008.variables) '''
  
# fetch dataset 
diabetes_130_us_hospitals_for_years_1999_2008 = fetch_ucirepo(id=296) 
  
# data (as pandas dataframes) 
X = diabetes_130_us_hospitals_for_years_1999_2008.data.features 
y = diabetes_130_us_hospitals_for_years_1999_2008.data.targets 

# create complete real_data
diabetes = pd.DataFrame(X)
diabetes["readmitted"] = y

# visualize data
diabetes.head()

  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,Caucasian,Female,[0-10),,6,25,1,1,,Pediatrics-Endocrinology,41,0,1,0,0,0,250.83,,,1,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO
1,Caucasian,Female,[10-20),,1,1,7,3,,,59,0,18,0,0,0,276.0,250.01,255,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30
2,AfricanAmerican,Female,[20-30),,1,1,7,2,,,11,5,13,2,0,1,648.0,250.0,V27,6,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO
3,Caucasian,Male,[30-40),,1,1,7,2,,,44,1,16,0,0,0,8.0,250.43,403,7,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO
4,Caucasian,Male,[40-50),,1,1,7,1,,,51,0,8,0,0,0,197.0,157.0,250,5,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO


## EXPLORE REAL DATA

In [None]:
# dimensions
print(f"Dimension: {diabetes.shape}")

# data information
print(f"\nData information: {diabetes.info()}\n")

# understanding columns values
for col in diabetes.columns:
    print(f"\nColumn: {col} has values: {diabetes[col].unique()}")

# identify identity sensible data: 
sensitive_column_names = ['race', 'gender', 'age', 'admission_type_id','discharge_disposition_id','admission_source_id','payer_code', 'medical_specialty']

## EXPLORE REAL DATA VISUALLY

In [None]:
# visualizing categorical data distribution
import matplotlib.pyplot as plt
import seaborn as sns

categorical_cols = diabetes.select_dtypes(include='object')

for col in categorical_cols:
    fig, ax = plt.subplots(figsize = (10,5))
    sns.countplot(data=diabetes, x=col, ax = ax)
    ax.set_title(col)
    plt.tight_layout()
    plt.show()

In [None]:
# visualizing continuos data distribution
import matplotlib.pyplot as plt
import seaborn as sns

# continuous data
continuos_cols = diabetes.select_dtypes(include='int64')

# check data
print(f"First filter: {continuos_cols.columns}")

# remove id type values 
continuos_cols = [col for col in continuos_cols.columns if 'id' not in col]

# check data
print(f"Without `_id` columns: {continuos_cols}")

# KDE visualizations
for col in continuos_cols:
    fig, ax = plt.subplots(1,2, figsize = (10,5))
    sns.kdeplot(data=diabetes[col], ax=ax[0], fill=True)
    ax[1].hist(diabetes[col])
    ax[0].set_title(f"KDE: {col}")
    ax[1].set_title(f"HISTOGRAM: {col}")
    plt.tight_layout()
    plt.show()

## CREATE SYNTHETIZER & SYNTHETIC DATA WITH SDV

In [27]:
# Transform `diabetes` dataframe `SingleTableMetadata` data type 
from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata()

# Automatically detect metadata from the actual DataFrame
metadata.detect_from_dataframe(diabetes)

# Check if metadata has been correctly generated
print(metadata)


{
    "columns": {
        "race": {
            "sdtype": "categorical"
        },
        "gender": {
            "sdtype": "categorical"
        },
        "age": {
            "sdtype": "categorical"
        },
        "weight": {
            "sdtype": "categorical"
        },
        "admission_type_id": {
            "sdtype": "numerical"
        },
        "discharge_disposition_id": {
            "sdtype": "numerical"
        },
        "admission_source_id": {
            "sdtype": "numerical"
        },
        "time_in_hospital": {
            "sdtype": "numerical"
        },
        "payer_code": {
            "sdtype": "categorical"
        },
        "medical_specialty": {
            "sdtype": "categorical"
        },
        "num_lab_procedures": {
            "sdtype": "numerical"
        },
        "num_procedures": {
            "sdtype": "numerical"
        },
        "num_medications": {
            "sdtype": "numerical"
        },
        "number_outpatient": {
  

In [34]:
# TODO KONPONDU CONTRAINTS-AK GEHITU BALIO DENAK HAR DITZAN!!
from sdv.constraints import Constraint

def ensure_all_categories_present(column, real_data):
    """Creates a constraint function to ensure all categories from the real data are present in synthetic data."""
    
    class EnsureAllCategories(Constraint):
        def __init__(self):
            self.column = column
            self.required_values = real_data[self.column].unique()

        def is_valid(self, table_data):
            unique_synthetic_values = table_data[self.column].unique()
            contains_all_values = all(
                pd.isna(val) if pd.isna(val) else val in unique_synthetic_values
                for val in self.required_values
            )
            return contains_all_values
        
        def transform(self, table_data):
            return table_data
        
        def reverse_transform(self, table_data):
            return table_data

    return EnsureAllCategories()

In [None]:
# TODO KONPONDU CONTRAINTS-AK GEHITU BALIO DENAK HAR DITZAN!!
from sdv.single_table import GaussianCopulaSynthesizer

synthesizer = GaussianCopulaSynthesizer(
    metadata,
    enforce_min_max_values=True,
    default_distribution='norm')


# Select categorical columns
categorical_cols = diabetes.select_dtypes(include='object')
for cat_col in categorical_cols:
    # Create and add the constraint
    constraint = ensure_all_categories_present(cat_col, diabetes[cat_col].unique())
    synthesizer.add_constraints([constraint])

In [None]:
# train data to learn from real data
synthesizer.fit(
    data = diabetes
)

In [18]:
# create new data (same dimensions) based on learned model
synthetic_data = synthesizer.sample(
    num_rows=diabetes.shape[0]
)

In [19]:
# TODO KONPONDU ZIURTATU BALIO DENAK DAUDELA SYNTHETIKOTAN BAITA ERE
for col in diabetes.columns:
    print(f"\n\nReal column: {col} has values: {diabetes[col].unique()}")
    print(f"Synth column: {col} has values: {synthetic_data[col].unique()}")




Real column: race has values: ['Caucasian' 'AfricanAmerican' nan 'Other' 'Asian' 'Hispanic']
Synth column: race has values: ['Caucasian' nan 'AfricanAmerican' 'Other' 'Asian']


Real column: gender has values: ['Female' 'Male' 'Unknown/Invalid']
Synth column: gender has values: ['Female' 'Male' 'Unknown/Invalid']


Real column: age has values: ['[0-10)' '[10-20)' '[20-30)' '[30-40)' '[40-50)' '[50-60)' '[60-70)'
 '[70-80)' '[80-90)' '[90-100)']
Synth column: age has values: ['[60-70)' '[50-60)' '[80-90)' '[70-80)' '[40-50)' '[30-40)' '[90-100)'
 '[10-20)' '[20-30)' '[0-10)']


Real column: weight has values: [nan '[75-100)' '[50-75)' '[0-25)' '[100-125)' '[25-50)' '[125-150)'
 '[175-200)' '[150-175)' '>200']
Synth column: weight has values: [nan '[75-100)' '[50-75)' '[0-25)']


Real column: admission_type_id has values: [6 1 2 3 4 5 8 7]
Synth column: admission_type_id has values: [4 3 1 2 5 6 7 8]


Real column: discharge_disposition_id has values: [25  1  3  6  2  5 11  7 10  4 14 

## EXPLORE SYNTHETIC DATA AND VALIDATE

In [16]:
# dimensions
print(f"Real dimension: {diabetes.shape}")
print(f"Synth dimension: {synthetic_data.shape}")

# data information
print(f"\n\nReal data information: {diabetes.info()}")
print(f"Synth data information: {synthetic_data.info()}\n")

# understanding columns values
for col in diabetes.columns:
    print(f"\n\nReal column: {col} has values: {diabetes[col].unique()}")
    print(f"Synth column: {col} has values: {synthetic_data[col].unique()}")

Real dimension: (101766, 48)
Synth dimension: (101766, 48)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 48 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   race                      99493 non-null   object
 1   gender                    101766 non-null  object
 2   age                       101766 non-null  object
 3   weight                    3197 non-null    object
 4   admission_type_id         101766 non-null  int64 
 5   discharge_disposition_id  101766 non-null  int64 
 6   admission_source_id       101766 non-null  int64 
 7   time_in_hospital          101766 non-null  int64 
 8   payer_code                61510 non-null   object
 9   medical_specialty         51817 non-null   object
 10  num_lab_procedures        101766 non-null  int64 
 11  num_procedures            101766 non-null  int64 
 12  num_medications           101766 non-null  int64 
 13  

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# CORRELATION MATRIX
fig, ax = plt.subplots(1,2,figsize = (15,5))
corr_r = diabetes.corr()
corr_s = synthetic_data.corr()
sns.heatmap(corr_r, 
            xticklabels=corr_r.columns.values,
            yticklabels=corr_r.columns.values,
            cmap="Blues",
            annot=True,         # Display the correlation values in the cells
            fmt=".2f", ax = ax[0])
sns.heatmap(corr_s, 
            xticklabels=corr_s.columns.values,
            yticklabels=corr_s.columns.values,
            cmap="Greens",
            annot=True,         # Display the correlation values in the cells
            fmt=".2f", ax = ax[1])
ax[0].set_title("REAL")
ax[1].set_title("SYNTH")
plt.tight_layout()     
plt.show()

In [None]:
# check sensitive data
sensitive_column_names = ['race', 'gender', 'age','payer_code', 'medical_specialty']

# understanding columns values
for col in sensitive_column_names:
    print(f"\n\nReal column: {col} has values: {diabetes[col].unique()}")
    print(f"Synth column: {col} has values: {synthetic_data[col].unique()}")

## SAVE SYNTHETIZER & SYNTHETIC DATA

In [14]:
import os

# create save folder
synth_folder = os.path.join("./","synthetic_data")
os.makedirs(synth_folder, exist_ok = True) 

# save synth generator 
synthesizer.save(os.path.join(synth_folder, "sdv_synthesizer.pkl"))

In [15]:
# save synthetic data
synthetic_data.to_parquet(os.path.join(synth_folder,"sdv_synth.parquet"), engine='pyarrow')