# SDV SYNTHETIC DATA GENERATION

## LOAD PREPROCESSED DATA

In [None]:
import pandas as pd
import os

# create folder
tmp_folder = "./tmp_folder"
diabetes = pd.read_parquet(os.path.join(tmp_folder,"preprocessed_file.parquet"),engine="pyarrow")

## CREATE SYNTHETIZER & SYNTHETIC DATA WITH SDV

### Create Metadata

In [None]:
# Transform dataframe into`SingleTableMetadata` data type 
from sdv.metadata import SingleTableMetadata

def create_metadata(df):
    """
    SingleTableMetadata type data creation. Obtains information directly from original dataframe.
    
    Parameters:
        df (pd.DataFrame): The original DataFrame.

    Returns:
        SingleTableMetadata: metadata to create synthetic data
    """
    # Automatically detect metadata from the actual DataFrame
    metadata = SingleTableMetadata()
    metadata.detect_from_dataframe(df)

    return metadata

# create metadata
metadata = create_metadata(diabetes)

# Check if metadata has been correctly generated
print(metadata)	


### Create synthetizer and synthetic data

#### Check numeric data distribution

In [None]:
# get numeric columns
num_cols = diabetes.select_dtypes(include='int64').columns.to_list()

import matplotlib.pyplot as plt
import numpy as np

# plot their distribution
for col in num_cols:
    # Plot histogram
    fig, ax = plt.subplots(figsize=(10, 5))
    
    # Get unique values count for binning (useful for integer columns)
    unique_values = diabetes[col].nunique()
    
    # Set the number of bins based on unique values or a minimum threshold for better visualization
    if unique_values < 30:
        bins = unique_values  # Use number of unique values if less than 30
    else:
        bins = 30  # Default to 30 bins if more than 30 unique values
    
    
    ax.hist(diabetes[col], bins= bins)
    
    # Set xticks based on min and max values in the column
    col_min, col_max = diabetes[col].min(), diabetes[col].max()
    
    # Adjust step size for xticks dynamically (if range is small, step=1, else larger step)
    if col_max - col_min < 30:
        step_size = 1
    else:
        step_size = (col_max - col_min) // 10  # Step size as a fraction of the range

    # Set the xticks on the axis
    ax.set_xticks(np.arange(col_min, col_max + step_size, step_size))

    # Adjust & show the plot
    ax.set_title(f'Distribution of {col}')
    ax.set_xlabel(col)
    ax.set_ylabel('Frequency')
    plt.tight_layout()
    plt.show()

#### Define numeric data distribution for synthetizer 

In [None]:
'''from sdv.single_table import GaussianCopulaSynthesizer

# Specify the distribution for numerical columns
numerical_distributions = {
    'time_in_hospital': 'beta',  # Right-skewed integer distribution
    'num_lab_procedures': 'norm', # Normal distribution
    'num_procedures': 'beta',  # Right-skewed
    'num_medications': 'norm', # Normal distribution
    'number_outpatient': 'beta',  # Right-skewed
    'number_emergency': 'beta',  # Right-skewed
    'number_inpatient': 'beta',  # Right-skewed
    'number_diagnoses': 'gamma' # Left-skewed
}'''

#### Create synthetizer

In [None]:
# Create synthetizer and synthetic data 
def create_synthetizer (md, df):
    """
    Creates synthetizer, trains synthetizer with real data and creates new 
    synthetic data.
    
    Parameters:
        md (SingleTableMetadata): Metadata of DataFrame.
        df (pd.DataFrame): The original DataFrame.

    Returns:
        sinthetizer (GaussianCopulaSynthesizer): returns trained synthetizer.
    """

    # create synthetizer
    synthesizer = GaussianCopulaSynthesizer(
        md, 
        enforce_min_max_values=True,
        enforce_rounding=True#,  # Ensure integer values are generated
        #numerical_distributions= num_distribution
    )
    
    # train data to learn from real data
    synthesizer.fit(
        data = df
    )

    return synthesizer    

#### Create synthetic data

In [None]:
# create new data with synthetizer
def create_synth_data (df, md):
    """
    Creates synthetic data using metadata and specific numerical distribution
    
    Parameters:
        df (pd.DataFrame): The original DataFrame.
        md (SingleTableMetadata): Metadata of DataFrame.

    Returns:
        synthetic_data (pd.DataFrame): new synthetic data.
        synthesizer (GaussianCopulaSynthesizer): trained synthetizer.
    """
    # create synthetizer
    synthesizer = create_synthetizer (md, df)
    
    # create new synth data
    synthetic_data = synthesizer.sample(
        num_rows=df.shape[0]
    )
    
    return synthetic_data, synthesizer

# obtain synthetic data
#synthetic_data, synthesizer = create_synth_data(diabetes, metadata, numerical_distributions)
synthetic_data, synthesizer = create_synth_data(diabetes, metadata)

# print result
synthetic_data

In [None]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    diabetes,
    synthetic_data,
    metadata
)

print(f"Quality of synthetic data is: {quality_report}")

## EXPLORE SYNTHETIC DATA AND VALIDATE

### Compare dimensions

In [None]:
print(f"Real dimension: {diabetes.shape}")
print(f"Synth dimension: {synthetic_data.shape}")

### Compare general data

In [None]:
# Get information from both datasets
real_data_info = pd.DataFrame({
    'Column': diabetes.columns,
    'Real Non-Null Count':diabetes.notnull().sum()
})

# For synthetic data
synthetic_data_info = pd.DataFrame({
    'Column': synthetic_data.columns,
    'Synthetic Non-Null Count':synthetic_data.notnull().sum()
})

# Merge the two DataFrames on the 'Column' name
comparison = pd.merge(real_data_info, synthetic_data_info, on='Column', how='outer')

# Print comparison table
print("Comparison of Real and Synthetic Data:")
print(comparison)


### Check data anonymization

Check first N rows and their sensible columns values.

#### Detect sensible columns

In [None]:
# identify identity sensible data: 
sensitive_columns = ['race', 'gender', 'age', 'admission_type_id','discharge_disposition_id','admission_source_id','payer_code']
print(f"\nSensitive columns: {sensitive_columns}\n")

#### Check sensible data anonymization

In [None]:
print(f"Real: {diabetes[sensitive_columns].head()}")
print(f"\nSynthetic: {synthetic_data[sensitive_columns].head()}")

### Check for numeric data correlation

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# CORRELATION MATRIX
fig, ax = plt.subplots(1,2,figsize = (15,5))
corr_r = diabetes.corr()
corr_s = synthetic_data.corr()
sns.heatmap(corr_r, 
            xticklabels=corr_r.columns.values,
            yticklabels=corr_r.columns.values,
            cmap="Blues",
            annot=True,         # Display the correlation values in the cells
            fmt=".2f", ax = ax[0])
sns.heatmap(corr_s, 
            xticklabels=corr_s.columns.values,
            yticklabels=corr_s.columns.values,
            cmap="Greens",
            annot=True,         # Display the correlation values in the cells
            fmt=".2f", ax = ax[1])
ax[0].set_title("REAL")
ax[1].set_title("SYNTH")
plt.tight_layout()     
plt.show()

In [None]:
# save synthetic data
synthetic_data.to_parquet(os.path.join(tmp_folder,"synthetic_data.parquet"),engine="pyarrow",index=False)

In [None]:
# save synthetizer:
import pickle

with open("synthesizer.pkl", "wb") as file:
    pickle.dump(my_synth, file)