# SDV SYNTHETIC DATA GENERATION

## LOAD PREPROCESSED DATA

In [None]:
import pandas as pd
import os

# create folder
tmp_folder = "../resources"
diabetes = pd.read_parquet(os.path.join(tmp_folder,"preprocessed_file.parquet"),engine="pyarrow")

## CREATE METADATA

In [None]:
# Transform dataframe into`SingleTableMetadata` data type 
from sdv.metadata import SingleTableMetadata

def create_metadata(df):
    """
    SingleTableMetadata type data creation. Obtains information directly from original dataframe.
    
    Parameters:
        df (pd.DataFrame): The original DataFrame.

    Returns:
        SingleTableMetadata: metadata to create synthetic data
    """
    # Automatically detect metadata from the actual DataFrame
    metadata = SingleTableMetadata()
    metadata.detect_from_dataframe(df)

    return metadata

# create metadata
metadata = create_metadata(diabetes)

# Check if metadata has been correctly generated
print(metadata)	


## GET SYNTHETIZER

In [None]:
from sdv.single_table import CTGANSynthesizer

def create_synthesizer (df, md):
    """
    Creates synthetizer, trains synthesizer with real data and creates new 
    synthetic data.
    
    Parameters:
        df (pd.DataFrame): The original DataFrame.
        md (SingleTableMetadata): Metadata of DataFrame.

    Returns:
        sinthetizer (CTGANSynthesizer): returns trained synthesizer.
    """

    # create synthesizer
    synthesizer = CTGANSynthesizer(
        md, # required
        enforce_rounding=True,
        epochs=100,
        verbose=True
    )
    
    
    # train data to learn from real data
    synthesizer.fit(
        data = df
    )

    return synthesizer    

# call to function
synthesizer = create_synthesizer (diabetes, metadata)

## CREATE SYNTHETIC DATA

In [None]:
# create new data with synthesizer
def create_synth_data (df, synth):
    """
    Creates synthetic data using metadata and specific numerical distribution
    
    Parameters:
        df (pd.DataFrame): The original DataFrame.
        synth (GaussianCopulaSynthesizer): trained synthesizer.

    Returns:
        synthetic_data (pd.DataFrame): new synthetic data.
    """   
    # create new synth data
    synthetic_data = synth.sample(
        num_rows=df.shape[0]
    )
    
    return synthetic_data

# obtain synthetic data
synthetic_data = create_synth_data(diabetes, synthesizer)

# print result
synthetic_data.head()

## EVALUATE THE RESULTED SYNTHETIC DATA

In [None]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    diabetes,
    synthetic_data,
    metadata
)

print(f"Quality of synthetic data is: {quality_report}")

## COMPARE REAL DATA WITH SYNTHETIC DATA

### Compare dimensions

In [None]:
print(f"Real dimension: {diabetes.shape}")
print(f"Synth dimension: {synthetic_data.shape}")

### Compare general data

In [None]:
# Get information from both datasets
real_data_info = pd.DataFrame({
    'Column': diabetes.columns,
    'Real Non-Null Count':diabetes.notnull().sum()
})

# For synthetic data
synthetic_data_info = pd.DataFrame({
    'Column': synthetic_data.columns,
    'Synthetic Non-Null Count':synthetic_data.notnull().sum()
})

# Merge the two DataFrames on the 'Column' name
comparison = pd.merge(real_data_info, synthetic_data_info, on='Column', how='outer')

# Print comparison table
print("Comparison of Real and Synthetic Data:")
print(comparison)


### Check data anonymization

Check first N rows and their sensible columns values.

#### Detect sensible columns

In [None]:
# identify identity sensible data: 
sensitive_columns = ['race', 'gender', 'age', 'admission_type_id','discharge_disposition_id','admission_source_id','payer_code']
print(f"\nSensitive columns: {sensitive_columns}\n")

#### Check sensible data anonymization

In [None]:
print(f"Real: {diabetes[sensitive_columns].head()}")
print(f"\nSynthetic: {synthetic_data[sensitive_columns].head()}")

### Check for numeric data correlation

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# CORRELATION MATRIX
fig, ax = plt.subplots(1,2,figsize = (15,5))
corr_r = diabetes.corr()
corr_s = synthetic_data.corr()
sns.heatmap(corr_r, 
            xticklabels=corr_r.columns.values,
            yticklabels=corr_r.columns.values,
            cmap="Blues",
            annot=True,         # Display the correlation values in the cells
            fmt=".2f", ax = ax[0])
sns.heatmap(corr_s, 
            xticklabels=corr_s.columns.values,
            yticklabels=corr_s.columns.values,
            cmap="Greens",
            annot=True,         # Display the correlation values in the cells
            fmt=".2f", ax = ax[1])
ax[0].set_title("REAL")
ax[1].set_title("SYNTH")
plt.tight_layout()     
plt.show()

In [None]:
# save synthetic data
synthetic_data.to_parquet(os.path.join(tmp_folder,"synthetic_data.parquet"),engine="pyarrow",index=False)

In [None]:
# save synthetizer:
import pickle

with open(os.path.join("../resources","synthesizer.pkl"), "wb") as file:
    pickle.dump(synthesizer, file)