# SDV SYNTHETIC DATA GENERATION

## LOAD PREPROCESSED DATA

In [None]:
import pandas as pd
import os

pd.set_option("display.max_columns", None)

# create folder
tmp_folder = "../results"
diabetes = pd.read_parquet(os.path.join(tmp_folder,"preprocessed_file.parquet"),engine="pyarrow")

## CREATE SYNTHETIZERS

In [None]:
from sdv.single_table import CTGANSynthesizer, GaussianCopulaSynthesizer
from scipy.stats import gamma, skew
import numpy as np
from sdv.metadata import SingleTableMetadata

def create_metadata(df):
    """
    SingleTableMetadata type data creation. Obtains information directly from original dataframe.
    
    Parameters:
        df (pd.DataFrame): The original DataFrame.

    Returns:
        SingleTableMetadata: metadata to create synthetic data
    """
    # Automatically detect metadata from the actual DataFrame
    metadata = SingleTableMetadata()
    metadata.detect_from_dataframe(df)

    return metadata

def create_CTGANSynthesizer (df, md):
    """
    Creates CTGANSynthesizer type synthetizer, trains synthesizer with real data and creates new 
    synthetic data.
    
    Parameters:
        df (pd.DataFrame): The original DataFrame.
        md (SingleTableMetadata): Metadata of DataFrame.

    Returns:
        sinthetizer (CTGANSynthesizer): returns trained synthesizer.
    """

    # create synthesizer
    synthesizer = CTGANSynthesizer(
        md, # required
        enforce_rounding=True,
        epochs=100,
        verbose=True
    )
    
    
    # train data to learn from real data
    synthesizer.fit(
        data = df
    )

    return synthesizer

def create_numerical_colums_distribution(df):
    """
    Function that generates numerical columns data distribution
    
    Parameters:
        df (pd.DataFrame): The original DataFrame.

    Returns:
        num_distribution acording to distr distribution
    """
    num_distribution = {}
    num_cols = diabetes.select_dtypes(include='int64')  # only numerical columns  
    for col in num_cols:
        # Calculate the skewness 
        skewness = skew(df[col])
        print(f"\nColumn: {col} skewness: {skewness}")

        if skewness > 0: # if  possitive gamma 
            gamma_params = gamma.fit(df[col])

            # get params to configure 
            shape, loc, scale = gamma_params
            print(f"Gamma distribution parameters:\nShape: {shape}, Loc: {loc}, Scale: {scale}")

            num_distribution[col] = "gamma"
        else:
            num_distribution[col] = "gaussian_kde"

    return num_distribution

def create_full_synthesizer(df, num_distr):
    """
    Creates a synthesizer for categorical and gamma distributed columns.

    Parameters:
        df (pd.DataFrame): The original DataFrame.
        num_distr: Numerical columns data distribution.

    Returns:
        synthesizer_full: Synthesizer for categorical and gamma distributed columns.
    """
    # Separate categorical and gamma distributed columns
    categorical_cols = df.select_dtypes(include='object').columns.tolist()
    gamma_cols = [col for col in df.columns if (col in num_distr.keys()) and (num_distr[col] == 'gamma')]
    
    # Create DataFrame for gamma distributed and categorical columns
    df_full = df[categorical_cols + gamma_cols]
    
    # Filter metadata
    filtered_md = create_metadata(df[categorical_cols + gamma_cols])
    
    # Create synthesizer for full DataFrame (categorical + gamma distributed)
    synthesizer_full = GaussianCopulaSynthesizer(
        filtered_md,
        enforce_min_max_values=True,
        enforce_rounding=True,
        numerical_distributions={col: 'gamma' for col in gamma_cols}
    )
    synthesizer_full.fit(data=df_full)

    return synthesizer_full

def create_kde_synthesizer(df, num_distr, batch_size=10000):
    """
    Creates a synthesizer for columns that require gaussian_kde using batching.

    Parameters:
        df (pd.DataFrame): The original DataFrame.       
        num_distr: Numerical columns data distribution.
        batch_size (int): The size of each batch for fitting.

    Returns:
        synthesizer_kde: Synthesizer for columns that need gaussian_kde.
    """
    
    # Identify columns that need gaussian_kde
    kde_cols = [col for col in df.columns if (col in num_distr.keys()) and (num_distr[col] == "gaussian_kde")]
    
    # Filter metadata
    filtered_md = create_metadata(df[kde_cols])
    
    # Create synthesizer for gaussian_kde columns only
    synthesizer_kde = GaussianCopulaSynthesizer(
        filtered_md,
        enforce_min_max_values=True,
        enforce_rounding=True,
        numerical_distributions={col: 'gaussian_kde' for col in kde_cols}
    )
    
    # Fit the synthesizer in batches to avoid memory crashes
    for start in range(0, df.shape[0], batch_size):
        end = start + batch_size
        batch = df[kde_cols].iloc[start:end]
        
        # Fit the synthesizer with the current batch
        synthesizer_kde.fit(data=batch)  # Cumulative Learning, accumulates knowledge about the data as it processes each batch.

    return synthesizer_kde

def generate_synthetic_data_by_2_synths(synth_full, synth_kde, num_rows):
    """
    Generates synthetic data from both synthesizers.

    Parameters:
        synth_full: synthesizer for categorical and gamma distributed columns
        synth_kde: synthesizer for gaussian_kde columns
        num_rows: Number of rows for synthetic data

    Returns:
        pd.DataFrame: Combined synthetic data.
    """
    # Generate synthetic data
    synthetic_full = synth_full.sample(num_rows)    
    synthetic_kde = synth_kde.sample(num_rows)

    # Combine the dataframes, aligning by columns
    synthetic_data = pd.concat([synthetic_full.reset_index(drop=True), synthetic_kde.reset_index(drop=True)], axis=1)

    return synthetic_data

def create_synth_data (df, synth):
    """
    Generates synthetic data based on passed synthesizer.

    Parameters:
        synth: synthesizer 

    Returns:
        pd.DataFrame: Combined synthetic data.
    """

    # create new synth data
    synthetic_data = synth.sample(
        num_rows=df.shape[0]
    )
    
    return synthetic_data

### GaussianCopulaSynthesizer

#### Get numerical data distribution

In [None]:
# call num_distribution generator
num_distribution = create_numerical_colums_distribution(diabetes)
num_distribution

#### Create synthesizer

Create synthesizer separately,

In [None]:
# gaussian_kde numerical data synthesizer only. Optimizing memory.
synth_gcopula_kde = create_kde_synthesizer(diabetes, num_distribution)

In [None]:
# The rest of the data
synth_gcopula_rest = create_full_synthesizer(diabetes, num_distribution)

#### Save synthesizers

In [None]:
from joblib import dump

from joblib import dump

# save as joblib both synthesizers
dump(synth_gcopula_kde, os.path.join(tmp_folder,'synth_kde.joblib'))
dump(synth_gcopula_rest, os.path.join(tmp_folder,'synth_rest.joblib'))

### CTGANSynthesizer

#### Create synthesizer

In [None]:
# call to gan_synthetizer 
metadata = create_metadata(diabetes)
ctgan_synthesizer = create_CTGANSynthesizer (diabetes, metadata)

#### Save synthesizer

In [None]:
# save as joblib
dump(ctgan_synthesizer, os.path.join(tmp_folder,'ctgan_synthesizer.joblib'))

## CREATE SYNTHETIC DATA

### With GaussianCopulaSynthesizer

In [None]:
# Generate synthetic data composed by 2 synthesizer data.
g_copula_synth_data = generate_synthetic_data_by_2_synths(synth_gcopula_rest, synth_gcopula_kde, diabetes.shape[0])

# print result
g_copula_synth_data.head()

#### Save GaussianCopula synthetic data

In [None]:
# save synthetic data
g_copula_synth_data.to_parquet(os.path.join(tmp_folder,"g_copula_synth_data.parquet"),engine="pyarrow",index=False)

### With CTGANSynthesizer

In [None]:
# obtain synthetic data
ctgan_synth_data = create_synth_data(diabetes, ctgan_synthesizer)

# print result
ctgan_synth_data.head()

#### Save GaussianCTGAN synthetic data

In [None]:
# save synthetic data
ctgan_synth_data.to_parquet(os.path.join(tmp_folder,"ctgan_synth_data.parquet"),engine="pyarrow",index=False)

## EVALUATE THE RESULTED SYNTHETIC DATA

In [None]:
from sdv.evaluation.single_table import evaluate_quality

# create evaluation report
quality_gcopula = evaluate_quality(diabetes, g_copula_synth_data, metadata)  # GaussianCopula
quality_ctgan = evaluate_quality(diabetes, ctgan_synth_data, metadata)  # CTGAN

print(f"Quality of GaussianCopula synthetic data: {quality_gcopula}")
print(f"Quality of CTGAN synthetic data: {quality_ctgan}")

## COMPARE REAL DATA WITH SYNTHETIC DATA

### Compare dimensions

In [None]:
print(f"Real dimension: {diabetes.shape}")
print(f"Synth dimension: {synthetic_data.shape}")

### Compare general data

In [None]:
# Get information from both datasets
real_data_info = pd.DataFrame({
    'Column': diabetes.columns,
    'Real Non-Null Count':diabetes.notnull().sum()
})

# For synthetic data
synthetic_data_info = pd.DataFrame({
    'Column': synthetic_data.columns,
    'Synthetic Non-Null Count':synthetic_data.notnull().sum()
})

# Merge the two DataFrames on the 'Column' name
comparison = pd.merge(real_data_info, synthetic_data_info, on='Column', how='outer')

# Print comparison table
print("Comparison of Real and Synthetic Data:")
print(comparison)


### Check data anonymization

Check first N rows and their sensible columns values.

#### Detect sensible columns

In [None]:
# identify identity sensible data: 
sensitive_columns = ['race', 'gender', 'age', 'admission_type_id','discharge_disposition_id','admission_source_id','payer_code']
print(f"\nSensitive columns: {sensitive_columns}\n")

#### Check sensible data anonymization

In [None]:
print(f"Real: {diabetes[sensitive_columns].head()}")
print(f"\nSynthetic: {synthetic_data[sensitive_columns].head()}")

### Check for numeric data correlation

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# CORRELATION MATRIX
fig, ax = plt.subplots(1,2,figsize = (15,5))
corr_r = diabetes.corr()
corr_s = synthetic_data.corr()
sns.heatmap(corr_r, 
            xticklabels=corr_r.columns.values,
            yticklabels=corr_r.columns.values,
            cmap="Blues",
            annot=True,         # Display the correlation values in the cells
            fmt=".2f", ax = ax[0])
sns.heatmap(corr_s, 
            xticklabels=corr_s.columns.values,
            yticklabels=corr_s.columns.values,
            cmap="Greens",
            annot=True,         # Display the correlation values in the cells
            fmt=".2f", ax = ax[1])
ax[0].set_title("REAL")
ax[1].set_title("SYNTH")
plt.tight_layout()     
plt.show()