## EVALUATE THE RESULTED SYNTHETIC DATA

In [None]:
from sdv.evaluation.single_table import evaluate_quality

# create evaluation report
quality_gcopula = evaluate_quality(diabetes, g_copula_synth_data, metadata)  # GaussianCopula
quality_ctgan = evaluate_quality(diabetes, ctgan_synth_data, metadata)  # CTGAN

print(f"Quality of GaussianCopula synthetic data: {quality_gcopula}")
print(f"Quality of CTGAN synthetic data: {quality_ctgan}")

## COMPARE REAL DATA WITH SYNTHETIC DATA

### Compare dimensions

In [None]:
print(f"Real dimension: {diabetes.shape}")
print(f"Synth dimension: {synthetic_data.shape}")

### Compare general data

In [None]:
# Get information from both datasets
real_data_info = pd.DataFrame({
    'Column': diabetes.columns,
    'Real Non-Null Count':diabetes.notnull().sum()
})

# For synthetic data
synthetic_data_info = pd.DataFrame({
    'Column': synthetic_data.columns,
    'Synthetic Non-Null Count':synthetic_data.notnull().sum()
})

# Merge the two DataFrames on the 'Column' name
comparison = pd.merge(real_data_info, synthetic_data_info, on='Column', how='outer')

# Print comparison table
print("Comparison of Real and Synthetic Data:")
print(comparison)


### Check data anonymization

Check first N rows and their sensible columns values.

#### Detect sensible columns

In [None]:
# identify identity sensible data: 
sensitive_columns = ['race', 'gender', 'age', 'admission_type_id','discharge_disposition_id','admission_source_id','payer_code']
print(f"\nSensitive columns: {sensitive_columns}\n")

#### Check sensible data anonymization

In [None]:
print(f"Real: {diabetes[sensitive_columns].head()}")
print(f"\nSynthetic: {synthetic_data[sensitive_columns].head()}")

### Check for numeric data correlation

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# CORRELATION MATRIX
fig, ax = plt.subplots(1,2,figsize = (15,5))
corr_r = diabetes.corr()
corr_s = synthetic_data.corr()
sns.heatmap(corr_r, 
            xticklabels=corr_r.columns.values,
            yticklabels=corr_r.columns.values,
            cmap="Blues",
            annot=True,         # Display the correlation values in the cells
            fmt=".2f", ax = ax[0])
sns.heatmap(corr_s, 
            xticklabels=corr_s.columns.values,
            yticklabels=corr_s.columns.values,
            cmap="Greens",
            annot=True,         # Display the correlation values in the cells
            fmt=".2f", ax = ax[1])
ax[0].set_title("REAL")
ax[1].set_title("SYNTH")
plt.tight_layout()     
plt.show()