# SYNTHETIC DATA EVALUATION

## LOAD REAL & BOTH SYNTH DATA

In [None]:
# import packages
import pandas as pd
import os

# load real data from uci repository
from ucimlrepo import fetch_ucirepo 

'''# metadata 
print(diabetes_130_us_hospitals_for_years_1999_2008.metadata) 
  
# variable information 
print(diabetes_130_us_hospitals_for_years_1999_2008.variables) '''
  
# fetch dataset 
diabetes_130_us_hospitals_for_years_1999_2008 = fetch_ucirepo(id=296) 
  
# data (as pandas dataframes) 
X = diabetes_130_us_hospitals_for_years_1999_2008.data.features 
y = diabetes_130_us_hospitals_for_years_1999_2008.data.targets 

# create complete real_data
real = pd.DataFrame(X)
real["readmitted"] = y

# visualize data
real.head()

In [None]:
# load mostly.ai synth
mostly = pd.read_parquet("./synthetic_data/mostly_ai_synth.parquet", engine ="pyarrow")

# read data
mostly.head()

In [None]:
# load sdv synth
sdv = pd.read_parquet("./synthetic_data/sdv_synth.parquet", engine ="pyarrow")

# read data
sdv.head()

## EXPLORE SYNTHETIC DATA VS REAL DATA

### Categorical data 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Select categorical columns
categorical_cols = real.select_dtypes(include='object')

for col in categorical_cols:
    fig, ax = plt.subplots(1, 2, figsize=(10, 5))
    
    # Get unique values for the x-axis from the real data
    x_ticks = real[col].value_counts().index
    
    # Plot real data
    sns.countplot(data=real, x=col, ax=ax[0])
    ax[0].set_title(col + " _REAL")
    ax[0].set_xticklabels(ax[0].get_xticklabels(), rotation=45, ha='right')
    
    # Plot synthetic data
    sns.countplot(data=sdv, x=col, ax=ax[1])
    ax[1].set_title(col + " _SDV(synth)")
    ax[1].set_xticklabels(ax[1].get_xticklabels(), rotation=45, ha='right')

    # Calculate the maximum y-value from both plots
    max_y = max(real[col].value_counts().max(), mostly[col].value_counts().max())
    
    # Set y-axis limits for both plots
    ax[0].set_ylim(0, max_y)
    ax[1].set_ylim(0, max_y)
    
    # Set x-axis ticks for both plots to ensure they match
    ax[0].set_xticks(range(len(x_ticks)))
    ax[0].set_xticklabels(x_ticks, rotation=45, ha='right')
    
    ax[1].set_xticks(range(len(x_ticks)))
    ax[1].set_xticklabels(x_ticks, rotation=45, ha='right')

    plt.tight_layout()
    plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Select categorical columns
categorical_cols = real.select_dtypes(include='object')

for col in categorical_cols:
    fig, ax = plt.subplots(1, 2, figsize=(10, 5))
    
    # Get unique values for the x-axis from the real data
    x_ticks = real[col].value_counts().index
    
    # Plot real data
    sns.countplot(data=real, x=col, ax=ax[0])
    ax[0].set_title(col + " _REAL")
    ax[0].set_xticklabels(ax[0].get_xticklabels(), rotation=45, ha='right')
    
    # Plot synthetic data
    sns.countplot(data=mostly, x=col, ax=ax[1])
    ax[1].set_title(col + " _MOSTLY(synth)")
    ax[1].set_xticklabels(ax[1].get_xticklabels(), rotation=45, ha='right')

    # Calculate the maximum y-value from both plots
    max_y = max(real[col].value_counts().max(), mostly[col].value_counts().max())
    
    # Set y-axis limits for both plots
    ax[0].set_ylim(0, max_y)
    ax[1].set_ylim(0, max_y)
    
    # Set x-axis ticks for both plots to ensure they match
    ax[0].set_xticks(range(len(x_ticks)))
    ax[0].set_xticklabels(x_ticks, rotation=45, ha='right')
    
    ax[1].set_xticks(range(len(x_ticks)))
    ax[1].set_xticklabels(x_ticks, rotation=45, ha='right')

    plt.tight_layout()
    plt.show()

### Numerical data 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# continuos columns
continuos_columns = real.select_dtypes(include='int64')

for col in continuos_columns:
    fig, ax = plt.subplots(1, 2, figsize=(10, 5))
    
    # Determine the x-axis range
    all_data = np.concatenate([real[col].values, mostly[col].values])
    x_min, x_max = all_data.min(), all_data.max()
    
    # Plot KDE for real data
    sns.kdeplot(data=real, x=col, ax=ax[0], fill=True)
    ax[0].set_title(col + " _REAL")
    ax[0].set_xlim(x_min, x_max)
    
    # Plot KDE for synthetic data
    sns.kdeplot(data=sdv, x=col, ax=ax[1], fill=True)
    ax[1].set_title(col + " _SDV (synth)")
    ax[1].set_xlim(x_min, x_max)
    
    # Add margins to x-axis
    margin = 0.1 * (x_max - x_min)
    ax[0].set_xlim(x_min - margin, x_max + margin)
    ax[1].set_xlim(x_min - margin, x_max + margin)
    
    plt.tight_layout()
    plt.show()