# SDV SYNTHETIC DATA GENERATION

## SETUP

In [263]:
import pandas as pd
pd.set_option('display.max_columns', None)

## LOAD REAL DATA

In [None]:
from ucimlrepo import fetch_ucirepo 

'''# metadata 
print(diabetes_130_us_hospitals_for_years_1999_2008.metadata) 
  
# variable information 
print(diabetes_130_us_hospitals_for_years_1999_2008.variables) '''
  
# fetch dataset 
diabetes_130_us_hospitals_for_years_1999_2008 = fetch_ucirepo(id=296) 
  
# data (as pandas dataframes) 
X = diabetes_130_us_hospitals_for_years_1999_2008.data.features 
y = diabetes_130_us_hospitals_for_years_1999_2008.data.targets 

# create complete real_data
diabetes = pd.DataFrame(X)
diabetes["readmitted"] = y

# visualize data
diabetes.head()

## EXPLORE REAL DATA

In [None]:
# dimensions
print(f"Dimension: {diabetes.shape}")

# data information
print(f"\nData information: {diabetes.info()}\n")

In [None]:
# detect sensitive columns by intuition by their name
print(f"\columns: {diabetes.columns}\n")

# identify identity sensible data: 
sensitive_column_names = ['race', 'gender', 'weight', 'admission_type_id','discharge_disposition_id','admission_source_id','payer_code', 'medical_specialty']
print(f"\nSensitive columns: {sensitive_column_names}\n")

In [None]:
# SINGLE COLUMN: check columns values & distribution
for col in diabetes.columns:
    print(f"\n\nColumn: {col}")
    
    # Get value counts of data
    val_counts = diabetes[col].value_counts(dropna = False)   

    # prepare to print more pretty way
    counts_df = pd.DataFrame({'Items': val_counts})    

    # Print 
    print(counts_df)

It seems that some sensitive data may allow the re-identification of patients. Let's check whether that is really the case.

In [None]:
# Check '>200' weight data
big_weights = diabetes[diabetes["weight"] == '>200']
big_weights = big_weights.sort_values("age")
big_weights

It seems that `k-anonymity` rule is not met, reclasifying the weight values we could gain anonymity? Let's check.

In [None]:
''' NEW WEIGTH CLASSIFICATION
[0-100) == [0-25) & [25-50) & [50-75) & [75-100) 
> 100 ==  [100-125) & [125-150) & [150-175) & [175-200) & '>200'
nan
'''

# Change weight ranges to gain anonymity
diabetes1 = diabetes.copy()
diabetes1.loc[diabetes1["weight"].isin(['[0-25)', '[25-50)', '[50-75)', '[75-100)']), "weight"] = "[0-100)"
diabetes1.loc[diabetes1["weight"].isin(['[100-125)', '[125-150)', '[150-175)', '[175-200)','>200']), "weight"] = "> 100"

# validating that only 3 values exists
print(f"New weight values:{diabetes1.weight.unique()}")

# SINGLE COLUMN: check after reajust weight
for col in diabetes1.columns:
    print(f"\n\nColumn: {col}")
    
    # Get value counts of data
    val_counts = diabetes1[col].value_counts(dropna = False)   

    # prepare to print more pretty way
    counts_df = pd.DataFrame({'Items': val_counts})    

    # Print 
    print(counts_df)

# seems to be better distributed lets mantain this change
diabetes = diabetes1

Let's check the data pairs to see if re-identification is possible

In [None]:
import itertools
import os

# COLUMN PAIRS: create pairs, only with sensitive data
column_pairs = list(itertools.combinations(sensitive_column_names, 2))

# Dictionary to keep track of problematic rows
special_pairs = {}
count =0
for val in column_pairs:
    data_crosstab = pd.crosstab(diabetes[val[0]], 
                            diabetes[val[1]],  
                            margins = False) 
    
    # Check if any row in the crosstab has only 1 value
    #print(f"\nCross_data: {data_crosstab}")
    conflicted_value_column = data_crosstab.apply(lambda row: row[row == 1].index.tolist(), axis=1)

    # Filter only rows where the conflicted column is found (non-empty lists)
    conflicted_value_column = conflicted_value_column[conflicted_value_column.apply(len) > 0]

    if any(conflicted_value_column.apply(len) == 1):
        #print(f"conflicted_value_column: {conflicted_value_column}")
        # complete dictionary
        count+=1
        special_attention = {}
        for index, columns in conflicted_value_column.items():
            if len(columns)==1:
                special_attention[val[0]] = index
                special_attention[val[1]] = columns     
                #print(f"special_attention: {special_attention}")
                
                # add to principal dictionary    
                special_pairs[val] = special_attention
                #print(f"special_pairs: {special_pairs}")
            
# visualize columns  
print(f"From: {len(column_pairs)} pairs conflictive are: {count}")
for key in special_pairs.keys():
      print(f"Check: {special_pairs.get(key)}")

In [None]:
# Check 'Unknown/Invalid' gender data
unknown_gender = diabetes[diabetes["gender"] == 'Unknown/Invalid']
unknown_gender = unknown_gender.sort_values("age")
unknown_gender

# Since 'Unknown/Invalid' gender data not met k-anonymity rule, as first glance removing those values seems reasonable as only there are 3 items.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import itertools

weights = diabetes["weight"].unique()

# weight and age relation
for w in weights:
    filt1 = diabetes[diabetes["weight"] == w]
    fig, ax = plt.subplots(figsize = (10,5))
    sns.countplot(data=diabetes, x="race", ax = ax)
    ax.set_title(f"Race distribution by Weight value:{w}")
    plt.tight_layout()
    plt.show()

In a quick glance seems that only some registries has weight values per age. Let's dive in this relations before create blindly synth data.

EXPLORE REAL DATA VISUALLY

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


# weight and age relation
for w in weights:
    filt1 = diabetes1[diabetes1["weight"] == w]
    fig, ax = plt.subplots(figsize = (10,5))
    sns.countplot(data=diabetes, x="age", ax = ax)
    ax.set_title(f"Age distribution by Weight value:{w}")
    plt.tight_layout()
    plt.show()

In [None]:
# weight and age relation
for w in weights:
    filt1 = diabetes1[diabetes1["weight"] == w]
    fig, ax = plt.subplots(figsize = (10,5))
    sns.countplot(data=diabetes, x="age", ax = ax)
    ax.set_title(f"Age distribution by Weight value:{w}")
    plt.tight_layout()
    plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# visualizing data distribution
for col in diabetes.columns:
    fig, ax = plt.subplots(figsize = (10,5))
    sns.countplot(data=diabetes, x=col, ax = ax)
    ax.set_title(col)
    plt.tight_layout()
    plt.show()

## CREATE SYNTHETIZER & SYNTHETIC DATA WITH SDV

In [None]:
# Transform `diabetes` dataframe `SingleTableMetadata` data type 
from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata()

# Automatically detect metadata from the actual DataFrame
metadata.detect_from_dataframe(diabetes)

# Change dtype of "_id" columns. Threat as categorical instead of numerical
for column_name in metadata.columns:
    if '_id' in column_name:
        metadata.update_column(column_name, sdtype='categorical')

# Check if metadata has been correctly generated
print(metadata)	

In [10]:
from sdv.single_table import GaussianCopulaSynthesizer

synthesizer = GaussianCopulaSynthesizer(
    metadata,
    enforce_min_max_values=True,
    enforce_rounding=True) 

In [11]:
# train data to learn from real data
synthesizer.fit(
    data = diabetes
)

In [12]:
# create new data (same dimensions) based on learned model
synthetic_data = synthesizer.sample(
    num_rows=diabetes.shape[0]
)

## EXPLORE SYNTHETIC DATA AND VALIDATE

In [None]:
# dimensions
print(f"Real dimension: {diabetes.shape}")
print(f"Synth dimension: {synthetic_data.shape}")

# data information
print(f"\n\nReal data information: {diabetes.info()}")
print(f"Synth data information: {synthetic_data.info()}\n")

In [None]:
# Compare values creation
for col in diabetes.columns:
    print(f"\n\nColumn: {col}")
    
    # Get value counts for both real and synthetic data
    real_counts = diabetes[col].value_counts()
    synth_counts = synthetic_data[col].value_counts()
    # Combine the counts to ensure all categories are represented in both datasets
    combined_counts = pd.DataFrame({'Real': real_counts, 'Synthetic': synth_counts}).fillna(0)
    
    # Print the combined counts for easy comparison
    print(combined_counts)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# CORRELATION MATRIX
fig, ax = plt.subplots(1,2,figsize = (15,5))
corr_r = diabetes.corr()
corr_s = synthetic_data.corr()
sns.heatmap(corr_r, 
            xticklabels=corr_r.columns.values,
            yticklabels=corr_r.columns.values,
            cmap="Blues",
            annot=True,         # Display the correlation values in the cells
            fmt=".2f", ax = ax[0])
sns.heatmap(corr_s, 
            xticklabels=corr_s.columns.values,
            yticklabels=corr_s.columns.values,
            cmap="Greens",
            annot=True,         # Display the correlation values in the cells
            fmt=".2f", ax = ax[1])
ax[0].set_title("REAL")
ax[1].set_title("SYNTH")
plt.tight_layout()     
plt.show()

In [None]:
# check sensitive data
sensitive_column_names = ['race', 'gender', 'age','payer_code', 'medical_specialty']

# understanding columns values
for col in sensitive_column_names:
    print(f"\n\nReal column: {col} has values: {diabetes[col].unique()}")
    print(f"Synth column: {col} has values: {synthetic_data[col].unique()}")

## SAVE SYNTHETIZER & SYNTHETIC DATA

In [14]:
import os

# create save folder
synth_folder = os.path.join("./","synthetic_data")
os.makedirs(synth_folder, exist_ok = True) 

# save synth generator 
synthesizer.save(os.path.join(synth_folder, "sdv_synthesizer.pkl"))

In [15]:
# save synthetic data
synthetic_data.to_parquet(os.path.join(synth_folder,"sdv_synth.parquet"), engine='pyarrow')