# SDV SYNTHETIC DATA GENERATION

## SETUP

In [60]:
import pandas as pd
pd.set_option('display.max_columns', None)

## LOAD REAL DATA

In [None]:
from ucimlrepo import fetch_ucirepo 

'''# metadata 
print(diabetes_130_us_hospitals_for_years_1999_2008.metadata) 
  
# variable information 
print(diabetes_130_us_hospitals_for_years_1999_2008.variables) '''
  
# fetch dataset 
diabetes_130_us_hospitals_for_years_1999_2008 = fetch_ucirepo(id=296) 
  
# data (as pandas dataframes) 
X = diabetes_130_us_hospitals_for_years_1999_2008.data.features 
y = diabetes_130_us_hospitals_for_years_1999_2008.data.targets 

# create complete real_data
diabetes = pd.DataFrame(X)
diabetes["readmitted"] = y

# visualize data
diabetes.head()

## EXPLORE REAL DATA

In [None]:
# dimensions
print(f"Dimension: {diabetes.shape}")

# data information
print(f"\nData information: {diabetes.info()}\n")

In [None]:
# detect sensitive columns by intuition by their name
print(f"\columns: {diabetes.columns}\n")

# identify identity sensible data: 
sensitive_column_names = ['race', 'gender', 'weight', 'admission_type_id','discharge_disposition_id','admission_source_id','payer_code', 'medical_specialty']
print(f"\nSensitive columns: {sensitive_column_names}\n")

In [None]:
# SINGLE COLUMN: check columns values & distribution
def visualize_columns_distributions(df):
    for col in df.columns:
        print(f"\n\nColumn: {col}")

        # Get value counts of data
        val_counts = df[col].value_counts(dropna = False)   

        # prepare to print more pretty way
        counts_df = pd.DataFrame({'Items': val_counts})    

        # Print 
        print(counts_df)
    
# call to columns_distibution function
visualize_columns_distributions(diabetes)    

It seems that some sensitive data may allow the re-identification of patients. Let's check whether that is really the case.

In [None]:
# Check '>200' weight data
big_weights = diabetes[diabetes["weight"] == '>200']
big_weights = big_weights.sort_values("age")
big_weights

It seems that `k-anonymity` rule is not met, reclasifying the weight values we could gain anonymity? Let's check.

In [None]:
''' NEW WEIGTH CLASSIFICATION
[0-100) == [0-25) & [25-50) & [50-75) & [75-100) 
> 100 ==  [100-125) & [125-150) & [150-175) & [175-200) & '>200'
nan
'''

# Change weight ranges to gain anonymity
diabetes1 = diabetes.copy()
diabetes1.loc[diabetes1["weight"].isin(['[0-25)', '[25-50)', '[50-75)', '[75-100)']), "weight"] = "[0-100)"
diabetes1.loc[diabetes1["weight"].isin(['[100-125)', '[125-150)', '[150-175)', '[175-200)','>200']), "weight"] = "> 100"

# validating that only 3 values exists
print(f"New weight values:{diabetes1.weight.unique()}")

# call to columns_distibution function
visualize_columns_distributions(diabetes1)    

# seems to be better distributed lets mantain this change
diabetes = diabetes1

Let's check the data pairs to see if re-identification is possible

In [None]:
import itertools

# function that determines singularization risk columns pairs
def determine_singularization_risk(df, column_pairs):
      # Dictionary to keep track of problematic rows
      special_pairs = {}
      count =0
      for val in column_pairs:
           data_crosstab = pd.crosstab(df[val[0]], 
                            df[val[1]],  
                            margins = False) 
           # Check if any row in the crosstab has only 1 value
           #print(f"\nCross_data: {data_crosstab}")
           conflicted_value_column = data_crosstab.apply(lambda row: row[row == 1].index.tolist(), axis=1)

           # Filter only rows where the conflicted column is found (non-empty lists)
           conflicted_value_column = conflicted_value_column[conflicted_value_column.apply(len) > 0]

           if any(conflicted_value_column.apply(len) == 1):
                 #print(f"conflicted_value_column: {conflicted_value_column}")
                 # complete dictionary
                 count+=1
                 special_attention = {}
                 for index, columns in conflicted_value_column.items():
                       if len(columns)==1:
                             special_attention[val[0]] = index
                             special_attention[val[1]] = columns     
                             #print(f"special_attention: {special_attention}")

                             # add to principal dictionary    
                             special_pairs[val] = special_attention
                             #print(f"special_pairs: {special_pairs}")

      return (special_pairs, count)
      
                
# COLUMN PAIRS: create pairs, only with sensitive data
column_pairs = list(itertools.combinations(sensitive_column_names, 2))                
# function call
special_pairs, count = determine_singularization_risk(diabetes, column_pairs)               
 
# visualize results
print(f"From: {len(column_pairs)} pairs conflictive are: {count}")
for key in special_pairs.keys():
      print(f"Check: {special_pairs.get(key)}")

Let's check for singularization risks to determine the appropriate actions.

In [None]:
# Check 'Unknown/Invalid' gender data
diabetes[diabetes['gender'] == 'Unknown/Invalid'] # only 3 registry, not possible to define gender, remove

# removing  'Unknown/Invalid' gender data
print(f"Shape before drop: {diabetes.shape}")
diabetes1 = diabetes.drop(diabetes[diabetes["gender"] == 'Unknown/Invalid'].index)

# validating results (only 3 less)
print(f"Shape after drop: {diabetes1.shape}")

# assing "diabetes" name again
diabetes = diabetes1

Let's try to generalize even more `age` column to see if singularization risks is minorized.

In [67]:
''' New Columns: age
[0-20) == [0-10) & [10-20)
[20-40) == [20-30) & [30-40)
[40-60) == [40-50) & [50-60)
[60-80) == [60-70) & [70-80)
[80-100) == [80-90) & [90-100)
'''

# Change weight ranges to gain anonymity
diabetes1 = diabetes.copy()
diabetes1.loc[diabetes1["age"].isin(['[0-10)' ,'[10-20)']), "age"] = "[0-20)"
diabetes1.loc[diabetes1["age"].isin(['[20-30)', '[30-40)']), "age"] = "[20-40)"
diabetes1.loc[diabetes1["age"].isin(['[40-50)' ,'[50-60)']), "age"] = "[40-60)"
diabetes1.loc[diabetes1["age"].isin(['[60-70)', '[70-80)']), "age"] = "[60-80)"
diabetes1.loc[diabetes1["age"].isin(['[80-90)', '[90-100)']), "age"] = "[80-100)"

# validating that only 3 values exists
print(f"New weight values:{diabetes1.age.unique()}")

# call to columns_distibution function
diabetes1.age.value_counts() 

New weight values:['[0-20)' '[20-40)' '[40-60)' '[60-80)' '[80-100)']


[60-80)     48548
[40-60)     26941
[80-100)    19990
[20-40)      5432
[0-20)        852
Name: age, dtype: int64

Check if this generation would help to minimize the individualization risk

In [None]:
# race is Other and its discharge_disposition_id = 15?
gender_age_weight = diabetes1[(diabetes1["race"] == 'Other') & (diabetes1["discharge_disposition_id"] == 15)][["gender", "age", "weight", "admission_type_id", "discharge_disposition_id", "admission_source_id"]]
filter_data = gender_age_weight.iloc[0] 

#Male	[40-60)	NaN	1	15	1
#
# check if generalization is possible
diabetes2 = diabetes1[diabetes1["discharge_disposition_id"]== 15]

# filter by gender [["gender", "age", "weight", "admission_type_id", "discharge_disposition_id", "admission_source_id"]]
diabetes3 =diabetes2[diabetes2["gender"]== filter_data.gender]

# filter by age [["gender", "age", "weight", "admission_type_id", "discharge_disposition_id", "admission_source_id"]]
diabetes4 = diabetes3[diabetes3["age"]== filter_data.age]

# filter by admission_type_id [["gender", "age", "weight", "admission_type_id", "discharge_disposition_id", "admission_source_id"]]
diabetes5 = diabetes4[diabetes4["admission_type_id"]== filter_data.admission_type_id]

# filter by discharge_disposition_id [["gender", "age", "weight", "admission_type_id", "discharge_disposition_id", "admission_source_id"]]
diabetes5[diabetes5["discharge_disposition_id"]== filter_data.discharge_disposition_id]

# we will drop

In [82]:
# drop individual
diabetes2 = diabetes1.drop(diabetes1[(diabetes1["race"] == 'Other') & (diabetes1["discharge_disposition_id"] == 15)].index)

#assign to same dataset
diabetes1 = diabetes2

# save file to continue checking other actions
diabetes1.to_csv("./age_generalization1.csv")

In [None]:
# 'raceis Other and its payer_code = CH?
#diabetes[(diabetes["race"] == 'Other') & (diabetes["payer_code"] == "CH")] # exist posibility to generalize?

# race is Asian and its admission_type_id = 4?
#diabetes[(diabetes["race"] == 'Asian') & (diabetes["admission_type_id"] == 4)] # exist posibility to generalize?

# gender is Male and its payer_code = "FR"?
#diabetes[(diabetes["gender"] == 'Male') & (diabetes["payer_code"] == "FR")] # exist posibility to generalize?

# gender is Male and its discharge_disposition_id = 20?
#diabetes[(diabetes["gender"] == 'Male') & (diabetes["discharge_disposition_id"] == 20)] # exist posibility to generalize?

# gender is Female and its admission_source_id = 14?
#diabetes[(diabetes["gender"] == 'Female') & (diabetes["admission_source_id"] == 14)] # exist posibility to generalize?

# weight is > 100' and its admission_type_id = 5?
#diabetes[(diabetes["weight"] == '> 100') & (diabetes["admission_type_id"] == 5)] # exist posibility to generalize?

# weight is [0-100)' and its payer_code = CH?
#diabetes[(diabetes["weight"] == '[0-100)') & (diabetes["payer_code"] == "CH")] # exist posibility to generalize?

# weight is [0-100)' and its medical_specialty = CH?
#diabetes[(diabetes["weight"] == '[0-100)') & (diabetes["medical_specialty"] == "Dentistry")] # exist posibility to generalize?

# weight is > 100' and its admission_type_id = 5?
#diabetes[(diabetes["weight"] == '> 100') & (diabetes["admission_type_id"] == 5)] # exist posibility to generalize?

# admission_type_id is 7 and its discharge_disposition_id = 2?
#diabetes[(diabetes["admission_type_id"] == 7) & (diabetes["discharge_disposition_id"] == 2)] # exist posibility to generalize?

# admission_type_id is 6 and its admission_source_id = 8?
#diabetes[(diabetes["admission_type_id"] == 6) & (diabetes["admission_source_id"] == 8)] # exist posibility to generalize?

# admission_type_id is 2 and its payer_code = 'FR'?
#diabetes[(diabetes["admission_type_id"] == 2) & (diabetes["payer_code"] == 'FR')] # exist posibility to generalize?

# discharge_disposition_id is 28 and its admission_source_id = 4?
#diabetes[(diabetes["discharge_disposition_id"] == 28) & (diabetes["admission_source_id"] == 4)] # exist posibility to generalize?

# discharge_disposition_id is 25 and its payer_code = 'MD'?
#diabetes[(diabetes["discharge_disposition_id"] == 25) & (diabetes["payer_code"] == 'MD')] # exist posibility to generalize?

# discharge_disposition_id is 27 and its medical_specialty = 'Family/GeneralPractice'?
#diabetes[(diabetes["discharge_disposition_id"] == 27) & (diabetes["medical_specialty"] == 'Family/GeneralPractice')] # exist posibility to generalize?

# admission_source_id is 17 and its payer_code = 'MD'?
#diabetes[(diabetes["admission_source_id"] == 17) & (diabetes["payer_code"] == 'WC')] # exist posibility to generalize?

# admission_source_id is 22 and its medical_specialty = 'Family/GeneralPractice'?
#diabetes[(diabetes["admission_source_id"] == 22) & (diabetes["medical_specialty"] == 'Orthopedics-Reconstructive')] # exist posibility to generalize

Since 'Unknown/Invalid' gender data not met k-anonymity rule, as first glance removing those values seems reasonable as only are 3 items.

In [20]:
# removing  'Unknown/Invalid' gender data
print(f"Shape before drop: {diabetes.shape}")
diabetes1 = diabetes.drop(diabetes[diabetes["gender"] == 'Unknown/Invalid'].index)

# validating results (only 3 less)
print(f"Shape after drop: {diabetes1.shape}")

# assing "diabetes" name again
diabetes = diabetes1

# validating results (only 3 less)
print(f"Diabetes shape: {diabetes.shape}")

# check singularities again after remove
special_pairs, count = determine_singularization_risk(diabetes, column_pairs)

# visualize results  
print(f"\nFrom: {len(column_pairs)} pairs conflictive are: {count}")
for key in special_pairs.keys():
      print(f"Check: {special_pairs.get(key)}")

Shape before drop: (101766, 48)
Shape after drop: (101763, 48)
Diabetes shape: (101763, 48)

From: 28 pairs conflictive are: 17
Check: {'race': 'Asian', 'admission_type_id': [4]}
Check: {'race': 'Other', 'discharge_disposition_id': [15]}
Check: {'race': 'Other', 'payer_code': ['CH']}
Check: {'gender': 'Male', 'discharge_disposition_id': [20]}
Check: {'gender': 'Female', 'admission_source_id': [14]}
Check: {'gender': 'Male', 'payer_code': ['FR']}
Check: {'weight': '> 100', 'admission_type_id': [5]}
Check: {'weight': '[0-100)', 'payer_code': ['CH']}
Check: {'weight': '[0-100)', 'medical_specialty': ['Dentistry']}
Check: {'admission_type_id': 7, 'discharge_disposition_id': [2]}
Check: {'admission_type_id': 6, 'admission_source_id': [8]}
Check: {'admission_type_id': 2, 'payer_code': ['FR']}
Check: {'discharge_disposition_id': 28, 'admission_source_id': [4]}
Check: {'discharge_disposition_id': 25, 'payer_code': ['MD']}
Check: {'discharge_disposition_id': 27, 'medical_specialty': ['Family/Ge

Let's check for singularization risks to determine the appropriate actions.

In [49]:
# save data in csv to filter and determine actions better
diabetes.to_csv("./singularization_risk.csv", index = False)

Both instances of race `Other` do not refer to the same person. The person with `"discharge_disposition_id" = 15` reflect a direct risk of singularization. A single person with this characteristic will not be sufficient for training any model, so removing this individual would be the best action.

In [None]:
'''# current dimesion
print(f"current dimension: {diabetes.shape}")

# remove directly "discharge_disposition_id" = 15 individual
diabetes = diabetes.drop(diabetes[(diabetes["race"] == "Other") & (diabetes["discharge_disposition_id"] == 15)].index)

# validate dimension (1 less)
print(f"new_dimension: {diabetes.shape}")'''

Respecting the other person, it is advisable to check if they meet the k-anonymity standard by generalizing the `payer_code` to an `Nan` value.

In [13]:
# get corresponding line data to sort all other according to this columns and determine action
gender_age_weight  = other[(other["race"] == "Other") & (other["payer_code"] == "CH")][["gender", "age", "weight", "admission_type_id", "discharge_disposition_id", "admission_source_id"]]
filter_data = gender_age_weight.iloc[0]

filter_data # Female	[60-70)	NaN	1	1	7

# filter by gender
other[other["gender"] == filter_data.gender]

# filter by age
other[other["age"] == filter_data.age]

# filter by weight
other1 = other[other["admission_type_id"] == filter_data.admission_type_id]

other1.sort_values(["gender","age"],inplace =True)

# save data in csv to control better
other1.to_csv("./other_singularization1.csv", index = False)

Yes! by generalizing the `payer_code` to an `Nan` value we avoid this person singularity risk.

In [None]:
import numpy as np

# actual value
print(f"actual value: {diabetes[(diabetes['race'] == 'Other') & (diabetes['payer_code'] == 'CH')]['payer_code']}")

# set NAN value
diabetes.loc[(diabetes["race"] == "Other") & (diabetes["payer_code"] == "CH"), "payer_code"] = np.nan

# no value
print(f"new value: {diabetes[(diabetes['race'] == 'Other') & (diabetes['payer_code'] == 'CH')]['payer_code']}")

# check singularities again after remove
special_pairs, count = determine_singularization_risk(diabetes, column_pairs)

# visualize results  
print(f"\nFrom: {len(column_pairs)} pairs conflictive are: {count}")
for key in special_pairs.keys():
      print(f"Check: {special_pairs.get(key)}")


In [110]:
# Check race 'Asian' cases, tcan be avoido singularization problems refers to the same person?
asians = diabetes[diabetes["race"] == "Asian"]

gender_age_weight  = asians[(asians["race"] == "Asian") & (asians["admission_type_id"] == 4)][["gender", "age", "weight", "admission_type_id", "discharge_disposition_id", "admission_source_id"]]

gender_age_weight #Male	[80-90)	[0-100)	4	6	7

asians.to_csv("./asian_singularities.csv",index=True)

Unnamed: 0,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id
100721,Male,[80-90),[0-100),4,6,7


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import itertools

weights = diabetes["weight"].unique()

# weight and age relation
for w in weights:
    filt1 = diabetes[diabetes["weight"] == w]
    fig, ax = plt.subplots(figsize = (10,5))
    sns.countplot(data=diabetes, x="race", ax = ax)
    ax.set_title(f"Race distribution by Weight value:{w}")
    plt.tight_layout()
    plt.show()

In a quick glance seems that only some registries has weight values per age. Let's dive in this relations before create blindly synth data.

EXPLORE REAL DATA VISUALLY

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


# weight and age relation
for w in weights:
    filt1 = diabetes1[diabetes1["weight"] == w]
    fig, ax = plt.subplots(figsize = (10,5))
    sns.countplot(data=diabetes, x="age", ax = ax)
    ax.set_title(f"Age distribution by Weight value:{w}")
    plt.tight_layout()
    plt.show()

In [None]:
# weight and age relation
for w in weights:
    filt1 = diabetes1[diabetes1["weight"] == w]
    fig, ax = plt.subplots(figsize = (10,5))
    sns.countplot(data=diabetes, x="age", ax = ax)
    ax.set_title(f"Age distribution by Weight value:{w}")
    plt.tight_layout()
    plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# visualizing data distribution
for col in diabetes.columns:
    fig, ax = plt.subplots(figsize = (10,5))
    sns.countplot(data=diabetes, x=col, ax = ax)
    ax.set_title(col)
    plt.tight_layout()
    plt.show()

## CREATE SYNTHETIZER & SYNTHETIC DATA WITH SDV

In [None]:
# Transform `diabetes` dataframe `SingleTableMetadata` data type 
from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata()

# Automatically detect metadata from the actual DataFrame
metadata.detect_from_dataframe(diabetes)

# Change dtype of "_id" columns. Threat as categorical instead of numerical
for column_name in metadata.columns:
    if '_id' in column_name:
        metadata.update_column(column_name, sdtype='categorical')

# Check if metadata has been correctly generated
print(metadata)	

In [10]:
from sdv.single_table import GaussianCopulaSynthesizer

synthesizer = GaussianCopulaSynthesizer(
    metadata,
    enforce_min_max_values=True,
    enforce_rounding=True) 

In [11]:
# train data to learn from real data
synthesizer.fit(
    data = diabetes
)

In [12]:
# create new data (same dimensions) based on learned model
synthetic_data = synthesizer.sample(
    num_rows=diabetes.shape[0]
)

## EXPLORE SYNTHETIC DATA AND VALIDATE

In [None]:
# dimensions
print(f"Real dimension: {diabetes.shape}")
print(f"Synth dimension: {synthetic_data.shape}")

# data information
print(f"\n\nReal data information: {diabetes.info()}")
print(f"Synth data information: {synthetic_data.info()}\n")

In [None]:
# Compare values creation
for col in diabetes.columns:
    print(f"\n\nColumn: {col}")
    
    # Get value counts for both real and synthetic data
    real_counts = diabetes[col].value_counts()
    synth_counts = synthetic_data[col].value_counts()
    # Combine the counts to ensure all categories are represented in both datasets
    combined_counts = pd.DataFrame({'Real': real_counts, 'Synthetic': synth_counts}).fillna(0)
    
    # Print the combined counts for easy comparison
    print(combined_counts)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# CORRELATION MATRIX
fig, ax = plt.subplots(1,2,figsize = (15,5))
corr_r = diabetes.corr()
corr_s = synthetic_data.corr()
sns.heatmap(corr_r, 
            xticklabels=corr_r.columns.values,
            yticklabels=corr_r.columns.values,
            cmap="Blues",
            annot=True,         # Display the correlation values in the cells
            fmt=".2f", ax = ax[0])
sns.heatmap(corr_s, 
            xticklabels=corr_s.columns.values,
            yticklabels=corr_s.columns.values,
            cmap="Greens",
            annot=True,         # Display the correlation values in the cells
            fmt=".2f", ax = ax[1])
ax[0].set_title("REAL")
ax[1].set_title("SYNTH")
plt.tight_layout()     
plt.show()

In [None]:
# check sensitive data
sensitive_column_names = ['race', 'gender', 'age','payer_code', 'medical_specialty']

# understanding columns values
for col in sensitive_column_names:
    print(f"\n\nReal column: {col} has values: {diabetes[col].unique()}")
    print(f"Synth column: {col} has values: {synthetic_data[col].unique()}")

## SAVE SYNTHETIZER & SYNTHETIC DATA

In [14]:
import os

# create save folder
synth_folder = os.path.join("./","synthetic_data")
os.makedirs(synth_folder, exist_ok = True) 

# save synth generator 
synthesizer.save(os.path.join(synth_folder, "sdv_synthesizer.pkl"))

In [15]:
# save synthetic data
synthetic_data.to_parquet(os.path.join(synth_folder,"sdv_synth.parquet"), engine='pyarrow')