# DATA PREPARATION

## SETUP

In [45]:
import pandas as pd
pd.set_option('display.max_columns', None)

## LOAD REAL DATA

In [None]:
from ucimlrepo import fetch_ucirepo 

'''# metadata 
print(diabetes_130_us_hospitals_for_years_1999_2008.metadata) 
  
# variable information 
print(diabetes_130_us_hospitals_for_years_1999_2008.variables) '''
  
# fetch dataset 
diabetes_130_us_hospitals_for_years_1999_2008 = fetch_ucirepo(id=296) 
  
# data (as pandas dataframes) 
X = diabetes_130_us_hospitals_for_years_1999_2008.data.features 
y = diabetes_130_us_hospitals_for_years_1999_2008.data.targets 

# create complete real_data
diabetes = pd.DataFrame(X)
diabetes["readmitted"] = y

# visualize data
diabetes.head()

## EXPLORE & PREPROCESS DATA

### Dimensions

In [None]:
# dimensions
print(f"Dimension: {diabetes.shape}")

In [None]:
# detect sensitive columns by intuition by their name
print(f"\columns: {diabetes.columns}\n")

# identify identity sensible data: 
sensitive_columns = ['race', 'gender', 'weight', 'admission_type_id','discharge_disposition_id','admission_source_id','payer_code', 'medical_specialty']
print(f"\nSensitive columns: {sensitive_columns}\n")

###  Check columns values & distribution

In [None]:
# SINGLE COLUMN: check columns values & distribution
def visualize_columns_distributions(df):
    for col in df.columns:
        print(f"\n\nColumn: {col}")

        # Combine proportions into a DataFrame for easy comparison
        data =  pd.DataFrame({
            'Values': df[col].value_counts(dropna=False),
            'Proportions': df[col].value_counts(normalize = True, dropna=False)
        }).fillna(0)

        # print data
        print(data)
    
# call to columns_distibution function
visualize_columns_distributions(diabetes)    

Data is not balanced they are more caucasian (%74) than any other race.

###  Generalize 'Nan' race values to 'Other'

In [None]:
# Generalize Nan race as 'Other'
import numpy as np
diabetes.loc[diabetes["race"].isin([np.nan ,'Other']), "race"] = "Other"

# validate change
diabetes.race.value_counts() 

Hispanic and Asian can be generalized as other race but this way still be considerably imbalanced. 

###  Check 'Unknown/Invalid' gender values

In [None]:
# 'Unknown/Invalid' data
diabetes[diabetes['gender'] == 'Unknown/Invalid'] 

Only 3 registry, not possible to define gender, best option would be to remove them.

In [None]:
# removing  'Unknown/Invalid' gender data
print(f"Shape before drop: {diabetes.shape}")
diabetes = diabetes.drop(diabetes[diabetes["gender"] == 'Unknown/Invalid'].index)

# validating results (only 3 less)
print(f"Shape after drop: {diabetes.shape}")

###  Check for 'null values' per column

In [None]:
# nulls per columns (percentage)
diabetes.isna().sum() * 100 / len(diabetes)

Weight column has plenty null values (%96.858387), best option would be to remove.

###  Drop "weight" column

In [None]:
# remove weight column 
print(f"Columns before remove {len(diabetes.columns)}")
diabetes = diabetes.drop('weight', axis=1)
print(f"Columns after remove {len(diabetes.columns)}")

###  Check for 'variability'

Columns that have only one value

In [None]:
def columns_without_variability(df):    
    """
    Function that is responsible to determine which columnns has no variability (those which has only 1 value).    
    Parameters:
        df (pd.DataFrame): The original DataFrame.

    Returns:
        list(): list of variables without variability.
    """    
    sobran = []

    cols = df.columns
    for col in cols:
        if len(df[col].unique()) < 2:
            print(f"Column: `{col}` unique values: {df[col].unique()}")
            sobran.append(col)

    return sobran

# obtain column list without variability
cols_without_variability = columns_without_variability(diabetes)

# print result
print(f"Invariant columns: {cols_without_variability}")

Remove columns without variability.

In [None]:
# remove columns
print(f"Columns without variability: {cols_without_variability}")
print(f"Columns before remove {len(diabetes.columns)}")
diabetes = diabetes.drop(columns = cols_without_variability)
print(f"Columns after remove {len(diabetes.columns)}")

### Detect 'single value entries'

Columns that present a singularization risk, columns that have more than one value but only a single instance of one value.

In [None]:
def determine_single_value_entries(df):
    rows_to_check = []  # List to hold the rows matching the criteria
    cols_to_check = []
    for col in df.columns:
        # Get the value counts for the column
        value_counts = df[col].value_counts()

        # Check if exactly one value has a count of 1
        if (value_counts == 1).sum() == 1:
            # Get the value that appears exactly once
            single_value = value_counts[value_counts == 1].index[0]
            # add column name
            cols_to_check.append(col)
            
            # Select rows where this single value appears
            matching_rows = df[df[col] == single_value]
            
            # Append these rows to the list
            rows_to_check.append(matching_rows)

    # Concatenate all the matching rows into a single dataframe (if needed)
    result_df = pd.concat(rows_to_check, ignore_index=True) if rows_to_check else pd.DataFrame()

    return result_df,cols_to_check

# determine single value entries
matching_rows_df, cols_to_check = determine_single_value_entries(diabetes)

# check data relevancy
for col in cols_to_check:
    value_counts = diabetes[col].value_counts(dropna = False)
    print(f"Single value entry in column {col} :  {value_counts[value_counts == 1].index[0]}")
    #print(f"Distribution \n{value_counts}")


With synthetic data anonymization, this singularization risk can be diminished, but since there are few records, they can be analyzed to determine if they can be removed

#### Check 'single value entries'

The columns `metformin-pioglitazone`, `glimepiride-pioglitazone`, and `acetohexamide` show a steady value. Do they correspond to the same individual?

In [None]:
# Filter from matching_rows_df those only they have 1 Steady instances
steadys = ["metformin-pioglitazone","glimepiride-pioglitazone","acetohexamide"] 
val ="Steady"

# Filter rows where exactly one of the specified columns has the value 'Steady'
filtered_df = diabetes.loc[
    (diabetes[steadys[0]] == val).astype(int) +
    (diabetes[steadys[1]] == val).astype(int) +
    (diabetes[steadys[2]] == val).astype(int) == 1
]

# show results
print(filtered_df)

#### Detect 'sensitive columns'

In [None]:
# detect sensitive columns by intuition by their name
print(f"\columns: {diabetes.columns}\n")

# identify identity sensible data: 
sensitive_columns = ['race', 'gender', 'age', 'admission_type_id','discharge_disposition_id','admission_source_id','payer_code', 'medical_specialty']
print(f"\nSensitive columns: {sensitive_columns}\n")

Check sensitive_columns for any reidentification risk.

In [None]:
print(filtered_df[sensitive_columns])

Check also this columns values variability:

In [None]:
for col in steadys:
    print(f"Column: {col} \nunique values: {diabetes[col].value_counts()}")

These three steady cases clearly pose a reidentification risk due to their singularization. If they are removed, the metformin-pioglitazone, glimepiride-pioglitazone, and acetohexamide columns will lose their variability and should also be removed.

Check the other `single value entries` to evaluate if they should be removed. 

Check 'admission_source_id' = 13 case:

In [None]:
# risk of reindetifiction better to be removed.
diabetes[diabetes['admission_source_id'] == 13] 

Check 'payer_code' : 'FR'

In [None]:
# risk of reindetifiction better to be removed.
diabetes[diabetes['payer_code'] == 'FR'] 

Check for 'chlorpropamide' : 'Down'

In [None]:
# check variability of the column
diabetes['chlorpropamide'].value_counts() # if registry is removed chlorpropamide column not need to be removed.

Check for 'tolazamide' : 'Up'

In [None]:
# check variability of the column
diabetes['tolazamide'].value_counts() # if registry is removed tolazamide column not need to be removed.

Evaluate columns shape to determine if `steady` columns need to be removed or not.

In [None]:
diabetes.shape

Remove steady rows and columns.

In [None]:
# Remove rows and columns as they do not have variability after removing
print(f"Actual dimension: {diabetes.shape}")
print(f"Removing rows: {filtered_df.index}")
diabetes = diabetes.drop(filtered_df.index)
print(f"After dimension: {diabetes.shape}")
print("Checking column variability")
cols_without_variability = columns_without_variability(diabetes)
print(f"Removing columns: {cols_without_variability}")
diabetes = diabetes.drop(columns = cols_without_variability)
print(f"Later dimension: {diabetes.shape}")

Remove the other single value registries.

In [None]:
# Remove single 4 rows
print(f"Current dimensions: {diabetes.shape}")

# Calculate how many rows will be removed
rows_to_remove = len(diabetes[(diabetes['chlorpropamide'] == 'Down') ^ 
                              (diabetes['tolazamide'] == 'Up') ^ 
                              (diabetes['admission_source_id'] == 13) ^ 
                              (diabetes['payer_code'] == 'FR')])
print(f"Removing {rows_to_remove} rows.")

# Drop the rows based on conditions
diabetes = diabetes.drop(diabetes[(diabetes['chlorpropamide'] == 'Down') ^ 
                                  (diabetes['tolazamide'] == 'Up') ^ 
                                  (diabetes['admission_source_id'] == 13) ^ 
                                  (diabetes['payer_code'] == 'FR')].index)

print(f"Updated dimensions: {diabetes.shape}")


###  Check dtype uniformity: 

In [None]:
# data information
print(f"\nData information: {diabetes.dtypes}\n")

In [None]:
# Numerical data comprobations
num_cols = diabetes.select_dtypes(include='int64')

# Check column values, correspond to dtypes
for cat in num_cols.columns:
    print(f"\nColumn: {cat} values: {diabetes[cat].unique()}")

### Change admission_type_id, discharge_disposition_id  &  admission_source_id  to categoricals

In [None]:
# admission_type_id, discharge_disposition_id  &  admission_source_id  are categorical no numericals. Change
cols_to_change = ["admission_type_id","discharge_disposition_id", "admission_source_id"]
diabetes[cols_to_change] =  diabetes[cols_to_change].astype(str)
diabetes.info()

### Check numerical data

In [None]:
# get updated list
num_cols = diabetes.select_dtypes(include='int64')

# Check column values, correspond to dtypes
for cat in num_cols.columns:
    print(f"\nColumn: {cat} values: {diabetes[cat].unique()}")

### Check categorical data 

In [None]:
# optimize memory use changing object to string
categorical_cols = diabetes.select_dtypes('object').columns.tolist()

# Check column values, correspond to dtypes
for cat in categorical_cols:
    print(f"\nColumn: {cat} values: {diabetes[cat].unique()}")

### Check categorical column pair relations

In [None]:
import itertools

# get categorical column pairs
categorical_col_pair = list(itertools.combinations(categorical_cols, 2))       

# visualize data relations
for pair in categorical_col_pair:
    print(f"\n{pair[0]} distribution per {pair[1]}")
    print(f"{diabetes.groupby(pair[0])[pair[1]].value_counts(dropna= False).unstack().fillna(0)}")

### Save preprocessed file

In [64]:
import os

# create folder
tmp_folder = "./tmp_folder"
os.makedirs(tmp_folder, exist_ok=True)

# save data
diabetes.to_parquet(os.path.join(tmp_folder,"refined_file.parquet"),engine="pyarrow",index=False)

# SDV SYNTHETIC DATA GENERATION

## LOAD PREPROCESSED DATA

In [None]:
import pandas as pd
import os

# create folder
tmp_folder = "./tmp_folder"
diabetes = pd.read_parquet(os.path.join(tmp_folder,"generalized_file.parquet"),engine="pyarrow")

## CREATE SYNTHETIZER & SYNTHETIC DATA WITH SDV

In [None]:
# Transform dataframe into`SingleTableMetadata` data type 
from sdv.metadata import SingleTableMetadata

def create_and_modify_metadata(df):
    """
    SingleTableMetadata type data creation. Obtains information directly from original dataframe and
    adjust dtype for "_id" type columns.
    
    Parameters:
        df (pd.DataFrame): The original DataFrame.

    Returns:
        SingleTableMetadata: metadata to create synthetic data
    """
    # Automatically detect metadata from the actual DataFrame
    metadata = SingleTableMetadata()
    metadata.detect_from_dataframe(df)

    """# Change dtype of "_id" columns. Threat as categorical instead of numerical
    for column_name in metadata.columns:
        if '_id' in column_name:
            metadata.update_column(column_name, sdtype='categorical') """
    return metadata

# create metadata
metadata = create_and_modify_metadata(diabetes)

# Check if metadata has been correctly generated
print(metadata)	


In [None]:
from sdv.single_table import GaussianCopulaSynthesizer

# Create synthetizer and synthetic data 
def synthetic_data_creation(md, df):
    """
    Creates synthetizer, trains synthetizer with real data and creates new 
    synthetic data.
    
    Parameters:
        md (SingleTableMetadata): Metadata of DataFrame.
        df (pd.DataFrame): The original DataFrame.

    Returns:
        SingleTableMetadata: metadata to create synthetic data
    """

    # create synthetizer
    synthesizer = GaussianCopulaSynthesizer(
        md,
        enforce_min_max_values=True,
        enforce_rounding=True) 

    # train data to learn from real data
    synthesizer.fit(
        data = df
    )

    # create new data (same dimensions) based on learned model
    synthetic_data = synthesizer.sample(
        num_rows=df.shape[0]
    )
    return synthetic_data

# obtain synthetic data
synthetic_data = synthetic_data_creation(metadata, diabetes)

## EXPLORE SYNTHETIC DATA AND VALIDATE

In [None]:
# dimensions
print(f"Real dimension: {diabetes.shape}")
print(f"Synth dimension: {synthetic_data.shape}")

In [None]:
# Get information from both datasets
real_data_info = pd.DataFrame({
    'Column': diabetes.columns,
    'Real Non-Null Count':diabetes.notnull().sum()
})

# For synthetic data
synthetic_data_info = pd.DataFrame({
    'Column': synthetic_data.columns,
    'Synthetic Non-Null Count':synthetic_data.notnull().sum()
})

# Merge the two DataFrames on the 'Column' name
comparison = pd.merge(real_data_info, synthetic_data_info, on='Column', how='outer')

# Print comparison table
print("Comparison of Real and Synthetic Data:")
print(comparison)


# Check for data correlation

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# CORRELATION MATRIX
fig, ax = plt.subplots(1,2,figsize = (15,5))
corr_r = diabetes.corr()
corr_s = synthetic_data.corr()
sns.heatmap(corr_r, 
            xticklabels=corr_r.columns.values,
            yticklabels=corr_r.columns.values,
            cmap="Blues",
            annot=True,         # Display the correlation values in the cells
            fmt=".2f", ax = ax[0])
sns.heatmap(corr_s, 
            xticklabels=corr_s.columns.values,
            yticklabels=corr_s.columns.values,
            cmap="Greens",
            annot=True,         # Display the correlation values in the cells
            fmt=".2f", ax = ax[1])
ax[0].set_title("REAL")
ax[1].set_title("SYNTH")
plt.tight_layout()     
plt.show()