# SDV SYNTHETIC DATA GENERATION

## SETUP

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)

## LOAD REAL DATA

In [None]:
from ucimlrepo import fetch_ucirepo 

'''# metadata 
print(diabetes_130_us_hospitals_for_years_1999_2008.metadata) 
  
# variable information 
print(diabetes_130_us_hospitals_for_years_1999_2008.variables) '''
  
# fetch dataset 
diabetes_130_us_hospitals_for_years_1999_2008 = fetch_ucirepo(id=296) 
  
# data (as pandas dataframes) 
X = diabetes_130_us_hospitals_for_years_1999_2008.data.features 
y = diabetes_130_us_hospitals_for_years_1999_2008.data.targets 

# create complete real_data
diabetes = pd.DataFrame(X)
diabetes["readmitted"] = y

# visualize data
diabetes.head()

## EXPLORE & PREPROCESS REAL DATA

In [None]:
# dimensions
print(f"Dimension: {diabetes.shape}")

In [None]:
# detect sensitive columns by intuition by their name
print(f"\columns: {diabetes.columns}\n")

# identify identity sensible data: 
sensitive_column_names = ['race', 'gender', 'weight', 'admission_type_id','discharge_disposition_id','admission_source_id','payer_code', 'medical_specialty']
print(f"\nSensitive columns: {sensitive_column_names}\n")

###  Check columns values & distribution

In [None]:
# SINGLE COLUMN: check columns values & distribution
def visualize_columns_distributions(df):
    for col in df.columns:
        print(f"\n\nColumn: {col}")

        # Get value counts of data
        val_counts = df[col].value_counts(dropna = False)   

        # prepare to print more pretty way
        counts_df = pd.DataFrame({'Items': val_counts})    

        # Print 
        print(counts_df)
    
# call to columns_distibution function
visualize_columns_distributions(diabetes)    

###  Generalize 'Nan' race values to 'Other'

In [None]:
# Generalize Nan race as 'Other'
import numpy as np
diabetes.loc[diabetes["race"].isin([np.nan ,'Other']), "race"] = "Other"

# validate change
diabetes.race.value_counts() 

###  Remove 'Unknown/Invalid' gender values

In [None]:
# 'Unknown/Invalid' # only 3 registry, not possible to define gender, best option would be to remove them 
diabetes[diabetes['gender'] == 'Unknown/Invalid'] 

# removing  'Unknown/Invalid' gender data
print(f"Shape before drop: {diabetes.shape}")
diabetes = diabetes.drop(diabetes[diabetes["gender"] == 'Unknown/Invalid'].index)

# validating results (only 3 less)
print(f"Shape after drop: {diabetes.shape}")

###  Check for null values per column

In [None]:
# nulls per columns (percentage)
diabetes.isna().sum() * 100 / len(diabetes)

###  Drop "weight" column

In [None]:
# remove weight column form dataframe 96.858387% null values
print(f"Columns before remove {len(diabetes.columns)}")
print(f"Columns:{diabetes.columns}")
diabetes = diabetes.drop('weight', axis=1)
print(f"Columns after remove {len(diabetes.columns)}")

###  Check for variability

In [None]:
# Drops columns without variability
def columns_without_variability(data_frame):
    '''
     Function that is responsible to determine which columnns has no variability (those which has only 1 value).
    '''
    sobran = []

    cols = data_frame.columns
    for col in cols:
        if len(data_frame[col].unique()) < 2:
            sobran.append(col)

    return sobran

# obtener listado de columnas sin variabilidad en una lista
cols_without_variability = columns_without_variability(diabetes)

# TODO!! HOBETU TESTUA remove columns without variabitly
print(f"Columns before remove {len(diabetes.columns)}")
print(f"Columns:{diabetes.columns}")
diabetes = diabetes.drop(columns = cols_without_variability)
print(f"Columns after remove {len(diabetes.columns)}")

###  Check dtype uniformity: 

In [None]:
# data information
print(f"\nData information: {diabetes.dtypes}\n")

In [None]:
# Numerical data comprobations
num_cols = diabetes.select_dtypes(include='int64')

# Check column values, correspond to dtypes
for cat in num_cols.columns:
    print(f"\nColumn: {cat} values: {diabetes[cat].unique()}")

### Change admission_type_id, discharge_disposition_id  &  admission_source_id  to categoricals

In [None]:
# admission_type_id, discharge_disposition_id  &  admission_source_id  are categorical no numericals. Change
cols_to_change = ["admission_type_id","discharge_disposition_id", "admission_source_id"]
diabetes[cols_to_change] =  diabetes[cols_to_change].astype(object)

# get updated list
num_cols = diabetes.select_dtypes(include='int64')

# Check column values, correspond to dtypes
for cat in num_cols.columns:
    print(f"\nColumn: {cat} values: {diabetes[cat].unique()}")

### Numerical data visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# visualizing data distribution
for col in num_cols.columns:
    fig, ax = plt.subplots(figsize = (10,5))
    sns.countplot(data=diabetes, x=col, ax = ax)
    ax.set_title(col)
    plt.tight_layout()
    plt.show()

### Categorical data 

In [None]:
# optimize memory use changing object to string
categorical_cols = diabetes.select_dtypes('object').columns.tolist()

# Check column values, correspond to dtypes
for cat in categorical_cols:
    print(f"\nColumn: {cat} values: {diabetes[cat].unique()}")

In [None]:
import itertools

# get categorical column pairs
categorical_col_pair = list(itertools.combinations(categorical_cols, 2))       

# visualize data relations
for pair in categorical_col_pair:
    print(f"\n{pair[0]} distribution per {pair[1]}")
    print(f"{diabetes.groupby(pair[0])[pair[1]].value_counts(dropna= False).unstack().fillna(0)}")


### Save preprocess data

In [None]:
# save data
diabetes.to_parquet("./refined_file.parquet",engine="pyarrow")

## CREATE SYNTHETIZER & SYNTHETIC DATA WITH SDV

In [None]:
# TODO funtzio batetan
# Transform `diabetes` dataframe `SingleTableMetadata` data type 
from sdv.metadata import SingleTableMetadata

# recover refined or preprocessed file
diabetes = pd.read_parquet("./refined_file.parquet",engine="pyarrow")

# Automatically detect metadata from the actual DataFrame
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(diabetes)

# Change dtype of "_id" columns. Threat as categorical instead of numerical
for column_name in metadata.columns:
    if '_id' in column_name:
        metadata.update_column(column_name, sdtype='categorical')
# Check if metadata has been correctly generated
print(metadata)	

In [21]:
from sdv.single_table import GaussianCopulaSynthesizer

#TODO dena funtzio batetan!
synthesizer = GaussianCopulaSynthesizer(
    metadata,
    enforce_min_max_values=True,
    enforce_rounding=True) 

# train data to learn from real data
synthesizer.fit(
    data = diabetes
)

# create new data (same dimensions) based on learned model
synthetic_data = synthesizer.sample(
    num_rows=diabetes.shape[0]
)

## EXPLORE SYNTHETIC DATA AND VALIDATE

In [None]:
# dimensions
print(f"Real dimension: {diabetes.shape}")
print(f"Synth dimension: {synthetic_data.shape}")

In [None]:
# Get information from both datasets
real_data_info = pd.DataFrame({
    'Column': diabetes.columns,
    'Real Non-Null Count':diabetes.notnull().sum()
    
})

# For synthetic data
synthetic_data_info = pd.DataFrame({
    'Column': synthetic_data.columns,
    'Synthetic Non-Null Count':synthetic_data.notnull().sum()
})

# Merge the two DataFrames on the 'Column' name
comparison = pd.merge(real_data_info, synthetic_data_info, on='Column', how='outer')

# Print comparison table
print("Comparison of Real and Synthetic Data:")
print(comparison)


###  Check mayor differences and see how to improve the model

In [None]:
# Compare and detect considerable differences
# set threshold
threshold = 0.10

# Initialize the list
columns_with_differences = []

for col in diabetes.columns:
    print(f"\n\nColumn: {col}")
    
    # Get value counts for both real and synthetic data
    real_counts = diabetes[col].value_counts()
    synth_counts = synthetic_data[col].value_counts()
    
    # Combine the counts to ensure all categories are represented in both datasets
    combined_counts = pd.DataFrame({'Real': real_counts, 'Synthetic': synth_counts}).fillna(0)
    
    # Compute proportions
    total_real = combined_counts['Real'].sum()
    total_synthetic = combined_counts['Synthetic'].sum()
    combined_counts['Real_Proportion'] = combined_counts['Real'] / total_real
    combined_counts['Synthetic_Proportion'] = combined_counts['Synthetic'] / total_synthetic
    
    # Compute absolute and relative differences
    combined_counts['Absolute_Difference'] = combined_counts['Real_Proportion'] - combined_counts['Synthetic_Proportion']
    combined_counts['Relative_Difference'] = combined_counts['Absolute_Difference'] / combined_counts['Real_Proportion'].replace(0, 1)  # Avoid division by zero
    
    # Print the combined counts for easy comparison
    print(combined_counts)
    
    # Check if any relative difference exceeds the threshold
    if (combined_counts['Relative_Difference'].abs() > threshold).any():
        columns_with_differences.append(col)
        print(f"** Significant differences detected in column: {col} **")

# Report columns with considerable differences
print(f"\nFrom: {len(diabetes.columns)} has considerable differences: {len(columns_with_differences)}")
print("\nColumns with considerable differences between real and synthetic data:")
print(columns_with_differences)

### Check differences visually

In [None]:
import matplotlib.pyplot as plt

# loop columns with differences and visualize in same plot
for col in columns_with_differences:
    real_counts = diabetes[col].value_counts().to_dict()
    synthetic_counts = synthetic_data[col].value_counts().to_dict()

    # Plot for Real Data
    plt.figure(figsize=(12, 6))
    plt.bar(real_counts.keys(), real_counts.values(), alpha=0.7, label='Real Data')

    # Plot for Synthetic Data
    plt.bar(synthetic_counts.keys(), synthetic_counts.values(), alpha=0.7, label='Synthetic Data')

    plt.xlabel(col)
    plt.ylabel('Counts')
    plt.title(f'Comparison of {col} distribution')
    plt.legend()
    plt.xticks(rotation=90)
    plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# CORRELATION MATRIX
fig, ax = plt.subplots(1,2,figsize = (15,5))
corr_r = diabetes.corr()
corr_s = synthetic_data.corr()
sns.heatmap(corr_r, 
            xticklabels=corr_r.columns.values,
            yticklabels=corr_r.columns.values,
            cmap="Blues",
            annot=True,         # Display the correlation values in the cells
            fmt=".2f", ax = ax[0])
sns.heatmap(corr_s, 
            xticklabels=corr_s.columns.values,
            yticklabels=corr_s.columns.values,
            cmap="Greens",
            annot=True,         # Display the correlation values in the cells
            fmt=".2f", ax = ax[1])
ax[0].set_title("REAL")
ax[1].set_title("SYNTH")
plt.tight_layout()     
plt.show()

## ADJUST REAL DATA

Adjust the frequencies of categories in the real data to match the desired proportions. This may involve oversampling underrepresented categories or undersampling overrepresented ones.

In [None]:
import pandas as pd
from sklearn.utils import resample

def adjust_data_distribution(df, col):
    """
    Adjust the distribution of a specific column in the DataFrame to match its real proportions.
    
    Parameters:
        df (pd.DataFrame): The original DataFrame.
        col (str): The column name to adjust.

    Returns:
        pd.DataFrame: The DataFrame with adjusted distribution for the specified column.
    """
    real_proportions = df[col].value_counts(normalize=True)
    adjusted_real_df = pd.DataFrame()
    
    for value, proportion in real_proportions.items():
        current_count = df[df[col] == value].shape[0]
        target_count = int(proportion * len(df))
        
        if target_count > current_count:
            # Oversample
            sample_df = df[df[col] == value]
            sampled_df = resample(sample_df, replace=True, n_samples=target_count - current_count)
            adjusted_real_df = pd.concat([adjusted_real_df, sample_df, sampled_df])
        elif target_count < current_count:
            # Undersample
            sampled_df = resample(df[df[col] == value], replace=False, n_samples=target_count)
            adjusted_real_df = pd.concat([adjusted_real_df, sampled_df])
        else:
            # No adjustment
            adjusted_real_df = pd.concat([adjusted_real_df, df[df[col] == value]])
    
    return adjusted_real_df

def compare_distributions(real_df, adjusted_df, col):
    """
    Compare the distributions of a specific column in the real and adjusted DataFrames with visualizations.
    
    Parameters:
        real_df (pd.DataFrame): The original DataFrame.
        adjusted_df (pd.DataFrame): The DataFrame with adjusted distribution.
        col (str): The column name to compare.

    Returns:
        None
    """
    # Compute proportions for real and adjusted data
    real_proportions = real_df[col].value_counts(normalize=True).sort_index()
    adjusted_proportions = adjusted_df[col].value_counts(normalize=True).sort_index()
    
    # Combine proportions into a DataFrame for easy comparison
    comparison_df = pd.DataFrame({
        'Real': real_proportions,
        'Adjusted': adjusted_proportions
    }).fillna(0)

    # Plotting
    plt.figure(figsize=(12, 6))
    comparison_df.plot(kind='bar', width=0.8)
    plt.title(f"Comparison of Proportions for '{col}'")
    plt.xlabel('Categories')
    plt.ylabel('Proportion')
    plt.xticks(rotation=45)
    plt.legend(['Real', 'Adjusted'])
    plt.tight_layout()
    plt.show()

    # Calculate differences
    comparison_df['Absolute_Difference'] = comparison_df['Real'] - comparison_df['Adjusted']
    comparison_df['Relative_Difference'] = comparison_df['Absolute_Difference'] / comparison_df['Real'].replace(0, 1)  # Avoid division by zero

    # Print comparison
    print(f"\nComparison of Proportions for column '{col}':")
    print(comparison_df)
    
    # Optional: Highlight significant differences
    significant_diff = comparison_df[comparison_df['Relative_Difference'].abs() > 0.1]
    if not significant_diff.empty:
        print("\nSignificant Differences:")
        print(significant_diff)

# Example usage for each column
for col in diabetes.columns:
    print(f"\nProcessing column: {col}")
    
    # Adjust the real data
    adjusted_real_df = adjust_data_distribution(diabetes, col)
    
    # Compare distributions
    compare_distributions(diabetes, adjusted_real_df, col)

### Normalize adjusted real data

In [None]:
# TODO AJUST!!
from sklearn.preprocessing import MinMaxScaler

def min_max_normalize(df):
    scaler = MinMaxScaler()
    normalized_df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns, index=df.index)
    return normalized_df

# Example usage
normalized_df = min_max_normalize(adjusted_real_df)

# Save normalized data

In [None]:
# save data
normalized_df.to_parquet("./adjusted_and_normalized.parquet",engine="pyarrow")

## RE-CREATE SYNTH DATA AND VALIDATE PERFORMANCE

In [None]:
# TODO METADATA funtzioari deitu konpondutako dataset-akin

In [None]:
# TODO SYNTH SORRERA funtzioari deitu konpondutako dataset-akin