# SDV SYNTHETIC DATA GENERATION

## LOAD PREPROCESSED DATA

## CREATE SYNTHETIZER & SYNTHETIC DATA WITH SDV

In [None]:
# Transform dataframe into`SingleTableMetadata` data type 
from sdv.metadata import SingleTableMetadata

def create_and_modify_metadata(df):
    """
    SingleTableMetadata type data creation. Obtains information directly from original dataframe and
    adjust dtype for "_id" type columns.
    
    Parameters:
        df (pd.DataFrame): The original DataFrame.

    Returns:
        SingleTableMetadata: metadata to create synthetic data
    """
    # Automatically detect metadata from the actual DataFrame
    metadata = SingleTableMetadata()
    metadata.detect_from_dataframe(df)

    # Change dtype of "_id" columns. Threat as categorical instead of numerical
    for column_name in metadata.columns:
        if '_id' in column_name:
            metadata.update_column(column_name, sdtype='categorical')
    return metadata

# create metadata
# recover refined or preprocessed file
diabetes = pd.read_parquet("./refined_file.parquet",engine="pyarrow")
metadata = create_and_modify_metadata(diabetes)

# Check if metadata has been correctly generated
print(metadata)	


In [4]:
from sdv.single_table import GaussianCopulaSynthesizer

# Create synthetizer and synthetic data 
def synthetic_data_creation(md, df):
    """
    Creates synthetizer, trains synthetizer with real data and creates new 
    synthetic data.
    
    Parameters:
        md (SingleTableMetadata): Metadata of DataFrame.
        df (pd.DataFrame): The original DataFrame.

    Returns:
        SingleTableMetadata: metadata to create synthetic data
    """

    # create synthetizer
    synthesizer = GaussianCopulaSynthesizer(
        md,
        enforce_min_max_values=True,
        enforce_rounding=True) 

    # train data to learn from real data
    synthesizer.fit(
        data = df
    )

    # create new data (same dimensions) based on learned model
    synthetic_data = synthesizer.sample(
        num_rows=df.shape[0]
    )
    return synthetic_data

# obtain synthetic data
synthetic_data = synthetic_data_creation(metadata, diabetes)

## EXPLORE SYNTHETIC DATA AND VALIDATE

In [None]:
# dimensions
print(f"Real dimension: {diabetes.shape}")
print(f"Synth dimension: {synthetic_data.shape}")

In [None]:
# Get information from both datasets
real_data_info = pd.DataFrame({
    'Column': diabetes.columns,
    'Real Non-Null Count':diabetes.notnull().sum()
})

# For synthetic data
synthetic_data_info = pd.DataFrame({
    'Column': synthetic_data.columns,
    'Synthetic Non-Null Count':synthetic_data.notnull().sum()
})

# Merge the two DataFrames on the 'Column' name
comparison = pd.merge(real_data_info, synthetic_data_info, on='Column', how='outer')

# Print comparison table
print("Comparison of Real and Synthetic Data:")
print(comparison)


### Identify Major Differences and Improve the Model

Analyze proportional differences and visualize them graphically for better insights.

In [None]:
def check_proportion_diffs(real, other, col, text):
    """
    Creates real and adjusted proportions values to compare
    
    Parameters:
        real (pd.DataFrame): The original DataFrame.
        other (pd.DataFrame): Dataframe to compare with original DataFrame
        col (str): The column to apply the comparison
        text (str): Text to apply comparison dataframe for comparative Dataframe

    Returns:       
        comparison_df pd.DataFrame: Dataframe that comparates real and other dataframes proportions
    """
    # Compute proportions for real and adjusted data
    real_proportions = real[col].value_counts(normalize=True).sort_index()
    other_proportions = other[col].value_counts(normalize=True).sort_index()
    
    # Combine proportions into a DataFrame for easy comparison
    comparison_df = pd.DataFrame({
        'Real': real_proportions,
         text: other_proportions
    }).fillna(0)

    # Calculate differences
    comparison_df['Absolute_Difference'] = comparison_df['Real'] - comparison_df[text]
    comparison_df['Relative_Difference'] = comparison_df['Absolute_Difference'] / comparison_df['Real'].replace(0, 1)  # Avoid division by zero

    return comparison_df

# Create and visualize data proportion
for col in diabetes.columns:
    print(f"\n Proportion differences for column '{col}':") 
    comparison_df= check_proportion_diffs(diabetes, synthetic_data, col, "Synthetic")
    print(comparison_df)

In [None]:
import matplotlib.pyplot as plt

def compare_distributions(comparison_df, col, text):
    """
    Compare the distributions of a specific column in the real and adjusted DataFrames with visualizations.
    
    Parameters:
        comparison_df (pd.DataFrame): The DataFrame with proportion comparation between real and adjusted data.
        col (str): The column name to compare.
        text (str): Text to refer column that correspond to comparative data

    Returns:
        None
    """
    # Plotting
    plt.figure(figsize=(12, 6))
    comparison_df.plot(kind='bar', width=0.8)
    plt.title(f"Comparison of Proportions for '{col}'")
    plt.xlabel('Categories')
    plt.ylabel('Proportion')
    plt.xticks(rotation=45)
    plt.legend(['Real',text])
    plt.tight_layout()
    plt.show()
    
    
# Visualize comparison_df data
for col in diabetes.columns:
    print(f"\nColumn: {col}")    
    comparison_df= check_proportion_diffs(diabetes, synthetic_data, col, "Synthetic")
    compare_distributions(comparison_df[["Real","Synthetic"]], col, "Synthetic")
        

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# CORRELATION MATRIX
fig, ax = plt.subplots(1,2,figsize = (15,5))
corr_r = diabetes.corr()
corr_s = synthetic_data.corr()
sns.heatmap(corr_r, 
            xticklabels=corr_r.columns.values,
            yticklabels=corr_r.columns.values,
            cmap="Blues",
            annot=True,         # Display the correlation values in the cells
            fmt=".2f", ax = ax[0])
sns.heatmap(corr_s, 
            xticklabels=corr_s.columns.values,
            yticklabels=corr_s.columns.values,
            cmap="Greens",
            annot=True,         # Display the correlation values in the cells
            fmt=".2f", ax = ax[1])
ax[0].set_title("REAL")
ax[1].set_title("SYNTH")
plt.tight_layout()     
plt.show()

## ADJUST REAL DATA

Adjust the frequencies of categories in the real data to match the desired proportions. This may involve oversampling underrepresented categories or undersampling overrepresented ones.

In [None]:
import pandas as pd
from sklearn.utils import resample

def adjust_data_distribution(df, col):
    """
    Adjust the distribution of a specific column in the DataFrame to match its real proportions.
    
    Parameters:
        df (pd.DataFrame): The original DataFrame.
        col (str): The column name to adjust.

    Returns:
        pd.DataFrame: The DataFrame with adjusted distribution for the specified column.
    """
    real_proportions = df[col].value_counts(normalize=True)
    adjusted_real_df = pd.DataFrame()
    
    for value, proportion in real_proportions.items():
        current_count = df[df[col] == value].shape[0]
        target_count = int(proportion * len(df))
        
        if target_count > current_count:
            # Oversample
            sample_df = df[df[col] == value]
            sampled_df = resample(sample_df, replace=True, n_samples=target_count - current_count)
            adjusted_real_df = pd.concat([adjusted_real_df, sample_df, sampled_df])
        elif target_count < current_count:
            # Undersample
            sampled_df = resample(df[df[col] == value], replace=False, n_samples=target_count)
            adjusted_real_df = pd.concat([adjusted_real_df, sampled_df])
        else:
            # No adjustment
            adjusted_real_df = pd.concat([adjusted_real_df, df[df[col] == value]])
    
    return adjusted_real_df


# Visualize adjusted data distribution 
for col in diabetes.columns:   
    # Adjust the real data
    adjusted_real_df = adjust_data_distribution(diabetes, col)
    
    # Comparate 
    comparison_df= check_proportion_diffs(diabetes, adjusted_real_df, col, "Adjusted_real")
    compare_distributions(comparison_df[["Real","Adjusted_real"]], col, "Adjusted_real")
    
    

In [58]:
for col in diabetes.columns:
    print("\n Column:{col}")
    print(diabetes[col].value_counts(dropna =False))


 Column:{col}
Caucasian          76099
AfricanAmerican    19210
Other               3776
Hispanic            2037
Asian                641
Name: race, dtype: int64

 Column:{col}
Female    54708
Male      47055
Name: gender, dtype: int64

 Column:{col}
[70-80)     26066
[60-70)     22482
[50-60)     17256
[80-90)     17197
[40-50)      9685
[30-40)      3775
[90-100)     2793
[20-30)      1657
[10-20)       691
[0-10)        161
Name: age, dtype: int64

 Column:{col}
1    53988
3    18868
2    18480
6     5291
5     4785
8      320
7       21
4       10
Name: admission_type_id, dtype: int64

 Column:{col}
1     60232
3     13954
6     12902
18     3691
2      2128
22     1992
11     1642
5      1184
25      989
4       815
7       623
23      412
13      399
14      372
28      139
8       108
15       63
24       48
9        21
17       14
16       11
19        8
10        6
27        5
12        3
20        2
Name: discharge_disposition_id, dtype: int64

 Column:{col}
7     57492
1 

# Save adjusted data

In [25]:
# save data
adjusted_real_df.to_parquet("./adjusted.parquet",engine="pyarrow")

## RE-CREATE SYNTH DATA AND VALIDATE PERFORMANCE

In [None]:
# TODO METADATA funtzioari deitu konpondutako dataset-akin

In [None]:
# TODO SYNTH SORRERA funtzioari deitu konpondutako dataset-akin