Code to create synthetic data based on different distributions.

Using the three options

In [39]:
# Import libraries
import pandas as pd
import numpy as np

In [40]:
def load_data(filename):
    # Load the data
    data = pd.read_csv(filename)
    
    # Drop the first column
    data = data.drop(data.columns[0], axis=1)
    
    # Create a dataframe with numerical columns only
    nums = data.drop(['subject_id', 'hadm_id', 'charttime'], axis=1)
    
    return nums

def calculate_stats(nums):
    # Determine the null values in each column
    nan_counts = nums.isnull().sum()
    total_counts = len(nums)
    nan_percent = (nan_counts / total_counts) * 100
    
    # Calculate mean and variance for each column
    smean = nums.mean()
    svar = nums.var()
    
    return nan_counts, smean, svar

def generate_synthetic_data(nums, nan_counts, smean, svar):
    samples = len(nums)
    synth_data = pd.DataFrame()
    
    for column in nums.columns:
        mean = smean[column]
        variance = svar[column]
        std_dev = np.sqrt(variance)
        
        # Sample from a normal distribution
        synthetic_data = np.random.normal(loc=mean, scale=std_dev, size=samples)

        # Cap GCS_Total values at 15 and ensure all values are non-negative
        if column == 'GCS Total':
            synthetic_data = np.clip(synthetic_data, a_min=0, a_max=15)
        else:
            synthetic_data = np.clip(synthetic_data, a_min=0, a_max=None)
        
        # Introduce NaNs based on the original NaN distribution
        num_nans = nan_counts[column]
        nan_indices = np.random.choice(samples, num_nans, replace=False)
        synthetic_data[nan_indices] = np.nan
        
        synth_data[column] = synthetic_data
    
    return synth_data

# Process a file and generate synthetic data
def process_file(filename):
    nums = load_data(filename)
    nan_counts, smean, svar = calculate_stats(nums)
    synth_data = generate_synthetic_data(nums, nan_counts, smean, svar)
    
    return nums, synth_data

# Compare statistics of real and synthetic data
def compare_stats(real_data, synth_data):
    real_nan_counts, real_mean, real_var = calculate_stats(real_data)
    synth_nan_counts, synth_mean, synth_var = calculate_stats(synth_data)
    
    print("Comparison of Real and Synthetic Data:\n")
    
    for column in real_data.columns:
        print(f"Column: {column}")
        
        print(f"Real Mean: {real_mean[column]}")
        print(f"Synthetic Mean: {synth_mean[column]}")
        print(f"Mean Difference: {real_mean[column] - synth_mean[column]}\n")
        
        print(f"Real Variance: {real_var[column]}")
        print(f"Synthetic Variance: {synth_var[column]}")
        print(f"Variance Difference: {real_var[column] - synth_var[column]}\n")
        
        print(f"Real NaN Count: {real_nan_counts[column]}")
        print(f"Synthetic NaN Count: {synth_nan_counts[column]}")
        print(f"NaN Count Difference: {real_nan_counts[column] - synth_nan_counts[column]}\n")
        print("-" * 50)


Option 1

In [41]:
# Option 1: Using the earliest chart time for each hadm_id
real_data_op1, synth_data_op1 = process_file('NumOp1.csv')

In [42]:
print("Option 1:")

#compare_stats(real_data_op1, synth_data_op1)

Option 1:


Option 2

In [43]:
# Option 2: Picking the charttime with the fewest NaNs
real_data_op2, synth_data_op2 = process_file('NumOp2.csv')

In [44]:
print("Option 2:")
#compare_stats(real_data_op2, synth_data_op2)

Option 2:


Option 3

In [45]:
# Option 3: Picking the first reading within the hour (from the start of the first recorded time)
real_data_op3, synth_data_op3 = process_file('NumOp3.csv')

In [46]:
print("Option 3:")
#compare_stats(real_data_op3, synth_data_op3)

Option 3:
