Code to create synthetic data based on different distributions.

Using the three options

In [10]:
# Import libraries
import pandas as pd
import numpy as np

In [32]:
def load_data(filename):
    # Load the data
    data = pd.read_csv(filename)
    
    # Drop the first column
    data = data.drop(data.columns[0], axis=1)
    
    # Create a dataframe with numerical columns only
    nums = data.drop(['subject_id', 'hadm_id', 'charttime'], axis=1)
    
    return nums

def calculate_stats(nums):
    # Determine the null values in each column
    nan_counts = nums.isnull().sum()
    total_counts = len(nums)
    nan_percent = (nan_counts / total_counts) * 100
    
    # Calculate mean and variance for each column
    smean = nums.mean()
    svar = nums.var()
    
    return nan_counts, smean, svar

def generate_synthetic_data(nums, nan_counts, smean, svar):
    samples = len(nums)
    synth_data = pd.DataFrame()
    
    for column in nums.columns:
        mean = smean[column]
        variance = svar[column]
        std_dev = np.sqrt(variance)
        
        # Sample from a normal distribution
        synthetic_data = np.random.normal(loc=mean, scale=std_dev, size=samples)

        # Cap GCS_Total values at 15 and ensure all values are non-negative
        if column == 'GCS Total':
            synthetic_data = np.clip(synthetic_data, a_min=0, a_max=15)
        else:
            synthetic_data = np.clip(synthetic_data, a_min=0, a_max=None)
        
        # Introduce NaNs based on the original NaN distribution
        num_nans = nan_counts[column]
        nan_indices = np.random.choice(samples, num_nans, replace=False)
        synthetic_data[nan_indices] = np.nan
        
        synth_data[column] = synthetic_data
    
    return synth_data

# Process a file and generate synthetic data
def process_file(filename):
    nums = load_data(filename)
    nan_counts, smean, svar = calculate_stats(nums)
    synth_data = generate_synthetic_data(nums, nan_counts, smean, svar)
    
    return nums, synth_data

# Compare statistics of real and synthetic data
def compare_stats(real_data, synth_data):
    real_nan_counts, real_mean, real_var = calculate_stats(real_data)
    synth_nan_counts, synth_mean, synth_var = calculate_stats(synth_data)
    
    print("Comparison of Real and Synthetic Data:\n")
    
    for column in real_data.columns:
        print(f"Column: {column}")
        
        print(f"Real Mean: {real_mean[column]}")
        print(f"Synthetic Mean: {synth_mean[column]}")
        print(f"Mean Difference: {real_mean[column] - synth_mean[column]}\n")
        
        print(f"Real Variance: {real_var[column]}")
        print(f"Synthetic Variance: {synth_var[column]}")
        print(f"Variance Difference: {real_var[column] - synth_var[column]}\n")
        
        print(f"Real NaN Count: {real_nan_counts[column]}")
        print(f"Synthetic NaN Count: {synth_nan_counts[column]}")
        print(f"NaN Count Difference: {real_nan_counts[column] - synth_nan_counts[column]}\n")
        print("-" * 50)


Option 1

In [33]:
# Option 1: Using the earliest chart time for each hadm_id
real_data_op1, synth_data_op1 = process_file('NumOp1.csv')

Unnamed: 0,anchor_age,ART BP Systolic,Heart Rate,O2 saturation pulseoxymetry,Respiratory Rate,Temperature Celsius,GCS Total
0,58.972071,,,96.638236,19.556537,,15.0
1,75.605297,,106.160389,95.447182,,,
2,43.381731,,,,,,14.649457
3,77.356232,,84.224165,,28.01642,,
4,97.099906,,82.881905,,,,


In [34]:
print("Option 1:")

compare_stats(real_data_op1, synth_data_op1)

Option 1:
Comparison of Real and Synthetic Data:

Column: anchor_age
Real Mean: 63.06205470006895
Synthetic Mean: 63.535244641406265
Mean Difference: -0.47318994133731707

Real Variance: 282.7179874094211
Synthetic Variance: 287.27670136639745
Variance Difference: -4.558713956976362

Real NaN Count: 0
Synthetic NaN Count: 0
NaN Count Difference: 0

--------------------------------------------------
Column: ART BP Systolic
Real Mean: 131.06896551724137
Synthetic Mean: 134.31385430242506
Mean Difference: -3.24488878518369

Real Variance: 614.3522167487685
Synthetic Variance: 722.3470534053371
Variance Difference: -107.99483665656862

Real NaN Count: 4322
Synthetic NaN Count: 4322
NaN Count Difference: 0

--------------------------------------------------
Column: Heart Rate
Real Mean: 88.37096774193549
Synthetic Mean: 88.88824552153561
Mean Difference: -0.5172777796001213

Real Variance: 409.77225264299267
Synthetic Variance: 431.4885777326407
Variance Difference: -21.716325089648024

Rea

Option 2

In [35]:
# Option 2: Picking the charttime with the fewest NaNs
real_data_op2, synth_data_op2 = process_file('NumOp2.csv')

Unnamed: 0,anchor_age,ART BP Systolic,Heart Rate,O2 saturation pulseoxymetry,Respiratory Rate,Temperature Celsius,GCS Total
0,67.444132,,93.077745,95.681217,16.765494,,9.73921
1,39.190032,,76.490812,101.113562,24.156657,,14.548287
2,89.87689,,109.433867,103.803874,28.268985,,15.0
3,96.004163,,79.155965,99.266199,17.117569,,15.0
4,72.704668,,73.581516,99.632851,14.521096,,9.026758


In [36]:
print("Option 2:")
compare_stats(real_data_op2, synth_data_op2)

Option 2:
Comparison of Real and Synthetic Data:

Column: anchor_age
Real Mean: 63.06205470006895
Synthetic Mean: 63.36320750307107
Mean Difference: -0.3011528030021182

Real Variance: 282.7179874094211
Synthetic Variance: 283.41411732922927
Variance Difference: -0.6961299198081861

Real NaN Count: 0
Synthetic NaN Count: 0
NaN Count Difference: 0

--------------------------------------------------
Column: ART BP Systolic
Real Mean: 117.12075471698114
Synthetic Mean: 118.2196197534024
Mean Difference: -1.0988650364212589

Real Variance: 564.6747570040023
Synthetic Variance: 603.7446973588594
Variance Difference: -39.06994035485707

Real NaN Count: 4086
Synthetic NaN Count: 4086
NaN Count Difference: 0

--------------------------------------------------
Column: Heart Rate
Real Mean: 84.96989197885543
Synthetic Mean: 84.97175019293192
Mean Difference: -0.001858214076492004

Real Variance: 349.06690939062764
Synthetic Variance: 347.39246118890924
Variance Difference: 1.6744482017184055

Re

Option 3

In [37]:
# Option 3: Picking the first reading within the hour (from the start of the first recorded time)
real_data_op3, synth_data_op3 = process_file('NumOp3.csv')

Unnamed: 0,anchor_age,ART BP Systolic,Heart Rate,O2 saturation pulseoxymetry,Respiratory Rate,Temperature Celsius,GCS Total
0,71.19203,,103.942937,99.345021,22.345017,,11.891725
1,72.846562,,88.491543,101.904281,27.725356,,4.970713
2,61.597098,,121.556065,96.748639,21.991005,,5.070357
3,47.732669,,100.903567,91.036421,16.973034,,15.0
4,79.553435,,70.860413,93.806021,8.332421,,15.0


In [38]:
print("Option 3:")
compare_stats(real_data_op3, synth_data_op3)

Option 3:
Comparison of Real and Synthetic Data:

Column: anchor_age
Real Mean: 63.06205470006895
Synthetic Mean: 63.00295754694188
Mean Difference: 0.059097153127069646

Real Variance: 282.7179874094211
Synthetic Variance: 282.2852095171313
Variance Difference: 0.4327778922897778

Real NaN Count: 0
Synthetic NaN Count: 0
NaN Count Difference: 0

--------------------------------------------------
Column: ART BP Systolic
Real Mean: 123.37795275590551
Synthetic Mean: 119.59773484690463
Mean Difference: 3.780217909000882

Real Variance: 728.3322084739408
Synthetic Variance: 639.7871409866092
Variance Difference: 88.54506748733161

Real NaN Count: 4224
Synthetic NaN Count: 4224
NaN Count Difference: 0

--------------------------------------------------
Column: Heart Rate
Real Mean: 88.29614935822637
Synthetic Mean: 87.93995490758567
Mean Difference: 0.3561944506407002

Real Variance: 400.74304072465236
Synthetic Variance: 388.8249448751927
Variance Difference: 11.918095849459633

Real NaN 