# Synthea
Infarction Emergency Patients - First Encounter, 10k Patients

## Libraries and Constants

In [63]:
import subprocess

from pathlib import Path
import shutil
import pandas as pd

# Absolute paths to files and directories
BASE_DIRECTORY = (Path.cwd()).parent
SYNTHEA_PATH = Path(BASE_DIRECTORY / 'synthea-with-dependencies.jar')
RAW_DATA_PATH = Path(BASE_DIRECTORY / 'data/raw/infarction_v1')
CSV_RAW_DATA_PATH = Path(RAW_DATA_PATH / 'csv')
PROCESSED_DATA_PATH = Path(BASE_DIRECTORY / 'data/processed/infarction_v1')

# Shell command to run Synthea
RUN_SYNTHEA = f'java -jar {SYNTHEA_PATH}'

# CSV files that will be excluded from the generated data
EXCLUDED_CSV_FILES = [
    'careplans.csv',
    'claims.csv',
    'claims_transactions.csv',
    'organizations.csv',
    'patient_expenses.csv',
    'payer_transitions.csv',
    'payers.csv',
    'providers.csv',
    'supplies.csv'
]

# Number of patients to be generated
NUM_PATIENTS = 10_000

# If new Synthea data should be generated
GENERATE_DATA = False

## Data Generation

In [64]:
if(GENERATE_DATA):
    # Verify if the v0 raw data directory exists
    if RAW_DATA_PATH.exists():
        # Delete the directory
        shutil.rmtree(RAW_DATA_PATH)

    # String containing the CSV files that will not be generated
    excluded_files = ','.join(EXCLUDED_CSV_FILES)

    # Define the shell command to generate the Synthea data
    command = RUN_SYNTHEA \
                + f' -p {NUM_PATIENTS}' \
                + f' --exporter.baseDirectory={RAW_DATA_PATH}' \
                + ' --exporter.csv.export=true' \
                + f' --exporter.csv.excluded_files={excluded_files}' \
                + ' --exporter.metadata.export=false' \
                + ' --exporter.fhir.export=false' \
                + ' --exporter.fhir.transaction_bundle=false' \
                + ' --exporter.hospital.fhir.export=false' \
                + ' --exporter.practitioner.fhir.export=false'

    # Run Synthea
    subprocess.run(command, shell=True)

## Data Processing

### Opening the Raw Data Files

In [65]:
# Get the raw data file names
raw_files = [file.name[:-4] for file in CSV_RAW_DATA_PATH.iterdir()]

# Load the DataFrames with raw data
dfs = dict()
for file in raw_files:
    dfs[file] = pd.read_csv(f'{CSV_RAW_DATA_PATH}/{file}.csv')

### Finding the Infarction Emergency Patients

In [66]:
# Drop unnecessary encounter columns
dfs['encounters'] = dfs['encounters'].drop(
    columns=[
        'ORGANIZATION', 'PROVIDER', 'PAYER', 'BASE_ENCOUNTER_COST',
        'TOTAL_CLAIM_COST', 'PAYER_COVERAGE'
        ]
    )

# Filter only infarction emergency encounters
dfs['encounters'] = dfs['encounters'] \
    .query('ENCOUNTERCLASS == "emergency"') \
    .query('REASONDESCRIPTION.notnull()') \
    .query('REASONDESCRIPTION.str.contains("infarction")') \
    .reset_index(drop=True)

# Get the identifiers (encounters and patients)
encounters_ids = dfs['encounters']['Id'].tolist()
patients_ids = dfs['encounters']['PATIENT'].tolist()

In [67]:
dfs['encounters']

Unnamed: 0,Id,START,STOP,PATIENT,ENCOUNTERCLASS,CODE,DESCRIPTION,REASONCODE,REASONDESCRIPTION
0,549abada-1beb-bead-119a-8946c8c90ad4,2011-06-22T15:07:30Z,2011-06-22T16:07:30Z,3da17df9-877f-6d12-a58e-4975bfb18483,emergency,50849002,Emergency room admission (procedure),22298006.0,Myocardial infarction (disorder)
1,7c20f665-2b14-cf98-292d-53bf4af069c4,2015-10-08T18:57:43Z,2015-10-08T19:57:43Z,234a2a89-50d4-7dc3-8df5-7ee769371011,emergency,50849002,Emergency room admission (procedure),22298006.0,Myocardial infarction (disorder)
2,2f3154d4-1566-a919-409e-341476d8b374,2009-08-01T18:05:47Z,2009-08-01T19:05:47Z,b48a710b-76af-08ab-e01d-e6a193657496,emergency,50849002,Emergency room admission (procedure),22298006.0,Myocardial infarction (disorder)
3,81ff928d-4d02-4a7a-75b7-eea1e3b1f87f,2015-10-24T05:37:39Z,2015-10-24T06:37:39Z,cb026290-20ed-b6e0-059f-498dadc369ec,emergency,50849002,Emergency room admission (procedure),22298006.0,Myocardial infarction (disorder)
4,39475780-52d1-8156-1a2e-a89b7fab31d8,2024-02-13T09:42:25Z,2024-02-13T10:42:25Z,e85a1bbc-f0a0-dec8-3f1c-9dacadffd21d,emergency,50849002,Emergency room admission (procedure),22298006.0,Myocardial infarction (disorder)
...,...,...,...,...,...,...,...,...,...
491,13ba4bcf-b6fa-7b15-dbbb-9acd8257b51f,2007-05-06T22:25:54Z,2007-05-06T23:25:54Z,96c7a5ba-1dd0-2059-9d51-53ccd97d39bb,emergency,50849002,Emergency room admission (procedure),22298006.0,Myocardial infarction (disorder)
492,77f9a1bf-dd49-3a4f-407b-484c935883c9,2015-05-01T20:59:11Z,2015-05-01T21:59:11Z,eb5479dd-b94e-0e3e-0bd5-01b74f91dd1f,emergency,50849002,Emergency room admission (procedure),22298006.0,Myocardial infarction (disorder)
493,8e4e7b13-3db9-0e58-275d-7f1ae6e29a14,2017-03-31T06:34:37Z,2017-03-31T07:34:37Z,917dacc3-fe4b-1bda-fdd5-98f4d19241a7,emergency,50849002,Emergency room admission (procedure),22298006.0,Myocardial infarction (disorder)
494,fa18f15b-4a24-ae61-1587-e6d4f15b5be6,2007-10-12T08:45:40Z,2007-10-12T09:45:40Z,7c2c7475-dceb-5477-9b51-6ac9372bec3a,emergency,50849002,Emergency room admission (procedure),22298006.0,Myocardial infarction (disorder)


### Filtering the Data Associated with the Infarction Emergency Encounters

In [68]:
# Filter data from DataFrames with encounter foreign key
for file in raw_files:
    if (file != 'encounters') and (file != 'patients'):
        dfs[file] = dfs[file] \
            .query('ENCOUNTER in @encounters_ids') \
            .reset_index(drop=True)

# # Drop unnecessary columns from immunizations, medications and procedures DataFrames
dfs['immunizations'] = dfs['immunizations'].drop(columns=['BASE_COST'])
dfs['medications'] = dfs['medications'] \
    .drop(columns=['BASE_COST', 'PAYER_COVERAGE', 'TOTALCOST'])
dfs['procedures'] = dfs['procedures'].drop(columns=['BASE_COST'])

# Filter data and drop unnecessary columns from the patients DataFrame
dfs['patients'] = dfs['patients'] \
    .drop(columns=['HEALTHCARE_EXPENSES', 'HEALTHCARE_COVERAGE', 'INCOME']) \
    .query('Id in @patients_ids') \
    .reset_index(drop=True)

### Writing CSV Files with the Processed Data

In [69]:
# Verify if the v0 processed data directory exists
if PROCESSED_DATA_PATH.exists():
    # Delete the directory
    shutil.rmtree(PROCESSED_DATA_PATH)

# Create the v0 processed data directory
PROCESSED_DATA_PATH.mkdir(parents=True)

# Write CSV files with processed data
for file in raw_files:
    dfs[file].to_csv(f'{PROCESSED_DATA_PATH}/{file}.csv', index=False)

## Data Analysis

### Verifying the Number of Conditions per Encounter

In [70]:
# Open the conditions data file
df_conditions = pd.read_csv(f'{PROCESSED_DATA_PATH}/conditions.csv')

# Print some of the conditions data
df_conditions.head()

Unnamed: 0,START,STOP,PATIENT,ENCOUNTER,CODE,DESCRIPTION
0,2011-06-22,2011-06-30,3da17df9-877f-6d12-a58e-4975bfb18483,549abada-1beb-bead-119a-8946c8c90ad4,22298006,Myocardial infarction (disorder)
1,2011-06-22,,3da17df9-877f-6d12-a58e-4975bfb18483,549abada-1beb-bead-119a-8946c8c90ad4,401314000,Acute non-ST segment elevation myocardial infa...
2,2015-10-08,2015-10-11,234a2a89-50d4-7dc3-8df5-7ee769371011,7c20f665-2b14-cf98-292d-53bf4af069c4,22298006,Myocardial infarction (disorder)
3,2015-10-08,,234a2a89-50d4-7dc3-8df5-7ee769371011,7c20f665-2b14-cf98-292d-53bf4af069c4,401314000,Acute non-ST segment elevation myocardial infa...
4,2009-08-01,,b48a710b-76af-08ab-e01d-e6a193657496,2f3154d4-1566-a919-409e-341476d8b374,401303003,Acute ST segment elevation myocardial infarcti...


In [71]:
# Aggregate data using encounter codes
df_conditions_agg = df_conditions \
    .groupby(by=['ENCOUNTER'], as_index=False) \
    .agg(
        NUM_CONDITIONS=('CODE', pd.Series.nunique),
        CONDITIONS=('DESCRIPTION', 'unique')
    ) \
    .sort_values(by=['NUM_CONDITIONS'], ascending=False) \
    .reset_index(drop=True)

# Save a CSV file with the aggregation result
df_conditions_agg.to_csv(f'{PROCESSED_DATA_PATH}/agg_conditions.csv', index=False)

# Print some of the aggregation result
df_conditions_agg

Unnamed: 0,ENCOUNTER,NUM_CONDITIONS,CONDITIONS
0,0038d474-b81c-efb0-71fd-776c6b7e5798,2,"[Myocardial infarction (disorder), Acute ST se..."
1,94ac9eea-1d6f-20f5-e9a0-fcdd7cd01bec,2,"[Myocardial infarction (disorder), Acute non-S..."
2,9962bd41-f567-ce20-73ad-dd00e98b4553,2,"[Myocardial infarction (disorder), Acute non-S..."
3,990de367-c67f-3879-bcfe-e7ff5749fdcd,2,"[Myocardial infarction (disorder), Acute ST se..."
4,986f30d2-7417-c86e-19dc-e7e0b6100a6c,2,"[Myocardial infarction (disorder), Acute non-S..."
...,...,...,...
491,38e48f44-abbb-d510-2376-e815a3043ddc,1,[Acute non-ST segment elevation myocardial inf...
492,980131a9-8664-2d12-8440-73a84950dcf8,1,[Acute non-ST segment elevation myocardial inf...
493,3b98eb06-6fda-eee9-ade0-f4f30cf68458,1,[Acute ST segment elevation myocardial infarct...
494,974a2e2a-16e2-3d0c-eb14-8d14990a1874,1,[Acute non-ST segment elevation myocardial inf...


### Verifying the Number of Observations per Encounter

In [72]:
# Open the observations data file
df_observations = pd.read_csv(f'{PROCESSED_DATA_PATH}/observations.csv')

# Print some of the observations data
df_observations.head()

Unnamed: 0,DATE,PATIENT,ENCOUNTER,CATEGORY,CODE,DESCRIPTION,VALUE,UNITS,TYPE
0,2011-06-22T15:24:01Z,3da17df9-877f-6d12-a58e-4975bfb18483,549abada-1beb-bead-119a-8946c8c90ad4,laboratory,6690-2,Leukocytes [#/volume] in Blood by Automated count,3.4,10*3/uL,numeric
1,2011-06-22T15:24:01Z,3da17df9-877f-6d12-a58e-4975bfb18483,549abada-1beb-bead-119a-8946c8c90ad4,laboratory,789-8,Erythrocytes [#/volume] in Blood by Automated ...,3.8,10*6/uL,numeric
2,2011-06-22T15:24:01Z,3da17df9-877f-6d12-a58e-4975bfb18483,549abada-1beb-bead-119a-8946c8c90ad4,laboratory,718-7,Hemoglobin [Mass/volume] in Blood,13.1,g/dL,numeric
3,2011-06-22T15:24:01Z,3da17df9-877f-6d12-a58e-4975bfb18483,549abada-1beb-bead-119a-8946c8c90ad4,laboratory,4544-3,Hematocrit [Volume Fraction] of Blood by Autom...,44.1,%,numeric
4,2011-06-22T15:24:01Z,3da17df9-877f-6d12-a58e-4975bfb18483,549abada-1beb-bead-119a-8946c8c90ad4,laboratory,787-2,MCV [Entitic volume] by Automated count,90.1,fL,numeric


In [73]:
# Aggregate data using encounter codes
df_observations_agg = df_observations \
    .groupby(by=['ENCOUNTER'], as_index=False) \
    .agg(
        NUM_CATEGORIES=('CATEGORY', pd.Series.nunique),
        CATEGORIES=('CATEGORY', 'unique'),
        NUM_OBSERVATIONS=('CODE', pd.Series.nunique),
        OBSERVATIONS=('DESCRIPTION', 'unique')
    ) \
    .sort_values(by=['NUM_CATEGORIES', 'NUM_OBSERVATIONS'], ascending=False) \
    .reset_index(drop=True)

# Save a CSV file with the aggregation result
df_observations_agg.to_csv(f'{PROCESSED_DATA_PATH}/agg_observations.csv', index=False)

# Print some of the aggregation result
df_observations_agg

Unnamed: 0,ENCOUNTER,NUM_CATEGORIES,CATEGORIES,NUM_OBSERVATIONS,OBSERVATIONS
0,0038d474-b81c-efb0-71fd-776c6b7e5798,2,"[laboratory, vital-signs]",34,[Leukocytes [#/volume] in Blood by Automated c...
1,02c85b2b-39c1-1c90-41e8-d939d99adbf8,2,"[laboratory, vital-signs]",34,[Leukocytes [#/volume] in Blood by Automated c...
2,0306eea5-c7d1-f613-cefa-04e6072cb5c6,2,"[laboratory, vital-signs]",34,[Leukocytes [#/volume] in Blood by Automated c...
3,03240b50-46af-e470-1ebe-97a3d7fbad82,2,"[laboratory, vital-signs]",34,[Leukocytes [#/volume] in Blood by Automated c...
4,047c76d2-baa1-d449-3d26-a028bd714e2f,2,"[laboratory, vital-signs]",34,[Leukocytes [#/volume] in Blood by Automated c...
...,...,...,...,...,...
234,f7b880c1-9647-dea5-c9b9-4cb30c34b582,2,"[laboratory, vital-signs]",34,[Leukocytes [#/volume] in Blood by Automated c...
235,fb1b48b8-380d-730a-59a6-206cddca7f01,2,"[laboratory, vital-signs]",34,[Leukocytes [#/volume] in Blood by Automated c...
236,fb30ecf6-02cc-6de6-6feb-cb8c907d8392,2,"[laboratory, vital-signs]",34,[Leukocytes [#/volume] in Blood by Automated c...
237,fbf7d3d2-6c5a-cd97-03ce-ed34fefa2099,2,"[laboratory, vital-signs]",34,[Leukocytes [#/volume] in Blood by Automated c...


### Verifying the Number of Conditions and Observations per Encounter

In [74]:
# Merge aggregation result for conditions and observations
df_merged_agg = df_conditions_agg \
    .merge(
        right=df_observations_agg,
        how='left',
        on='ENCOUNTER'
    ) \
    .sort_values(
        by=['NUM_CONDITIONS', 'NUM_CATEGORIES', 'NUM_OBSERVATIONS'],
        ascending=False
    ) \
    .fillna({
        'NUM_CATEGORIES': 0,
        'CATEGORIES': '',
        'NUM_OBSERVATIONS': 0,
        'OBSERVATIONS': ''
    }) \
    .astype({'NUM_CATEGORIES': int, 'NUM_OBSERVATIONS': int})

# Save a CSV file with the merge result
df_merged_agg.to_csv(f'{PROCESSED_DATA_PATH}/agg_conditions_observations.csv', index=False)

# Print some of the merge result
df_merged_agg

Unnamed: 0,ENCOUNTER,NUM_CONDITIONS,CONDITIONS,NUM_CATEGORIES,CATEGORIES,NUM_OBSERVATIONS,OBSERVATIONS
0,0038d474-b81c-efb0-71fd-776c6b7e5798,2,"[Myocardial infarction (disorder), Acute ST se...",2,"[laboratory, vital-signs]",34,[Leukocytes [#/volume] in Blood by Automated c...
1,94ac9eea-1d6f-20f5-e9a0-fcdd7cd01bec,2,"[Myocardial infarction (disorder), Acute non-S...",2,"[laboratory, vital-signs]",34,[Leukocytes [#/volume] in Blood by Automated c...
2,9962bd41-f567-ce20-73ad-dd00e98b4553,2,"[Myocardial infarction (disorder), Acute non-S...",2,"[laboratory, vital-signs]",34,[Leukocytes [#/volume] in Blood by Automated c...
3,990de367-c67f-3879-bcfe-e7ff5749fdcd,2,"[Myocardial infarction (disorder), Acute ST se...",2,"[laboratory, vital-signs]",34,[Leukocytes [#/volume] in Blood by Automated c...
4,986f30d2-7417-c86e-19dc-e7e0b6100a6c,2,"[Myocardial infarction (disorder), Acute non-S...",2,"[laboratory, vital-signs]",34,[Leukocytes [#/volume] in Blood by Automated c...
...,...,...,...,...,...,...,...
491,38e48f44-abbb-d510-2376-e815a3043ddc,1,[Acute non-ST segment elevation myocardial inf...,0,,0,
492,980131a9-8664-2d12-8440-73a84950dcf8,1,[Acute non-ST segment elevation myocardial inf...,0,,0,
493,3b98eb06-6fda-eee9-ade0-f4f30cf68458,1,[Acute ST segment elevation myocardial infarct...,0,,0,
494,974a2e2a-16e2-3d0c-eb14-8d14990a1874,1,[Acute non-ST segment elevation myocardial inf...,0,,0,


In [75]:
# Statistical description of the merged result
df_merged_agg.describe()

Unnamed: 0,NUM_CONDITIONS,NUM_CATEGORIES,NUM_OBSERVATIONS
count,496.0,496.0,496.0
mean,1.481855,0.96371,16.383065
std,0.500175,1.00035,17.005954
min,1.0,0.0,0.0
25%,1.0,0.0,0.0
50%,1.0,0.0,0.0
75%,2.0,2.0,34.0
max,2.0,2.0,34.0
