# Synthea - Infarction Emergency Patients Analysis
Only the first encounter is of interest

## Libraries and Paths

In [11]:
from pathlib import Path
import shutil
import pandas as pd

# Absolute paths to files and directories
BASE_DIRECTORY = (Path.cwd()).parent
RAW_DATA_PATH = Path(BASE_DIRECTORY / 'data/raw/csv')
PROCESSED_DATA_PATH = Path(BASE_DIRECTORY / 'data/processed/infarction')
ANALYSIS_DATA_PATH = Path(BASE_DIRECTORY / 'data/analysis')

## Data Processing

### Opening the Raw Data Files

In [12]:
# Get the raw data file names
raw_files = [file.name[:-4] for file in RAW_DATA_PATH.iterdir()]

# Load the DataFrames with raw data
dfs = dict()
for file in raw_files:
    dfs[file] = pd.read_csv(f'{RAW_DATA_PATH}/{file}.csv')

### Finding the Infarction Emergency Patients

In [13]:
# Drop unnecessary encounter columns
dfs['encounters'] = dfs['encounters'].drop(
    columns=[
        'ORGANIZATION', 'PROVIDER', 'PAYER', 'BASE_ENCOUNTER_COST',
        'TOTAL_CLAIM_COST', 'PAYER_COVERAGE'
        ]
    )

# Filter only infarction emergency encounters
dfs['encounters'] = dfs['encounters'] \
    .query('ENCOUNTERCLASS == "emergency"') \
    .query('REASONDESCRIPTION.notnull()') \
    .query('REASONDESCRIPTION.str.contains("infarction")') \
    .reset_index(drop=True)

# Get the identifiers (encounters and patients)
encounters_ids = dfs['encounters']['Id'].tolist()
patients_ids = dfs['encounters']['PATIENT'].tolist()

# Print some of the filtered encounters data
dfs['encounters']

Unnamed: 0,Id,START,STOP,PATIENT,ENCOUNTERCLASS,CODE,DESCRIPTION,REASONCODE,REASONDESCRIPTION
0,f92d3c2e-d29a-6859-556a-de5b62f2d405,2019-05-16T07:29:34Z,2019-05-16T08:29:34Z,92ad977e-d82c-e187-f671-0505b402c337,emergency,50849002,Emergency room admission (procedure),22298006.0,Myocardial infarction (disorder)
1,e72a40cc-5242-64e3-775d-229d901b0c16,2013-06-19T20:49:32Z,2013-06-19T21:49:32Z,16313d4d-e097-4dc8-66c0-1e185f8b201d,emergency,50849002,Emergency room admission (procedure),22298006.0,Myocardial infarction (disorder)
2,adb82c2f-f58a-583b-0474-51c83eca5299,2007-04-05T18:34:48Z,2007-04-05T19:34:48Z,a8b96d79-17a8-fbd5-3d20-8be355fbf0c0,emergency,50849002,Emergency room admission (procedure),22298006.0,Myocardial infarction (disorder)
3,f221d19b-db1f-6ae5-e5c2-71b6046757c6,2022-04-12T09:49:09Z,2022-04-12T10:49:09Z,7f48a3d6-971c-f431-d297-a96224ed0066,emergency,50849002,Emergency room admission (procedure),22298006.0,Myocardial infarction (disorder)
4,67dda16a-068b-d7e0-b4ec-27788d9bb39c,1995-07-04T15:24:01Z,1995-07-04T16:24:01Z,fc8e0104-552d-8ea0-dc63-4f61b47380dd,emergency,50849002,Emergency room admission (procedure),22298006.0,Myocardial infarction (disorder)
...,...,...,...,...,...,...,...,...,...
478,e5e08675-1b7f-cdf4-6081-0cb4e58f827d,2015-09-15T00:19:05Z,2015-09-15T01:19:05Z,ae5d46c2-17f3-2452-4eba-9c8f78676bd5,emergency,50849002,Emergency room admission (procedure),22298006.0,Myocardial infarction (disorder)
479,465dceea-fc96-b28c-1d30-74ecfd234c07,2020-05-31T07:04:36Z,2020-05-31T08:04:36Z,ee788c26-624d-c903-dcdf-a800aba825c7,emergency,50849002,Emergency room admission (procedure),22298006.0,Myocardial infarction (disorder)
480,3603363d-bb8d-755e-1eff-cf9e88e6ac5e,2018-01-14T02:14:47Z,2018-01-14T03:14:47Z,d891f720-12cc-179f-51ea-e81eb4a2e0d0,emergency,50849002,Emergency room admission (procedure),22298006.0,Myocardial infarction (disorder)
481,2d504b1e-43e7-dab7-57bc-ac94df7921c8,1994-05-27T01:52:44Z,1994-05-27T02:52:44Z,8aed6b40-9fb6-9e50-b534-fdd166046b04,emergency,50849002,Emergency room admission (procedure),22298006.0,Myocardial infarction (disorder)


### Filtering the Data Associated with the Infarction Emergency Encounters

In [14]:
# Filter data from DataFrames with encounter foreign key
for file in raw_files:
    if (file != 'encounters') and (file != 'patients'):
        dfs[file] = dfs[file] \
            .query('ENCOUNTER in @encounters_ids') \
            .reset_index(drop=True)

# # Drop unnecessary columns from immunizations, medications and procedures DataFrames
dfs['immunizations'] = dfs['immunizations'].drop(columns=['BASE_COST'])
dfs['medications'] = dfs['medications'] \
    .drop(columns=['BASE_COST', 'PAYER_COVERAGE', 'TOTALCOST'])
dfs['procedures'] = dfs['procedures'].drop(columns=['BASE_COST'])

# Filter data and drop unnecessary columns from the patients DataFrame
dfs['patients'] = dfs['patients'] \
    .drop(columns=['HEALTHCARE_EXPENSES', 'HEALTHCARE_COVERAGE', 'INCOME']) \
    .query('Id in @patients_ids') \
    .reset_index(drop=True)

### Writing CSV Files with the Processed Data

In [15]:
# Verify if the processed data directory exists
if PROCESSED_DATA_PATH.exists():
    # Delete the directory
    shutil.rmtree(PROCESSED_DATA_PATH)

# Create the processed data directory
PROCESSED_DATA_PATH.mkdir(parents=True)

# Write CSV files with processed data
for file in raw_files:
    dfs[file].to_csv(f'{PROCESSED_DATA_PATH}/{file}.csv', index=False)

## Data Analysis

### Verifying the Number of Conditions per Encounter

In [16]:
# Open the conditions data file
df_conditions = pd.read_csv(f'{PROCESSED_DATA_PATH}/conditions.csv')

# Aggregate conditions data using encounter codes
df_conditions_agg = df_conditions \
    .groupby(by=['ENCOUNTER'], as_index=False) \
    .agg(
        NUM_CONDITIONS=('CODE', pd.Series.nunique),
        CONDITIONS=('DESCRIPTION', 'unique')
    ) \
    .sort_values(by=['NUM_CONDITIONS'], ascending=False) \
    .reset_index(drop=True)

# Write a CSV file with the resulting DataFrame
df_conditions_agg.to_csv(f'{ANALYSIS_DATA_PATH}/infarction_conditions.csv', index=False)

# Print some of the aggregation result
df_conditions_agg

Unnamed: 0,ENCOUNTER,NUM_CONDITIONS,CONDITIONS
0,8798d7f0-4902-581f-2995-e3a022e62e6d,2,"[Myocardial infarction (disorder), Acute ST se..."
1,a028c6e9-dd5f-ea66-7e05-872ee839ecbc,2,"[Myocardial infarction (disorder), Acute ST se..."
2,9b687a55-12f2-bff6-d36d-94caef37c6f3,2,"[Myocardial infarction (disorder), Acute non-S..."
3,9b2e1576-df94-cc0a-c40f-497da59a3d61,2,"[Myocardial infarction (disorder), Preinfarcti..."
4,9aa70fb7-2b48-480f-9f1f-3d9041b15778,2,"[Myocardial infarction (disorder), Acute ST se..."
...,...,...,...
478,3d26dc12-af0d-3b94-0eb6-2c60c932cf20,1,[Acute ST segment elevation myocardial infarct...
479,3c7f0b2e-6189-da1b-fbae-ac2cd1ececb8,1,[Acute non-ST segment elevation myocardial inf...
480,3ae05fe2-aef6-f978-27cd-96028759b6ed,1,[Acute ST segment elevation myocardial infarct...
481,ad7269e6-9e30-7ac1-bf8e-421e31c09ee2,1,[Acute non-ST segment elevation myocardial inf...


### Verifying the Number of Observations per Encounter

In [17]:
# Open the observations data file
df_observations = pd.read_csv(f'{PROCESSED_DATA_PATH}/observations.csv')

# Aggregate data using encounter codes
df_observations_agg = df_observations \
    .groupby(by=['ENCOUNTER'], as_index=False) \
    .agg(
        NUM_CATEGORIES=('CATEGORY', pd.Series.nunique),
        CATEGORIES=('CATEGORY', 'unique'),
        NUM_OBSERVATIONS=('CODE', pd.Series.nunique),
        OBSERVATIONS=('DESCRIPTION', 'unique')
    ) \
    .sort_values(by=['NUM_CATEGORIES', 'NUM_OBSERVATIONS'], ascending=False) \
    .reset_index(drop=True)

# Write a CSV file with the resulting DataFrame
df_observations_agg.to_csv(f'{ANALYSIS_DATA_PATH}/infarction_observations.csv', index=False)

# Print some of the aggregation result
df_observations_agg

Unnamed: 0,ENCOUNTER,NUM_CATEGORIES,CATEGORIES,NUM_OBSERVATIONS,OBSERVATIONS
0,02aa36cf-0f8d-7da9-e3f3-a1d06ca2a14b,2,"[laboratory, vital-signs]",34,[Leukocytes [#/volume] in Blood by Automated c...
1,033dc170-f80e-b265-4382-1a0191e583c1,2,"[laboratory, vital-signs]",34,[Leukocytes [#/volume] in Blood by Automated c...
2,03fe0412-6dfa-2073-d16c-c2605f1396e7,2,"[laboratory, vital-signs]",34,[Leukocytes [#/volume] in Blood by Automated c...
3,053dff82-8641-83ab-5baa-ebcff4ebd555,2,"[laboratory, vital-signs]",34,[Leukocytes [#/volume] in Blood by Automated c...
4,092d9871-d219-f008-e9da-cbb6345791d0,2,"[laboratory, vital-signs]",34,[Leukocytes [#/volume] in Blood by Automated c...
...,...,...,...,...,...
255,fb1b3578-1b00-663e-28b7-e1d4750995ee,2,"[laboratory, vital-signs]",34,[Leukocytes [#/volume] in Blood by Automated c...
256,fca343f6-27f4-0500-d924-5296bf364435,2,"[laboratory, vital-signs]",34,[Leukocytes [#/volume] in Blood by Automated c...
257,fe218a9f-e4aa-9883-9674-8e11a59d29b8,2,"[laboratory, vital-signs]",34,[Leukocytes [#/volume] in Blood by Automated c...
258,ff014742-94fe-d3d9-5023-58af84b3e787,2,"[laboratory, vital-signs]",34,[Leukocytes [#/volume] in Blood by Automated c...


### Joining Conditions and Observations Data

In [18]:
# Merge aggregation of conditions and aggregation of observations
df_merged_agg = df_conditions_agg \
    .merge(right=df_observations_agg, how='left', on='ENCOUNTER') \
    .fillna({
        'NUM_CATEGORIES': 0,
        'CATEGORIES': '',
        'NUM_OBSERVATIONS': 0,
        'OBSERVATIONS': ''
    }) \
    .astype({'NUM_CATEGORIES': int, 'NUM_OBSERVATIONS': int})

# Write a CSV file with the resulting DataFrame
df_merged_agg.to_csv(f'{ANALYSIS_DATA_PATH}/infarction_merged_data.csv', index=False)

# Print some of the merged data
df_merged_agg

Unnamed: 0,ENCOUNTER,NUM_CONDITIONS,CONDITIONS,NUM_CATEGORIES,CATEGORIES,NUM_OBSERVATIONS,OBSERVATIONS
0,8798d7f0-4902-581f-2995-e3a022e62e6d,2,"[Myocardial infarction (disorder), Acute ST se...",2,"[laboratory, vital-signs]",34,[Leukocytes [#/volume] in Blood by Automated c...
1,a028c6e9-dd5f-ea66-7e05-872ee839ecbc,2,"[Myocardial infarction (disorder), Acute ST se...",2,"[laboratory, vital-signs]",34,[Leukocytes [#/volume] in Blood by Automated c...
2,9b687a55-12f2-bff6-d36d-94caef37c6f3,2,"[Myocardial infarction (disorder), Acute non-S...",2,"[laboratory, vital-signs]",34,[Leukocytes [#/volume] in Blood by Automated c...
3,9b2e1576-df94-cc0a-c40f-497da59a3d61,2,"[Myocardial infarction (disorder), Preinfarcti...",2,"[laboratory, vital-signs]",34,[Leukocytes [#/volume] in Blood by Automated c...
4,9aa70fb7-2b48-480f-9f1f-3d9041b15778,2,"[Myocardial infarction (disorder), Acute ST se...",2,"[laboratory, vital-signs]",34,[Leukocytes [#/volume] in Blood by Automated c...
...,...,...,...,...,...,...,...
478,3d26dc12-af0d-3b94-0eb6-2c60c932cf20,1,[Acute ST segment elevation myocardial infarct...,0,,0,
479,3c7f0b2e-6189-da1b-fbae-ac2cd1ececb8,1,[Acute non-ST segment elevation myocardial inf...,0,,0,
480,3ae05fe2-aef6-f978-27cd-96028759b6ed,1,[Acute ST segment elevation myocardial infarct...,0,,0,
481,ad7269e6-9e30-7ac1-bf8e-421e31c09ee2,1,[Acute non-ST segment elevation myocardial inf...,0,,0,


### Metrics of Conditions and Observations per Encounter

In [19]:
# Calculate the metrics of conditions and observations for encounters
df_metrics = df_merged_agg.describe()

# Write a CSV file with the resulting DataFrame
df_metrics.to_csv(f'{ANALYSIS_DATA_PATH}/infarction_metrics.csv', index=False)

# Print some of the metrics data
df_metrics

Unnamed: 0,NUM_CONDITIONS,NUM_CATEGORIES,NUM_OBSERVATIONS
count,483.0,483.0,483.0
mean,1.540373,1.076605,18.302277
std,0.498884,0.998095,16.96762
min,1.0,0.0,0.0
25%,1.0,0.0,0.0
50%,2.0,2.0,34.0
75%,2.0,2.0,34.0
max,2.0,2.0,34.0
