# Synthea - Emergency Patients Analysis
Only the first encounter is of interest

## Libraries and Paths

In [34]:
from pathlib import Path
import shutil
import pandas as pd

# Absolute paths to files and directories
BASE_DIRECTORY = (Path.cwd()).parent
RAW_DATA_PATH = Path(BASE_DIRECTORY / 'data/raw/csv')
PROCESSED_DATA_PATH = Path(BASE_DIRECTORY / 'data/processed/emergency')
ANALYSIS_DATA_PATH = Path(BASE_DIRECTORY / 'data/analysis')

## Data Processing

### Opening the Raw Data Files

In [35]:
# Get the raw data file names
raw_files = [file.name[:-4] for file in RAW_DATA_PATH.iterdir()]

# Load the DataFrames with raw data
dfs = dict()
for file in raw_files:
    dfs[file] = pd.read_csv(f'{RAW_DATA_PATH}/{file}.csv')

### Finding the Emergency Patients

In [36]:
# Drop unnecessary encounter columns
dfs['encounters'] = dfs['encounters'] \
    .drop(columns=[
        'ORGANIZATION', 'PROVIDER', 'PAYER', 'BASE_ENCOUNTER_COST',
        'TOTAL_CLAIM_COST', 'PAYER_COVERAGE'
    ])

# Filter only emergency encounters
dfs['encounters'] = dfs['encounters'] \
    .query('ENCOUNTERCLASS == "emergency"') \
    .query('REASONDESCRIPTION.notnull()') \
    .reset_index(drop=True)

# Get the identifiers (encounters and patients)
encounters_ids = dfs['encounters']['Id'].tolist()
patients_ids = dfs['encounters']['PATIENT'].tolist()

# Print some of the filtered encounters data
dfs['encounters']

Unnamed: 0,Id,START,STOP,PATIENT,ENCOUNTERCLASS,CODE,DESCRIPTION,REASONCODE,REASONDESCRIPTION
0,f39d9b9e-b1bf-d308-0126-ffffb8c60009,2019-08-25T03:20:29Z,2019-08-25T05:19:29Z,54323f01-0951-8e76-f4ec-2c72f8f92673,emergency,50849002,Emergency room admission (procedure),125605004.0,Fracture of bone (disorder)
1,b2cd38c3-4ee0-69f5-2b9c-7bd5c0c28b81,2016-03-01T06:08:40Z,2016-03-01T07:08:40Z,78d6b763-4338-9ff2-999c-b0fc12cb3a76,emergency,50849002,Emergency room admission (procedure),312608009.0,Laceration - injury (disorder)
2,5ee24d42-ca32-7982-b18b-898608503984,2017-07-19T21:53:33Z,2017-07-19T22:53:33Z,38ec7e85-d9c7-cfbb-3292-9221b528f77a,emergency,50849002,Emergency room admission (procedure),384709000.0,Sprain (morphologic abnormality)
3,fbea6096-6757-4022-52c4-5f0049d44631,2017-05-06T06:32:01Z,2017-05-06T07:32:01Z,78d6b763-4338-9ff2-999c-b0fc12cb3a76,emergency,50849002,Emergency room admission (procedure),312608009.0,Laceration - injury (disorder)
4,a426cc2d-afd5-00ce-61c6-913d010735d8,2019-04-13T06:56:30Z,2019-04-13T07:56:30Z,78d6b763-4338-9ff2-999c-b0fc12cb3a76,emergency,50849002,Emergency room admission (procedure),384709000.0,Sprain (morphologic abnormality)
...,...,...,...,...,...,...,...,...,...
19203,2a74325f-1197-22a8-ec11-36de49c4d740,1969-11-14T22:12:47Z,1969-11-14T23:12:47Z,39864d79-3b3f-d2f1-839f-e2042708dbfd,emergency,50849002,Emergency room admission (procedure),110030002.0,Concussion injury of brain (disorder)
19204,576bbd19-94f9-0e42-ebc0-4e8879b0f631,1984-09-26T09:51:20Z,1984-09-26T10:51:20Z,eb4b400c-ef63-8913-c637-b0099ef6e6df,emergency,50849002,Emergency room admission (procedure),22298006.0,Myocardial infarction (disorder)
19205,4090648e-a2d7-5519-b8e5-18730d7db58f,2016-04-04T10:39:59Z,2016-04-04T11:39:59Z,eb4b400c-ef63-8913-c637-b0099ef6e6df,emergency,50849002,Emergency room admission (procedure),125605004.0,Fracture of bone (disorder)
19206,1f9310e0-21d2-b0f5-bad6-fde280a6bfd9,2020-06-24T16:26:09Z,2020-06-24T17:26:09Z,b7b3a619-833e-035d-1668-edc7cea0378a,emergency,50849002,Emergency room admission (procedure),125605004.0,Fracture of bone (disorder)


### Filtering the Data Associated with the Emergency Encounters

In [37]:
# Filter data from DataFrames with encounter as foreign key
for file in raw_files:
    if (file != 'encounters') and (file != 'patients'):
        dfs[file] = dfs[file] \
            .query('ENCOUNTER in @encounters_ids') \
            .reset_index(drop=True)

# # Drop unnecessary columns from immunizations, medications and procedures DataFrames
dfs['immunizations'] = dfs['immunizations'].drop(columns=['BASE_COST'])
dfs['medications'] = dfs['medications'] \
    .drop(columns=['BASE_COST', 'PAYER_COVERAGE', 'TOTALCOST'])
dfs['procedures'] = dfs['procedures'].drop(columns=['BASE_COST'])

# Filter data and drop unnecessary columns from the patients DataFrame
dfs['patients'] = dfs['patients'] \
    .drop(columns=['HEALTHCARE_EXPENSES', 'HEALTHCARE_COVERAGE', 'INCOME']) \
    .query('Id in @patients_ids') \
    .reset_index(drop=True)

### Writing CSV Files with the Processed Data

In [38]:
# Verify if the processed data directory exists
if PROCESSED_DATA_PATH.exists():
    # Delete the directory
    shutil.rmtree(PROCESSED_DATA_PATH)

# Create the processed data directory
PROCESSED_DATA_PATH.mkdir(parents=True)

# Write CSV files with processed data
for file in raw_files:
    dfs[file].to_csv(f'{PROCESSED_DATA_PATH}/{file}.csv', index=False)

## Data Analysis

### Treating the Encounters DataFrame

In [39]:
# Open the encounters data file
df_encounters = pd.read_csv(f'{PROCESSED_DATA_PATH}/encounters.csv')

# Select the necessary encounter columns
df_encounters_reason = df_encounters[['Id', 'REASONDESCRIPTION']].copy()

# Rename the remaining columns
df_encounters_reason = df_encounters_reason.rename(
    columns={'Id': 'ENCOUNTER', 'REASONDESCRIPTION': 'REASON'}
)

# Print some of the encounters data
df_encounters_reason

Unnamed: 0,ENCOUNTER,REASON
0,f39d9b9e-b1bf-d308-0126-ffffb8c60009,Fracture of bone (disorder)
1,b2cd38c3-4ee0-69f5-2b9c-7bd5c0c28b81,Laceration - injury (disorder)
2,5ee24d42-ca32-7982-b18b-898608503984,Sprain (morphologic abnormality)
3,fbea6096-6757-4022-52c4-5f0049d44631,Laceration - injury (disorder)
4,a426cc2d-afd5-00ce-61c6-913d010735d8,Sprain (morphologic abnormality)
...,...,...
19203,2a74325f-1197-22a8-ec11-36de49c4d740,Concussion injury of brain (disorder)
19204,576bbd19-94f9-0e42-ebc0-4e8879b0f631,Myocardial infarction (disorder)
19205,4090648e-a2d7-5519-b8e5-18730d7db58f,Fracture of bone (disorder)
19206,1f9310e0-21d2-b0f5-bad6-fde280a6bfd9,Fracture of bone (disorder)


### Verifying the Number of Conditions per Encounter

In [40]:
# Open the conditions data file
df_conditions = pd.read_csv(f'{PROCESSED_DATA_PATH}/conditions.csv')

# Aggregate conditions data using encounter codes
df_conditions_agg = df_conditions \
    .groupby(by=['ENCOUNTER'], as_index=False) \
    .agg(
        NUM_CONDITIONS=('CODE', pd.Series.nunique),
        CONDITIONS=('DESCRIPTION', 'unique')
    ) \
    .sort_values(by=['NUM_CONDITIONS'], ascending=False) \
    .reset_index(drop=True)

# Write a CSV file with the resulting DataFrame
df_conditions_agg.to_csv(f'{ANALYSIS_DATA_PATH}/emergency_conditions.csv', index=False)

# Print some of the aggregation result
df_conditions_agg

Unnamed: 0,ENCOUNTER,NUM_CONDITIONS,CONDITIONS
0,b68c1c6d-5417-c742-2d62-987c8abb4445,4,"[Sprain (morphologic abnormality), Sprain of a..."
1,a79db2af-0c4d-cbca-e209-33bd3ef01481,4,"[Sprain (morphologic abnormality), Sprain of a..."
2,1510c614-b3ab-f63a-dfb0-3d476f38dc89,4,"[Seizure disorder, History of single seizure (..."
3,78784c38-97ce-c006-842c-4a46b48f1ab5,4,"[Fracture of bone (disorder), Fracture of fore..."
4,d4bf9363-ac02-fb34-6c42-73ad52e3de79,4,"[Seizure disorder, History of single seizure (..."
...,...,...,...
12258,a5b33e32-0b4c-38cb-663a-98054ea52c52,1,[Drug overdose]
12259,a5b0c0bd-c4f7-31f2-a44f-d15a5bff71cc,1,[Chronic pain]
12260,3a2993ab-f91c-f8de-5b3f-57d6b4085f2c,1,[Chronic pain]
12261,a5a17fe7-10c1-788e-0ca0-45664106e9fe,1,[Acute allergic reaction]


### Verifying the Number of Observations per Encounter

In [41]:
# Open the observations data file
df_observations = pd.read_csv(f'{PROCESSED_DATA_PATH}/observations.csv')

# Aggregate observations data using encounter codes
df_observations_agg = df_observations \
    .groupby(by=['ENCOUNTER'], as_index=False) \
    .agg(
        NUM_CATEGORIES=('CATEGORY', pd.Series.nunique),
        CATEGORIES=('CATEGORY', 'unique'),
        NUM_OBSERVATIONS=('CODE', pd.Series.nunique),
        OBSERVATIONS=('DESCRIPTION', 'unique')
    ) \
    .sort_values(by=['NUM_CATEGORIES', 'NUM_OBSERVATIONS'], ascending=False) \
    .reset_index(drop=True)

# Write a CSV file with the resulting DataFrame
df_observations_agg.to_csv(f'{ANALYSIS_DATA_PATH}/emergency_observations.csv', index=False)

# Print some of the aggregation result
df_observations_agg

Unnamed: 0,ENCOUNTER,NUM_CATEGORIES,CATEGORIES,NUM_OBSERVATIONS,OBSERVATIONS
0,104c87d4-2e74-a626-21ba-8b190c426146,3,"[exam, vital-signs, laboratory]",48,"[Emergency severity index, Diastolic Blood Pre..."
1,124291dd-fe3e-d637-8ebc-0db97c27c187,3,"[exam, vital-signs, laboratory]",48,"[Emergency severity index, Diastolic Blood Pre..."
2,217662bd-14ac-d138-a574-89170a9bed72,3,"[exam, vital-signs, laboratory]",48,"[Emergency severity index, Diastolic Blood Pre..."
3,2531b492-5fe2-43bd-5384-e03c7cbf5fd6,3,"[exam, vital-signs, laboratory]",48,"[Emergency severity index, Diastolic Blood Pre..."
4,28c81a40-edf4-56f1-fe7b-3e97a9119298,3,"[exam, vital-signs, laboratory]",48,"[Emergency severity index, Diastolic Blood Pre..."
...,...,...,...,...,...
4052,ff868ef4-36d6-303a-b7cf-922f5f61c520,1,[vital-signs],1,[Pain severity - 0-10 verbal numeric rating [S...
4053,ffa2821a-c60e-6e60-07fe-94e8990427e7,1,[vital-signs],1,[Pain severity - 0-10 verbal numeric rating [S...
4054,ffad5a41-8631-cb64-7a80-b7c90260496c,1,[vital-signs],1,[Pain severity - 0-10 verbal numeric rating [S...
4055,fff965c5-279b-de6e-b24c-8a75490eca3c,1,[vital-signs],1,[Pain severity - 0-10 verbal numeric rating [S...


### Joining Encounters, Conditions and Observations Data

In [42]:
# Merge encounters reason and aggregation of conditions and observations
df_merged_data = df_encounters_reason \
    .merge(right=df_conditions_agg, how='left', on='ENCOUNTER') \
    .merge(right=df_observations_agg, how='left', on='ENCOUNTER') \
    .fillna({
        'NUM_CONDITIONS': 0,
        'CONDITIONS': '',
        'NUM_CATEGORIES': 0,
        'CATEGORIES': '',
        'NUM_OBSERVATIONS': 0,
        'OBSERVATIONS': ''
    }) \
    .astype(
        {'NUM_CONDITIONS': int, 'NUM_CATEGORIES': int, 'NUM_OBSERVATIONS': int}
    )

# Write a CSV file with the resulting DataFrame
df_merged_data.to_csv(f'{ANALYSIS_DATA_PATH}/emergency_merged_data.csv', index=False)

# Print some of the merged data
df_merged_data

Unnamed: 0,ENCOUNTER,REASON,NUM_CONDITIONS,CONDITIONS,NUM_CATEGORIES,CATEGORIES,NUM_OBSERVATIONS,OBSERVATIONS
0,f39d9b9e-b1bf-d308-0126-ffffb8c60009,Fracture of bone (disorder),2,"[Fracture of bone (disorder), Fracture of clav...",0,,0,
1,b2cd38c3-4ee0-69f5-2b9c-7bd5c0c28b81,Laceration - injury (disorder),2,"[Laceration - injury (disorder), Laceration of...",0,,0,
2,5ee24d42-ca32-7982-b18b-898608503984,Sprain (morphologic abnormality),2,"[Sprain (morphologic abnormality), Sprain of a...",0,,0,
3,fbea6096-6757-4022-52c4-5f0049d44631,Laceration - injury (disorder),2,"[Laceration - injury (disorder), Facial lacera...",0,,0,
4,a426cc2d-afd5-00ce-61c6-913d010735d8,Sprain (morphologic abnormality),2,"[Sprain (morphologic abnormality), Sprain of a...",0,,0,
...,...,...,...,...,...,...,...,...
19203,2a74325f-1197-22a8-ec11-36de49c4d740,Concussion injury of brain (disorder),2,"[Concussion injury of brain (disorder), Concus...",0,,0,
19204,576bbd19-94f9-0e42-ebc0-4e8879b0f631,Myocardial infarction (disorder),1,[Acute non-ST segment elevation myocardial inf...,0,,0,
19205,4090648e-a2d7-5519-b8e5-18730d7db58f,Fracture of bone (disorder),2,"[Fracture of bone (disorder), Fracture of ankle]",1,[procedure],1,[DXA Radius and Ulna [T-score] Bone density]
19206,1f9310e0-21d2-b0f5-bad6-fde280a6bfd9,Fracture of bone (disorder),2,"[Fracture of bone (disorder), Fracture of ankle]",1,[procedure],1,[DXA Radius and Ulna [T-score] Bone density]


### Metrics of Conditions and Observations per Encounter Reason

In [43]:
# Calculate the metrics of conditions and observations per encounter reason
df_metrics = df_merged_data \
    .groupby(by=['REASON'], as_index=False) \
    .agg(
        CASES=('ENCOUNTER', 'count'),
        MIN_CONDS=('NUM_CONDITIONS', 'min'),
        MAX_CONDS=('NUM_CONDITIONS', 'max'),
        MEDIAN_CONDS=('NUM_CONDITIONS', 'median'),
        MEAN_CONDS=('NUM_CONDITIONS', 'mean'),
        STD_CONDS=('NUM_CONDITIONS', 'std'),
        MIN_OBS=('NUM_OBSERVATIONS', 'min'),
        MAX_OBS=('NUM_OBSERVATIONS', 'max'),
        MEDIAN_OBS=('NUM_OBSERVATIONS', 'median'),
        MEAN_OBS=('NUM_OBSERVATIONS', 'mean'),
        STD_OBS=('NUM_OBSERVATIONS', 'std')
    ) \
    .fillna({'STD_CONDS': 0,'STD_OBS': 0}) \
    .astype({'MEDIAN_CONDS': int, 'MEDIAN_OBS': int}) \
    .sort_values(by=['MEDIAN_CONDS', 'MEDIAN_OBS'], ascending=False) \
    .reset_index(drop=True)

# Write a CSV file with the resulting DataFrame
df_metrics.to_csv(f'{ANALYSIS_DATA_PATH}/emergency_metrics.csv', index=False)

# Print some of the metrics data
df_metrics

Unnamed: 0,REASON,CASES,MIN_CONDS,MAX_CONDS,MEDIAN_CONDS,MEAN_CONDS,STD_CONDS,MIN_OBS,MAX_OBS,MEDIAN_OBS,MEAN_OBS,STD_OBS
0,Myocardial infarction (disorder),483,1,2,2,1.540373,0.498884,0,34,34,18.302277,16.96762
1,Acute Cholecystitis,16,0,2,2,1.875,0.5,0,24,24,22.5,6.0
2,Burn injury (morphologic abnormality),440,0,2,2,1.918182,0.378988,0,0,0,0.0,0.0
3,Concussion injury of brain (disorder),930,1,4,2,1.915054,0.301216,0,23,0,0.025806,0.754877
4,Fracture of bone (disorder),1693,0,4,2,2.024808,0.528702,0,26,0,0.366804,1.107468
5,Gunshot wound (disorder),68,0,2,2,1.970588,0.242536,0,0,0,0.0,0.0
6,Injury of knee (disorder),249,0,3,2,2.0,0.155543,0,0,0,0.0,0.0
7,Injury of neck (disorder),533,2,2,2,2.0,0.0,0,0,0,0.0,0.0
8,Laceration - injury (disorder),1634,0,3,2,1.998776,0.121224,0,26,0,0.033048,0.861582
9,Seizure disorder,502,0,4,2,1.858566,1.029464,0,0,0,0.0,0.0
