# Synthea
Infarction Emergency Patients - First Encounter

## Libraries and Constants

In [29]:
import subprocess

from pathlib import Path
import shutil
import pandas as pd

# Absolute paths to files and directories
BASE_DIRECTORY = (Path.cwd()).parent
SYNTHEA_PATH = Path(BASE_DIRECTORY / 'synthea-with-dependencies.jar')
RAW_DATA_PATH = Path(BASE_DIRECTORY / 'data/raw/infarction_v0')
CSV_RAW_DATA_PATH = Path(RAW_DATA_PATH / 'csv')
PROCESSED_DATA_PATH = Path(BASE_DIRECTORY / 'data/processed/infarction_v0')

# Shell command to run Synthea
RUN_SYNTHEA = f'java -jar {SYNTHEA_PATH}'

# CSV files that will be excluded from the generated data
EXCLUDED_CSV_FILES = [
    'careplans.csv',
    'claims.csv',
    'claims_transactions.csv',
    'organizations.csv',
    'patient_expenses.csv',
    'payer_transitions.csv',
    'payers.csv',
    'providers.csv',
    'supplies.csv'
]

# Number of patients to be generated
NUM_PATIENTS = 99

## Data Generation

In [30]:
# Verify if the v0 raw data directory exists
if RAW_DATA_PATH.exists():
    # Delete the directory
    shutil.rmtree(RAW_DATA_PATH)

# String containing the CSV files that will not be generated
excluded_files = ','.join(EXCLUDED_CSV_FILES)

# Define the shell command to generate the Synthea data
command = RUN_SYNTHEA \
            + f' -p {NUM_PATIENTS}' \
            + f' --exporter.baseDirectory={RAW_DATA_PATH}' \
            + ' --exporter.csv.export=true' \
            + f' --exporter.csv.excluded_files={excluded_files}' \
            + ' --exporter.metadata.export=false' \
            + ' --exporter.fhir.export=false' \
            + ' --exporter.fhir.transaction_bundle=false' \
            + ' --exporter.hospital.fhir.export=false' \
            + ' --exporter.practitioner.fhir.export=false'

# Run Synthea
subprocess.run(command, shell=True)

SLF4J: No SLF4J providers were found.
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#noProviders for further details.


Scanned 84 modules and 151 submodules.
Loading submodule modules/allergies/allergy_panel.json
Loading submodule modules/allergies/drug_allergy_incidence.json
Loading submodule modules/allergies/environmental_allergy_incidence.json
Loading submodule modules/allergies/food_allergy_incidence.json
Loading submodule modules/allergies/immunotherapy.json
Loading submodule modules/allergies/outgrow_env_allergies.json
Loading submodule modules/allergies/outgrow_food_allergies.json
Loading submodule modules/allergies/severe_allergic_reaction.json
Loading submodule modules/anemia/anemia_sub.json
Loading submodule modules/breast_cancer/chemotherapy_breast.json
Loading submodule modules/breast_cancer/hormone_diagnosis.json
Loading submodule modules/breast_cancer/hormonetherapy_breast.json
Loading submodule modules/breast_cancer/surgery_therapy_breast.json
Loading submodule modules/breast_cancer/tnm_diagnosis.json
Loading submodule modules/contraceptives/clear_contraceptive.json
Loading submodule mo

CompletedProcess(args='java -jar /home/my-roberta/Unicamp/synthea/synthea-with-dependencies.jar -p 99 --exporter.baseDirectory=/home/my-roberta/Unicamp/synthea/data/raw/v0 --exporter.csv.export=true --exporter.csv.excluded_files=careplans.csv,claims.csv,claims_transactions.csv,organizations.csv,patient_expenses.csv,payer_transitions.csv,payers.csv,providers.csv,supplies.csv --exporter.metadata.export=false --exporter.fhir.export=false --exporter.fhir.transaction_bundle=false --exporter.hospital.fhir.export=false --exporter.practitioner.fhir.export=false', returncode=0)

## Data Processing

### Opening the Raw Data Files

In [39]:
# Get the raw data file names
raw_files = [file.name[:-4] for file in CSV_RAW_DATA_PATH.iterdir()]

# Load the DataFrames with raw data
dfs = dict()
for file in raw_files:
    dfs[file] = pd.read_csv(f'{CSV_RAW_DATA_PATH}/{file}.csv')

### Finding the Infarction Emergency Patients

In [40]:
# Drop unnecessary encounter columns
dfs['encounters'] = dfs['encounters'].drop(
    columns=[
        'ORGANIZATION', 'PROVIDER', 'PAYER', 'BASE_ENCOUNTER_COST',
        'TOTAL_CLAIM_COST', 'PAYER_COVERAGE'
        ]
    )

# Filter only infarction emergency encounters
dfs['encounters'] = dfs['encounters'] \
    .query('ENCOUNTERCLASS == "emergency"') \
    .query('REASONDESCRIPTION.notnull()') \
    .query('REASONDESCRIPTION.str.contains("infarction")') \
    .reset_index(drop=True)

# Get the identifiers (encounters and patients)
encounters_ids = dfs['encounters']['Id'].tolist()
patients_ids = dfs['encounters']['PATIENT'].tolist()

In [41]:
dfs['encounters']

Unnamed: 0,Id,START,STOP,PATIENT,ENCOUNTERCLASS,CODE,DESCRIPTION,REASONCODE,REASONDESCRIPTION
0,04e9670b-16ed-6298-fdd4-7578e961383d,2004-10-03T21:28:17Z,2004-10-03T22:28:17Z,79666e1b-425b-2dbb-0307-fb33a249d6fe,emergency,50849002,Emergency room admission (procedure),22298006.0,Myocardial infarction (disorder)
1,20c8d9b1-4e65-ea9b-e86e-9fb8a817464d,2021-09-09T18:34:16Z,2021-09-09T19:34:16Z,7096e0ff-c20c-1fcc-d99a-dcddaf0b8e8e,emergency,50849002,Emergency room admission (procedure),22298006.0,Myocardial infarction (disorder)
2,b971e644-bc66-c229-7364-59225516fc9c,2011-01-14T01:35:05Z,2011-01-14T02:35:05Z,591cb472-5624-06c9-2181-3ea9aedd7705,emergency,50849002,Emergency room admission (procedure),22298006.0,Myocardial infarction (disorder)
3,3147f999-8f08-8375-4ac1-bedff7e66c13,2008-12-02T22:21:10Z,2008-12-02T23:21:10Z,539d45b1-c5b3-c868-5125-5a440249b3e9,emergency,50849002,Emergency room admission (procedure),22298006.0,Myocardial infarction (disorder)
4,e3bf4a26-38d3-aedf-ff71-c4cd94bfabc2,2007-03-22T21:28:17Z,2007-03-22T22:28:17Z,88f0fdfb-339e-917d-49a9-0dd25a881352,emergency,50849002,Emergency room admission (procedure),22298006.0,Myocardial infarction (disorder)
5,35cea79e-0504-95c2-2866-2b4e6b625966,1994-07-12T11:18:02Z,1994-07-12T12:18:02Z,91e20e45-2171-9c64-6155-b1b86cf08573,emergency,50849002,Emergency room admission (procedure),22298006.0,Myocardial infarction (disorder)
6,3acafbcb-9337-a3c8-92b8-3f986dec7d4f,2006-03-11T11:18:02Z,2006-03-11T12:18:02Z,7b2683ff-ad72-a3f3-786c-a6dcdfd2fedb,emergency,50849002,Emergency room admission (procedure),22298006.0,Myocardial infarction (disorder)


### Filtering the Data Associated with the Infarction Emergency Encounters

In [42]:
# Filter data from DataFrames with encounter foreign key
for file in raw_files:
    if (file != 'encounters') and (file != 'patients'):
        dfs[file] = dfs[file] \
            .query('ENCOUNTER in @encounters_ids') \
            .reset_index(drop=True)

# # Drop unnecessary columns from immunizations, medications and procedures DataFrames
dfs['immunizations'] = dfs['immunizations'].drop(columns=['BASE_COST'])
dfs['medications'] = dfs['medications'] \
    .drop(columns=['BASE_COST', 'PAYER_COVERAGE', 'TOTALCOST'])
dfs['procedures'] = dfs['procedures'].drop(columns=['BASE_COST'])

# Filter data and drop unnecessary columns from the patients DataFrame
dfs['patients'] = dfs['patients'] \
    .drop(columns=['HEALTHCARE_EXPENSES', 'HEALTHCARE_COVERAGE', 'INCOME']) \
    .query('Id in @patients_ids') \
    .reset_index(drop=True)

### Writing CSV Files with the Processed Data

In [43]:
# Verify if the v0 processed data directory exists
if PROCESSED_DATA_PATH.exists():
    # Delete the directory
    shutil.rmtree(PROCESSED_DATA_PATH)

# Create the v0 processed data directory
PROCESSED_DATA_PATH.mkdir(parents=True)

# Write CSV files with processed data
for file in raw_files:
    dfs[file].to_csv(f'{PROCESSED_DATA_PATH}/{file}.csv', index=False)