# Synthea - Data Generator
Synthetic patients data generation

## Libraries, Paths and Constants

In [1]:
import subprocess

from pathlib import Path
import shutil

# Absolute paths to files and directories
BASE_DIRECTORY = (Path.cwd()).parent
SYNTHEA_PATH = Path(BASE_DIRECTORY / 'synthea-with-dependencies.jar')
RAW_DATA_PATH = Path(BASE_DIRECTORY / 'data/raw')

# Shell command to run Synthea
RUN_SYNTHEA = f'java -jar {SYNTHEA_PATH}'

# CSV files that will be excluded from the generated data
EXCLUDED_CSV_FILES = [
    'careplans.csv',
    'claims.csv',
    'claims_transactions.csv',
    'organizations.csv',
    'patient_expenses.csv',
    'payer_transitions.csv',
    'payers.csv',
    'providers.csv',
    'supplies.csv'
]

# Number of patients to be generated
NUM_PATIENTS = 10_000

## Synthea Data Generation

In [2]:
# Verify if the raw data directory exists
if RAW_DATA_PATH.exists():
    # Delete the directory
    shutil.rmtree(RAW_DATA_PATH)

# String containing the CSV files that will not be generated
excluded_files = ','.join(EXCLUDED_CSV_FILES)

# Define the shell command to generate the Synthea data
command = RUN_SYNTHEA \
            + f' -p {NUM_PATIENTS}' \
            + f' --exporter.baseDirectory={RAW_DATA_PATH}' \
            + ' --exporter.csv.export=true' \
            + f' --exporter.csv.excluded_files={excluded_files}' \
            + ' --exporter.metadata.export=false' \
            + ' --exporter.fhir.export=false' \
            + ' --exporter.fhir.transaction_bundle=false' \
            + ' --exporter.hospital.fhir.export=false' \
            + ' --exporter.practitioner.fhir.export=false'

# Run Synthea
subprocess.run(command, shell=True)

SLF4J: No SLF4J providers were found.
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#noProviders for further details.


Scanned 84 modules and 151 submodules.
Loading submodule modules/allergies/allergy_panel.json
Loading submodule modules/allergies/drug_allergy_incidence.json
Loading submodule modules/allergies/environmental_allergy_incidence.json
Loading submodule modules/allergies/food_allergy_incidence.json
Loading submodule modules/allergies/immunotherapy.json
Loading submodule modules/allergies/outgrow_env_allergies.json
Loading submodule modules/allergies/outgrow_food_allergies.json
Loading submodule modules/allergies/severe_allergic_reaction.json
Loading submodule modules/anemia/anemia_sub.json
Loading submodule modules/breast_cancer/chemotherapy_breast.json
Loading submodule modules/breast_cancer/hormone_diagnosis.json
Loading submodule modules/breast_cancer/hormonetherapy_breast.json
Loading submodule modules/breast_cancer/surgery_therapy_breast.json
Loading submodule modules/breast_cancer/tnm_diagnosis.json
Loading submodule modules/contraceptives/clear_contraceptive.json
Loading submodule mo

CompletedProcess(args='java -jar /home/my-roberta/Unicamp/synthea/synthea-with-dependencies.jar -p 10000 --exporter.baseDirectory=/home/my-roberta/Unicamp/synthea/data/raw --exporter.csv.export=true --exporter.csv.excluded_files=careplans.csv,claims.csv,claims_transactions.csv,organizations.csv,patient_expenses.csv,payer_transitions.csv,payers.csv,providers.csv,supplies.csv --exporter.metadata.export=false --exporter.fhir.export=false --exporter.fhir.transaction_bundle=false --exporter.hospital.fhir.export=false --exporter.practitioner.fhir.export=false', returncode=0)