In [73]:
import sys
sys.path.insert(1, '..')
import tools.read_data as rd
import pandas as pd
import unittest
import time
import json
import importlib
for k,v in list(sys.modules.items()):
    if k.startswith('tools'):
        importlib.reload(v)

## Read in the data

In [2]:
# read in the json data using a generator method
patient_json_list = []
start = time.time()
for json_obj in rd.read_json_files('data'):
    patient_json_list.append(json_obj)
end = time.time()
print(end - start)

2.418402910232544


In [3]:
# read in the json data using a standard method
patient_json_list_alt = []
start = time.time()
pfl = rd.get_patient_file_list('data')
for json_obj in pfl:
    patient_json_list_alt.append(rd.read_patient_file('data', json_obj))
end = time.time()
print(end - start)

3.676032781600952


In [None]:
# we have also implemented functions to read in the data from a database or an API - these are not implemented in this example
# patient_json_list = rd.get_json_objects_from_API('https://www.example-patient-api.com/get-patient-FHIR-data')

In [4]:
# check the two different methods correctly calculated the same list
assert patient_json_list == patient_json_list_alt

## Run tests for data quality / correctness

In [40]:
# run all tests on the incoming data
from tools.data_tests import TestFHIRData

test_runner = unittest.TextTestRunner()
for json_obj in patient_json_list:
    TestFHIRData.JSON_OBJ = json_obj
    test_suite = unittest.TestLoader().loadTestsFromTestCase(TestFHIRData)
    test_results = test_runner.run(test_suite)

....
----------------------------------------------------------------------
Ran 4 tests in 0.346s

OK
....
----------------------------------------------------------------------
Ran 4 tests in 0.262s

OK
....
----------------------------------------------------------------------
Ran 4 tests in 0.150s

OK
....
----------------------------------------------------------------------
Ran 4 tests in 0.485s

OK
....
----------------------------------------------------------------------
Ran 4 tests in 11.012s

OK
....
----------------------------------------------------------------------
Ran 4 tests in 0.420s

OK
....
----------------------------------------------------------------------
Ran 4 tests in 0.327s

OK
....
----------------------------------------------------------------------
Ran 4 tests in 0.216s

OK
....
----------------------------------------------------------------------
Ran 4 tests in 0.494s

OK
....
----------------------------------------------------------------------
Ran 4

In [37]:
# example test failure
with open("resources/bad_example.json") as f:
    bad_json_obj = json.load(f)

TestFHIRData.JSON_OBJ = bad_json_obj
test_suite = unittest.TestLoader().loadTestsFromTestCase(TestFHIRData)
test_results = test_runner.run(test_suite)

FFFF
FAIL: test_all_fields_in_patient (tools.data_tests.TestFHIRData)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/Users/joshuastapleton/Desktop/EMIS_interview/exa-data-eng-assessment/tools/data_tests.py", line 43, in test_all_fields_in_patient
    self.assertIn(field, expected_fields, msg=f"{field} field not found in expected fields list")
AssertionError: 'subject' not found in ['resourceType', 'fhir_comments', 'id', 'implicitRules', 'implicitRules__ext', 'language', 'language__ext', 'meta', 'contained', 'extension', 'modifierExtension', 'text', 'active', 'active__ext', 'address', 'birthDate', 'birthDate__ext', 'communication', 'contact', 'deceasedBoolean', 'deceasedBoolean__ext', 'deceasedDateTime', 'deceasedDateTime__ext', 'gender', 'gender__ext', 'generalPractitioner', 'identifier', 'link', 'managingOrganization', 'maritalStatus', 'multipleBirthBoolean', 'multipleBirthBoolean__ext', 'multipleBirthInteger', 'multi

## Basic / local implementation of pipeline using filesystem

In [9]:
from fhir.resources.patient import Patient
from fhir.resources.bundle import Bundle

FHIR_patient_object_list = [Patient.parse_obj(Bundle.parse_obj(patient_json).entry[0].resource) for patient_json in patient_json_list]
patient_df = rd.patients_to_dataframe(FHIR_patient_object_list).drop(columns=['resource_type']) # we can drop this column because it is constant by definition

In [14]:
# write the raw tabular data to a csv file. This needs to be normalized and cleaned before it can be used for analysis.
patient_df.to_csv('data_output/patient_data_tabular_raw.csv', index=False)

In [69]:
# 1NF normalization - each table cell should have a single value
# the columns in the dataframe in need of normalization are extension, address, maritalStatus, name, telecom, etc.
# a naive solution would be to explode the columns that are lists. This, however, tends to become monolithic, as the number of table rows grows exponentially.
print("exploding column: extension")
patient_exploded_df = patient_df.explode('extension') # start by exploding extension - the first column of type list
for column in patient_df.columns.drop('extension'):
    if type(patient_df[column][0]) == list:
        print("exploding column: " + column)
        patient_exploded_df = patient_exploded_df.explode(column)

patient_exploded_df.to_csv('data_output/1NF_data/patient_data_tabular.csv', index=False)

exploding column: address
exploding column: communication
exploding column: identifier
exploding column: name
exploding column: telecom


In [70]:
# 2NF normalization - create additional tables for initial table cells with multiple/list entires
# this is a more complex solution, but it is more scalable, easier to maintain, and there is less data redundancy
patient_df_2NF = patient_df.copy()

for column in patient_df_2NF.columns:
    if type(patient_df_2NF[column][0]) == list:
        print("exploding column: " + column)
        patient_exploded_df = patient_df_2NF.explode(column)
        patient_df_2NF = patient_df_2NF.drop(columns=[column])

        # drop all columns from the exploded dataframe that are in the original dataframe except ID
        NF_columns = list(patient_df_2NF.columns)
        NF_columns.remove('id')
        patient_exploded_df.drop(columns=NF_columns, inplace=True)
        patient_exploded_df.to_csv('data_output/2NF_data/patient_data_tabular_' + column + '.csv', index=False)

# finally, write the original table with all multi-value columns removed to a csv file
patient_df_2NF.to_csv('data_output/2NF_data/patient_data_tabular.csv', index=False)

exploding column: extension
exploding column: address
exploding column: communication
exploding column: identifier
exploding column: name
exploding column: telecom


In [76]:
# #  we can further expand the table by identifying values which are FHIR objects and splitting them up by field, however this can get tricky with string parsing
# patient_1NF_df = pd.read_csv('data_output/1NF_data/patient_data_tabular.csv')
# for column in patient_1NF_df.columns:
#     # if the column starts with 'resource'
#     first_column_value = patient_1NF_df[column].values[0]
#     if type(first_column_value) == str and first_column_value.startswith('resource_type'):
#         print("Fields of column:", column)
#         for field in first_column_value.split(' '):
#             print("--",field)

## SQL implementation of pipeline with database

In [10]:
import tools.create_database as cd
import tools.update_database as ud

for k,v in list(sys.modules.items()):
    if k.startswith('tools'):
        importlib.reload(v)

# get the connection to the patient database, creating it if it does not yet exist
patient_database = cd.create_patient_database()

# # create the tables in the patient database - these correspond to the csv files in the data_output folder generated in the previous steps
# ud.create_tables(patient_database)
