# Notebook for exploratory data analysis of FHIR data

The .json data is formulated in the [FHIR](https://www.hl7.org/fhir/overview.html) standard.
- The basic building block in FHIR is a Resource
- Each resource consists of data elements that describe the healthcare concept.
- See the [FHIR Resource List](https://www.hl7.org/fhir/resourcelist.html) for a list of resources.
- For our use case, we will primarily be using the [Patient](https://www.hl7.org/fhir/patient.html) resource.
- The outcome of this EDA is a number of actionable testing/cleaning steps which will be implemented in tools/data_tests

In [2]:
import sys
sys.path.insert(1, '..')
import json
import importlib
for k,v in list(sys.modules.items()):
    if k.startswith('tools'):
        importlib.reload(v)

## Bundle EDA

In [3]:
# read in an example FHIR patient data file.
with open('../data/Aaron697_Jerde200_6fa23508-960e-ff22-c3d0-0519a036543b.json') as f:
    bundle_json = json.load(f)

In [4]:
# Data records are stored as FHIR bundles. Each bundle contains a list of entries.
from fhir.resources.bundle import Bundle

bundle = Bundle.parse_obj(bundle_json)
# # view all different resource types in the bundle - patient should be the only relevant one for this project.
print(set([e.resource.resource_type for e in bundle.entry]))

# TODO: we need to check there is only one patient per bundle, and that the patient is the first entry for all .json files in the data folder since the entries field is a list.
#  This can be implemented as a test.

{'Condition', 'MedicationRequest', 'Claim', 'Procedure', 'AllergyIntolerance', 'Encounter', 'Patient', 'Immunization', 'CarePlan', 'CareTeam', 'DiagnosticReport', 'ExplanationOfBenefit', 'DocumentReference', 'Observation', 'Provenance'}


In [48]:
patient_json = bundle_json['entry'][0]['resource']
# write patient json to .json file
with open('../data/patient.json', 'w') as f:
    json.dump(patient_json, f, indent=4)

## Patient EDA

In [40]:
# extract the patient data for the single example
from fhir.resources.patient import Patient

patient = Patient.parse_obj(bundle.entry[0].resource)

# print all fields in the patient data object
field_list = []
type_list = []
for field,value in patient:
    field_list.append(field)
    type_list.append(type(value))
    print(field, value, type(value))

print(field_list)
print(set(type_list))
# TODO: we need to check that all fields in the patient data object are valid/expected based on the FHIR model. This can be implemented as a test.
# TODO: we also need to check that all field values in the patient data object are valid/expected based on the FHIR model. This can also be implemented as a test.

resource_type='Patient' fhir_comments=None id='b0bccf43-3bf5-217c-7315-9e44d106bb6b' implicitRules=None implicitRules__ext=None language=None language__ext=None meta=Meta(resource_type='Meta', fhir_comments=None, extension=None, id=None, lastUpdated=None, lastUpdated__ext=None, profile=['http://hl7.org/fhir/us/core/StructureDefinition/us-core-patient'], profile__ext=None, security=None, source=None, source__ext=None, tag=None, versionId=None, versionId__ext=None) contained=None extension=[Extension(resource_type='Extension', fhir_comments=None, extension=[Extension(resource_type='Extension', fhir_comments=None, extension=None, id=None, url='ombCategory', valueAddress=None, valueAge=None, valueAnnotation=None, valueAttachment=None, valueBase64Binary=None, valueBoolean=None, valueCanonical=None, valueCode=None, valueCodeableConcept=None, valueCodeableReference=None, valueCoding=Coding(resource_type='Coding', fhir_comments=None, extension=None, id=None, code='2106-3', code__ext=None, disp

In [44]:
print(bundle.entry[0].resource)

resource_type='Patient' fhir_comments=None id='b0bccf43-3bf5-217c-7315-9e44d106bb6b' implicitRules=None implicitRules__ext=None language=None language__ext=None meta=Meta(resource_type='Meta', fhir_comments=None, extension=None, id=None, lastUpdated=None, lastUpdated__ext=None, profile=['http://hl7.org/fhir/us/core/StructureDefinition/us-core-patient'], profile__ext=None, security=None, source=None, source__ext=None, tag=None, versionId=None, versionId__ext=None) contained=None extension=[Extension(resource_type='Extension', fhir_comments=None, extension=[Extension(resource_type='Extension', fhir_comments=None, extension=None, id=None, url='ombCategory', valueAddress=None, valueAge=None, valueAnnotation=None, valueAttachment=None, valueBase64Binary=None, valueBoolean=None, valueCanonical=None, valueCode=None, valueCodeableConcept=None, valueCodeableReference=None, valueCoding=Coding(resource_type='Coding', fhir_comments=None, extension=None, id=None, code='2106-3', code__ext=None, disp

In [30]:
# check if all fields in all patient objects are the same, and in the same order
import tools.read_data as rd

pfl = rd.get_patient_file_list('../data')
all_patient_field_lists = []
patient_object_list = []
for data_file_name in pfl:
    with open('../data/'+data_file_name) as f:
        bundle_json = json.load(f)
    bundle = Bundle.parse_obj(bundle_json)
    patient_object = Patient.parse_obj(bundle.entry[0].resource)
    patient_object_list.append(patient_object)
    
    patient_field_list = [field for field,value in patient_object]
    all_patient_field_lists.append(patient_field_list)

# check if all fields in all patient objects are the same, and in the same order
assert all([all_patient_field_lists[0] == x for x in all_patient_field_lists])

# TODO: now that we know the fields are constant across all patient objects, we can begin to formulate the structure of the database tables

# Tabulate

In [33]:
# create a dataframe from all the patient data
import pandas as pd

patient_data = []
for patient_object in patient_object_list:
    patient_data.append(patient_object.dict())

patient_df_take_one = pd.DataFrame(patient_data)
patient_df_take_one.head()

# This method leaves out a number of fields including those which have None as entries, and those which have non-python types such as fhir.resources.narrative.Narrative or fhir.resources.codeableconcept.CodeableConcept.
# We can do better

Unnamed: 0,resourceType,id,meta,text,extension,identifier,name,telecom,gender,birthDate,address,maritalStatus,multipleBirthBoolean,communication,deceasedDateTime,multipleBirthInteger
0,Patient,8a3247d3-a54c-43f2-2c5d-a8f5e28ff588,{'profile': ['http://hl7.org/fhir/us/core/Stru...,"{'status': 'generated', 'div': '<div xmlns=""ht...","[{'extension': [OrderedDict([('url', 'ombCateg...",[{'system': 'https://github.com/synthetichealt...,"[{'use': 'official', 'family': 'Littel644', 'g...","[{'system': 'phone', 'value': '555-136-4712', ...",male,1974-10-12,"[{'extension': [OrderedDict([('extension', [Or...",{'coding': [{'system': 'http://terminology.hl7...,False,[{'language': {'coding': [OrderedDict([('syste...,,
1,Patient,b0f40536-9dc8-2ea0-0bbf-467a69f5e3ad,{'profile': ['http://hl7.org/fhir/us/core/Stru...,"{'status': 'generated', 'div': '<div xmlns=""ht...","[{'extension': [OrderedDict([('url', 'ombCateg...",[{'system': 'https://github.com/synthetichealt...,"[{'use': 'official', 'family': 'Ratke343', 'gi...","[{'system': 'phone', 'value': '555-884-9015', ...",female,1956-11-05,"[{'extension': [OrderedDict([('extension', [Or...",{'coding': [{'system': 'http://terminology.hl7...,False,[{'language': {'coding': [OrderedDict([('syste...,,
2,Patient,09e292d4-f186-331c-ed95-c503acabc54e,{'profile': ['http://hl7.org/fhir/us/core/Stru...,"{'status': 'generated', 'div': '<div xmlns=""ht...","[{'extension': [OrderedDict([('url', 'ombCateg...",[{'system': 'https://github.com/synthetichealt...,"[{'use': 'official', 'family': 'Windler79', 'g...","[{'system': 'phone', 'value': '555-425-8155', ...",male,1913-02-23,"[{'extension': [OrderedDict([('extension', [Or...",{'coding': [{'system': 'http://terminology.hl7...,False,[{'language': {'coding': [OrderedDict([('syste...,1934-01-15 13:15:01+00:00,
3,Patient,10bf6da8-ffa1-6913-a119-726634be754c,{'profile': ['http://hl7.org/fhir/us/core/Stru...,"{'status': 'generated', 'div': '<div xmlns=""ht...","[{'extension': [OrderedDict([('url', 'ombCateg...",[{'system': 'https://github.com/synthetichealt...,"[{'use': 'official', 'family': 'Schuppe920', '...","[{'system': 'phone', 'value': '555-294-9369', ...",male,1946-12-13,"[{'extension': [OrderedDict([('extension', [Or...",{'coding': [{'system': 'http://terminology.hl7...,,[{'language': {'coding': [OrderedDict([('syste...,,3.0
4,Patient,0d55a582-07fe-a897-776c-3ab5e48cd457,{'profile': ['http://hl7.org/fhir/us/core/Stru...,"{'status': 'generated', 'div': '<div xmlns=""ht...","[{'extension': [OrderedDict([('url', 'ombCateg...",[{'system': 'https://github.com/synthetichealt...,"[{'use': 'official', 'family': 'Hahn503', 'giv...","[{'system': 'phone', 'value': '555-571-7714', ...",female,1960-07-10,"[{'extension': [OrderedDict([('extension', [Or...",{'coding': [{'system': 'http://terminology.hl7...,False,[{'language': {'coding': [OrderedDict([('syste...,,


In [38]:
def patients_to_dataframe(patients):
    """
    Converts a list of FHIR Patient objects to a pandas dataframe.
    :param patients: a list of FHIR Patient objects
    :return: a pandas dataframe
    """
    columns = field_list
    data = []

    for patient in patients:
        row = []
        for attr in columns:
            if hasattr(patient, attr):
                value = getattr(patient, attr)
                if isinstance(value, list):
                    value = [str(item) if not isinstance(item, (int, float, bool)) else item for item in value]
                elif not isinstance(value, (int, float, bool)):
                    value = str(value)
                row.append(value)
            else:
                row.append(None)
        data.append(row)

    df = pd.DataFrame(data, columns=columns)
    return df

patient_df = patients_to_dataframe(patient_object_list).drop(columns=['resource_type']) # we can drop this column because it is constant by definition

In [39]:
patient_df # look at the patient df

Unnamed: 0,fhir_comments,id,implicitRules,implicitRules__ext,language,language__ext,meta,contained,extension,modifierExtension,...,link,managingOrganization,maritalStatus,multipleBirthBoolean,multipleBirthBoolean__ext,multipleBirthInteger,multipleBirthInteger__ext,name,photo,telecom
0,,8a3247d3-a54c-43f2-2c5d-a8f5e28ff588,,,,,resource_type='Meta' fhir_comments=None extens...,,[resource_type='Extension' fhir_comments=None ...,,...,,,resource_type='CodeableConcept' fhir_comments=...,False,,,,[resource_type='HumanName' fhir_comments=None ...,,[resource_type='ContactPoint' fhir_comments=No...
1,,b0f40536-9dc8-2ea0-0bbf-467a69f5e3ad,,,,,resource_type='Meta' fhir_comments=None extens...,,[resource_type='Extension' fhir_comments=None ...,,...,,,resource_type='CodeableConcept' fhir_comments=...,False,,,,[resource_type='HumanName' fhir_comments=None ...,,[resource_type='ContactPoint' fhir_comments=No...
2,,09e292d4-f186-331c-ed95-c503acabc54e,,,,,resource_type='Meta' fhir_comments=None extens...,,[resource_type='Extension' fhir_comments=None ...,,...,,,resource_type='CodeableConcept' fhir_comments=...,False,,,,[resource_type='HumanName' fhir_comments=None ...,,[resource_type='ContactPoint' fhir_comments=No...
3,,10bf6da8-ffa1-6913-a119-726634be754c,,,,,resource_type='Meta' fhir_comments=None extens...,,[resource_type='Extension' fhir_comments=None ...,,...,,,resource_type='CodeableConcept' fhir_comments=...,,,3,,[resource_type='HumanName' fhir_comments=None ...,,[resource_type='ContactPoint' fhir_comments=No...
4,,0d55a582-07fe-a897-776c-3ab5e48cd457,,,,,resource_type='Meta' fhir_comments=None extens...,,[resource_type='Extension' fhir_comments=None ...,,...,,,resource_type='CodeableConcept' fhir_comments=...,False,,,,[resource_type='HumanName' fhir_comments=None ...,,[resource_type='ContactPoint' fhir_comments=No...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74,,f406a4e8-821b-0c9a-c8ec-09ad0f1fe9c6,,,,,resource_type='Meta' fhir_comments=None extens...,,[resource_type='Extension' fhir_comments=None ...,,...,,,resource_type='CodeableConcept' fhir_comments=...,False,,,,[resource_type='HumanName' fhir_comments=None ...,,[resource_type='ContactPoint' fhir_comments=No...
75,,aad9d04b-bb30-2f47-d5dd-888b3b7bd831,,,,,resource_type='Meta' fhir_comments=None extens...,,[resource_type='Extension' fhir_comments=None ...,,...,,,resource_type='CodeableConcept' fhir_comments=...,False,,,,[resource_type='HumanName' fhir_comments=None ...,,[resource_type='ContactPoint' fhir_comments=No...
76,,aa4eae2c-733a-35f9-8869-d33a6015db23,,,,,resource_type='Meta' fhir_comments=None extens...,,[resource_type='Extension' fhir_comments=None ...,,...,,,resource_type='CodeableConcept' fhir_comments=...,False,,,,[resource_type='HumanName' fhir_comments=None ...,,[resource_type='ContactPoint' fhir_comments=No...
77,,cf3ce382-cceb-1557-89ac-b751a9e0e65d,,,,,resource_type='Meta' fhir_comments=None extens...,,[resource_type='Extension' fhir_comments=None ...,,...,,,resource_type='CodeableConcept' fhir_comments=...,False,,,,[resource_type='HumanName' fhir_comments=None ...,,[resource_type='ContactPoint' fhir_comments=No...


In [37]:
patient_df.describe() # look at some summary statistics

Unnamed: 0,resource_type,fhir_comments,id,implicitRules,implicitRules__ext,language,language__ext,meta,contained,extension,...,link,managingOrganization,maritalStatus,multipleBirthBoolean,multipleBirthBoolean__ext,multipleBirthInteger,multipleBirthInteger__ext,name,photo,telecom
count,79,79.0,79,79.0,79.0,79.0,79.0,79,79.0,79,...,79.0,79.0,79,79,79.0,79.0,79.0,79,79.0,79
unique,1,1.0,79,1.0,1.0,1.0,1.0,1,1.0,79,...,1.0,1.0,3,2,1.0,3.0,1.0,79,1.0,79
top,Patient,,8a3247d3-a54c-43f2-2c5d-a8f5e28ff588,,,,,resource_type='Meta' fhir_comments=None extens...,,[resource_type='Extension' fhir_comments=None ...,...,,,resource_type='CodeableConcept' fhir_comments=...,False,,,,[resource_type='HumanName' fhir_comments=None ...,,[resource_type='ContactPoint' fhir_comments=No...
freq,79,79.0,1,79.0,79.0,79.0,79.0,79,79.0,1,...,79.0,79.0,45,77,79.0,77.0,79.0,1,79.0,1


In [36]:
# check that the dataframe preserves all fields
assert list(patient_df.columns) == field_list