<a href="https://colab.research.google.com/github/Jahan08/Electronic-Health-Data-analysis/blob/main/Electronic_Health_Records_(EHRs)_Data_Exploration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Analysis preparation

In [None]:
import numpy as np
import pandas as pd
import json
import os
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', 50)
from tqdm import tqdm

# Read the data

## Explore the file structure

In [None]:
file_path_list = []
for dirname, _, filenames in os.walk('/content/drive/MyDrive/Electronic-Health-Record-SyntheticData'):
    for filename in filenames:
        file_path_list.append((dirname, filename))

In [None]:
metadata_df = pd.DataFrame(file_path_list, columns=["folder", "file"])

In [None]:
print(f"Files: {metadata_df.shape[0]}")

Files: 129228


In [None]:
metadata_df[5:11]

Unnamed: 0,folder,file
5,/content/drive/MyDrive/Electronic-Health-Recor...,7f4357a0-2d94-48a0-a5d0-2943c27c2948.json
6,/content/drive/MyDrive/Electronic-Health-Recor...,7f460434-134e-41ea-9c15-43b9fdf64c41.json
7,/content/drive/MyDrive/Electronic-Health-Recor...,7f4039f8-e17d-46f6-9314-351ffd2455cd.json
8,/content/drive/MyDrive/Electronic-Health-Recor...,7f47d17f-7e00-4bc5-9d33-936ff317ae65.json
9,/content/drive/MyDrive/Electronic-Health-Recor...,7f4b2ea0-48a8-457a-a959-69af3b27d964.json
10,/content/drive/MyDrive/Electronic-Health-Recor...,7f4fbc5f-9849-4ee9-a773-9f0644c66286.json


## Add group and subgroup information

In [None]:
def extract_subgroup(path):
    return path.split("/")[-1]

def extract_group(path):
    return path.split("/")[-2]


In [None]:
metadata_df["group"] = metadata_df["folder"].apply(lambda x: extract_group(x))
metadata_df["subgroup"] = metadata_df["folder"].apply(lambda x: extract_subgroup(x))

In [None]:
metadata_df = metadata_df[["folder", "group", "subgroup", "file"]]

In [None]:
metadata_df.folder[1]

'/content/drive/MyDrive/Electronic-Health-Record-SyntheticData/7f/7f4'

In [None]:
print(f"Folders: {metadata_df.folder.nunique()}")
print(f"Groups: {metadata_df.group.nunique()}")
print(f"Subgroups: {metadata_df.subgroup.nunique()}")
print(f"Files: {metadata_df.file.nunique()}")

Folders: 4080
Groups: 255
Subgroups: 4080
Files: 129228


## Read one file

We read and we will explore one sample data file.

In [None]:
sample_df= pd.read_json('/content/drive/MyDrive/Electronic-Health-Record-SyntheticData/7f/7f4/7f4039f8-e17d-46f6-9314-351ffd2455cd.json')

In [None]:
sample_df.head()

Unnamed: 0,type,entry,resourceType
0,collection,{'fullUrl': 'urn:uuid:2e2f285e-e91c-4158-af24-...,Bundle
1,collection,{'fullUrl': 'urn:uuid:7ed51b0f-8be7-40ef-aba9-...,Bundle
2,collection,{'fullUrl': 'urn:uuid:8ef06d73-7ba7-4d8f-b6b0-...,Bundle
3,collection,{'fullUrl': 'urn:uuid:b76b875d-340c-4db9-9840-...,Bundle
4,collection,{'fullUrl': 'urn:uuid:9a3a98c4-4d00-49ca-b91e-...,Bundle


# Data exploration

## Import and process one sample file

In [None]:
patient_df = pd.DataFrame()
careplan_df = pd.DataFrame()
condition_df = pd.DataFrame()
diagnostic_report_df = pd.DataFrame()
encounter_df = pd.DataFrame()
immunization_df = pd.DataFrame()
observation_df = pd.DataFrame()
procedure_df = pd.DataFrame()

In [None]:
def process_one_file(sample_df,
                    patient_df,
                    careplan_df,
                    condition_df,
                    diagnostic_report_df,
                    encounter_df,
                    immunization_df,
                    observation_df,
                    procedure_df):

    dataframe_list = [patient_df, careplan_df, condition_df, diagnostic_report_df,
                 encounter_df, immunization_df, observation_df, procedure_df]

    for index, row in sample_df.iterrows():
        resourcetype=set()
        tempdf=pd.json_normalize(row.entry)
        resourcetype.add([str(x) for x in tempdf['resource.resourceType']][0])

        if str(tempdf['resource.resourceType'][0])=="Patient":
            frames = [patient_df, tempdf]
            patient_df = pd.concat(frames)

        elif str(tempdf['resource.resourceType'][0])=="CarePlan":
            frames = [careplan_df, tempdf]
            careplan_df = pd.concat(frames)

        elif str(tempdf['resource.resourceType'][0])=="Condition":
            frames = [condition_df, tempdf]
            condition_df = pd.concat(frames)

        elif str(tempdf['resource.resourceType'][0])=="DiagnosticReport":
            frames = [diagnostic_report_df, tempdf]
            diagnostic_report_df = pd.concat(frames)

        elif str(tempdf['resource.resourceType'][0])=="Encounter":
            frames = [encounter_df, tempdf]
            encounter_df = pd.concat(frames)

        elif str(tempdf['resource.resourceType'][0])=="Immunization":
            frames = [immunization_df, tempdf]
            immunization_df = pd.concat(frames)

        elif str(tempdf['resource.resourceType'][0])=="Observation":
            frames = [observation_df, tempdf]
            observation_df = pd.concat(frames)

        elif str(tempdf['resource.resourceType'][0])=="Procedure":
            frames = [procedure_df, tempdf]
            procedure_df = pd.concat(frames)

    return patient_df,\
                    careplan_df,\
                    condition_df,\
                    diagnostic_report_df,\
                    encounter_df,\
                    immunization_df,\
                    observation_df,\
                    procedure_df

In [None]:
def clean_and_rename(patient_df,
                    careplan_df,
                    condition_df,
                    diagnostic_report_df,
                    encounter_df,
                    immunization_df,
                    observation_df,
                    procedure_df):
    for df in [patient_df, careplan_df, condition_df, diagnostic_report_df,
                 encounter_df, immunization_df, observation_df, procedure_df]:
        df.columns = df.columns.str.replace(".", "_")
        df.columns = df.columns.str.replace("resource_", "")

    for df in [patient_df, observation_df, encounter_df]:
        df['fullUrl']= df['fullUrl'].str.replace('urn:uuid:', '')

    #for df in [careplan_df, condition_df, diagnostic_report_df]:
    #    df['subject_reference']=df['subject_reference'].str.replace('urn:uuid:', '')
    #    df['context_reference']=df['context_reference'].str.replace('urn:uuid:', '')

    for df in [encounter_df, immunization_df]:
        df['patient_reference'] = df['patient_reference'].str.replace('urn:uuid:', '')

    for df in [immunization_df]:
        df['encounter_reference'] = df['encounter_reference'].str.replace('urn:uuid:', '')

    for df in [observation_df, procedure_df]:
        df['subject_reference'] = df['subject_reference'].str.replace('urn:uuid:', '')
        df['encounter_reference'] = df['encounter_reference'].str.replace('urn:uuid:', '')

    return patient_df,\
                    careplan_df,\
                    condition_df,\
                    diagnostic_report_df,\
                    encounter_df,\
                    immunization_df,\
                    observation_df,\
                    procedure_df

In [None]:
patient_df,\
careplan_df,\
condition_df,\
diagnostic_report_df,\
encounter_df,\
immunization_df,\
observation_df,\
procedure_df = \
process_one_file(sample_df,patient_df,
careplan_df,
condition_df,
diagnostic_report_df,
encounter_df,
immunization_df,
observation_df,
procedure_df)

In [None]:
patient_df.head()

Unnamed: 0,fullUrl,resource.id,resource.text.status,resource.text.div,resource.extension,resource.identifier,resource.name,resource.telecom,resource.gender,resource.birthDate,resource.address,resource.maritalStatus.coding,resource.multipleBirthBoolean,resource.photo,resource.resourceType
0,urn:uuid:2e2f285e-e91c-4158-af24-fd1e7fb3ad6c,2e2f285e-e91c-4158-af24-fd1e7fb3ad6c,generated,"<div>Generated by <a href=""https://github.com/...",[{'url': 'http://hl7.org/fhir/StructureDefinit...,[{'system': 'https://github.com/synthetichealt...,"[{'use': 'official', 'family': 'Bergstrom566',...",[{'extension': [{'url': 'http://standardhealth...,male,1974-08-30,[{'extension': [{'extension': [{'url': 'latitu...,[{'system': 'http://hl7.org/fhir/v3/MaritalSta...,False,"[{'contentType': 'image/png', 'data': 'iVBORw0...",Patient


In [None]:
careplan_df.head()

Unnamed: 0,resource.status,resource.category,resource.subject.reference,resource.context.reference,resource.period.start,resource.addresses,resource.activity,resource.resourceType
0,active,[{'coding': [{'system': 'http://snomed.info/sc...,urn:uuid:2e2f285e-e91c-4158-af24-fd1e7fb3ad6c,urn:uuid:7ed51b0f-8be7-40ef-aba9-351e1f1fb1d2,1998-01-12,[{'reference': 'urn:uuid:9a3a98c4-4d00-49ca-b9...,[{'detail': {'code': {'coding': [{'system': 'h...,CarePlan


In [None]:
condition_df.head()

Unnamed: 0,fullUrl,resource.id,resource.clinicalStatus,resource.verificationStatus,resource.code.coding,resource.subject.reference,resource.context.reference,resource.onsetDateTime,resource.resourceType,resource.abatementDateTime
0,urn:uuid:8ef06d73-7ba7-4d8f-b6b0-b750c1b0a754,8ef06d73-7ba7-4d8f-b6b0-b750c1b0a754,active,confirmed,"[{'system': 'http://snomed.info/sct', 'code': ...",urn:uuid:2e2f285e-e91c-4158-af24-fd1e7fb3ad6c,urn:uuid:7ed51b0f-8be7-40ef-aba9-351e1f1fb1d2,1992-09-18T05:02:42-04:00,Condition,
0,urn:uuid:b76b875d-340c-4db9-9840-c1bfc26aad7c,b76b875d-340c-4db9-9840-c1bfc26aad7c,active,confirmed,"[{'system': 'http://snomed.info/sct', 'code': ...",urn:uuid:2e2f285e-e91c-4158-af24-fd1e7fb3ad6c,urn:uuid:7ed51b0f-8be7-40ef-aba9-351e1f1fb1d2,1993-12-10T01:00:27-05:00,Condition,
0,urn:uuid:9a3a98c4-4d00-49ca-b91e-8b18eeb19cc1,9a3a98c4-4d00-49ca-b91e-8b18eeb19cc1,active,confirmed,"[{'system': 'http://snomed.info/sct', 'code': ...",urn:uuid:2e2f285e-e91c-4158-af24-fd1e7fb3ad6c,urn:uuid:7ed51b0f-8be7-40ef-aba9-351e1f1fb1d2,1998-01-12T00:29:06-05:00,Condition,
0,urn:uuid:2cf1b2f3-efc7-48c7-a368-aec0df61cc8b,2cf1b2f3-efc7-48c7-a368-aec0df61cc8b,active,confirmed,"[{'system': 'http://snomed.info/sct', 'code': ...",urn:uuid:2e2f285e-e91c-4158-af24-fd1e7fb3ad6c,urn:uuid:7ed51b0f-8be7-40ef-aba9-351e1f1fb1d2,2011-05-16T00:16:01-04:00,Condition,2011-05-28T03:02:50-04:00
0,urn:uuid:364c4729-5681-41ac-9fe9-ee277de2a6f0,364c4729-5681-41ac-9fe9-ee277de2a6f0,active,confirmed,"[{'system': 'http://snomed.info/sct', 'code': ...",urn:uuid:2e2f285e-e91c-4158-af24-fd1e7fb3ad6c,urn:uuid:b981574b-1607-49d2-ae5a-fe0c4157123f,2012-02-28T12:36:17-05:00,Condition,2012-03-24T05:28:29-04:00


In [None]:
diagnostic_report_df.head()

Unnamed: 0,fullUrl,resource.id,resource.status,resource.code.coding,resource.subject.reference,resource.encounter.reference,resource.effectiveDateTime,resource.issued,resource.performer,resource.result,resource.resourceType
0,urn:uuid:16fd238e-8c27-4ee2-84d9-6f4ee778e6d1,16fd238e-8c27-4ee2-84d9-6f4ee778e6d1,final,"[{'system': 'http://loinc.org', 'code': '51990...",urn:uuid:2e2f285e-e91c-4158-af24-fd1e7fb3ad6c,urn:uuid:ba95cf69-17e8-46fc-a79f-2e4a1309abc3,2013-01-16T22:16:28-05:00,2013-01-16T22:16:28-05:00,[{'display': 'Hospital Lab'}],[{'reference': 'urn:uuid:e6860820-d5d5-4890-a0...,DiagnosticReport
0,urn:uuid:e50ea769-1820-462a-aa67-1e5128cccd45,e50ea769-1820-462a-aa67-1e5128cccd45,final,"[{'system': 'http://loinc.org', 'code': '57698...",urn:uuid:2e2f285e-e91c-4158-af24-fd1e7fb3ad6c,urn:uuid:ba95cf69-17e8-46fc-a79f-2e4a1309abc3,2013-01-16T22:16:28-05:00,2013-01-16T22:16:28-05:00,[{'display': 'Hospital Lab'}],[{'reference': 'urn:uuid:eb69b3d4-aded-4b98-aa...,DiagnosticReport
0,urn:uuid:1b9a6141-a8b7-405e-a28d-bdc9c69c21fe,1b9a6141-a8b7-405e-a28d-bdc9c69c21fe,final,"[{'system': 'http://loinc.org', 'code': '51990...",urn:uuid:2e2f285e-e91c-4158-af24-fd1e7fb3ad6c,urn:uuid:3a346214-681c-46c2-8e47-4574831f7aa1,2015-12-12T10:46:57-05:00,2015-12-12T10:46:57-05:00,[{'display': 'Hospital Lab'}],[{'reference': 'urn:uuid:51357d4e-d3c8-4c85-ab...,DiagnosticReport


In [None]:
encounter_df.head()

Unnamed: 0,fullUrl,resource.id,resource.status,resource.class.code,resource.type,resource.patient.reference,resource.period.start,resource.period.end,resource.reason.coding,resource.resourceType
0,urn:uuid:7ed51b0f-8be7-40ef-aba9-351e1f1fb1d2,7ed51b0f-8be7-40ef-aba9-351e1f1fb1d2,finished,ambulatory,[{'coding': [{'system': 'http://snomed.info/sc...,urn:uuid:2e2f285e-e91c-4158-af24-fd1e7fb3ad6c,2011-05-17T13:30:52-04:00,2011-05-17T13:30:52-04:00,"[{'system': 'http://snomed.info/sct', 'code': ...",Encounter
0,urn:uuid:b981574b-1607-49d2-ae5a-fe0c4157123f,b981574b-1607-49d2-ae5a-fe0c4157123f,finished,ambulatory,[{'coding': [{'system': 'http://snomed.info/sc...,urn:uuid:2e2f285e-e91c-4158-af24-fd1e7fb3ad6c,2012-03-04T22:37:52-05:00,2012-03-04T22:37:52-05:00,"[{'system': 'http://snomed.info/sct', 'code': ...",Encounter
0,urn:uuid:021783b7-70d8-4252-900b-7e03754cca48,021783b7-70d8-4252-900b-7e03754cca48,finished,ambulatory,[{'coding': [{'system': 'http://snomed.info/sc...,urn:uuid:2e2f285e-e91c-4158-af24-fd1e7fb3ad6c,2012-09-01T17:08:21-04:00,2012-09-01T17:08:21-04:00,"[{'system': 'http://snomed.info/sct', 'code': ...",Encounter
0,urn:uuid:ba95cf69-17e8-46fc-a79f-2e4a1309abc3,ba95cf69-17e8-46fc-a79f-2e4a1309abc3,finished,outpatient,[{'coding': [{'system': 'http://snomed.info/sc...,urn:uuid:2e2f285e-e91c-4158-af24-fd1e7fb3ad6c,2013-01-16T22:16:28-05:00,2013-01-16T23:16:28-05:00,,Encounter
0,urn:uuid:be443768-90c6-4611-92ad-26c5b2e12414,be443768-90c6-4611-92ad-26c5b2e12414,finished,ambulatory,[{'coding': [{'system': 'http://snomed.info/sc...,urn:uuid:2e2f285e-e91c-4158-af24-fd1e7fb3ad6c,2013-04-14T15:18:08-04:00,2013-04-14T15:18:08-04:00,"[{'system': 'http://snomed.info/sct', 'code': ...",Encounter


In [None]:
immunization_df.head()

Unnamed: 0,resource.status,resource.date,resource.vaccineCode.coding,resource.patient.reference,resource.wasNotGiven,resource.primarySource,resource.encounter.reference,resource.resourceType
0,completed,2013-01-16T22:16:28-05:00,"[{'system': 'http://hl7.org/fhir/sid/cvx', 'co...",urn:uuid:2e2f285e-e91c-4158-af24-fd1e7fb3ad6c,False,True,urn:uuid:ba95cf69-17e8-46fc-a79f-2e4a1309abc3,Immunization
0,completed,2015-12-12T10:46:57-05:00,"[{'system': 'http://hl7.org/fhir/sid/cvx', 'co...",urn:uuid:2e2f285e-e91c-4158-af24-fd1e7fb3ad6c,False,True,urn:uuid:3a346214-681c-46c2-8e47-4574831f7aa1,Immunization
0,completed,2015-12-12T10:46:57-05:00,"[{'system': 'http://hl7.org/fhir/sid/cvx', 'co...",urn:uuid:2e2f285e-e91c-4158-af24-fd1e7fb3ad6c,False,True,urn:uuid:3a346214-681c-46c2-8e47-4574831f7aa1,Immunization


In [None]:
observation_df.head()

Unnamed: 0,fullUrl,resource.id,resource.status,resource.code.coding,resource.subject.reference,resource.encounter.reference,resource.effectiveDateTime,resource.valueQuantity.value,resource.valueQuantity.unit,resource.valueQuantity.system,resource.valueQuantity.code,resource.resourceType,resource.component
0,urn:uuid:39bc992f-2aa3-48e4-ba1b-dd31185b561c,39bc992f-2aa3-48e4-ba1b-dd31185b561c,final,"[{'system': 'http://loinc.org', 'code': '8331-...",urn:uuid:2e2f285e-e91c-4158-af24-fd1e7fb3ad6c,urn:uuid:7ed51b0f-8be7-40ef-aba9-351e1f1fb1d2,2011-05-17T13:30:52-04:00,38.0,Cel,http://unitsofmeasure.org/,Cel,Observation,
0,urn:uuid:4e9a2866-f300-49c1-881a-93e8140ec4c1,4e9a2866-f300-49c1-881a-93e8140ec4c1,final,"[{'system': 'http://loinc.org', 'code': '8331-...",urn:uuid:2e2f285e-e91c-4158-af24-fd1e7fb3ad6c,urn:uuid:021783b7-70d8-4252-900b-7e03754cca48,2012-09-01T17:08:21-04:00,37.0,Cel,http://unitsofmeasure.org/,Cel,Observation,
0,urn:uuid:109232d9-90b2-4f89-9ead-db22b93f5be6,109232d9-90b2-4f89-9ead-db22b93f5be6,final,"[{'system': 'http://loinc.org', 'code': '4548-...",urn:uuid:2e2f285e-e91c-4158-af24-fd1e7fb3ad6c,urn:uuid:ba95cf69-17e8-46fc-a79f-2e4a1309abc3,2013-01-16T22:16:28-05:00,6.2,%,http://unitsofmeasure.org/,%,Observation,
0,urn:uuid:4a3bab73-feb6-4621-a123-b043a8dd8530,4a3bab73-feb6-4621-a123-b043a8dd8530,final,"[{'system': 'http://loinc.org', 'code': '8302-...",urn:uuid:2e2f285e-e91c-4158-af24-fd1e7fb3ad6c,urn:uuid:ba95cf69-17e8-46fc-a79f-2e4a1309abc3,2013-01-16T22:16:28-05:00,180.200796,cm,http://unitsofmeasure.org/,cm,Observation,
0,urn:uuid:56c96cea-9ff9-4cf4-af06-15007cbcec13,56c96cea-9ff9-4cf4-af06-15007cbcec13,final,"[{'system': 'http://loinc.org', 'code': '29463...",urn:uuid:2e2f285e-e91c-4158-af24-fd1e7fb3ad6c,urn:uuid:ba95cf69-17e8-46fc-a79f-2e4a1309abc3,2013-01-16T22:16:28-05:00,83.927793,kg,http://unitsofmeasure.org/,kg,Observation,


In [None]:
procedure_df

Unnamed: 0,resource.status,resource.code.coding,resource.code.text,resource.subject.reference,resource.encounter.reference,resource.performedDateTime,resource.resourceType
0,completed,"[{'system': 'http://snomed.info/sct', 'code': ...",Documentation of current medications,urn:uuid:2e2f285e-e91c-4158-af24-fd1e7fb3ad6c,urn:uuid:3a346214-681c-46c2-8e47-4574831f7aa1,2015-12-12T10:46:57-05:00,Procedure


## Import and process a group of files


Let's import an entire group of files. We will pick few groups with the largest size.

In [None]:
sel_index = list(metadata_df.group.value_counts()[0:2].index)
sel_index

['d6', 'e1']

In [None]:
group_df = metadata_df.loc[metadata_df.group.isin(sel_index)]

In [None]:
group_df.shape[0], group_df.shape[0] / metadata_df.shape[0]

(1119, 0.008659114123874084)

We will select only first 1.1K entries, or less than 1% of the data.

In [None]:
patient_df = pd.DataFrame()
careplan_df = pd.DataFrame()
condition_df = pd.DataFrame()
diagnostic_report_df = pd.DataFrame()
encounter_df = pd.DataFrame()
immunization_df = pd.DataFrame()
observation_df = pd.DataFrame()
procedure_df = pd.DataFrame()

In [None]:
for index, row in tqdm(group_df.iterrows()):
    folder = row["folder"]
    file = row["file"]
    sample_df = pd.read_json(os.path.join(folder, file))
    patient_df,\
    careplan_df,\
    condition_df,\
    diagnostic_report_df,\
    encounter_df,\
    immunization_df,\
    observation_df,\
    procedure_df = \
    process_one_file(sample_df,patient_df,
    careplan_df,
    condition_df,
    diagnostic_report_df,
    encounter_df,
    immunization_df,
    observation_df,
    procedure_df)

85it [00:46,  2.12it/s]

Let's check the data size for each dimmension.

In [None]:
patient_df.shape[0], careplan_df.shape[0], condition_df.shape[0], diagnostic_report_df.shape[0], encounter_df.shape[0], immunization_df.shape[0],\
observation_df.shape[0], procedure_df.shape[0]

In [None]:
for df in [patient_df, careplan_df, condition_df, diagnostic_report_df, encounter_df, immunization_df, observation_df,procedure_df]:
    print(df.columns)

In [None]:
patient_df.head()

In [None]:
careplan_df.head()

In [None]:
condition_df.head()

In [None]:
diagnostic_report_df.head()

In [None]:
encounter_df.head()

In [None]:
immunization_df.head()

In [None]:
observation_df.head()

In [None]:
procedure_df.head()

In [None]:
def plot_count(feature, title, df, size=1, ordered=True):
    sns.set_theme(style="whitegrid")
    f, ax = plt.subplots(1,1, figsize=(4*size,4))
    total = float(len(df))
    if ordered:
        g = sns.countplot(x=feature, data=df, order = df[feature].value_counts().index[:20], palette="Set3")
    else:
        g = sns.countplot(x=feature, data = df, palette='Set3')
    g.set_title("Number and percentage of {}".format(title))
    if(size > 2):
        plt.xticks(rotation=90, size=8)
    for p in ax.patches:
        height = p.get_height()
        ax.text(p.get_x()+p.get_width()/2.,
                height + 3,
                '{:1.2f}%'.format(100*height/total),
                ha="center")
    plt.show()

In [None]:
plot_count("resource.gender", "Patient gender", df=patient_df, size=1)

In [None]:
from datetime import datetime as dt
patient_df["birth_date"] = patient_df["resource.birthDate"].apply(lambda x: dt.strptime(x, '%Y-%m-%d'))
patient_df["birth_year"] = patient_df["birth_date"].apply(lambda x: x.year)

In [None]:
agg_year = patient_df.groupby(["birth_year"])["resource.gender"].count().reset_index()
agg_year.columns = ["birth_year", "count"]
plt.plot(agg_year["birth_year"], agg_year["count"])
plt.title = "Patients births / year"
plt.xlabel("Year")
plt.ylabel("Patients births")
plt.show()

In [None]:
agg_year = patient_df.groupby(["birth_year", "resource.gender"])["resource.telecom"].count().reset_index()
agg_year.columns = ["birth_year", "gender", "count"]
plt.plot(agg_year.loc[agg_year.gender=="male", "birth_year"], agg_year.loc[agg_year.gender=="male", "count"], label="male")
plt.plot(agg_year.loc[agg_year.gender=="female", "birth_year"], agg_year.loc[agg_year.gender=="female", "count"], label="female")
plt.title = "Patients births / year"
plt.xlabel("Year")
plt.ylabel("Patients births")
plt.legend()
plt.show()

In [None]:
plot_count("resource.status", "Status of careplan", df=careplan_df, size=1)

In [None]:
plot_count("resource.clinicalStatus", "clinical status (condition)", df=condition_df, size=1)

In [None]:
plot_count("resource.verificationStatus", "verification status (condition)", df=condition_df, size=1)

In [None]:
plot_count("resource.class.code", "class code (encounter)", df=encounter_df, size=3)

In [None]:
plot_count("resource.status", "status (encounter)", df=encounter_df, size=1)

In [None]:
plot_count("resource.code.text", "code text (procedure)", df=procedure_df, size=4)

## Further process data points

Let's further process some of the data points to extract more details about the observations.

In [None]:
observation_df.head()

In [None]:
observation_df.iloc[0]["resource.code.coding"]

In [None]:
def extract_subitems_by_name(item_list, name):
    for item in item_list:
        if item[name]:
            return item[name]

def extract_one_subitem_by_name(item_list, name):
    return item_list[0][name]



In [None]:
observation_df["date"] = observation_df["resource.effectiveDateTime"].apply(lambda x: dt.strptime(x[0:10], '%Y-%m-%d'))

In [None]:
observation_df["resource.code.code"] = observation_df["resource.code.coding"].apply(lambda x: extract_one_subitem_by_name(x, 'code'))
observation_df["resource.code.display"] = observation_df["resource.code.coding"].apply(lambda x: extract_one_subitem_by_name(x, 'display'))

We would like to extract all data from

In [None]:
plot_count("resource.code.code", "code (observation)", df=observation_df, size=4)

In [None]:
plot_count("resource.code.display", "display (observation)", df=observation_df, size=4)

In [None]:
plot_count("resource.valueQuantity.code", "value quantity code (observation)", df=observation_df, size=4)

## Few observation data distributions

We show now the data distribution for the observations, grouped by code.display.

In [None]:
resource_code_display = observation_df["resource.code.display"].unique()
f, ax = plt.subplots(4, 5, figsize=(20, 24))
i = 0
colors = ["red", "green", "magenta", "darkblue"]
for display in resource_code_display[0:20]:
    i += 1
    plt.subplot(4, 5,i)
    sel_df = observation_df.loc[observation_df["resource.code.display"]==display]
    g = sns.distplot(sel_df["resource.valueQuantity.value"], color=colors[i%4])
    g.set(ylabel=None)
    g.set(xlabel=display)
plt.show()

## Selection of observation data for one patient

Let's select one type of observation for one patient.  
We will select the patient with bigest number of observations available.

In [None]:
sel_patient = observation_df["resource.subject.reference"].value_counts().index[0]
print(sel_patient)
obs_subset_df = observation_df.loc[observation_df["resource.subject.reference"]==sel_patient]

In [None]:
obs_subset_df.head()

In [None]:
plot_count("resource.code.display", "display (observation)", df=obs_subset_df, size=4)

In [None]:
plot_count("resource.valueQuantity.code", "value quantity code (observation)", df=obs_subset_df, size=4)

Let's show now the distribution of few of the data for this patient.

In [None]:
sel_obs_columns = ["resource.code.display", "resource.effectiveDateTime", "resource.valueQuantity.value",
               "resource.valueQuantity.unit","resource.valueQuantity.system","resource.valueQuantity.code", "date"]

In [None]:
resource_code_display = obs_subset_df["resource.code.display"].unique()



f, ax = plt.subplots(4, 5, figsize=(20, 24))
i = 0
for display in resource_code_display[:-2]:
    i += 1
    plt.subplot(4, 5,i)
    plt.legend(loc=2, prop={'size': 3})
    label_text = f"{display}\n[{sel_df['resource.valueQuantity.code'].values[0]}]"
    sel_df = obs_subset_df.loc[obs_subset_df["resource.code.display"]==display]
    plt.plot(sel_df["date"], sel_df["resource.valueQuantity.value"], label=label_text)
    plt.tick_params(axis='x', rotation=45, size=6)
    plt.xlabel("date")
    plt.legend()
plt.show()



## More patients

Let's repeat this process for few more patients.

In [None]:
for idx in range(1, 5):
    sel_patient = observation_df["resource.subject.reference"].value_counts().index[idx]
    print(f"Selected patient: {sel_patient}")
    obs_subset_df = observation_df.loc[observation_df["resource.subject.reference"]==sel_patient]
    resource_code_display = obs_subset_df["resource.code.display"].unique()
    colors = ["red", "blue", "green", "magenta", "orange"]
    f, ax = plt.subplots(4, 5, figsize=(20, 24))
    i = 0
    for display in resource_code_display[0:20]:
        i += 1
        plt.subplot(4, 5,i)
        plt.legend(loc=2, prop={'size': 3})
        label_text = f"{display}\n[{sel_df['resource.valueQuantity.code'].values[0]}]"
        sel_df = obs_subset_df.loc[obs_subset_df["resource.code.display"]==display]
        plt.plot(sel_df["date"], sel_df["resource.valueQuantity.value"], label=label_text, color=colors[idx])
        plt.tick_params(axis='x', rotation=45, size=6)
        plt.xlabel("date")
        plt.legend()
    plt.show()