In [None]:
import os
from uuid import uuid4
import pandas as pd
import json 
import time

from tqdm import tqdm

DATA_PATH = "data"
NA_PLACEHOLDER = "NA"

pd.options.plotting.backend = "plotly"

### Load Jsons

In [None]:
json_files = os.listdir(DATA_PATH)
json_files[:2], json_files[-2:], len(json_files)

### Combine All The Jsons 

In [None]:
patient_records = []

for json_file in tqdm(json_files):
    patient_id = str(uuid4())
    json_path = os.path.join(DATA_PATH,json_file)
    patient_details = {"id": patient_id,
                       "json_path": json_path}
    with open(json_path, encoding="utf-8") as fh:
        try:
            payload = json.load(fh)
            patient_details['payload'] =  payload 
            patient_records.append(patient_details)
        except Exception as e:
            print("Exception {e} occured while trying to load {fpath}".format(
              e = e,
              fpath = json_path
            ))
print("Lenght of final list: {}".format(len(patient_records)))

In [None]:
patient_records[0].keys(),patient_records[0]['payload'].keys(),

### Prepare DataFrame

In [None]:
rows = []
for patient_record in tqdm(patient_records):
    patient_id = patient_record['id']
    patient_json_path = patient_record['json_path'] 
    patient_payload = patient_record['payload']
    
    entries = patient_payload['entry']

    try:
        for entry in entries:
            patient_row = {
                            "patient_id": patient_id,
                            "patient_json_path": patient_json_path 
                }
            resource_details = entry.get('resource', NA_PLACEHOLDER)
            patient_row["resource_type"] = resource_details.get("resourceType",NA_PLACEHOLDER)
            patient_row["resource_id"] = resource_details.get("id",NA_PLACEHOLDER)
            patient_row["resource_meta"] = resource_details.get("meta", NA_PLACEHOLDER)
            rows.append(patient_row)
    except Exception as e:
        print("Exception {e} for {fpath}".format(
            e = e,
            fpath = patient_json_path
        ))


final_df = pd.DataFrame(rows)

### Inspect Final DataFrame

In [None]:
final_df['patient_id'].nunique()

In [None]:
final_df.head()

#### Number Of Unique Resource Types

In [None]:
final_df['resource_type'].unique()

In [None]:
final_df['resource_type'].nunique()

### Count For Each Resource Type

In [None]:
final_df['resource_type'].value_counts()

In [None]:
final_df['resource_type'].value_counts().plot(kind="bar",
                                              title = "Resource Type Counts",
                                             template="plotly_dark")

#### Count Of Each Resource Type Per Patient

In [None]:
patient_resources = final_df.groupby('patient_json_path')['resource_type'].value_counts().to_frame()
patient_resources = patient_resources.rename(columns = {"resource_type": "count"})
patient_resources = patient_resources.reset_index()
patient_resources['patient_name'] = patient_resources['patient_json_path'].apply(lambda field: os.path.splitext(field)[0].replace('data/','').title())
patient_resources.head()

In [None]:
patient_resources_pivot = patient_resources.pivot(index = 'patient_name',
                        columns = "resource_type",
                        values="count").fillna(0)
patient_resources_pivot

In [None]:
patient_resources_pivot.to_csv(os.path.join(DATA_PATH,'patient_resource_count.csv'))