In [1]:
import os
import requests
import json
import pandas as pd
import plotly.express as px
from tqdm.notebook import tqdm_notebook


# Specify the root directory containing FHIR bundles (JSON files)
root_directory = r'C:\Users\Imtech\Downloads\EMIS\BUNDLES'

# Specify the base URL for the FHIR server
base_url = "http://localhost:52775/csp/healthshare/fhir/fhir/r3"

# Set headers for the request
headers = {
    'Accept': 'application/fhir+json',
    'Content-Type': 'application/fhir+json'
}

# Function to get individual resources from a FHIR bundle
def get_resources(bundle):
    resources = []

    for entry in bundle.get('entry', []):
        # Assuming each entry has a 'resource'
        resource = entry.get('resource')
        resource['id'] = resource.get('id')

        if resource:       
            resources.append(resource)

    return resources

# Function to sort resources based on dependencies (references)
def sort_resources(resources):
    sorted_resources = []
    processed_ids = set()

    def process_resource(resource):
        if resource['id'] not in processed_ids:
            for reference in resource.get('contained', []):
                process_resource(reference)

            sorted_resources.append(resource)
            processed_ids.add(resource['id'])

    for resource in resources:
        process_resource(resource)

    return sorted_resources

def handle_response_error(response, resource, json_file_path):
    if response.status_code not in [200, 201]:
        error_info = {
            'resourceType': resource.get('resourceType', ''),
            'resourceId': resource.get('id', ''),
            'diagnostics': '',
            'text': '',
            'expression': ''
        }

        try:
            error_json = response.json()
            if 'issue' in error_json:
                issue = error_json['issue'][0]
                error_info['diagnostics'] = issue.get('diagnostics', '')
                error_info['expression'] = issue.get('expression', [])
                error_info['text'] = issue.get('details', {}).get('text', '')
        except json.JSONDecodeError:
            error_info['diagnostics'] = response.text

        return error_info
    else:
        error_info = {
            'resourceType': resource.get('resourceType', ''),
            'resourceId': resource.get('id', ''),
            'diagnostics': 'OK',
            'text': 'OK',
            'expression': 'OK'
        }
        return error_info

# Function to send a single FHIR resource to the server
def send_resource(resource, json_file_path, error_info_list):
    resource_id = resource['id']
    endpoint_url = f"{base_url}/{resource['resourceType']}/{resource_id}"

    response = requests.put(endpoint_url, headers=headers, json=resource)

    error_info = handle_response_error(response, resource, json_file_path)
    if error_info:
        error_info_list.append(error_info)


def create_layout():
    error_info_list = []
    for foldername, _, filenames in tqdm_notebook(os.walk(root_directory)):
        for filename in filenames:
            if filename.endswith('.json'):
                json_file_path = os.path.join(foldername, filename)
                with open(json_file_path, encoding="utf8") as file:
                    fhir_bundle = json.load(file)
                    #print(fhir_bundle)
                fhir_resources = get_resources(fhir_bundle)
                sorted_resources = sort_resources(fhir_resources)
                for resource in sorted_resources:
                    send_resource(resource, json_file_path, error_info_list)
    # Create a Pandas DataFrame from the error_info_list
    df = pd.DataFrame(error_info_list)
    df_filtered = df

    #Remove resource Types having 0 issue
    for resource_type in tqdm_notebook(df_filtered['resourceType'].unique().tolist()):
        if df_filtered[df_filtered['resourceType'] == resource_type]['diagnostics'].nunique() == 1 and df_filtered[df_filtered['resourceType'] == resource_type]['diagnostics'].unique()[0] == 'OK':
            df_filtered = df_filtered[df_filtered['resourceType'] != resource_type]

    # Cleansing
    df['diagnostics'] = df['diagnostics'].str.replace('<HSFHIRErr>', '', regex=True)
    df_filtered['diagnostics'] = df_filtered['diagnostics'].str.replace('<HSFHIRErr>', '', regex=True)

    return df, df_filtered

df = create_layout()[0]
df_filtered = create_layout()[1]



0it [00:00, ?it/s]

  0%|          | 0/22 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/22 [00:00<?, ?it/s]

In [2]:
df.diagnostics.unique(), df_filtered.diagnostics.unique()

(array(['EmptyValue', 'OK', 'InvalidReferenceTarget',
        'MissingRequiredProperty'], dtype=object),
 array(['EmptyValue', 'OK', 'InvalidReferenceTarget',
        'MissingRequiredProperty'], dtype=object))

In [3]:
df.describe()

Unnamed: 0,resourceType,resourceId,diagnostics,text,expression
count,90824,90350,90824,90824,90824
unique,22,11641,4,117,13
top,Observation,5E496953-065B-41F2-9577-BE8F2FBD0757,OK,OK,OK
freq,24831,475,87783,87783,87783


In [4]:
df_filtered.describe()

Unnamed: 0,resourceType,resourceId,diagnostics,text,expression
count,67147,66673,67147,67147,67147
unique,8,6395,4,117,13
top,Observation,EB3994A6-5A87-4B53-A414-913137072F57,OK,OK,OK
freq,24831,475,64106,64106,64106


In [14]:
li1 = df['resourceId'].unique().tolist()
s = df_filtered['resourceId'].unique().tolist()
temp3 = [x for x in li1 if x not in s]
print(len(temp3))
print(temp3[0])

5246
2FE2854D-CF58-4AEE-8F71-257ADF74FE21


In [5]:
df_filtered.sample(15)

Unnamed: 0,resourceType,resourceId,diagnostics,text,expression
27333,Observation,FE4B1ADC-1E71-4577-952D-99DC7E1C918B,InvalidReferenceTarget,The referenced resource type 'HealthcareServic...,[Observation.performer[4]]
31324,Observation,71CB96DB-F5FB-4384-BF9D-0612432A91DC,OK,OK,OK
72323,Observation,3823BE08-0EA3-418C-AB4B-5CDC254CDDD7,OK,OK,OK
81286,List,6244FA36-A8BC-43BE-A845-A6E7368EA578-PG0,OK,OK,OK
18021,MedicationRequest,E502C0F6-F0A9-4D1F-AE52-9E6161CB4D2A,OK,OK,OK
43665,Encounter,C98E0E00-4D1C-11E3-A2DD-010000000161,OK,OK,OK
47612,MedicationRequest,9C07E604-E7D8-4D7C-A130-138565FF4BE3,OK,OK,OK
72624,List,FE642D6E-A729-47E9-9512-B0DEA7AC36A1-LST,OK,OK,OK
78688,Observation,E85D812C-0A81-4CC6-AFA0-095D93B0E295,OK,OK,OK
62180,Observation,F5844A0C-9CDB-411B-B56C-BDE7A3584C03,OK,OK,OK


In [16]:
for resource_type in df_filtered['resourceType'].unique().tolist():
    mask = df_filtered['resourceType'] == resource_type
    for diagnostic in df_filtered['diagnostics'].unique().tolist():
        mask2 = df_filtered.loc[mask, 'diagnostics'] == diagnostic
        df_filtered.loc[mask & mask2, 'Count'] = mask2.sum()
        df_filtered.loc[mask & mask2, 'ResourceTypeCount'] = mask.sum()
        df_filtered.loc[mask & mask2, 'Percentage'] = (mask2.sum() / len(mask2)) * 100

# Convert Percentage to XX.X% format
df_filtered['Percentage'] = df_filtered['Percentage'].apply(lambda x: f"{x:.1f}")
# Convert ResourceTypeCount to integers
df_filtered['ResourceTypeCount'] = df_filtered['ResourceTypeCount'].astype(int)
df_filtered['Count'] = df_filtered['Count'].astype(int)

In [57]:
df_filtered[df_filtered.diagnostics=='EmptyValue']['resourceType'].unique()


array(['List', 'Location', 'MedicationRequest', 'Encounter', 'Specimen',
       'DiagnosticReport'], dtype=object)

In [61]:
df_filtered[(df_filtered.diagnostics=='EmptyValue') & (df_filtered.resourceType=='DiagnosticReport')]['text'].unique()

array(["Property 'value' of Type 'Identifier' cannot be null or an empty string."],
      dtype=object)

In [62]:
# Plot as percentage
df_plot = df_filtered[df_filtered['resourceType'].isin(['Location', 'List', 'DiagnosticReport', 'Observation', 'QuestionnaireResponse'])][['resourceType','Percentage', 'Count', 'ResourceTypeCount', 'diagnostics']].drop_duplicates().sort_values(by=['resourceType'])
fig = px.histogram(df_plot, x="resourceType", y="Percentage", color="diagnostics", title="Errored Resources", text_auto=True, height=1000, width = 700, color_discrete_sequence=px.colors.qualitative.G10)
fig.update_layout(xaxis_title="Resource Type", yaxis_title="Percentage %")
fig.update_traces(marker_line_width=.1, marker_line_color='rgb(8,48,107)', opacity=0.8)

fig.show()
#['#46039f', '#9c179e', '#fb9f3a', '#f0f921']


In [44]:
df_plotreport = df_plot.sort_values(by=['Percentage'], ascending=[False])
#df_plotreport['Percentage'] = df_filtered['Percentage'].apply(lambda x: f"{x:.4}")

df_plotreport.drop(df_plotreport[df_plotreport['diagnostics'] == 'OK'].index, inplace=True)
df_plotreport = df_plotreport.sort_values(by=['Percentage'])
df_plotreport['Percentage'] = df_filtered['Percentage'].apply(lambda x: f"{x:.4}%")
df_plotreport

Unnamed: 0,resourceType,Percentage,Count,ResourceTypeCount,diagnostics
35259,DiagnosticReport,0.2%,3,1449,EmptyValue
0,List,2.6%,478,18515,EmptyValue
25941,DiagnosticReport,21.0%,305,1449,InvalidReferenceTarget
356,Location,32.5%,327,1005,EmptyValue
38273,QuestionnaireResponse,44.7%,98,219,MissingRequiredProperty
26046,Observation,7.3%,1817,24831,InvalidReferenceTarget


In [None]:
# Plot as percentage
df_plot = df_filtered[df_filtered['resourceType'].isin(['MedicationRequest', 'Encounter', 'Specimen'])][['resourceType','Percentage', 'diagnostics']].drop_duplicates()
fig = px.histogram(df_plot, x="resourceType", y="Percentage", color="diagnostics", title="Errored Resources", height=800, width = 700)
fig.update_traces(marker_line_width=1.5, opacity=0.6)
fig.show()

In [25]:
df_table = df_filtered[['resourceType', 'Count', 'ResourceTypeCount', 'Percentage', 'diagnostics', 'text']].drop_duplicates()


df_table.drop(df_table[df_table['diagnostics'] == 'OK'].index, inplace=True)

df_table

Unnamed: 0,resourceType,Count,ResourceTypeCount,Percentage,diagnostics,text
0,List,478,18515,2.6,EmptyValue,Property 'id' of Type 'List' cannot be null or...
356,Location,327,1005,32.5,EmptyValue,Property 'description' of Type 'Location' cann...
544,MedicationRequest,7,14276,0.0,EmptyValue,Property 'text' of Type 'CodeableConcept' cann...
24711,List,478,18515,2.6,EmptyValue,Property 'title' of Type 'List' cannot be null...
24762,Encounter,4,5331,0.1,EmptyValue,Property 'text' of Type 'CodeableConcept' cann...
...,...,...,...,...,...,...
81010,Observation,1817,24831,7.3,InvalidReferenceTarget,The referenced resource type 'HealthcareServic...
81020,Observation,1817,24831,7.3,InvalidReferenceTarget,The referenced resource type 'HealthcareServic...
81038,Observation,1817,24831,7.3,InvalidReferenceTarget,The referenced resource type 'HealthcareServic...
89528,DiagnosticReport,305,1449,21.0,InvalidReferenceTarget,The referenced resource type 'HealthcareServic...


In [None]:
total_resource_type_count = df_table['ResourceTypeCount'].sum()
total_count = df_table['Count'].sum()

print(total_count, total_resource_type_count, total_count/total_resource_type_count*100)

In [None]:
df_filtered