In [134]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [135]:
import glob
import json
import pickle
import pandas as pd
from tqdm import tqdm
from pathlib import Path
import xml.etree.ElementTree as ET
from pprint import pprint
from tqdm.notebook import tqdm
import ast

import sys
sys.path.append('../../')

In [136]:
DATA_DIR = Path("../data/dev")
data_file_name = "archehr-qa.xml"
key_file_name = "archehr-qa_key.json"
mapping_file_name = "archehr-qa_mapping.json"

# Extract & Organize

**Key-File:**
- Provides sentence-level answer annotations.
- Labels answers as "essential" or "not-relevant".

Load from json file

In [137]:
import json

json_file = DATA_DIR / key_file_name
with open(json_file, "r") as f:
    key_data = json.load(f)

# Print the first few key-value pairs
print(json.dumps(key_data, indent=2))

[
  {
    "case_id": "1",
    "answers": [
      {
        "sentence_id": "0",
        "relevance": "not-relevant"
      },
      {
        "sentence_id": "1",
        "relevance": "essential"
      },
      {
        "sentence_id": "2",
        "relevance": "not-relevant"
      },
      {
        "sentence_id": "3",
        "relevance": "not-relevant"
      },
      {
        "sentence_id": "4",
        "relevance": "not-relevant"
      },
      {
        "sentence_id": "5",
        "relevance": "essential"
      },
      {
        "sentence_id": "6",
        "relevance": "essential"
      },
      {
        "sentence_id": "7",
        "relevance": "essential"
      },
      {
        "sentence_id": "8",
        "relevance": "not-relevant"
      }
    ]
  },
  {
    "case_id": "2",
    "answers": [
      {
        "sentence_id": "0",
        "relevance": "not-relevant"
      },
      {
        "sentence_id": "1",
        "relevance": "essential"
      },
      {
        "sentence_id":

Convert to pandas dataframe

In [118]:
# Convert JSON into a structured DataFrame
structured_key_data = []

for case in key_data:
    case_id = case["case_id"]
    for answer in case["answers"]:
        structured_key_data.append({
            "case_id": case_id,
            "sentence_id": answer["sentence_id"],
            "relevance": answer["relevance"]
        })

# Create Pandas DataFrame
df_key = pd.DataFrame(structured_key_data)

In [119]:
df_key.head()

Unnamed: 0,case_id,sentence_id,relevance
0,1,0,not-relevant
1,1,1,essential
2,1,2,not-relevant
3,1,3,not-relevant
4,1,4,not-relevant


**Mapping-File:**
- Maps case IDs to specific documents from MIMIC-III.
- Shows where the context for each question is located.

Load from json file

In [120]:
json_mapping_file = DATA_DIR / mapping_file_name
with open(json_mapping_file, "r") as f:
    mapping_data = json.load(f)

# Print first few mappings
print(json.dumps(mapping_data, indent=2))

[
  {
    "case_id": "1",
    "document_id": "179164_41762",
    "document_source": "mimic-iii"
  },
  {
    "case_id": "2",
    "document_id": "191708_35669",
    "document_source": "mimic-iii"
  },
  {
    "case_id": "3",
    "document_id": "132786_42206",
    "document_source": "mimic-iii"
  },
  {
    "case_id": "4",
    "document_id": "180932_37135",
    "document_source": "mimic-iii"
  },
  {
    "case_id": "5",
    "document_id": "190179_39908",
    "document_source": "mimic-iii"
  },
  {
    "case_id": "6",
    "document_id": "104041_30389",
    "document_source": "mimic-iii"
  },
  {
    "case_id": "7",
    "document_id": "137866_53960",
    "document_source": "mimic-iii"
  },
  {
    "case_id": "8",
    "document_id": "169976_23897",
    "document_source": "mimic-iii"
  },
  {
    "case_id": "9",
    "document_id": "139801_38724",
    "document_source": "mimic-iii"
  },
  {
    "case_id": "10",
    "document_id": "100035_41331",
    "document_source": "mimic-iii"
  },
  {
   

Convert to pandas dataframe

In [121]:
# Convert JSON into a structured DataFrame
structured_mapping_data = []

for case in mapping_data:
    structured_mapping_data.append({
        "case_id": case["case_id"],
        "document_id": case["document_id"],
        "document_source": case["document_source"]
    })

# Create Pandas DataFrame
df_mapping = pd.DataFrame(structured_mapping_data)

In [122]:
df_mapping.head()

Unnamed: 0,case_id,document_id,document_source
0,1,179164_41762,mimic-iii
1,2,191708_35669,mimic-iii
2,3,132786_42206,mimic-iii
3,4,180932_37135,mimic-iii
4,5,190179_39908,mimic-iii


**Train-set:**
Contains cases with unique ids.
Stores questions, associated clinical context

Load from XML file

In [123]:
def parse_xml_to_dataframe(path_to_xml):
    tree = ET.parse(path_to_xml)
    root = tree.getroot()

    structured_data = []

    for case in root.findall("case"):
        case_id = case.attrib["id"]

        # Extract patient details
        patient_narrative = case.find("patient_narrative").text if case.find("patient_narrative") is not None else "No patient narrative"
        patient_question = case.find("patient_question/phrase").text if case.find("patient_question/phrase") is not None else "No patient question"
        clinician_question = case.find("clinician_question").text if case.find("clinician_question") is not None else "No clinician question"

        # Extract clinical note excerpts
        note_excerpt = case.find("note_excerpt").text if case.find("note_excerpt") is not None else "No note excerpt"

        # Extract sentence-level details from note excerpts
        for sentence in case.findall("note_excerpt_sentences/sentence"):
            sentence_id = sentence.attrib["id"]
            paragraph_id = sentence.attrib["paragraph_id"]
            start_char_index = sentence.attrib["start_char_index"]
            length = sentence.attrib["length"]
            sentence_text = sentence.text if sentence.text is not None else "No sentence text"

            structured_data.append({
                "case_id": case_id,
                "patient_narrative": patient_narrative,
                "patient_question": patient_question,
                "clinician_question": clinician_question,
                "note_excerpt": note_excerpt,
                "sentence_id": sentence_id,
                "sentence_text": sentence_text,
                "paragraph_id": paragraph_id,
                "start_char_index": start_char_index,
                "length": length
            })

    return pd.DataFrame(structured_data)

In [124]:
df_data = parse_xml_to_dataframe(DATA_DIR / data_file_name)

In [125]:
df_data.head()

Unnamed: 0,case_id,patient_narrative,patient_question,clinician_question,note_excerpt,sentence_id,sentence_text,paragraph_id,start_char_index,length
0,1,\nI had severe abdomen pain and was hospitalis...,\nMy question is if the sludge was there does ...,\nWhy was ERCP recommended to him over continu...,\nBrief Hospital Course:\n\nDuring the ERCP a ...,0,\nBrief Hospital Course:\n,0,0,22
1,1,\nI had severe abdomen pain and was hospitalis...,\nMy question is if the sludge was there does ...,\nWhy was ERCP recommended to him over continu...,\nBrief Hospital Course:\n\nDuring the ERCP a ...,1,\nDuring the ERCP a pancreatic stent was requi...,1,0,243
2,1,\nI had severe abdomen pain and was hospitalis...,\nMy question is if the sludge was there does ...,\nWhy was ERCP recommended to him over continu...,\nBrief Hospital Course:\n\nDuring the ERCP a ...,2,"\nHowever, due to the patient's elevated INR, ...",1,244,93
3,1,\nI had severe abdomen pain and was hospitalis...,\nMy question is if the sludge was there does ...,\nWhy was ERCP recommended to him over continu...,\nBrief Hospital Course:\n\nDuring the ERCP a ...,3,\nFrank pus was noted to be draining\nfrom the...,1,338,151
4,1,\nI had severe abdomen pain and was hospitalis...,\nMy question is if the sludge was there does ...,\nWhy was ERCP recommended to him over continu...,\nBrief Hospital Course:\n\nDuring the ERCP a ...,4,\nThe\nVancomycin was discontinued.\n,1,490,32


Cast types to int for merging

In [126]:
df_data["case_id"] = df_data["case_id"].astype(int)
df_data["sentence_id"] = df_data["sentence_id"].astype(int)
df_key["case_id"] = df_key["case_id"].astype(int)
df_key["sentence_id"] = df_key["sentence_id"].astype(int)
df_mapping["case_id"] = df_mapping["case_id"].astype(int)

Merge the dataframes into one

In [127]:
# Merge XML data with answer relevance labels
df_temp = df_data.merge(df_key, on=["case_id", "sentence_id"], how="left")

# Merge with document mapping
df_all = df_temp.merge(df_mapping, on="case_id", how="left")

In [128]:
df_all.head()

Unnamed: 0,case_id,patient_narrative,patient_question,clinician_question,note_excerpt,sentence_id,sentence_text,paragraph_id,start_char_index,length,relevance,document_id,document_source
0,1,\nI had severe abdomen pain and was hospitalis...,\nMy question is if the sludge was there does ...,\nWhy was ERCP recommended to him over continu...,\nBrief Hospital Course:\n\nDuring the ERCP a ...,0,\nBrief Hospital Course:\n,0,0,22,not-relevant,179164_41762,mimic-iii
1,1,\nI had severe abdomen pain and was hospitalis...,\nMy question is if the sludge was there does ...,\nWhy was ERCP recommended to him over continu...,\nBrief Hospital Course:\n\nDuring the ERCP a ...,1,\nDuring the ERCP a pancreatic stent was requi...,1,0,243,essential,179164_41762,mimic-iii
2,1,\nI had severe abdomen pain and was hospitalis...,\nMy question is if the sludge was there does ...,\nWhy was ERCP recommended to him over continu...,\nBrief Hospital Course:\n\nDuring the ERCP a ...,2,"\nHowever, due to the patient's elevated INR, ...",1,244,93,not-relevant,179164_41762,mimic-iii
3,1,\nI had severe abdomen pain and was hospitalis...,\nMy question is if the sludge was there does ...,\nWhy was ERCP recommended to him over continu...,\nBrief Hospital Course:\n\nDuring the ERCP a ...,3,\nFrank pus was noted to be draining\nfrom the...,1,338,151,not-relevant,179164_41762,mimic-iii
4,1,\nI had severe abdomen pain and was hospitalis...,\nMy question is if the sludge was there does ...,\nWhy was ERCP recommended to him over continu...,\nBrief Hospital Course:\n\nDuring the ERCP a ...,4,\nThe\nVancomycin was discontinued.\n,1,490,32,not-relevant,179164_41762,mimic-iii


# Data Cleaning

Routine check for missing values

In [132]:
df_all.isna().sum()

case_id               0
patient_narrative     0
patient_question      0
clinician_question    0
note_excerpt          0
sentence_id           0
sentence_text         0
paragraph_id          0
start_char_index      0
length                0
relevance             0
document_id           0
document_source       0
dtype: int64

List categories of relevance

In [133]:
df_all["relevance"].value_counts()

relevance
not-relevant     239
essential        138
supplementary     51
Name: count, dtype: int64

List distribution of different documents

In [131]:
df_all["document_id"].value_counts()

document_id
169976_23897    54
26520572        38
100035_41331    32
137866_53960    30
22086761        30
22805349        27
104041_30389    25
190179_39908    23
23831520        23
180932_37135    21
139801_38724    21
20746225        18
25926743        14
21866822        12
27422858        12
191708_35669    11
132786_42206    10
179164_41762     9
20361094         9
22494097         9
Name: count, dtype: int64