In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import re
import json
import unicodedata
import pandas as pd
from pathlib import Path
import xml.etree.ElementTree as ET

import sys
sys.path.append("../../src")
import util.preprocessing_util as util

In [4]:
DATA_DIR = Path("../../data/dev")
input_dir = DATA_DIR / "raw"
output_dir = DATA_DIR / "processed"
data_file_name = "archehr-qa.xml"
key_file_name = "archehr-qa_key.json"
mapping_file_name = "archehr-qa_mapping.json"

# Extract & Organize

**Key-File:**
- Provides sentence-level answer annotations.
- Labels answers as "essential" or "not-relevant".

Load from json file

In [5]:
key_file_path = input_dir / key_file_name
with open(key_file_path, "r") as f:
    key_data = json.load(f)

Convert to dataframe

In [6]:
structured_key_data = []

for case in key_data:
    case_id = case["case_id"]
    for answer in case["answers"]:
        structured_key_data.append({
            "case_id": case_id,
            "sentence_id": answer["sentence_id"],
            "relevance": answer["relevance"]
        })

# Create Pandas DataFrame
key_df = pd.DataFrame(structured_key_data)

In [7]:
key_df.head()

Unnamed: 0,case_id,sentence_id,relevance
0,1,0,not-relevant
1,1,1,essential
2,1,2,not-relevant
3,1,3,not-relevant
4,1,4,not-relevant


**Mapping-File:**
- Maps case IDs to specific documents from MIMIC-III.
- Shows where the context for each question is located.

Load from json file

In [8]:
mapping_file_path = input_dir / mapping_file_name
with open(mapping_file_path, "r") as f:
    mapping_data = json.load(f)

Convert to dataframe

In [9]:
structured_mapping_data = []

for case in mapping_data:
    structured_mapping_data.append({
        "case_id": case["case_id"],
        "document_id": case["document_id"],
        "document_source": case["document_source"]
    })

# Create Pandas DataFrame
mapping_df = pd.DataFrame(structured_mapping_data)

In [10]:
mapping_df.head()

Unnamed: 0,case_id,document_id,document_source
0,1,179164_41762,mimic-iii
1,2,191708_35669,mimic-iii
2,3,132786_42206,mimic-iii
3,4,180932_37135,mimic-iii
4,5,190179_39908,mimic-iii


**Train-set:**

Contains cases with unique ids.
Stores questions, associated clinical context

Load from XML file

In [11]:
def parse_xml_to_dataframe(path_to_xml):
    tree = ET.parse(path_to_xml)
    root = tree.getroot()

    structured_data = []

    for case in root.findall("case"):
        case_id = case.attrib["id"]

        # Extract patient details
        patient_narrative = case.find("patient_narrative").text if case.find("patient_narrative") is not None else "No patient narrative"
        patient_question = case.find("patient_question/phrase").text if case.find("patient_question/phrase") is not None else "No patient question"
        clinician_question = case.find("clinician_question").text if case.find("clinician_question") is not None else "No clinician question"

        # Extract clinical note excerpts
        note_excerpt = case.find("note_excerpt").text if case.find("note_excerpt") is not None else "No note excerpt"

        # Extract sentence-level details from note excerpts
        for sentence in case.findall("note_excerpt_sentences/sentence"):
            sentence_id = sentence.attrib["id"]
            paragraph_id = sentence.attrib["paragraph_id"]
            start_char_index = sentence.attrib["start_char_index"]
            length = sentence.attrib["length"]
            sentence_text = sentence.text if sentence.text is not None else "No sentence text"

            structured_data.append({
                "case_id": case_id,
                "patient_narrative": patient_narrative,
                "patient_question": patient_question,
                "clinician_question": clinician_question,
                "note_excerpt": note_excerpt,
                "sentence_id": sentence_id,
                "sentence_text": sentence_text,
                "paragraph_id": paragraph_id,
                "start_char_index": start_char_index,
                "length": length
            })

    return pd.DataFrame(structured_data)

In [12]:
data_file_path = input_dir / data_file_name
data_df = parse_xml_to_dataframe(data_file_path)

Cast attributes to int for merging

In [13]:
data_df["case_id"] = data_df["case_id"].astype(int)
data_df["sentence_id"] = data_df["sentence_id"].astype(int)
key_df["case_id"] = key_df["case_id"].astype(int)
key_df["sentence_id"] = key_df["sentence_id"].astype(int)
mapping_df["case_id"] = mapping_df["case_id"].astype(int)

Merge the dataframes into one

In [14]:
# Merge XML data with answer relevance labels
temp_df = data_df.merge(key_df, on=["case_id", "sentence_id"], how="left")

# Merge with document mapping
all_df = temp_df.merge(mapping_df, on="case_id", how="left")

In [15]:
all_df.head()

Unnamed: 0,case_id,patient_narrative,patient_question,clinician_question,note_excerpt,sentence_id,sentence_text,paragraph_id,start_char_index,length,relevance,document_id,document_source
0,1,\nI had severe abdomen pain and was hospitalis...,\nMy question is if the sludge was there does ...,\nWhy was ERCP recommended to him over continu...,\nBrief Hospital Course:\n\nDuring the ERCP a ...,0,\nBrief Hospital Course:\n,0,0,22,not-relevant,179164_41762,mimic-iii
1,1,\nI had severe abdomen pain and was hospitalis...,\nMy question is if the sludge was there does ...,\nWhy was ERCP recommended to him over continu...,\nBrief Hospital Course:\n\nDuring the ERCP a ...,1,\nDuring the ERCP a pancreatic stent was requi...,1,0,243,essential,179164_41762,mimic-iii
2,1,\nI had severe abdomen pain and was hospitalis...,\nMy question is if the sludge was there does ...,\nWhy was ERCP recommended to him over continu...,\nBrief Hospital Course:\n\nDuring the ERCP a ...,2,"\nHowever, due to the patient's elevated INR, ...",1,244,93,not-relevant,179164_41762,mimic-iii
3,1,\nI had severe abdomen pain and was hospitalis...,\nMy question is if the sludge was there does ...,\nWhy was ERCP recommended to him over continu...,\nBrief Hospital Course:\n\nDuring the ERCP a ...,3,\nFrank pus was noted to be draining\nfrom the...,1,338,151,not-relevant,179164_41762,mimic-iii
4,1,\nI had severe abdomen pain and was hospitalis...,\nMy question is if the sludge was there does ...,\nWhy was ERCP recommended to him over continu...,\nBrief Hospital Course:\n\nDuring the ERCP a ...,4,\nThe\nVancomycin was discontinued.\n,1,490,32,not-relevant,179164_41762,mimic-iii


# Data View

Routine check for missing values

In [16]:
all_df.isna().sum()

case_id               0
patient_narrative     0
patient_question      0
clinician_question    0
note_excerpt          0
sentence_id           0
sentence_text         0
paragraph_id          0
start_char_index      0
length                0
relevance             0
document_id           0
document_source       0
dtype: int64

List categories of relevance

In [17]:
all_df["relevance"].value_counts()

relevance
not-relevant     239
essential        138
supplementary     51
Name: count, dtype: int64

List distribution of different documents

In [18]:
all_df["document_id"].value_counts()

document_id
169976_23897    54
26520572        38
100035_41331    32
137866_53960    30
22086761        30
22805349        27
104041_30389    25
190179_39908    23
23831520        23
139801_38724    21
180932_37135    21
20746225        18
25926743        14
21866822        12
27422858        12
191708_35669    11
132786_42206    10
179164_41762     9
20361094         9
22494097         9
Name: count, dtype: int64

In [19]:
observation = all_df.iloc[0]

In [20]:
observation

case_id                                                               1
patient_narrative     \nI had severe abdomen pain and was hospitalis...
patient_question      \nMy question is if the sludge was there does ...
clinician_question    \nWhy was ERCP recommended to him over continu...
note_excerpt          \nBrief Hospital Course:\n\nDuring the ERCP a ...
sentence_id                                                           0
sentence_text                    \nBrief Hospital Course:\n            
paragraph_id                                                          0
start_char_index                                                      0
length                                                               22
relevance                                                  not-relevant
document_id                                                179164_41762
document_source                                               mimic-iii
Name: 0, dtype: object

In [21]:
observation.patient_narrative

'\nI had severe abdomen pain and was hospitalised for 15 days in ICU, diagnoised with CBD sludge thereafter on udiliv. Doctor advised for ERCP. My question is if the sludge was there does not the medication help in flushing it out? Whether ERCP was the only cure?\n        '

In [22]:
observation.patient_question

'\nMy question is if the sludge was there does not the medication help in flushing it out? Whether ERCP was the only cure?\n            '

In [23]:
observation.clinician_question

'\nWhy was ERCP recommended to him over continuing a medication-based treatment?\n        '

In [24]:
observation.note_excerpt

"\nBrief Hospital Course:\n\nDuring the ERCP a pancreatic stent was required to facilitate\naccess to the biliary system (removed at the end of the\nprocedure), and a common bile duct stent was placed to allow\ndrainage of the biliary obstruction caused by stones and sludge.\nHowever, due to the patient's elevated INR, no sphincterotomy or\nstone removal was performed. Frank pus was noted to be draining\nfrom the common bile duct, and post-ERCP it was recommended that\nthe patient remain on IV Zosyn for at least a week. The\nVancomycin was discontinued.\n\nOn hospital day 4 (post-procedure day 3) the patient returned to\nERCP for re-evaluation of her biliary stent as her LFTs and\nbilirubin continued an upward trend. On ERCP the previous\nbiliary stent was noted to be acutely obstructed by biliary\nsludge and stones. As the patient's INR was normalized to 1.2, a\nsphincterotomy was safely performed, with removal of several\nbiliary stones in addition to the common bile duct stent. At t

In [25]:
observation.sentence_text

'\nBrief Hospital Course:\n            '

# Data Cleaning

**We apply:**

- Lowercasing
- Removing extra newlines & whitespace
- Unicode normalization
- Removing punctuation

Convert text to lowercase

In [26]:
text_columns = ["patient_narrative", "patient_question", "clinician_question", "note_excerpt", "sentence_text"]

In [27]:
cleaned_data = util.clean_text_df(all_df, text_columns)

In [28]:
cleaned_data.head()

Unnamed: 0,case_id,patient_narrative,patient_question,clinician_question,note_excerpt,sentence_id,sentence_text,paragraph_id,start_char_index,length,relevance,document_id,document_source
0,1,i had severe abdomen pain and was hospitalised...,my question is if the sludge was there does no...,why was ercp recommended to him over continuin...,brief hospital course during the ercp a pancre...,0,brief hospital course,0,0,22,not-relevant,179164_41762,mimic-iii
1,1,i had severe abdomen pain and was hospitalised...,my question is if the sludge was there does no...,why was ercp recommended to him over continuin...,brief hospital course during the ercp a pancre...,1,during the ercp a pancreatic stent was require...,1,0,243,essential,179164_41762,mimic-iii
2,1,i had severe abdomen pain and was hospitalised...,my question is if the sludge was there does no...,why was ercp recommended to him over continuin...,brief hospital course during the ercp a pancre...,2,however due to the patients elevated inr no sp...,1,244,93,not-relevant,179164_41762,mimic-iii
3,1,i had severe abdomen pain and was hospitalised...,my question is if the sludge was there does no...,why was ercp recommended to him over continuin...,brief hospital course during the ercp a pancre...,3,frank pus was noted to be draining from the co...,1,338,151,not-relevant,179164_41762,mimic-iii
4,1,i had severe abdomen pain and was hospitalised...,my question is if the sludge was there does no...,why was ercp recommended to him over continuin...,brief hospital course during the ercp a pancre...,4,the vancomycin was discontinued,1,490,32,not-relevant,179164_41762,mimic-iii


Save the processed data (using csv instead of pickle for now since dataset is fairly small)

# Attribute Selection 

In [29]:
relevant_columns = ["case_id", "patient_question", "note_excerpt", "sentence_id", "sentence_text", "relevance", "start_char_index", "length"]

In [30]:
data = cleaned_data[relevant_columns]

In [31]:
data.head()

Unnamed: 0,case_id,patient_question,note_excerpt,sentence_id,sentence_text,relevance,start_char_index,length
0,1,my question is if the sludge was there does no...,brief hospital course during the ercp a pancre...,0,brief hospital course,not-relevant,0,22
1,1,my question is if the sludge was there does no...,brief hospital course during the ercp a pancre...,1,during the ercp a pancreatic stent was require...,essential,0,243
2,1,my question is if the sludge was there does no...,brief hospital course during the ercp a pancre...,2,however due to the patients elevated inr no sp...,not-relevant,244,93
3,1,my question is if the sludge was there does no...,brief hospital course during the ercp a pancre...,3,frank pus was noted to be draining from the co...,not-relevant,338,151
4,1,my question is if the sludge was there does no...,brief hospital course during the ercp a pancre...,4,the vancomycin was discontinued,not-relevant,490,32


In [32]:
data.to_csv(output_dir / "medical_data.csv", index=False)
print("Cleaned dataset saved successfully!")

Cleaned dataset saved successfully!
