In [1]:
%load_ext autoreload

In [2]:
import re
import unicodedata
import pandas as pd
from pathlib import Path
import xml.etree.ElementTree as ET

import sys
sys.path.append('../../')

In [3]:
DATA_DIR = Path("../data/dev")
input_dir = DATA_DIR / "raw"
output_dir = DATA_DIR / "processed"
data_file_name = "archehr-qa.xml"
key_file_name = "archehr-qa_key.json"
mapping_file_name = "archehr-qa_mapping.json"

# Extract & Organize

**Key-File:**
- Provides sentence-level answer annotations.
- Labels answers as "essential" or "not-relevant".

Load from json file

In [4]:
import json

json_file = input_dir / key_file_name
with open(json_file, "r") as f:
    key_data = json.load(f)

# Print the first few key-value pairs
print(json.dumps(key_data, indent=2))

FileNotFoundError: [Errno 2] No such file or directory: '../../data/dev/raw/archehr-qa_key.json'

Convert to pandas dataframe

In [5]:
# Convert JSON into a structured DataFrame
structured_key_data = []

for case in key_data:
    case_id = case["case_id"]
    for answer in case["answers"]:
        structured_key_data.append({
            "case_id": case_id,
            "sentence_id": answer["sentence_id"],
            "relevance": answer["relevance"]
        })

# Create Pandas DataFrame
df_key = pd.DataFrame(structured_key_data)

In [6]:
df_key.head()

Unnamed: 0,case_id,sentence_id,relevance
0,1,0,not-relevant
1,1,1,essential
2,1,2,not-relevant
3,1,3,not-relevant
4,1,4,not-relevant


**Mapping-File:**
- Maps case IDs to specific documents from MIMIC-III.
- Shows where the context for each question is located.

Load from json file

In [7]:
json_mapping_file = input_dir / mapping_file_name
with open(json_mapping_file, "r") as f:
    mapping_data = json.load(f)

# Print first few mappings
print(json.dumps(mapping_data, indent=2))

[
  {
    "case_id": "1",
    "document_id": "179164_41762",
    "document_source": "mimic-iii"
  },
  {
    "case_id": "2",
    "document_id": "191708_35669",
    "document_source": "mimic-iii"
  },
  {
    "case_id": "3",
    "document_id": "132786_42206",
    "document_source": "mimic-iii"
  },
  {
    "case_id": "4",
    "document_id": "180932_37135",
    "document_source": "mimic-iii"
  },
  {
    "case_id": "5",
    "document_id": "190179_39908",
    "document_source": "mimic-iii"
  },
  {
    "case_id": "6",
    "document_id": "104041_30389",
    "document_source": "mimic-iii"
  },
  {
    "case_id": "7",
    "document_id": "137866_53960",
    "document_source": "mimic-iii"
  },
  {
    "case_id": "8",
    "document_id": "169976_23897",
    "document_source": "mimic-iii"
  },
  {
    "case_id": "9",
    "document_id": "139801_38724",
    "document_source": "mimic-iii"
  },
  {
    "case_id": "10",
    "document_id": "100035_41331",
    "document_source": "mimic-iii"
  },
  {
   

Convert to pandas dataframe

In [8]:
# Convert JSON into a structured DataFrame
structured_mapping_data = []

for case in mapping_data:
    structured_mapping_data.append({
        "case_id": case["case_id"],
        "document_id": case["document_id"],
        "document_source": case["document_source"]
    })

# Create Pandas DataFrame
df_mapping = pd.DataFrame(structured_mapping_data)

In [9]:
df_mapping.head()

Unnamed: 0,case_id,document_id,document_source
0,1,179164_41762,mimic-iii
1,2,191708_35669,mimic-iii
2,3,132786_42206,mimic-iii
3,4,180932_37135,mimic-iii
4,5,190179_39908,mimic-iii


**Train-set:**
Contains cases with unique ids.
Stores questions, associated clinical context

Load from XML file

In [10]:
def parse_xml_to_dataframe(path_to_xml):
    tree = ET.parse(path_to_xml)
    root = tree.getroot()

    structured_data = []

    for case in root.findall("case"):
        case_id = case.attrib["id"]

        # Extract patient details
        patient_narrative = case.find("patient_narrative").text if case.find("patient_narrative") is not None else "No patient narrative"
        patient_question = case.find("patient_question/phrase").text if case.find("patient_question/phrase") is not None else "No patient question"
        clinician_question = case.find("clinician_question").text if case.find("clinician_question") is not None else "No clinician question"

        # Extract clinical note excerpts
        note_excerpt = case.find("note_excerpt").text if case.find("note_excerpt") is not None else "No note excerpt"

        # Extract sentence-level details from note excerpts
        for sentence in case.findall("note_excerpt_sentences/sentence"):
            sentence_id = sentence.attrib["id"]
            paragraph_id = sentence.attrib["paragraph_id"]
            start_char_index = sentence.attrib["start_char_index"]
            length = sentence.attrib["length"]
            sentence_text = sentence.text if sentence.text is not None else "No sentence text"

            structured_data.append({
                "case_id": case_id,
                "patient_narrative": patient_narrative,
                "patient_question": patient_question,
                "clinician_question": clinician_question,
                "note_excerpt": note_excerpt,
                "sentence_id": sentence_id,
                "sentence_text": sentence_text,
                "paragraph_id": paragraph_id,
                "start_char_index": start_char_index,
                "length": length
            })

    return pd.DataFrame(structured_data)

In [11]:
df_data = parse_xml_to_dataframe(input_dir / data_file_name)

In [12]:
df_data.head(10)

Unnamed: 0,case_id,patient_narrative,patient_question,clinician_question,note_excerpt,sentence_id,sentence_text,paragraph_id,start_char_index,length
0,1,\nI had severe abdomen pain and was hospitalis...,\nMy question is if the sludge was there does ...,\nWhy was ERCP recommended to him over continu...,\nBrief Hospital Course:\n\nDuring the ERCP a ...,0,\nBrief Hospital Course:\n,0,0,22
1,1,\nI had severe abdomen pain and was hospitalis...,\nMy question is if the sludge was there does ...,\nWhy was ERCP recommended to him over continu...,\nBrief Hospital Course:\n\nDuring the ERCP a ...,1,\nDuring the ERCP a pancreatic stent was requi...,1,0,243
2,1,\nI had severe abdomen pain and was hospitalis...,\nMy question is if the sludge was there does ...,\nWhy was ERCP recommended to him over continu...,\nBrief Hospital Course:\n\nDuring the ERCP a ...,2,"\nHowever, due to the patient's elevated INR, ...",1,244,93
3,1,\nI had severe abdomen pain and was hospitalis...,\nMy question is if the sludge was there does ...,\nWhy was ERCP recommended to him over continu...,\nBrief Hospital Course:\n\nDuring the ERCP a ...,3,\nFrank pus was noted to be draining\nfrom the...,1,338,151
4,1,\nI had severe abdomen pain and was hospitalis...,\nMy question is if the sludge was there does ...,\nWhy was ERCP recommended to him over continu...,\nBrief Hospital Course:\n\nDuring the ERCP a ...,4,\nThe\nVancomycin was discontinued.\n,1,490,32
5,1,\nI had severe abdomen pain and was hospitalis...,\nMy question is if the sludge was there does ...,\nWhy was ERCP recommended to him over continu...,\nBrief Hospital Course:\n\nDuring the ERCP a ...,5,\nOn hospital day 4 (post-procedure day 3) the...,2,0,161
6,1,\nI had severe abdomen pain and was hospitalis...,\nMy question is if the sludge was there does ...,\nWhy was ERCP recommended to him over continu...,\nBrief Hospital Course:\n\nDuring the ERCP a ...,6,\nOn ERCP the previous\nbiliary stent was note...,2,162,99
7,1,\nI had severe abdomen pain and was hospitalis...,\nMy question is if the sludge was there does ...,\nWhy was ERCP recommended to him over continu...,\nBrief Hospital Course:\n\nDuring the ERCP a ...,7,"\nAs the patient's INR was normalized to 1.2, ...",2,262,164
8,1,\nI had severe abdomen pain and was hospitalis...,\nMy question is if the sludge was there does ...,\nWhy was ERCP recommended to him over continu...,\nBrief Hospital Course:\n\nDuring the ERCP a ...,8,"\nAt the\nconclusion of the procedure, retrogr...",2,427,94
9,2,\nI just wrote about my dad given multiple sho...,\ndad given multiple shots of lasciks after he...,\nWhy was he given lasix and his oxygen flow r...,\nBrief Hospital Course:\n\nAcute diastolic he...,0,\nBrief Hospital Course:\n,0,0,22


Cast types to int for merging

In [13]:
df_data["case_id"] = df_data["case_id"].astype(int)
df_data["sentence_id"] = df_data["sentence_id"].astype(int)
df_key["case_id"] = df_key["case_id"].astype(int)
df_key["sentence_id"] = df_key["sentence_id"].astype(int)
df_mapping["case_id"] = df_mapping["case_id"].astype(int)

Merge the dataframes into one

In [14]:
# Merge XML data with answer relevance labels
df_temp = df_data.merge(df_key, on=["case_id", "sentence_id"], how="left")

# Merge with document mapping
df_all = df_temp.merge(df_mapping, on="case_id", how="left")

# Data View

Routine check for missing values

In [15]:
df_all.isna().sum();

List categories of relevance

In [28]:
df_all["relevance"].value_counts()

relevance
not-relevant     239
essential        138
supplementary     51
Name: count, dtype: int64

List distribution of different documents

In [17]:
df_all["document_id"].value_counts();

In [18]:
observation = df_all.iloc[0]

In [19]:
observation

case_id                                                               1
patient_narrative     \nI had severe abdomen pain and was hospitalis...
patient_question      \nMy question is if the sludge was there does ...
clinician_question    \nWhy was ERCP recommended to him over continu...
note_excerpt          \nBrief Hospital Course:\n\nDuring the ERCP a ...
sentence_id                                                           0
sentence_text                    \nBrief Hospital Course:\n            
paragraph_id                                                          0
start_char_index                                                      0
length                                                               22
relevance                                                  not-relevant
document_id                                                179164_41762
document_source                                               mimic-iii
Name: 0, dtype: object

In [20]:
observation.patient_narrative

'\nI had severe abdomen pain and was hospitalised for 15 days in ICU, diagnoised with CBD sludge thereafter on udiliv. Doctor advised for ERCP. My question is if the sludge was there does not the medication help in flushing it out? Whether ERCP was the only cure?\n        '

In [21]:
observation.patient_question

'\nMy question is if the sludge was there does not the medication help in flushing it out? Whether ERCP was the only cure?\n            '

In [22]:
observation.clinician_question

'\nWhy was ERCP recommended to him over continuing a medication-based treatment?\n        '

In [23]:
observation.note_excerpt

"\nBrief Hospital Course:\n\nDuring the ERCP a pancreatic stent was required to facilitate\naccess to the biliary system (removed at the end of the\nprocedure), and a common bile duct stent was placed to allow\ndrainage of the biliary obstruction caused by stones and sludge.\nHowever, due to the patient's elevated INR, no sphincterotomy or\nstone removal was performed. Frank pus was noted to be draining\nfrom the common bile duct, and post-ERCP it was recommended that\nthe patient remain on IV Zosyn for at least a week. The\nVancomycin was discontinued.\n\nOn hospital day 4 (post-procedure day 3) the patient returned to\nERCP for re-evaluation of her biliary stent as her LFTs and\nbilirubin continued an upward trend. On ERCP the previous\nbiliary stent was noted to be acutely obstructed by biliary\nsludge and stones. As the patient's INR was normalized to 1.2, a\nsphincterotomy was safely performed, with removal of several\nbiliary stones in addition to the common bile duct stent. At t

In [24]:
observation.sentence_text

'\nBrief Hospital Course:\n            '

# Data Cleaning

**We apply:**

- Lowercasing
- Removing extra newlines & whitespace
- Unicode normalization
- Removing punctuation

Convert text to lowercase

In [188]:
text_columns = ["patient_narrative", "patient_question", "clinician_question", "note_excerpt", "sentence_text"]

# Apply lowercasing to all text columns
df_all[text_columns] = df_all[text_columns].apply(lambda x: x.str.lower())

Remove extra newlines (\n) and excessive whitespace

In [189]:
# Function to clean newlines & extra spaces
def clean_text(text):
    if isinstance(text, str):  # Ensure it's a string before processing
        text = text.replace("\n", " ")  # Replace newlines with space
        text = " ".join(text.split())   # Remove extra spaces
    return text

In [190]:
df_all[text_columns] = df_all[text_columns].apply(lambda col: col.map(clean_text))

In [191]:
df_all

Unnamed: 0,case_id,patient_narrative,patient_question,clinician_question,note_excerpt,sentence_id,sentence_text,paragraph_id,start_char_index,length,relevance,document_id,document_source
0,1,i had severe abdomen pain and was hospitalised...,my question is if the sludge was there does no...,why was ercp recommended to him over continuin...,brief hospital course during the ercp a pancre...,0,brief hospital course:,0,0,22,not-relevant,179164_41762,mimic-iii
1,1,i had severe abdomen pain and was hospitalised...,my question is if the sludge was there does no...,why was ercp recommended to him over continuin...,brief hospital course during the ercp a pancre...,1,during the ercp a pancreatic stent was require...,1,0,243,essential,179164_41762,mimic-iii
2,1,i had severe abdomen pain and was hospitalised...,my question is if the sludge was there does no...,why was ercp recommended to him over continuin...,brief hospital course during the ercp a pancre...,2,"however, due to the patient's elevated inr, no...",1,244,93,not-relevant,179164_41762,mimic-iii
3,1,i had severe abdomen pain and was hospitalised...,my question is if the sludge was there does no...,why was ercp recommended to him over continuin...,brief hospital course during the ercp a pancre...,3,frank pus was noted to be draining from the co...,1,338,151,not-relevant,179164_41762,mimic-iii
4,1,i had severe abdomen pain and was hospitalised...,my question is if the sludge was there does no...,why was ercp recommended to him over continuin...,brief hospital course during the ercp a pancre...,4,the vancomycin was discontinued.,1,490,32,not-relevant,179164_41762,mimic-iii
...,...,...,...,...,...,...,...,...,...,...,...,...,...
423,20,last week my 61 yr old sister woke up with a t...,so what is the dizziness from i never heard of...,how did they diagnose her with migraine for sp...,discharge instructions you were evaluated in t...,4,- we started you on a new medication called ve...,0,657,54,essential,22494097,mimic-iv
424,20,last week my 61 yr old sister woke up with a t...,so what is the dizziness from i never heard of...,how did they diagnose her with migraine for sp...,discharge instructions you were evaluated in t...,5,take this once daily even if you do not have h...,0,712,76,not-relevant,22494097,mimic-iv
425,20,last week my 61 yr old sister woke up with a t...,so what is the dizziness from i never heard of...,how did they diagnose her with migraine for sp...,discharge instructions you were evaluated in t...,6,"- inform your pcp of this change, and please f...",0,789,116,supplementary,22494097,mimic-iv
426,20,last week my 61 yr old sister woke up with a t...,so what is the dizziness from i never heard of...,how did they diagnose her with migraine for sp...,discharge instructions you were evaluated in t...,7,- you can request your pcp to set up a referra...,0,906,115,not-relevant,22494097,mimic-iv


Normalize Unicode Characters:
- NFKC-Normalization Form KC converts similar-looking characters to a common form

In [192]:
def normalize_unicode(text):
    if isinstance(text, str):  # Ensure it's a string before processing
        return unicodedata.normalize("NFKC", text)  # Normalize to standard form
    return text  # Return unchanged if not a string

In [193]:
df_all[text_columns] = df_all[text_columns].apply(lambda col: col.map(normalize_unicode))

In [194]:
print(df_all.head())

   case_id                                  patient_narrative  \
0        1  i had severe abdomen pain and was hospitalised...   
1        1  i had severe abdomen pain and was hospitalised...   
2        1  i had severe abdomen pain and was hospitalised...   
3        1  i had severe abdomen pain and was hospitalised...   
4        1  i had severe abdomen pain and was hospitalised...   

                                    patient_question  \
0  my question is if the sludge was there does no...   
1  my question is if the sludge was there does no...   
2  my question is if the sludge was there does no...   
3  my question is if the sludge was there does no...   
4  my question is if the sludge was there does no...   

                                  clinician_question  \
0  why was ercp recommended to him over continuin...   
1  why was ercp recommended to him over continuin...   
2  why was ercp recommended to him over continuin...   
3  why was ercp recommended to him over continui

In [195]:
# Function to remove punctuation
def remove_punctuation(text):
    if isinstance(text, str):  # Ensure it's a string before processing
        return re.sub(r"[^\w\s]", "", text)  # Keep only words & spaces
    return text

In [196]:
df_all[text_columns] = df_all[text_columns].apply(lambda col: col.map(remove_punctuation))

In [197]:
df_all.head()

Unnamed: 0,case_id,patient_narrative,patient_question,clinician_question,note_excerpt,sentence_id,sentence_text,paragraph_id,start_char_index,length,relevance,document_id,document_source
0,1,i had severe abdomen pain and was hospitalised...,my question is if the sludge was there does no...,why was ercp recommended to him over continuin...,brief hospital course during the ercp a pancre...,0,brief hospital course,0,0,22,not-relevant,179164_41762,mimic-iii
1,1,i had severe abdomen pain and was hospitalised...,my question is if the sludge was there does no...,why was ercp recommended to him over continuin...,brief hospital course during the ercp a pancre...,1,during the ercp a pancreatic stent was require...,1,0,243,essential,179164_41762,mimic-iii
2,1,i had severe abdomen pain and was hospitalised...,my question is if the sludge was there does no...,why was ercp recommended to him over continuin...,brief hospital course during the ercp a pancre...,2,however due to the patients elevated inr no sp...,1,244,93,not-relevant,179164_41762,mimic-iii
3,1,i had severe abdomen pain and was hospitalised...,my question is if the sludge was there does no...,why was ercp recommended to him over continuin...,brief hospital course during the ercp a pancre...,3,frank pus was noted to be draining from the co...,1,338,151,not-relevant,179164_41762,mimic-iii
4,1,i had severe abdomen pain and was hospitalised...,my question is if the sludge was there does no...,why was ercp recommended to him over continuin...,brief hospital course during the ercp a pancre...,4,the vancomycin was discontinued,1,490,32,not-relevant,179164_41762,mimic-iii


Stemming and lemmatization are not necessary for your use case because ModernBERT and other transformer-based models already handle word variations through subword tokenization (e.g., WordPiece). These models learn contextual word meanings, making additional text normalization redundant and potentially harmful by removing important morphological distinctions.

**Note:** But might experiment with such things later on

Save the processed data (using csv instead of pickle for now since dataset is fairly small)

In [None]:
'''
df_all.to_csv(output_dir / "medical_data.csv", index=False)
print("Cleaned dataset saved successfully!")
''';