# Dev: Extract Report

In [1]:
import sys
from pathlib import Path

import pandas as pd
from markdownify import markdownify as md

from radreportparser import RadReportExtractor
sys.path.insert(1, str(Path.cwd().parent)) 


## Example Data

In [2]:
report1_df = pd.read_csv("../_data-test/ReportTable_Ex1.csv")

## Fn: Extract Report 

In [25]:
from typing import Literal

def extract_report(df: pd.DataFrame, 
                   report_col: str | None = None, 
                   report_format: Literal["html", "plain"] = "plain",
                   **kwargs):
    """Extract report text from a given column of DF with option to choose report formatting"""
    extractor = RadReportExtractor()
    # Convert column to string
    df[report_col] = df[report_col].astype(str)
    
    def convert_formatting(text: str, report_format: str):
        return md(text) if report_format == "html" else text
    
    # Extract to `__dict__` column
    try:
        df["__dict__"] = df.apply(lambda row: extractor.extract_all(convert_formatting(row[report_col], report_format), **kwargs).to_dict(), axis=1)
        ## Unnest
        df_out = pd.concat(
            [df.drop(columns='__dict__'), df['__dict__'].apply(pd.Series)], axis=1
        )
    except ValueError as e:
        logging.warning(f"ValueEror: {e}")
        
    return df_out
    

In [21]:
extract_report(report1_df, report_col = "RESULT_TEXT_HTML", report_format = "html", verbose = False)

Unnamed: 0.1,Unnamed: 0,HN,RESULT_TEXT_HTML,RESULT_TEXT_MD,RESULT_TEXT_PLAIN,title,history,technique,comparison,findings,impression
0,0,6092203,<font face='Microsoft Sans Serif' size='3'><fo...,**MDCT SCAN OF THE NECK** \n \n **H...,MDCT SCAN OF THE NECK History: Right...,**MDCT SCAN OF THE NECK**,"Right preauricular mass for 2 months, FNA---> ...",Plain and enhanced axial CT scans of the neck ...,,The study shows a large lobulated heterogenous...,- A large lobulated heterogenously enhancing i...
1,1,6092203,<font face='Microsoft Sans Serif' size='3'><fo...,Chest upright \n Comparison: None \n Hear...,Chest upright Comparison: None Heart: Norma...,Chest upright,,,None \n Heart: Normal cardiothoracic ratio. ...,,
2,2,6092203,<font face='Microsoft Sans Serif' size='3'><fo...,**ULTRASOUND OF THE UPPER ABDOMEN** \n \n ...,ULTRASOUND OF THE UPPER ABDOMEN HISTORY: Hig...,**ULTRASOUND OF THE UPPER ABDOMEN**,High grade mucoepidermoid cancer at right paro...,,None.,**Liver**: Normal size with increased and coar...,- Several small hyperechoic lesions in both he...
3,3,6092203,<font face='Microsoft Sans Serif' size='3'><fo...,**MDCT SCAN OF THE CHEST** \n \n **History...,MDCT SCAN OF THE CHEST History: RUL mass T...,**MDCT SCAN OF THE CHEST**,RUL mass,Plain and enhanced axial scans of the chest wa...,CT neck on 01/09/2023,**Lungs and pleural cavity**: Centrilobular em...,- A 5.2x4.6x7.5-cm heterogeneous enhancing mas...
4,4,6092203,<font face='Microsoft Sans Serif' size='3'><fo...,**MDCT SCAN OF THE BRAIN** \n \n **Hi...,MDCT SCAN OF THE BRAIN History: Case...,**MDCT SCAN OF THE BRAIN**,Case of CA lung with brain metastasis.,Plain axial and enhanced CT scan of the brain ...,,The study reveals a small peripheral enhancing...,1.The 1.4x1.4-cm peripheral enhancing lesion a...
5,5,6092203,<font face='Microsoft Sans Serif' size='3'><fo...,**US- GUIDED BIOPSY OF THE RIGHT SUBMANDIBULAR...,US- GUIDED BIOPSY OF THE RIGHT SUBMANDIBULAR M...,**US- GUIDED BIOPSY OF THE RIGHT SUBMANDIBULAR...,"R/O CA lung with bone metastasis, right subman...","Under sterile technique, supine position, loca...",,The US reveals a large hypervascular mass at r...,
6,6,4780844,<font face='Microsoft Sans Serif' size='3'><fo...,Consultation for body interventional radiology...,Consultation for body interventional radiology...,Consultation for body interventional radiology...,,,,,
7,7,6121929,<font face='Microsoft Sans Serif' size='3'><fo...,**MDCT SCAN OF THE CHEST** \n \n **HISTORY...,MDCT SCAN OF THE CHEST HISTORY: A 68-year-ol...,**MDCT SCAN OF THE CHEST**,A 68-year-old male with history of heavy smoki...,Plain and enhanced axial helical scan of the c...,None.,**Tubes and lines**: None. \n **Lung and ai...,- A 7.7-cm lobulated mass with spiculate margi...
8,8,4923059,<font face='Microsoft Sans Serif' size='3'><fo...,**MDCT SCAN OF THE CHEST:** \n \n **HISTOR...,MDCT SCAN OF THE CHEST: HISTORY: A 71-year-o...,**MDCT SCAN OF THE CHEST:**,A 71-year-old woman was sent for evaluation of...,Non-enhanced and contrast-enhanced axial CT sc...,Chest radiograph dated 29-Nov-2023.,**Lungs and airways**: There is a 3.1x2.0x2.0-...,**- A 3.1x2.0x2.0-cm lobulated enhancing mass ...
9,9,4601040,<font face='Microsoft Sans Serif' size='3'><fo...,**MRI OF THE BRAIN** \n \n **History:** U...,MRI OF THE BRAIN History: Underlying right...,**MRI OF THE BRAIN**,Underlying right breast cancer with a lung mas...,"oSagittal SE T1W, 3D FSE FLAIR/FS+Gd with coro...",,The study of the brain shows a small enhancing...,1.The 0.8x0.5x0.6-cm enhancing extra-axial les...


In [22]:
extract_report(report1_df, report_col = "RESULT_TEXT_MD", report_format = "plain", verbose = False)

Unnamed: 0.1,Unnamed: 0,HN,RESULT_TEXT_HTML,RESULT_TEXT_MD,RESULT_TEXT_PLAIN,title,history,technique,comparison,findings,impression
0,0,6092203,<font face='Microsoft Sans Serif' size='3'><fo...,**MDCT SCAN OF THE NECK** \n \n **H...,MDCT SCAN OF THE NECK History: Right...,**MDCT SCAN OF THE NECK**,"Right preauricular mass for 2 months, FNA---> ...",Plain and enhanced axial CT scans of the neck ...,,The study shows a large lobulated heterogenous...,- A large lobulated heterogenously enhancing i...
1,1,6092203,<font face='Microsoft Sans Serif' size='3'><fo...,Chest upright \n Comparison: None \n Hear...,Chest upright Comparison: None Heart: Norma...,Chest upright,,,None \n Heart: Normal cardiothoracic ratio. ...,,
2,2,6092203,<font face='Microsoft Sans Serif' size='3'><fo...,**ULTRASOUND OF THE UPPER ABDOMEN** \n \n ...,ULTRASOUND OF THE UPPER ABDOMEN HISTORY: Hig...,**ULTRASOUND OF THE UPPER ABDOMEN**,High grade mucoepidermoid cancer at right paro...,,None.,**Liver**: Normal size with increased and coar...,- Several small hyperechoic lesions in both he...
3,3,6092203,<font face='Microsoft Sans Serif' size='3'><fo...,**MDCT SCAN OF THE CHEST** \n \n **History...,MDCT SCAN OF THE CHEST History: RUL mass T...,**MDCT SCAN OF THE CHEST**,RUL mass,Plain and enhanced axial scans of the chest wa...,CT neck on 01/09/2023,**Lungs and pleural cavity**: Centrilobular em...,- A 5.2x4.6x7.5-cm heterogeneous enhancing mas...
4,4,6092203,<font face='Microsoft Sans Serif' size='3'><fo...,**MDCT SCAN OF THE BRAIN** \n \n **Hi...,MDCT SCAN OF THE BRAIN History: Case...,**MDCT SCAN OF THE BRAIN**,Case of CA lung with brain metastasis.,Plain axial and enhanced CT scan of the brain ...,,The study reveals a small peripheral enhancing...,1.The 1.4x1.4-cm peripheral enhancing lesion a...
5,5,6092203,<font face='Microsoft Sans Serif' size='3'><fo...,**US- GUIDED BIOPSY OF THE RIGHT SUBMANDIBULAR...,US- GUIDED BIOPSY OF THE RIGHT SUBMANDIBULAR M...,**US- GUIDED BIOPSY OF THE RIGHT SUBMANDIBULAR...,"R/O CA lung with bone metastasis, right subman...","Under sterile technique, supine position, loca...",,The US reveals a large hypervascular mass at r...,
6,6,4780844,<font face='Microsoft Sans Serif' size='3'><fo...,Consultation for body interventional radiology...,Consultation for body interventional radiology...,Consultation for body interventional radiology...,,,,,
7,7,6121929,<font face='Microsoft Sans Serif' size='3'><fo...,**MDCT SCAN OF THE CHEST** \n \n **HISTORY...,MDCT SCAN OF THE CHEST HISTORY: A 68-year-ol...,**MDCT SCAN OF THE CHEST**,A 68-year-old male with history of heavy smoki...,Plain and enhanced axial helical scan of the c...,None.,**Tubes and lines**: None. \n **Lung and ai...,- A 7.7-cm lobulated mass with spiculate margi...
8,8,4923059,<font face='Microsoft Sans Serif' size='3'><fo...,**MDCT SCAN OF THE CHEST:** \n \n **HISTOR...,MDCT SCAN OF THE CHEST: HISTORY: A 71-year-o...,**MDCT SCAN OF THE CHEST:**,A 71-year-old woman was sent for evaluation of...,Non-enhanced and contrast-enhanced axial CT sc...,Chest radiograph dated 29-Nov-2023.,**Lungs and airways**: There is a 3.1x2.0x2.0-...,**- A 3.1x2.0x2.0-cm lobulated enhancing mass ...
9,9,4601040,<font face='Microsoft Sans Serif' size='3'><fo...,**MRI OF THE BRAIN** \n \n **History:** U...,MRI OF THE BRAIN History: Underlying right...,**MRI OF THE BRAIN**,Underlying right breast cancer with a lung mas...,"oSagittal SE T1W, 3D FSE FLAIR/FS+Gd with coro...",,The study of the brain shows a small enhancing...,1.The 0.8x0.5x0.6-cm enhancing extra-axial les...


In [24]:
extract_report(report1_df, report_col = "Unnamed: 0", report_format = "plain", verbose = False)

Unnamed: 0.1,Unnamed: 0,HN,RESULT_TEXT_HTML,RESULT_TEXT_MD,RESULT_TEXT_PLAIN,title,history,technique,comparison,findings,impression
0,0,6092203,<font face='Microsoft Sans Serif' size='3'><fo...,**MDCT SCAN OF THE NECK** \n \n **H...,MDCT SCAN OF THE NECK History: Right...,0,,,,,
1,1,6092203,<font face='Microsoft Sans Serif' size='3'><fo...,Chest upright \n Comparison: None \n Hear...,Chest upright Comparison: None Heart: Norma...,1,,,,,
2,2,6092203,<font face='Microsoft Sans Serif' size='3'><fo...,**ULTRASOUND OF THE UPPER ABDOMEN** \n \n ...,ULTRASOUND OF THE UPPER ABDOMEN HISTORY: Hig...,2,,,,,
3,3,6092203,<font face='Microsoft Sans Serif' size='3'><fo...,**MDCT SCAN OF THE CHEST** \n \n **History...,MDCT SCAN OF THE CHEST History: RUL mass T...,3,,,,,
4,4,6092203,<font face='Microsoft Sans Serif' size='3'><fo...,**MDCT SCAN OF THE BRAIN** \n \n **Hi...,MDCT SCAN OF THE BRAIN History: Case...,4,,,,,
5,5,6092203,<font face='Microsoft Sans Serif' size='3'><fo...,**US- GUIDED BIOPSY OF THE RIGHT SUBMANDIBULAR...,US- GUIDED BIOPSY OF THE RIGHT SUBMANDIBULAR M...,5,,,,,
6,6,4780844,<font face='Microsoft Sans Serif' size='3'><fo...,Consultation for body interventional radiology...,Consultation for body interventional radiology...,6,,,,,
7,7,6121929,<font face='Microsoft Sans Serif' size='3'><fo...,**MDCT SCAN OF THE CHEST** \n \n **HISTORY...,MDCT SCAN OF THE CHEST HISTORY: A 68-year-ol...,7,,,,,
8,8,4923059,<font face='Microsoft Sans Serif' size='3'><fo...,**MDCT SCAN OF THE CHEST:** \n \n **HISTOR...,MDCT SCAN OF THE CHEST: HISTORY: A 71-year-o...,8,,,,,
9,9,4601040,<font face='Microsoft Sans Serif' size='3'><fo...,**MRI OF THE BRAIN** \n \n **History:** U...,MRI OF THE BRAIN History: Underlying right...,9,,,,,


## Extract Report Multiple

In [None]:
extractor = RadReportExtractor()

In [None]:
report1_ext_df = (
    report1_df
    .pipe(lambda df: df.assign(dict_out = df.apply(lambda row: extractor.extract_all(row['RESULT_TEXT_MD']).to_dict(), axis=1)))
)

Start pattern [^\w\n]*Findings?[^\w\n]* appear 2 times in text, only the first one will be matched.
Start pattern [^\w\n]*Techniques?[^\w\n]* appear 2 times in text, only the first one will be matched.
Start pattern [^\w\n]*History[^\w\n]* appear 2 times in text, only the first one will be matched.
Start pattern [^\w\n]*Findings?[^\w\n]* appear 2 times in text, only the first one will be matched.
Start pattern [^\w\n]*Findings?[^\w\n]* appear 2 times in text, only the first one will be matched.


In [None]:
## Unnest
pd.concat(
    [report1_ext_df.drop(columns='dict_out'), report1_ext_df['dict_out'].apply(pd.Series)], axis=1
)

Unnamed: 0.1,Unnamed: 0,HN,RESULT_TEXT_HTML,RESULT_TEXT_MD,RESULT_TEXT_PLAIN,title,history,technique,comparison,findings,impression
0,0,6092203,<font face='Microsoft Sans Serif' size='3'><fo...,**MDCT SCAN OF THE NECK** \n \n **H...,MDCT SCAN OF THE NECK History: Right...,**MDCT SCAN OF THE NECK**,"Right preauricular mass for 2 months, FNA---> ...",Plain and enhanced axial CT scans of the neck ...,,The study shows a large lobulated heterogenous...,- A large lobulated heterogenously enhancing i...
1,1,6092203,<font face='Microsoft Sans Serif' size='3'><fo...,Chest upright \n Comparison: None \n Hear...,Chest upright Comparison: None Heart: Norma...,Chest upright,,,None \n Heart: Normal cardiothoracic ratio. ...,,
2,2,6092203,<font face='Microsoft Sans Serif' size='3'><fo...,**ULTRASOUND OF THE UPPER ABDOMEN** \n \n ...,ULTRASOUND OF THE UPPER ABDOMEN HISTORY: Hig...,**ULTRASOUND OF THE UPPER ABDOMEN**,High grade mucoepidermoid cancer at right paro...,,None.,**Liver**: Normal size with increased and coar...,- Several small hyperechoic lesions in both he...
3,3,6092203,<font face='Microsoft Sans Serif' size='3'><fo...,**MDCT SCAN OF THE CHEST** \n \n **History...,MDCT SCAN OF THE CHEST History: RUL mass T...,**MDCT SCAN OF THE CHEST**,RUL mass,Plain and enhanced axial scans of the chest wa...,CT neck on 01/09/2023,**Lungs and pleural cavity**: Centrilobular em...,- A 5.2x4.6x7.5-cm heterogeneous enhancing mas...
4,4,6092203,<font face='Microsoft Sans Serif' size='3'><fo...,**MDCT SCAN OF THE BRAIN** \n \n **Hi...,MDCT SCAN OF THE BRAIN History: Case...,**MDCT SCAN OF THE BRAIN**,Case of CA lung with brain metastasis.,Plain axial and enhanced CT scan of the brain ...,,The study reveals a small peripheral enhancing...,1.The 1.4x1.4-cm peripheral enhancing lesion a...
5,5,6092203,<font face='Microsoft Sans Serif' size='3'><fo...,**US- GUIDED BIOPSY OF THE RIGHT SUBMANDIBULAR...,US- GUIDED BIOPSY OF THE RIGHT SUBMANDIBULAR M...,**US- GUIDED BIOPSY OF THE RIGHT SUBMANDIBULAR...,"R/O CA lung with bone metastasis, right subman...","Under sterile technique, supine position, loca...",,The US reveals a large hypervascular mass at r...,
6,6,4780844,<font face='Microsoft Sans Serif' size='3'><fo...,Consultation for body interventional radiology...,Consultation for body interventional radiology...,Consultation for body interventional radiology...,,,,,
7,7,6121929,<font face='Microsoft Sans Serif' size='3'><fo...,**MDCT SCAN OF THE CHEST** \n \n **HISTORY...,MDCT SCAN OF THE CHEST HISTORY: A 68-year-ol...,**MDCT SCAN OF THE CHEST**,A 68-year-old male with history of heavy smoki...,Plain and enhanced axial helical scan of the c...,None.,**Tubes and lines**: None. \n **Lung and ai...,- A 7.7-cm lobulated mass with spiculate margi...
8,8,4923059,<font face='Microsoft Sans Serif' size='3'><fo...,**MDCT SCAN OF THE CHEST:** \n \n **HISTOR...,MDCT SCAN OF THE CHEST: HISTORY: A 71-year-o...,**MDCT SCAN OF THE CHEST:**,A 71-year-old woman was sent for evaluation of...,Non-enhanced and contrast-enhanced axial CT sc...,Chest radiograph dated 29-Nov-2023.,**Lungs and airways**: There is a 3.1x2.0x2.0-...,**- A 3.1x2.0x2.0-cm lobulated enhancing mass ...
9,9,4601040,<font face='Microsoft Sans Serif' size='3'><fo...,**MRI OF THE BRAIN** \n \n **History:** U...,MRI OF THE BRAIN History: Underlying right...,**MRI OF THE BRAIN**,Underlying right breast cancer with a lung mas...,"oSagittal SE T1W, 3D FSE FLAIR/FS+Gd with coro...",,The study of the brain shows a small enhancing...,1.The 0.8x0.5x0.6-cm enhancing extra-axial les...


## Extract Report Single

In [None]:
extractor = RadReportExtractor()

In [None]:
# From Plain Text -> Parse()
extractor.extract_all(report1_df["RESULT_TEXT_MD"][0]).to_dict()

{'title': '**MDCT SCAN OF THE NECK**',
 'history': 'Right preauricular mass for 2 months, FNA---> suspicious for high grade mucoepidermoid CA',
 'technique': 'Plain and enhanced axial CT scans of the neck were performed with 1.0 mm slice thickness from the skull base to the root of neck with coronal and sagittal reconstruction.',
 'comparison': '',
 'findings': 'The study shows a large lobulated heterogenously enhancing isodense mass causing expansile osteolytic destruction of the right mandibular angle-ramus, measuring 3.9x4.6x5.7 cm in transaxial and CC dimensions, probably a bony metastasis.    \n The right masseter muscle, anterior part of right parotid gland ( superficial and deep lobes), right medial pterygoid muscle, right submandibular gland appear to be compressed, Hypdense change of right masseter muscle is seen, could be due to edema.    \n   \n A 0.5 cm rim enhancing nodule, likely a necrotic LN at right level Ib is noted.    \n Small nonspecific LNs at bilateral level I-V 

In [None]:
# From HTML -> MD -> Parse()
extractor.extract_all(md(report1_df["RESULT_TEXT_HTML"][0]))

RadReport(title='**MDCT SCAN OF THE NECK**', history='Right preauricular mass for 2 months, FNA---> suspicious for high grade mucoepidermoid CA', technique='Plain and enhanced axial CT scans of the neck were performed with 1.0 mm slice thickness from the skull base to the root of neck with coronal and sagittal reconstruction.', comparison='', findings='The study shows a large lobulated heterogenously enhancing isodense mass causing expansile osteolytic destruction of the right mandibular angle-ramus, measuring 3.9x4.6x5.7 cm in transaxial and CC dimensions, probably a bony metastasis.    \n The right masseter muscle, anterior part of right parotid gland ( superficial and deep lobes), right medial pterygoid muscle, right submandibular gland appear to be compressed, Hypdense change of right masseter muscle is seen, could be due to edema.    \n   \n A 0.5 cm rim enhancing nodule, likely a necrotic LN at right level Ib is noted.    \n Small nonspecific LNs at bilateral level I-V are observ

## HowTo