# Rads report regex extract

In [1]:
from pyhere import here
import sys
import os
import re

sys.path.append(os.path.abspath('..')) # adding the absolute path

from radreportextract import ReportRegexExtractor

In [2]:
ext = ReportRegexExtractor()

## Extract History

In [3]:
text_hx_1 = """
MRI OF THE BRAIN AND ORBITS 

History: MALT lymphoma at the right orbit S/P chemotherapy was sent to follow-up. 

Technique: 
Sagittal SE T1W 
3D FSE FLAIR FS +Gd with MPR 

Comparison: Limited comparison to the MRI brain on 4-6-2022 
"""

# No Next Section KeyWord
text_hx_2 = """
MRI OF THE BRAIN AND ORBITS 

history: MALT lymphoma at the right orbit S/P chemotherapy was sent to follow-up. 

Sagittal SE T1W 
"""

# No Newline
text_hx_3 = """
MDCT OF THE NECK   Indication: A 58-year-old man, known case of nasopharyngeal cancer (T4N2M1) with lung metastasis   Technique: Post contrast enhanced axial scan of the neck using 1.0 mm slice thickness with 3.0 mm axial, coronal and sagittal reformation   Comparison: The prior CT of the neck taken on October 4, 2015   Findings:  	The current study reveals slight shrinkage but no significant change in extension of the preexisting ill-defined hypodense lesion with partial mild enhancement in some portions, epicenter at the left-sided nasopharynx. Extension of the lesion is described as follow;  ... Superior: No interval change of extension into the left foramen Ovale and left inferior orbital fissure. No significant change of few enhancing foci in the left inferior temporal lobe.  ... Anterior: Involvement of the left infratemporal fossa, left PPF, left retroantral space, and left masticator space. Erosion and sclerotic change of the left pterygoid bone and posterior wall of the left maxillary sinus, unchanged.
"""

# No Newline and Next Section KeyWord
text_hx_4 = """
Indications: A 58-year-old man, known case of nasopharyngeal cancer (T4N2M1).
"""

# No History or Indication
text_hx_5 = """
A 58-year-old man, known case of nasopharyngeal cancer (T4N2M1).
"""

# No History or Indication with end pattern 
text_hx_6 = """
A 58-year-old man, known case of nasopharyngeal cancer (T4N2M1).

technique:
"""

# No History or Indication with no end pattern 
text_hx_7 = """Some key: A 58-year-old man, known case of nasopharyngeal cancer (T4N2M1)."""

# This pattern ensures that it captures the "History" section and stops when it encounters another major section header or the end of the text.
# `|[\n\t]+`: Fallback logic, if none of the section headers are found, the match will stop at one or more consecutive newlines or tab characters
# pattern = r"((history|indication).*?)(?=\b(?:technique|findings|comparison|impression|$)|[\n\t]+)"

start_key = ["history", "indication"]
end_key = ["technique", "comparison", "finding", "impression"]

pattern = fr"(({"|".join(start_key)}).*?)(?=\b(?:{"|".join(end_key)}|$)|[\n\t]+)"
# pattern = r"(^.*)(?=\b(?:technique|findings|comparison|impression|$)|[\n\t]+)"
print(pattern)

# Extract the "History" section

## 1. Simple
match = re.search(pattern, text_hx_1, re.DOTALL | re.IGNORECASE)
print(match.group(0).strip())

## 2. No Next Section KeyWord
match = re.search(pattern, text_hx_2, re.DOTALL | re.IGNORECASE)
print(match.group(0).strip())


## 3. No Newline
match = re.search(pattern, text_hx_3, re.DOTALL | re.IGNORECASE)
print(match.group(0).strip())

## 4. No Newline and Next Section KeyWord
match = re.search(pattern, text_hx_4, re.DOTALL | re.IGNORECASE)
print(match.group(0).strip())

## 5. No Newline and Next Section KeyWord
match = re.search(pattern, text_hx_5, re.DOTALL | re.IGNORECASE)
if match:
    print(match.group(0).strip())
else:
    print("No Match")

((history|indication).*?)(?=\b(?:technique|comparison|finding|impression|$)|[\n\t]+)
History: MALT lymphoma at the right orbit S/P chemotherapy was sent to follow-up.
history: MALT lymphoma at the right orbit S/P chemotherapy was sent to follow-up.
Indication: A 58-year-old man, known case of nasopharyngeal cancer (T4N2M1) with lung metastasis
Indications: A 58-year-old man, known case of nasopharyngeal cancer (T4N2M1).
No Match


In [4]:
# Whole Word vs Regex

match = re.findall(r"\b(history|indication)\b", " history: historyclub indication indications indicationclub", re.DOTALL | re.IGNORECASE)
print(match)

match = re.findall(r"(history|indication)", " history: historyclub indication indications indicationclub", re.DOTALL | re.IGNORECASE)
print(match)

['history', 'indication']
['history', 'history', 'indication', 'indication', 'indication']


In [5]:
["A", "B"] + [ x + "s" for x in ["A", "B"]]

['A', 'B', 'As', 'Bs']

- `start_key`: "history" or "indication"
- `end_key`: "technique", "comparison", "finding", "impression"
- `end_char`: `\n` or `\t`

1. If there is `start_key`, match the input text until found (which ever comes first) `end_key`, `end_char`, or the end of text string.
2. If there is no `start_key`, match from the beginning of input text until found (which ever comes first) `end_key`, `end_char`, or the end of text string.

In [8]:
print(ext.extract_hx(text_hx_1))
print(ext.extract_hx(text_hx_2))
print(ext.extract_hx(text_hx_3))
print(ext.extract_hx(text_hx_4))

# Strick Mode
assert ext.extract_hx(text_hx_5) == ""
assert ext.extract_hx(text_hx_6) == ""
assert ext.extract_hx(text_hx_7) == ""


# Non-Strick Mode
print("\nNon-Strick Mode")
print(ext.extract_hx(text_hx_5, strick = False))
print(ext.extract_hx(text_hx_6, strick = False))
print(ext.extract_hx(text_hx_7, strick = False))

History: MALT lymphoma at the right orbit S/P chemotherapy was sent to follow-up.
history: MALT lymphoma at the right orbit S/P chemotherapy was sent to follow-up.
Indication: A 58-year-old man, known case of nasopharyngeal cancer (T4N2M1) with lung metastasis
Indications: A 58-year-old man, known case of nasopharyngeal cancer (T4N2M1).

Non-Strick Mode
A 58-year-old man, known case of nasopharyngeal cancer (T4N2M1)
A 58-year-old man, known case of nasopharyngeal cancer (T4N2M1).
Some key: A 58-year-old man, known case of nasopharyngeal cancer (T4N2M1)


## Extract Impression

In [7]:
ext.extract_imp()

TypeError: ReportRegexExtractor.extract_imp() missing 1 required positional argument: 'text'