# Dev: Section

In [38]:
import re
import sys
import logging
from pathlib import Path
sys.path.insert(1, str(Path.cwd().parent)) 


from radreportparser._pattern import _pattern_keys

## Fn: Find Start & End Position

### Start Position

In [39]:
def _find_start_position(text: str, 
                        keys: list[str] | None,
                        word_boundary: bool = True,
                        flags: re.RegexFlag = re.IGNORECASE,
                        ) -> tuple[int, int]:
    """Helper function to find start position of the section"""
    if keys is None:
        return 0, 0
    # Warn if start pattern appears more than once
    for key in keys:
        x = re.findall(key, text, flags)
        count = len(x)
        if count >= 2:
            logging.warning("Start pattern `%s` appear %d times in text, only the first one will be matched.", key, count)
            
    start_match = _pattern_keys(keys, word_boundary, flags).search(text)
    if not start_match:
        return -1, -1  # Indicate no match found
    return start_match.start(), start_match.end()

In [40]:
txt1 = """CT History: A patient presented with these symptoms:
- Chest Pain
- Dyspnea
History: Another History

technique: 

Comparison: None
"""

_find_start_position(txt1, ["history", "technique"])
# _pattern_keys(["history", "technique"]).search(txt1)



(3, 10)

In [41]:
import re
from typing import List, Tuple

def _find_start_position_all(
    text: str,
    keys: list[str] | None,
    word_boundary: bool = False,
    flags: re.RegexFlag = re.IGNORECASE,
) -> List[Tuple[int, int]]:
    """Helper function to find all start positions of the sections.
    
    Parameters
    ----------
    text : str
        The input text to search through
    keys : list[str] | None
        List of possible section start markers
    word_boundary : bool, optional
        Whether to use word boundaries in pattern matching
    flags : re.RegexFlag, optional
        Regex flags to use in pattern matching
        
    Returns
    -------
    List[Tuple[int, int]]
        List of tuples containing (start, end) positions of all matches.
        Returns [(0, 0)] if keys is None.
        Returns [] if no matches found.
    """
    if keys is None:
        return [(0, 0)]
        
    pattern = _pattern_keys(keys, word_boundary, flags)
    matches = list(pattern.finditer(text))
    if not matches:
        return []
        
    return [(m.start(), m.end()) for m in matches]


In [42]:
txt2 = """
Human: Hello
AI: Hi, How can I help you?

User: None
AI: Bye
"""

_find_start_position_all(txt2, ["Human", "User"])
# _pattern_keys(["history", "technique"]).search(txt1)

[(1, 6), (43, 47)]

### End Position

In [43]:
def _find_end_position_greedy(text: str, 
                            keys: list[str] | None, 
                            start_pos: int,
                            word_boundary: bool = True,
                            flags: re.RegexFlag = re.IGNORECASE,
                            ) -> int:
    """Find the end position of a section using greedy matching.
    
    Searches for any of the end keys and returns the position of the first match found.
    This is faster but less precise when order matters.
    
    Parameters
    ----------
    text : str
        The input text to search through
    keys : list[str] | None
        List of possible end markers
    start_pos : int
        Position in text to start searching from
    word_boundary : bool, optional
        Whether to use word boundaries in pattern matching
    flags : re.RegexFlag, optional
        Regex flags to use in pattern matching
        
    Returns
    -------
    int
        The ending position in the text
    """
    if keys is None:
        return len(text)
    end_match = _pattern_keys(keys, word_boundary, flags).search(text[start_pos:])
    return len(text) if not end_match else start_pos + end_match.start()


def _find_end_position_sequential(text: str, 
                                keys: list[str] | None, 
                                start_pos: int,
                                word_boundary: bool = True,
                                flags: re.RegexFlag = re.IGNORECASE,
                                ) -> int:
    """Find the end position of a section using sequential matching.
    
    Tries each end key in order and returns the position of the first successful match.
    More precise when the order of keys matters.
    
    Parameters
    ----------
    text : str
        The input text to search through
    keys : list[str] | None
        List of possible end markers, tried in order
    start_pos : int
        Position in text to start searching from
    word_boundary : bool, optional
        Whether to use word boundaries in pattern matching
    flags : re.RegexFlag, optional
        Regex flags to use in pattern matching
        
    Returns
    -------
    int
        The ending position in the text
    """
    if keys is None:
        return len(text)
        
    search_text = text[start_pos:]
    
    # Try each key in sequence
    for key in keys:
        # Create pattern for single key
        pattern = _pattern_keys([key], word_boundary, flags)
        match = pattern.search(search_text)
        
        if match:
            # Return position relative to original text
            return start_pos + match.start()
            
    # If no matches found, return end of text
    return len(text)

In [44]:
txt1 = """CT of the chest
History: A patient presented with these symptoms:
- Chest Pain
- Dyspnea
History: Another

technique: CT chest

Comparison: None
"""

_find_end_position_greedy(txt1, ["history", "technique"], start_pos=1)
_find_end_position_sequential(txt1, ["technique", "history"], start_pos=1)

107

## Class: Extract Section 

In [45]:
import re
from typing import Literal

class SectionExtractor:
    """Extract sections from text based on start and end keys.
    
    This class provides functionality to extract sections of text that begin with
    specified start keys and end with specified end keys. It encapsulates the
    pattern matching configuration and provides a reusable interface for text extraction.
    
    Parameters
    ----------
    start_keys : list[str] | None
        List of possible section start markers. If None, the section will be
        extracted from the beginning of the text.
    end_keys : list[str] | None
        List of possible section end markers. If None, the section will be
        extracted until the end of the text.
    include_start_keys : bool, optional
        Whether to include the start key in the extracted section.
        Default is False.
    word_boundary : bool, optional
        Whether to wrap word boundary `\b` around the keys.
        Default is True.
    flags : re.RegexFlag, optional
        Regex flags to use in pattern matching.
        Default is re.IGNORECASE.
    match_strategy : {"greedy", "sequential"}, optional
        Strategy for matching end keys:
        - "greedy": Use first matching end key (faster)
        - "sequential": Try end keys in order (more precise)
        Default is "greedy".
    
    Examples
    --------
    ```{python}
    # Create an extractor for finding text between headers
    extractor = SectionExtractor(
        start_keys=["FINDINGS:"],
        end_keys=["IMPRESSION:", "CONCLUSION:"]
    )
    
    # Extract section from text
    text = "FINDINGS: Normal study. IMPRESSION: No abnormality."
    section = extractor.extract(text)
    print(section)  # Output: "Normal study."
    ```
    """
    
    def __init__(
        self,
        start_keys: list[str] | None,
        end_keys: list[str] | None,
        include_start_keys: bool = False,
        word_boundary: bool = False,
        flags: re.RegexFlag = re.IGNORECASE,
        match_strategy: Literal["greedy", "sequential"] = "greedy",
    ):
        self.start_keys = start_keys
        self.end_keys = end_keys
        self.include_start_keys = include_start_keys
        self.word_boundary = word_boundary
        self.flags = flags
        
        # Validate match strategy
        match_strategy_options = frozenset({"greedy", "sequential"})
        if match_strategy not in match_strategy_options:
            raise ValueError(
                f"Invalid value: {match_strategy}. "
                f"Must be one of: {', '.join(match_strategy_options)}"
            )
        self.match_strategy = match_strategy

    def __repr__(self) -> str:
        """Return a detailed string representation of the SectionExtractor.
        """
        # Format start_keys and end_keys lists
        start_keys_str = f"[{', '.join(repr(k) for k in self.start_keys)}]" if self.start_keys else "None"
        end_keys_str = f"[{', '.join(repr(k) for k in self.end_keys)}]" if self.end_keys else "None"
        
        # Format flags 
        flags_name = self.flags.name if hasattr(self.flags, 'name') else str(self.flags)
        
        return (
            f"{self.__class__.__name__}("
            f"start_keys={start_keys_str}, "
            f"end_keys={end_keys_str}, "
            f"include_start_keys={self.include_start_keys=}, "
            f"word_boundary={self.word_boundary}, "
            f"flags=re.{flags_name}, "
            f"match_strategy='{self.match_strategy}')"
        )

    def extract(self, text: str) -> str:
        """Extract a section from the text using configured patterns.
        
        Parameters
        ----------
        text : str
            The input text to extract section from.
            
        Returns
        -------
        str
            The extracted section text. Returns empty string if section not found.
            
        Examples
        --------
        ```{python}
        extractor = SectionExtractor(
            start_keys=["FINDINGS:"], 
            end_keys=["IMPRESSION:"]
        )
        text = "FINDINGS: Normal. IMPRESSION: Clear."
        section = extractor.extract(text)
        print(section) 
        ```
        """
        # Find start position
        start_idx_start, start_idx_end = _find_start_position(text, self.start_keys)
        if start_idx_start == -1:  # No start match found
            return ""
        
        # Find end position based on strategy
        if self.match_strategy == "greedy":
            end_idx = _find_end_position_greedy(text, self.end_keys, start_idx_start)
        else:
            end_idx = _find_end_position_sequential(text, self.end_keys, start_idx_start)
        
        # Extract the section
        section_start = start_idx_start if self.include_start_keys else start_idx_end
        return text[section_start:end_idx].strip()


    def extract_all(self, text: str) -> List[str]:
            """Extract all sections from the text that match the configured patterns.
            
            Parameters
            ----------
            text : str
                The input text to extract sections from
                
            Returns
            -------
            List[str]
                List of extracted section texts. Returns empty list if no sections found.
                
            Examples
            --------
            ```{python}
            extractor = SectionExtractor(
                start_keys=["FINDING:"],
                end_keys=["IMPRESSION:"]
            )
            text = '''
            FINDING: First observation
            IMPRESSION: OK
            FINDING: Second observation
            IMPRESSION: Also OK
            '''
            sections = extractor.extract_all(text)
            print(sections)  # ['First observation', 'Second observation']
            ```
            """
            # Find all start positions
            start_positions = _find_start_position_all(
                text,
                self.start_keys,
                self.word_boundary,
                self.flags
            )
            
            if not start_positions:
                return []
                
            sections = []
            
            # Process each start position
            for start_idx_start, start_idx_end in start_positions:
                # Find end position based on strategy
                if self.match_strategy == "greedy":
                    end_idx = _find_end_position_greedy(
                        text,
                        self.end_keys,
                        start_idx_start,
                        self.word_boundary,
                        self.flags
                    )
                else:
                    end_idx = _find_end_position_sequential(
                        text,
                        self.end_keys,
                        start_idx_start,
                        self.word_boundary,
                        self.flags
                    )
                
                # Extract the section
                section_start = start_idx_start if self.include_start_keys else start_idx_end
                section = text[section_start:end_idx].strip()
                
                if section:  # Only add non-empty sections
                    sections.append(section)

            return sections

In [46]:
hx_extractor_1 = SectionExtractor(["history", "technique"], ["comparison"], word_boundary = False)
hx_extractor_1.extract(txt1)



': A patient presented with these symptoms:\n- Chest Pain\n- Dyspnea\nHistory: Another\n\ntechnique: CT chest'

In [59]:
txt2 = """
Human: Hello
AI: Hi, How can I help you?

User: None
AI: Bye
"""

SectionExtractor(["Human", "User"], ["AI"], word_boundary = False).extract_all(txt2)

[': Hello', ': None']

## Main: `extract_section()`

### V2

In [48]:
from typing import Literal

def extract_section(text: str,
                   start_keys: list[str] | None,
                   end_keys: list[str] | None,
                   include_start_keys: bool = False,
                   word_boundary: bool = True,
                   flags: re.RegexFlag = re.IGNORECASE,
                   match_strategy: Literal["greedy", "sequential"] = "greedy",
                   ) -> str | Literal[""]:
    """Extract a section of text between specified start and end keys.
    
    [previous docstring content]
    
    Parameters
    ----------
    [previous parameters]
    match_strategy : MatchStrategy, optional
        Strategy for matching end keys:
        - "greedy": Use first matching end key (faster)
        - "sequential": Try end keys in order (more precise)
        Default is GREEDY
    
    Examples
    --------
    >>> text = "FINDINGS: Normal. TECHNIQUE: MRI. IMPRESSION: Clear."
    >>> # Using sequential matching
    >>> extract_section(text, ["FINDINGS:"], 
    ...                ["TECHNIQUE:", "IMPRESSION:"],
    ...                match_strategy="sequential")
    'Normal.'
    """
    # Find start position
    start_idx_start, start_idx_end = _find_start_position(text, start_keys)
    if start_idx_start == -1:  # No start match found
        return ""
    
    # Find end position based on strategy
    match_strategy_options =  frozenset({"greedy", "sequential"})
    if match_strategy not in match_strategy_options:
        raise ValueError(f"Invalid value: {match_strategy}. Must be one of: {', '.join(match_strategy_options)}")
    
    if match_strategy == "greedy":
        end_idx = _find_end_position_greedy(text, end_keys, start_idx_start)
    else:
        end_idx = _find_end_position_sequential(text, end_keys, start_idx_start)
    
    # Extract the section
    section_start = start_idx_start if include_start_keys else start_idx_end
    return text[section_start:end_idx].strip()

In [49]:
extract_section(txt1, ["history", "technique"], ["comparison"])
# extract_section(txt1, None, ["History"])



': A patient presented with these symptoms:\n- Chest Pain\n- Dyspnea\nHistory: Another\n\ntechnique: CT chest'

In [50]:
extract_section(txt1, 
                start_keys=[r"history\W*"], 
                end_keys=[r"Comparison\W*"], 
                word_boundary= False,
                include_start_keys=False)



'A patient presented with these symptoms:\n- Chest Pain\n- Dyspnea\nHistory: Another\n\ntechnique: CT chest'

In [51]:
t_rep_md1 = """
History: A patient presented with these symptoms:
- Chest Pain
- Dyspnea

**technique:** CT chest
"""

extract_section(t_rep_md1, 
                start_keys=[r"\W*History\W*"], 
                end_keys=[r"\W*technique\W*"],
                word_boundary= True,
                include_start_keys=False)

'A patient presented with these symptoms:\n- Chest Pain\n- Dyspnea'

In [52]:
t_rep_md2 = """
Finding: 
- A
- B
- C

**Impression:**
- D
- E
- F
"""

extract_section(t_rep_md2, 
                start_keys=[r"\W*Finding(s?)\W*"], 
                end_keys=[r"\W*Impression\W*"],
                word_boundary= True,
                include_start_keys=False)

'A\n- B\n- C'

In [53]:
t_rep_md3 = """
Clinical Indications: 
- A
- B
- C

**Impression:**
- D
- E
- F
"""

extract_section(t_rep_md3, 
                start_keys= [r"\W*history\W*", r"\W*indication(s?)\W*", *[rf"\W*clinical\s+{h}\W*" for h in ["history", r"indication(s?)"]]],
                end_keys=[r"\W*Impression\W*"],
                word_boundary= True,
                include_start_keys=False)

'A\n- B\n- C'

In [54]:
## Title
t_rep_md4 = """
CT CHEST WITH CONTRAST
CT WHOLE ABODMEN

History: blah blah blah

Comparison: None

Impression: blah blah blah
"""

extract_section(t_rep_md4, 
                start_keys= None,
                end_keys=[r"\W*History\W*"],
                word_boundary= True,
                include_start_keys=False)


'CT CHEST WITH CONTRAST\nCT WHOLE ABODMEN'

In [55]:
## Title
t_rep_md4 = """
CT CHEST WITH CONTRAST
CT WHOLE ABODMEN

Blah
"""

extract_section(t_rep_md4, 
                start_keys= None,
                end_keys=[r"\W*Impression\W*"],
                word_boundary= True,
                include_start_keys=False)


'CT CHEST WITH CONTRAST\nCT WHOLE ABODMEN\n\nBlah'

## Deprecated

In [56]:
from typing import Literal

def extract_section2(text: str,
                   start_keys: list[str] | None,
                   end_keys: list[str] | None,
                   include_start_keys: bool = False,
                   word_boundary: bool = True,
                   flags: re.RegexFlag = re.IGNORECASE,
                   ) -> str | Literal[""]:

    # Find start position
    start_idx_start, start_idx_end = _find_start_position(text, start_keys)
    if start_idx_start == -1:  # No start match found
        return ""
    
    # Find end position
    end_idx = _find_end_position(text, end_keys, start_idx_start)
    
    # Extract the section
    section_start = start_idx_start if include_start_keys else start_idx_end
    return text[section_start:end_idx].strip()

In [57]:

MINIMAL_REPORT_MD = """**EMERGENCY CT BRAIN**

**HISTORY:** 25F, dizziness and LOC

**TECHNIQUE:** CT brain without contrast

**FINDINGS:** Normal study
- No hemorrhage
- No mass

**IMPRESSION:** No acute abnormality"""

def test_extract_section_markdown(report_md):
    """Test section extraction from markdown formatted text"""
    # Extract HISTORY section
    history = extract_section(
        report_md,
        start_keys=["**HISTORY:**"],
        end_keys=["**TECHNIQUE:**"],
        include_start_keys=True,
        word_boundary=False)
    
    


In [58]:
extract_section(
        MINIMAL_REPORT_MD,
        start_keys=[r"ssss"],
        end_keys=[r"svvv"],
        include_start_keys=True,
        word_boundary=False,
        )

''