# Dev: Pattern & Keys

In [1]:
import re

In [72]:
from typing import Any

## Pattern

### `_pattern_start_keys()`

- No word boundary: `(history).*?`
- With word boundary: `\b(history)\b.*?`

In [447]:
def _pattern_start_keys(
    keys: list[str], 
    word_boundary: bool = True,
    flags: re.RegexFlag = re.DOTALL | re.IGNORECASE
    ) -> Any:
    """
    Creates a regex pattern that matches the start of a string with any of the keys in the list.
    """
    if word_boundary:
        # \b is a word boundary, which matches the position where a word starts or ends
        pattern = rf"\b({'|'.join(keys)})\b.*?"
    else:
        # Regex pattern that matches any of the keys in the list
        pattern = rf"({'|'.join(keys)}).*?"
    return re.compile(pattern, flags = flags)


In [448]:
p1 = _pattern_start_keys([r"\W*hello\W*", "world"], False)
s1 = p1.search("vvvvhello: vvvv world vvvv")
s1

<re.Match object; span=(4, 11), match='hello: '>

### `_pattern_keys()`

In [553]:
def _pattern_keys(
    keys: list[str], 
    word_boundary: bool = True,
    flags: re.RegexFlag = re.IGNORECASE
    ) -> Any:
    """
    Create regex pattern for matching given keys.
    """
    if len(keys) == 0:
        raise ValueError("keys must have at least one element")
    
    if word_boundary:
        # \b is a word boundary, which matches the position where a word starts or ends
        pattern = rf"\b({'|'.join(keys)})\b"
    else:
        # Regex pattern that matches any of the keys in the list
        pattern = rf"({'|'.join(keys)})"
    return re.compile(pattern, flags = flags)



In [538]:
assert not _pattern_keys([]).match('')
assert _pattern_keys([]).match('test')

In [550]:
_pattern_keys([]).match('test').span() 
_pattern_keys([]).search('test')

<re.Match object; span=(0, 0), match=''>

In [562]:
pattern = _pattern_keys(['hist'], word_boundary=False)
pattern.search('hhisttt')

<re.Match object; span=(1, 5), match='hist'>

In [575]:
pattern = _pattern_keys([r'a\W*'], word_boundary=False)
pattern.search('A--')

<re.Match object; span=(0, 3), match='A--'>

## Extract Section: `_extract_section()`

In [508]:
from typing import Literal

### v2

In [509]:
def extract_section(text: str,
                   start_keys: list[str] | None,
                   end_keys: list[str] | None,
                   include_start_keys: bool = False,
                   word_boundary: bool = True,
                   flags: re.RegexFlag = re.IGNORECASE,
                   ) -> str | Literal[""]:
    
    def find_start_position(text: str, keys: list[str] | None) -> tuple[int, int]:
        """Helper function to find start position of the section"""
        if keys is None:
            return 0, 0
        start_match = _pattern_keys(keys, word_boundary, flags).search(text)
        if not start_match:
            return -1, -1  # Indicate no match found
        return start_match.start(), start_match.end()
    
    def find_end_position(text: str, keys: list[str] | None, start_pos: int) -> int:
        """Helper function to find end position of the section"""
        if keys is None:
            return len(text)
        end_match = _pattern_keys(keys, word_boundary, flags).search(text[start_pos:])
        return len(text) if not end_match else start_pos + end_match.start()

    # Find start position
    start_idx_start, start_idx_end = find_start_position(text, start_keys)
    if start_idx_start == -1:  # No start match found
        return ""
    
    # Find end position
    end_idx = find_end_position(text, end_keys, start_idx_start)
    
    # Extract the section
    section_start = start_idx_start if include_start_keys else start_idx_end
    return text[section_start:end_idx].strip()

### v1

In [None]:
def extract_section2(text, 
                    start_keys: list[str] | None,
                    end_keys: list[str] | None,
                    include_start_keys: bool = False,
                    word_boundary: bool = True,
                    flags: re.RegexFlag = re.IGNORECASE,
                    ):
    
    if start_keys is None:
        start_idx_start = start_idx_end = 0
    else:
        # First find the starting point
        start_match = _pattern_keys(start_keys, word_boundary, flags).search(text)
        if not start_match:
            return ""
        else:
            # If match get start index of the match
            start_idx_start = start_match.start()
            start_idx_end = start_match.end()
    
    if end_keys is None:
        # If there are no end keys, extract the section from the start key to the end of the text
        end_idx = len(text)
    else:
        # Find the ending point
        end_match = _pattern_keys(end_keys, word_boundary, flags).search(text[start_idx_start:])
        if not end_match:
            end_idx = len(text)
        else:
            # Find the index of the end key in the original text
            end_idx = start_idx_start + end_match.start()
    
    if include_start_keys:
        # Extract the section with start key
        section = text[start_idx_start:end_idx].strip()
    else:
        # Extract the section without start key
        section = text[start_idx_end:end_idx].strip()
    return section

In [518]:

t_rep1 = """
History: A patient presented with these symptoms:
- Chest Pain
- Dyspnea

technique: CT chest

Comparison: None
"""
# Test it
section = extract_section(t_rep1, 
                          start_keys=["History:", "indication"], 
                          end_keys=["comparison", "technique"], 
                          word_boundary= False,
                          include_start_keys=False)
print(section)

A patient presented with these symptoms:
- Chest Pain
- Dyspnea


In [511]:
# No start key
section = extract_section(t_rep1, 
                          start_keys=None, 
                          end_keys=None, 
                          word_boundary= False,
                          include_start_keys=False)
print(section)

History: A patient presented with these symptoms:
- Chest Pain
- Dyspnea

technique: CT chest

Comparison: None


In [512]:

extract_section(t_rep1, 
                start_keys=["History"], 
                end_keys=None, 
                word_boundary= True,
                include_start_keys=False)

': A patient presented with these symptoms:\n- Chest Pain\n- Dyspnea\n\ntechnique: CT chest\n\nComparison: None'

In [513]:
t_rep_md1 = """
History: A patient presented with these symptoms:
- Chest Pain
- Dyspnea

**technique:** CT chest
"""

extract_section(t_rep_md1, 
                start_keys=[r"\W*History\W*"], 
                end_keys=[r"\W*technique\W*"],
                word_boundary= True,
                include_start_keys=False)

'A patient presented with these symptoms:\n- Chest Pain\n- Dyspnea'

In [514]:
t_rep_md2 = """
Finding: 
- A
- B
- C

**Impression:**
- D
- E
- F
"""

extract_section(t_rep_md2, 
                start_keys=[r"\W*Finding(s?)\W*"], 
                end_keys=[r"\W*Impression\W*"],
                word_boundary= True,
                include_start_keys=False)

'A\n- B\n- C'

In [515]:
t_rep_md3 = """
Clinical Indications: 
- A
- B
- C

**Impression:**
- D
- E
- F
"""

extract_section(t_rep_md3, 
                start_keys= [r"\W*history\W*", r"\W*indication(s?)\W*", *[rf"\W*clinical\s+{h}\W*" for h in ["history", r"indication(s?)"]]],
                end_keys=[r"\W*Impression\W*"],
                word_boundary= True,
                include_start_keys=False)

'A\n- B\n- C'

In [521]:
## Title
t_rep_md4 = """
CT CHEST WITH CONTRAST
CT WHOLE ABODMEN

History: blah blah blah

Comparison: None

Impression: blah blah blah
"""

extract_section(t_rep_md4, 
                start_keys= None,
                end_keys=[r"\W*History\W*"],
                word_boundary= True,
                include_start_keys=False)


'CT CHEST WITH CONTRAST\nCT WHOLE ABODMEN'

In [524]:
## Title
t_rep_md4 = """
CT CHEST WITH CONTRAST
CT WHOLE ABODMEN

Blah
"""

extract_section(t_rep_md4, 
                start_keys= None,
                end_keys=[r"\W*Impression\W*"],
                word_boundary= True,
                include_start_keys=False)


'CT CHEST WITH CONTRAST\nCT WHOLE ABODMEN\n\nBlah'

## Get Match

### `_get_first_key_match()`

In [516]:
def _get_first_key_match(
    text: str,
    keys: list[str], 
    regex: bool = False,
    **kwargs,
    ) -> str | Any | None:
    """
    Return the first key match (ignore case and include newline) in the text. If no match is found, return None.
    """
    pattern = _pattern_start_keys(keys, regex, **kwargs) 
    match = pattern.search(text)
    return match.group(1) if match else None

In [268]:
_get_first_key_match("hello world", keys=["hello", "world"], regex=False)

'hello'

### `_get_all_key_matches()`

In [None]:
def _get_all_key_matches(
    text: str,
    keys: list[str], 
    regex: bool = False,
    **kwargs,
    ) -> list[Any]:
    """
    Return all key matches (ignore case and include newline) in the text. If no match is found, return an empty list.
    """
    pattern = _pattern_start_keys(keys, regex, **kwargs) 
    matches = pattern.findall(text)
    return matches

In [196]:
_get_all_key_matches("hello world hello", keys=["hello", "world"], regex=False)

['hello', 'world', 'hello']

## Tester

In [199]:
t_sk1 = """
**History:** Rule out pneumonia
ClinicalHistory: Patient with chest pain
Indication: blah blah

Comparison: None
"""

t_sk2 = """
Hist: Patient with chest pain
Comparison: None
"""

## Check if any key match
assert _get_first_key_match(t_sk1, ["History", "Indication"], True) == "History"
assert _get_first_key_match(t_sk2, ["History"], ) is None

assert _get_all_key_matches(t_sk1, ["History", "Indication"], True) == ["History", "Indication"]

## How To

In [202]:
t_sk3 = """
**History:** Patient with chest pain
Indication: blah blah

Comparison: None
"""

re.search(r"\b(History|indication)\b.*?", t_sk3, re.DOTALL | re.IGNORECASE)

<re.Match object; span=(3, 10), match='History'>

In [92]:
re.search(f"(History|Indication).*?", t_sk1, re.DOTALL | re.IGNORECASE).group(1)

'History'