# Dev: Pattern & Keys

In [1]:
import re

In [72]:
from typing import Any

## Pattern

### `_pattern_start_keys()`

- No word boundary: `(history).*?`
- With word boundary: `\b(history)\b.*?`

In [None]:
def _pattern_start_keys(
    keys: list[str], 
    regex: bool = False,
    flags: re.RegexFlag = re.DOTALL | re.IGNORECASE
    ) -> Any:
    """
    Creates a regex pattern that matches the start of a string with any of the keys in the list.
    """
    if regex:
        # Regex pattern that matches any of the keys in the list
        pattern = rf"({'|'.join(keys)}).*?"
    else:
        # \b is a word boundary, which matches the position where a word starts or ends
        pattern = rf"\b({'|'.join(keys)})\b.*?"
    return re.compile(pattern, flags = flags)


In [263]:
p1 = _pattern_start_keys(["hello", "world"], regex=True)
s1 = p1.search("vvvv hello vvvv world vvvv")
s1.start()
s1.end()

10

### `_pattern_keys()`

In [357]:

def _pattern_keys(
    keys: list[str], 
    regex: bool = False,
    flags: re.RegexFlag = re.IGNORECASE
    ) -> Any:
    if regex:
        # Regex pattern that matches any of the keys in the list
        pattern = rf"({'|'.join(keys)})"
    else:
        # \b is a word boundary, which matches the position where a word starts or ends
        pattern = rf"\b({'|'.join(keys)})\b"
    return re.compile(pattern, flags = flags)


## Extract Section: `_extract_section()`

In [347]:
def extract_section(text, 
                    start_keys: list[str],
                    end_keys: list[str] | None,
                    include_start_keys: bool = False,
                    regex: bool = True,
                    flags: re.RegexFlag = re.IGNORECASE,
                    ):
    
    # First find the starting point
    start_match = _pattern_keys(start_keys, regex, flags).search(text)
    
    if not start_match:
        return ""
    
    if end_keys is None:
        # If there are no end keys, extract the section from the start key to the end of the text
        end_idx = len(text)
    else:
        # Find the ending point
        end_match = _pattern_keys(end_keys, regex, flags).search(text[start_match.start():])
        if not end_match:
            end_idx = len(text)
        else:
            # Find the index of the end key in the original text
            end_idx = start_match.start() + end_match.start()
    
    if include_start_keys:
        # Extract the section with start key
        section = text[start_match.start():end_idx].strip()
    else:
        # Extract the section without start key
        section = text[start_match.end():end_idx].strip()
    return section

In [407]:

t_rep1 = """
History: A patient presented with these symptoms:
- Chest Pain
- Dyspnea

technique: CT chest

Comparison: None
"""
t_rep2 = """
A patient presented with these symptoms:
- Chest Pain
- Dyspnea

technique: CT chest
"""

# Test it
section = extract_section(t_rep1, 
                          start_keys=["His..ry:", "indication"], 
                          end_keys=["comparison", "technique"], 
                          regex= True,
                          include_start_keys=False)
print(section)

A patient presented with these symptoms:
- Chest Pain
- Dyspnea


In [356]:
extract_section(t_rep1, 
                start_keys=["History"], 
                end_keys=None, 
                regex= True,
                include_start_keys=False)

'A patient presented with these symptoms:\n- Chest Pain\n- Dyspnea\n\ntechnique: CT chest'

In [345]:
t_rep_md1 = """
History: A patient presented with these symptoms:
- Chest Pain
- Dyspnea

**technique:** CT chest
"""

extract_section(t_rep_md1, 
                start_keys=[r"\W*History\W*"], 
                end_keys=[r"\W*technique\W*"],
                regex= True,
                include_start_keys=False)

'A patient presented with these symptoms:\n- Chest Pain\n- Dyspnea'

In [384]:
t_rep_md2 = """
Finding: 
- A
- B
- C

**Impression:**
- D
- E
- F
"""

extract_section(t_rep_md2, 
                start_keys=[r"\W*Finding(s?)\W*"], 
                end_keys=[r"\W*Impression\W*"],
                regex= True,
                include_start_keys=False)

'A\n- B\n- C'

In [399]:
t_rep_md3 = """
Clinical Indications: 
- A
- B
- C

**Impression:**
- D
- E
- F
"""

extract_section(t_rep_md3, 
                start_keys= [r"\W*history\W*", r"\W*indication(s?)\W*", *[rf"\W*clinical\s+{h}\W*" for h in ["history", r"indication(s?)"]]],
                end_keys=[r"\W*Impression\W*"],
                regex= True,
                include_start_keys=False)

'A\n- B\n- C'

## Get Match

### `_get_first_key_match()`

In [267]:
def _get_first_key_match(
    text: str,
    keys: list[str], 
    regex: bool = False,
    **kwargs,
    ) -> str | Any | None:
    """
    Return the first key match (ignore case and include newline) in the text. If no match is found, return None.
    """
    pattern = _pattern_start_keys(keys, regex, **kwargs) 
    match = pattern.search(text)
    return match.group(1) if match else None

In [268]:
_get_first_key_match("hello world", keys=["hello", "world"], regex=False)

'hello'

### `_get_all_key_matches()`

In [None]:
def _get_all_key_matches(
    text: str,
    keys: list[str], 
    regex: bool = False,
    **kwargs,
    ) -> list[Any]:
    """
    Return all key matches (ignore case and include newline) in the text. If no match is found, return an empty list.
    """
    pattern = _pattern_start_keys(keys, regex, **kwargs) 
    matches = pattern.findall(text)
    return matches

In [196]:
_get_all_key_matches("hello world hello", keys=["hello", "world"], regex=False)

['hello', 'world', 'hello']

## Tester

In [199]:
t_sk1 = """
**History:** Rule out pneumonia
ClinicalHistory: Patient with chest pain
Indication: blah blah

Comparison: None
"""

t_sk2 = """
Hist: Patient with chest pain
Comparison: None
"""

## Check if any key match
assert _get_first_key_match(t_sk1, ["History", "Indication"], True) == "History"
assert _get_first_key_match(t_sk2, ["History"], ) is None

assert _get_all_key_matches(t_sk1, ["History", "Indication"], True) == ["History", "Indication"]

## How To

In [202]:
t_sk3 = """
**History:** Patient with chest pain
Indication: blah blah

Comparison: None
"""

re.search(r"\b(History|indication)\b.*?", t_sk3, re.DOTALL | re.IGNORECASE)

<re.Match object; span=(3, 10), match='History'>

In [92]:
re.search(f"(History|Indication).*?", t_sk1, re.DOTALL | re.IGNORECASE).group(1)

'History'