# Dev: Pattern & Keys

In [1]:
import re

In [72]:
from typing import Any

## Pattern

### `_pattern_start_keys()`

- No word boundary: `(history).*?`
- With word boundary: `\b(history)\b.*?`

In [447]:
def _pattern_start_keys(
    keys: list[str], 
    word_boundary: bool = True,
    flags: re.RegexFlag = re.DOTALL | re.IGNORECASE
    ) -> Any:
    """
    Creates a regex pattern that matches the start of a string with any of the keys in the list.
    """
    if word_boundary:
        # \b is a word boundary, which matches the position where a word starts or ends
        pattern = rf"\b({'|'.join(keys)})\b.*?"
    else:
        # Regex pattern that matches any of the keys in the list
        pattern = rf"({'|'.join(keys)}).*?"
    return re.compile(pattern, flags = flags)


In [583]:
p1 = _pattern_start_keys([r"\W*hello\W*", "world"], False)
s1 = p1.search("vvvvhello: vvvv world vvvv")
s1

<re.Match object; span=(4, 11), match='hello: '>

### `_pattern_keys()`

In [587]:
def _pattern_keys(
    keys: list[str], 
    word_boundary: bool = True,
    flags: re.RegexFlag = re.IGNORECASE
    ) -> Any:
    """
    Create regex pattern for matching given keys.
    """
    if len(keys) == 0:
        raise ValueError("keys must have at least one element")
    
    if word_boundary:
        # \b is a word boundary, which matches the position where a word starts or ends
        pattern = rf"\b({'|'.join(keys)})\b"
    else:
        # Regex pattern that matches any of the keys in the list
        pattern = rf"({'|'.join(keys)})"
    return re.compile(pattern, flags = flags)



In [589]:
pattern = _pattern_keys(['hist'], word_boundary=False)
pattern.search('hhisttt')

<re.Match object; span=(1, 5), match='hist'>

In [590]:
pattern = _pattern_keys([r'a\W*'], word_boundary=False)
pattern.search('A--')

<re.Match object; span=(0, 3), match='A--'>

### v1

In [518]:

t_rep1 = """
History: A patient presented with these symptoms:
- Chest Pain
- Dyspnea

technique: CT chest

Comparison: None
"""
# Test it
section = extract_section(t_rep1, 
                          start_keys=["History:", "indication"], 
                          end_keys=["comparison", "technique"], 
                          word_boundary= False,
                          include_start_keys=False)
print(section)

A patient presented with these symptoms:
- Chest Pain
- Dyspnea


In [511]:
# No start key
section = extract_section(t_rep1, 
                          start_keys=None, 
                          end_keys=None, 
                          word_boundary= False,
                          include_start_keys=False)
print(section)

History: A patient presented with these symptoms:
- Chest Pain
- Dyspnea

technique: CT chest

Comparison: None


## Get Match

### `_get_first_key_match()`

In [595]:
def _get_first_key_match(
    text: str,
    keys: list[str] | None, 
    word_boundary: bool = True,
    **kwargs,
    ) -> str | Any | None:
    """
    Return the first key match (ignore case and include newline) in the text. If no match is found, return None.
    """
    if keys is None:
        return None
    pattern = _pattern_keys(keys, word_boundary, **kwargs) 
    match = pattern.search(text)
    return match.group(1) if match else None

In [599]:
_get_first_key_match("hhelloo world", keys=["hello", "world"], word_boundary= False)

'hello'

### `_get_all_key_matches()`

In [None]:
def _get_all_key_matches(
    text: str,
    keys: list[str], 
    regex: bool = False,
    **kwargs,
    ) -> list[Any]:
    """
    Return all key matches (ignore case and include newline) in the text. If no match is found, return an empty list.
    """
    pattern = _pattern_start_keys(keys, regex, **kwargs) 
    matches = pattern.findall(text)
    return matches

In [196]:
_get_all_key_matches("hello world hello", keys=["hello", "world"], regex=False)

['hello', 'world', 'hello']

## Tester

In [199]:
t_sk1 = """
**History:** Rule out pneumonia
ClinicalHistory: Patient with chest pain
Indication: blah blah

Comparison: None
"""

t_sk2 = """
Hist: Patient with chest pain
Comparison: None
"""

## Check if any key match
assert _get_first_key_match(t_sk1, ["History", "Indication"], True) == "History"
assert _get_first_key_match(t_sk2, ["History"], ) is None

assert _get_all_key_matches(t_sk1, ["History", "Indication"], True) == ["History", "Indication"]

## How To

In [202]:
t_sk3 = """
**History:** Patient with chest pain
Indication: blah blah

Comparison: None
"""

re.search(r"\b(History|indication)\b.*?", t_sk3, re.DOTALL | re.IGNORECASE)

<re.Match object; span=(3, 10), match='History'>

In [92]:
re.search(f"(History|Indication).*?", t_sk1, re.DOTALL | re.IGNORECASE).group(1)

'History'