# Solutions II: Regex Tokenizer

In [1]:
import re

### Regex entity recognition

Write a regex to find entities:

- Any word that is all capitals, for example `ECB`.
- Any set of words that start with a capital, for example `European Central Bank`.

In [None]:
# Regex to find entities.
pattern = r"""
    (
        (?<!^)              # Not at the start of the text.
        (?:[A-Z]\w+\s)*     # Words starting with a capital, separated by whitespace.
        (?:[A-Z]+\w+)       # Final word with one ore more capitals.
    )
"""

# Find entity in the example.
re.search(pattern, "The European Central Bank increased its main interest rate.", re.X)

In [None]:
# Abbreviation using only capitals.
re.search(pattern, "The ECB increased its main interest rate.", re.X)

In [None]:
# Abbreviation as part of a name.
re.search(pattern, "The ING Group increased its main interest rate.", re.X)

### Regex tokenizer

In [5]:
# Define regex patterns for tokens.
TOKEN_PATTERNS = {
    # Fill in entity pattern here.
    "ENTITY":     r"(?<!^)((?:[A-Z]\w+\s)*(?:[A-Z]+\w+))",

    # Fill in the date pattern here.
    "DATE":       r"((0?[1-9]|[12][0-9]|3[01])-(0?[1-9]|1[012])-\d{4})",

    # Fill in the number pattern here.
    "NUMBER":     r"(([0-9]*[\.,])*[0-9]+)",

    # General patterns.
    "PUNCT":      r"([?!,.:;\"'{}()%&#$*+-/\\<=>@_`|~[\]])",
    "WORD":       r"(\w+)",
    "WHITESPACE": r"(\s+)",

    # Catch all pattern.
    "UNKNOWN":    r"(.)",
}


def tokenizer(text):
    """Tokenize the provided string."""
    position = 0
    tokens = []

    # Walk over text.
    while position < len(text):

        # Detect token patterns.
        for name, pattern in TOKEN_PATTERNS.items():

            # Compile pattern to enable position parameter.
            pattern = re.compile(pattern)

            # Check for a match.
            if m:= pattern.match(text, pos=position):
                token = m.group(1)
                tokens.append((token, name))

                # Update position with token length.
                position += len(token)
                break

    return tokens


In [None]:
# Apply to an example.
tokenizer("The European Central Bank increased its interest rate by 0.25% to a total of 4%.")

In [None]:
# And another.
tokenizer("This example was created on 5-10-2023.")