In [1]:
import spacy
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
nlp = spacy.load("nl_core_news_sm")

In [3]:
data = [
    "Hij nam de metro naar de Autoriteit Financiele Markten.",
    "Zij neemt altijd de metro vanaf het Centraal Station."
]

## Default tokenizer

In [4]:
cvs = CountVectorizer()

In [5]:
pd.DataFrame(
    data=cvs.fit_transform(data).todense(),
    columns=cvs.get_feature_names_out(),
)

Unnamed: 0,altijd,autoriteit,centraal,de,financiele,het,hij,markten,metro,naar,nam,neemt,station,vanaf,zij
0,0,1,0,2,1,0,1,1,1,1,1,0,0,0,0
1,1,0,1,1,0,1,0,0,1,0,0,1,1,1,1


## Spacy as tokenizer

In [6]:
# Create the language model once.
nlp = spacy.load("nl_core_news_sm")


# Define the tokenizer as a function.
def spacy_tokenizer(text: str) -> list:
    """Tokenize a text string using spaCy."""
    return [
        token.lemma_ for token in nlp(text)
        if not token.is_punct
    ]

In [7]:
# Test tokenizer function on first record.
spacy_tokenizer(data[0])

['hij',
 'nemen',
 'de',
 'metro',
 'naar',
 'de',
 'autoriteit',
 'Financiele',
 'Markten']

In [8]:
# Initialize CountVectorizer with the spaCy tokenizer.
cvs = CountVectorizer(tokenizer=spacy_tokenizer, token_pattern=None)

In [9]:
# Inspect the output.
# Note: Both forms of the verb now converge to the same token.
pd.DataFrame(
    data=cvs.fit_transform(data).todense(),
    columns=cvs.get_feature_names_out(),
)

Unnamed: 0,altijd,autoriteit,centraal,de,financieel,het,hij,markt,metro,naar,nemen,station,vanaf,zij
0,0,1,0,2,1,0,1,1,1,1,1,0,0,0
1,1,0,1,1,0,1,0,0,1,0,1,1,1,1


## Regex

In [140]:
import re

In [141]:
text = "<b>Python</b> is a <b>great</b> language!"

In [142]:
# Replace pattern by captured group.
re.sub(
    r"<b>(.+?)</b>",   # Search for text between bold tags.
    r"\1",             # Refers to Match.group(1) 
    text
)

'Python is a great language!'

In [143]:
# Replace pattern by named capture group.
re.sub(
    r"<b>(?P<bold>.+?)</b>",    # (?P<name>...) defines the named group.
    r"\g<bold>",                # \g<name> refers to the named group.
    text
)

'Python is a great language!'

In [144]:
# Second argument can be a (lambda) function.
re.sub(
    r"<b>(.+?)</b>",
    lambda m: m.group(1).upper(),   # Input is a Match object.
    text
)

'PYTHON is a GREAT language!'

### Look ahead / behind

In [86]:
text = "An ROI of 12.5% over 5 years."

In [87]:
# Look behind: matches number followed by % character.
re.search(r"(\d+\.?\d*)(?=%)", text)

<re.Match object; span=(10, 14), match='12.5'>

In [88]:
# Negative look ahead: number NOT followed by % character.
# Note: 21 is not followed by % but by another number...
re.search(r"([0-9\.]+)(?!%)", text)

<re.Match object; span=(10, 13), match='12.'>

In [89]:
# Need to include fractional numbers in look ahead...
# Any number not followed by a number, dot or percentage.
re.search(r"([0-9\.]+)(?![0-9\.%])", text)

<re.Match object; span=(21, 22), match='5'>