In [6]:
import re
import string

import spacy
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
nlp = spacy.load("nl_core_news_sm")

In [3]:
data = [
    "Hij nam de metro naar de Autoriteit Financiele Markten.",
    "Zij neemt altijd de metro vanaf het Centraal Station."
]

In [7]:
lines = [
    "The European Central Bank increased its interest rate by 0.25% to a total of 4%.",
    "Inflation was surging at 5.6% last year; the highest rate in 10 years.",
    "This article was updated on 24-03-2023.",
    "For more information, contact: finance@fake.tld",
]

### Finding numbers

In [346]:
# Very simple form.
pattern = r"[0-9\.,]+"
re.search(pattern, "The costs were 2,345.67 euro.")

<re.Match object; span=(15, 23), match='2,345.67'>

In [347]:
# However, does grab end of sentance...
re.search(pattern, "The year is now 2023.")

<re.Match object; span=(16, 21), match='2023.'>

In [348]:
# Bit more precise.
pattern = r"""(
    (?:[0-9]*[\.,])*        # Capture 1 or more groups of digits and separators.
    [0-9]+                  # Capture one final set of digits.
)"""
re.search(pattern, "The year is now 2023.", re.X)

<re.Match object; span=(16, 20), match='2023'>

In [349]:
# Also captures numbers
re.search(pattern, "The costs were 2,345.67 euro.", re.X)

<re.Match object; span=(15, 23), match='2,345.67'>

In [350]:
# Still not perfect...
re.search(pattern, "Localhost has IP address 127.0.0.1.", re.X)

<re.Match object; span=(25, 34), match='127.0.0.1'>

### Finding dates

In [313]:
# Somewhat naive approach...
pattern = r"\d{1,2}-\d{1,2}-\d{4}"
re.match(pattern, "31-1-2023")

<re.Match object; span=(0, 9), match='31-1-2023'>

In [316]:
# Woops...
re.match(pattern, "33-22-1111")

<re.Match object; span=(0, 10), match='33-22-1111'>

In [17]:
# Detect dates.
pattern = r"""(
    (?: 0?[1-9] | [12][0-9] | 3[01] ) -    # Day:   01-09 | 10 - 29 | 30 - 31
    (?: 0?[1-9] | 1[012] ) -               # Month: 01-09 | 10 - 12
    \d{4}                                  # Year
)"""

In [19]:
# Invalid date is no longer matched!
re.search(pattern, "33-22-1111", re.X)

In [22]:
# While valid dates are...
re.search(pattern, "31-01-2023", re.X)

<re.Match object; span=(0, 10), match='31-01-2023'>

### Finding entities

In [None]:
# Detect entities.
m = re.search(r"((?:[A-Z]+\w+\s)*(?:[A-Z]+\w+))", "the European Central Bank authorized")
m

<re.Match object; span=(4, 25), match='European Central Bank'>

In [None]:
m.groups()

('European Central Bank',)

### Detect e-mail

In [16]:
# Detect e-mail.
re.search(r"([a-zA-Z0-9\-.]+@[a-zA-Z0-9-.]+\.[a-zA-Z]{2,3})", "bla-bla.bla@test.blaat-testing.com")

<re.Match object; span=(0, 34), match='bla-bla.bla@test.blaat-testing.com'>

## Regex Tokenizer

In [164]:

def tokenizer(text):
    """Tokenize the provided string."""
    tokens = []
    while text:
        
        # Detect numbers.
        if m:= re.match(r"([0-9]*[\.,]?[0-9]+)", text):
            token = m.group(1)
            tokens.append((token, "NUMBER"))

        # Detect e-mail.
        elif m:= re.match(r"([\w\-\.]+@[a-zA-Z\-\.]+\.[a-zA-Z]{2,3})", text):
            token = m.group(1)
            tokens.append((token, "EMAIL"))

        # Detect word characters.
        elif m := re.match(r"(\w+)", text):
            token = m.group(1)
            tokens.append((token, "WORD"))
            
        
        # Detect punctuation characters
        elif text[0] in string.punctuation:
            token = text[0]
            tokens.append((token, "PUNCT"))
        
        
        # Detect whitespace characters.
        else:
            token = text[0]

        # Remove token from text.
        text = text[len(token):]


    return tokens


In [165]:
tokenizer(lines[2])

[('For', 'WORD'),
 ('more', 'WORD'),
 ('information', 'WORD'),
 (',', 'PUNCT'),
 ('contact', 'WORD'),
 (':', 'PUNCT'),
 ('finance@fake.tld', 'EMAIL')]

In [271]:
nlp = spacy.load("en_core_web_sm")

In [278]:
doc = nlp(lines[2])

In [279]:
for token in doc:
    print(token)

This
article
was
updated
on
24
-
03
-
2023
.


In [280]:
for ent in doc.ents:
    print(f"{ent} | {ent.label_} | {ent.start} - {ent.end}")

24-03-2023 | DATE | 5 - 10


## Default tokenizer

In [4]:
cvs = CountVectorizer()

In [5]:
pd.DataFrame(
    data=cvs.fit_transform(data).todense(),
    columns=cvs.get_feature_names_out(),
)

Unnamed: 0,altijd,autoriteit,centraal,de,financiele,het,hij,markten,metro,naar,nam,neemt,station,vanaf,zij
0,0,1,0,2,1,0,1,1,1,1,1,0,0,0,0
1,1,0,1,1,0,1,0,0,1,0,0,1,1,1,1


## Spacy as tokenizer

In [6]:
# Create the language model once.
nlp = spacy.load("nl_core_news_sm")


# Define the tokenizer as a function.
def spacy_tokenizer(text: str) -> list:
    """Tokenize a text string using spaCy."""
    return [
        token.lemma_ for token in nlp(text)
        if not token.is_punct
    ]

In [7]:
# Test tokenizer function on first record.
spacy_tokenizer(data[0])

['hij',
 'nemen',
 'de',
 'metro',
 'naar',
 'de',
 'autoriteit',
 'Financiele',
 'Markten']

In [8]:
# Initialize CountVectorizer with the spaCy tokenizer.
cvs = CountVectorizer(tokenizer=spacy_tokenizer, token_pattern=None)

In [9]:
# Inspect the output.
# Note: Both forms of the verb now converge to the same token.
pd.DataFrame(
    data=cvs.fit_transform(data).todense(),
    columns=cvs.get_feature_names_out(),
)

Unnamed: 0,altijd,autoriteit,centraal,de,financieel,het,hij,markt,metro,naar,nemen,station,vanaf,zij
0,0,1,0,2,1,0,1,1,1,1,1,0,0,0
1,1,0,1,1,0,1,0,0,1,0,1,1,1,1


## Similarity

In [3]:
from sklearn.metrics.pairwise import cosine_similarity

In [30]:
ref = "De jaarrekening geeft een getrouw beeld van het vermogen en het resultaat."

In [33]:
docs = [
    "De jaarrekening geeft een getrouw beeld van het vermogen en het resultaat.",
    "De jaarrekening geeft een getrouw beeld van onze bedrijfsvoering.",
    "Het vermogen is gegroeid en het jaarresultaat was positief.",
]

In [34]:
ref_vector = nlp(ref).vector.reshape(1, -1)

for doc in docs:
    doc_vector = nlp(doc).vector.reshape(1, -1)
    similarity = cosine_similarity(ref_vector, doc_vector)[0 ,0]
    
    print(f"Comparing: {doc}")
    print(f"Similarity: {similarity:0.3f}")

Comparing: De jaarrekening geeft een getrouw beeld van het vermogen en het resultaat.
Similarity: 1.000
Comparing: De jaarrekening geeft een getrouw beeld van onze bedrijfsvoering.
Similarity: 0.840
Comparing: Het vermogen is gegroeid en het jaarresultaat was positief.
Similarity: 0.656


In [8]:
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
nlp = spacy.load("en_core_web_sm")

In [26]:
# Define target to match to.
target = "cash"

In [71]:
# Define documents to match.
docs = [
    "cash",
    "money",
    "shoes",
]

In [72]:
# Vectorizer to convert to numeric data.
vectorizer = CountVectorizer().fit(docs)
vectorizer.vocabulary_

{'cash': 0, 'money': 1, 'shoes': 2}

In [73]:
# Transform the target.
target_vector = vectorizer.transform([target])
target_vector.todense()

matrix([[1, 0, 0]], dtype=int64)

In [74]:
# Transform the documents.
doc_vectors = vectorizer.transform(docs)
doc_vectors.todense()

matrix([[1, 0, 0],
        [0, 1, 0],
        [0, 0, 1]], dtype=int64)

In [75]:
# Compute similarities.
for doc, doc_vector in zip(docs, doc_vectors):
    similarity = cosine_similarity(target_vector, doc_vector)[0 ,0]
    print(f"Similarity {ref} - {doc:10s} {similarity:.2f}")

Similarity cash - cash       1.00
Similarity cash - money      0.00
Similarity cash - shoes      0.00


In [91]:
# Transform the target.
target_vector = nlp(target).vector.reshape(1, -1)
target_vector

array([[-1.397754  ,  0.07070416, -0.39039838,  0.5617517 , -0.34965605,
         0.13661712,  0.57871896,  1.5256443 ,  0.65602285, -0.65689296,
         0.18999839, -1.1277145 , -0.5164149 ,  0.50789267, -0.09047326,
        -0.00300154, -0.48194283, -1.1704302 ,  0.16177908,  0.7935656 ,
         0.5065979 ,  0.46046025, -0.46592283, -0.5427428 , -0.01515184,
         1.1252439 , -0.21802486,  1.1335628 ,  0.12234014,  0.44552493,
        -0.76876795, -0.26393777,  0.56515276,  0.42321578,  0.05211889,
        -1.145031  ,  0.6928235 ,  0.24417129,  1.2999567 , -0.58716494,
        -0.3146794 ,  0.00832057, -0.90186   ,  0.08301586, -0.6861465 ,
        -0.4924449 , -0.94279224, -0.33799934, -0.15532967, -0.56868625,
         0.13251549,  0.2655836 ,  0.488851  , -1.129028  ,  0.5697172 ,
        -0.19400054, -0.00783122,  0.29177633, -0.4797179 , -0.21585448,
        -0.27525264,  0.02882797,  0.27929398, -0.48936877,  1.1565417 ,
         0.94858444, -0.53587013, -0.18130594,  0.5

In [92]:
# Transform the documents.
doc_vectors = [nlp(doc).vector.reshape(1, -1) for doc in docs]

In [93]:
# Compute similarities.
for doc, doc_vector in zip(docs, doc_vectors):
    similarity = cosine_similarity(target_vector, doc_vector)[0 ,0]
    print(f"Similarity {ref} - {doc:10s} {similarity:.2f}")

Similarity cash - cash       1.00
Similarity cash - money      0.80
Similarity cash - shoes      0.29


## Regex

In [140]:
import re

In [141]:
text = "<b>Python</b> is a <b>great</b> language!"

In [142]:
# Replace pattern by captured group.
re.sub(
    r"<b>(.+?)</b>",   # Search for text between bold tags.
    r"\1",             # Refers to Match.group(1) 
    text
)

'Python is a great language!'

In [143]:
# Replace pattern by named capture group.
re.sub(
    r"<b>(?P<bold>.+?)</b>",    # (?P<name>...) defines the named group.
    r"\g<bold>",                # \g<name> refers to the named group.
    text
)

'Python is a great language!'

In [144]:
# Second argument can be a (lambda) function.
re.sub(
    r"<b>(.+?)</b>",
    lambda m: m.group(1).upper(),   # Input is a Match object.
    text
)

'PYTHON is a GREAT language!'

### Look ahead / behind

In [86]:
text = "An ROI of 12.5% over 5 years."

In [87]:
# Look behind: matches number followed by % character.
re.search(r"(\d+\.?\d*)(?=%)", text)

<re.Match object; span=(10, 14), match='12.5'>

In [88]:
# Negative look ahead: number NOT followed by % character.
# Note: 21 is not followed by % but by another number...
re.search(r"([0-9\.]+)(?!%)", text)

<re.Match object; span=(10, 13), match='12.'>

In [89]:
# Need to include fractional numbers in look ahead...
# Any number not followed by a number, dot or percentage.
re.search(r"([0-9\.]+)(?![0-9\.%])", text)

<re.Match object; span=(21, 22), match='5'>