In [None]:
import re
import string

import spacy
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
lines = [
    "The European Central Bank increased its interest rate by 0.25% to a total of 4%.",
    "Inflation was surging at 5.6% last year; the highest rate in 10 years.",
    "This article was updated on 24-03-2023.",
    "For more information, contact: finance@fake.tld",
]

In [None]:
example = "The European Central Bank raised its key interest rates by 0.25% on the 20th of September 2023."

In [None]:
# Applying nlp() creates a Doc instance.
doc = nlp(example)

In [None]:
# Use retokenize() to adjust tokenization for the document.
with doc.retokenize() as retokenizer:
    for ent in doc.ents:
        retokenizer.merge(doc[ent.start:ent.end], attrs={"LEMMA": ent.text})

In [None]:
# Loop over the tokens in doc.
for token in doc:
    print(f"{token.text} | {token.lemma_} | {token.pos_} | {token.ent_type_}")

In [None]:
def tokenizer(text):
    """Tokenize"""
    doc = nlp(text)


    # Use retokenize() to adjust tokenization for the document.
    with doc.retokenize() as retokenizer:
        for ent in doc.ents:
            retokenizer.merge(doc[ent.start:ent.end], attrs={"LEMMA": ent.text})

    # Drop stopwords or punctuation.
    doc = [t for t in doc if not (t.is_punct or t.is_stop)]

    return doc

In [None]:
cvs = CountVectorizer(tokenizer=tokenizer, token_pattern=None)
cvs.fit([example])
cvs.get_feature_names_out()

## Similarity

In [78]:
from sklearn.metrics.pairwise import cosine_similarity

In [79]:
ref = "De jaarrekening geeft een getrouw beeld van het vermogen en het resultaat."

In [80]:
docs = [
    "De jaarrekening geeft een getrouw beeld van het vermogen en het resultaat.",
    "De jaarrekening geeft een getrouw beeld van onze bedrijfsvoering.",
    "Het vermogen is gegroeid en het jaarresultaat was positief.",
]

In [81]:
ref_vector = nlp(ref).vector.reshape(1, -1)

for doc in docs:
    doc_vector = nlp(doc).vector.reshape(1, -1)
    similarity = cosine_similarity(ref_vector, doc_vector)[0 ,0]

    print(f"Comparing: {doc}")
    print(f"Similarity: {similarity:0.3f}")

Comparing: De jaarrekening geeft een getrouw beeld van het vermogen en het resultaat.
Similarity: 1.000
Comparing: De jaarrekening geeft een getrouw beeld van onze bedrijfsvoering.
Similarity: 0.905
Comparing: Het vermogen is gegroeid en het jaarresultaat was positief.
Similarity: 0.651


In [82]:
from sklearn.feature_extraction.text import CountVectorizer

In [83]:
nlp = spacy.load("en_core_web_sm")

In [93]:
# Define target to match to.
target = "cash"

In [94]:
# Define documents to match.
docs = [
    "cash",
    "money",
    "shoes",
]

In [95]:
# Vectorizer to convert to numeric data.
vectorizer = CountVectorizer().fit(docs)
vectorizer.vocabulary_

{'cash': 0, 'money': 1, 'shoes': 2}

In [96]:
# Transform the target.
target_vector = vectorizer.transform([target])
target_vector.todense()

matrix([[1, 0, 0]], dtype=int64)

In [97]:
# Transform the documents.
doc_vectors = vectorizer.transform(docs)
doc_vectors.todense()

matrix([[1, 0, 0],
        [0, 1, 0],
        [0, 0, 1]], dtype=int64)

In [98]:
# Compute similarities.
for doc, doc_vector in zip(docs, doc_vectors):
    similarity = cosine_similarity(target_vector, doc_vector)[0 ,0]
    print(f"Similarity {target} - {doc:10s} {similarity:.2f}")

Similarity cash - cash       1.00
Similarity cash - money      0.00
Similarity cash - shoes      0.00


In [99]:
# Transform the target.
target_vector = nlp(target).vector.reshape(1, -1)
target_vector

array([[-1.397754  ,  0.07070416, -0.39039838,  0.5617517 , -0.34965605,
         0.13661712,  0.57871896,  1.5256443 ,  0.65602285, -0.65689296,
         0.18999839, -1.1277145 , -0.5164149 ,  0.50789267, -0.09047326,
        -0.00300154, -0.48194283, -1.1704302 ,  0.16177908,  0.7935656 ,
         0.5065979 ,  0.46046025, -0.46592283, -0.5427428 , -0.01515184,
         1.1252439 , -0.21802486,  1.1335628 ,  0.12234014,  0.44552493,
        -0.76876795, -0.26393777,  0.56515276,  0.42321578,  0.05211889,
        -1.145031  ,  0.6928235 ,  0.24417129,  1.2999567 , -0.58716494,
        -0.3146794 ,  0.00832057, -0.90186   ,  0.08301586, -0.6861465 ,
        -0.4924449 , -0.94279224, -0.33799934, -0.15532967, -0.56868625,
         0.13251549,  0.2655836 ,  0.488851  , -1.129028  ,  0.5697172 ,
        -0.19400054, -0.00783122,  0.29177633, -0.4797179 , -0.21585448,
        -0.27525264,  0.02882797,  0.27929398, -0.48936877,  1.1565417 ,
         0.94858444, -0.53587013, -0.18130594,  0.5

In [100]:
# Transform the documents.
doc_vectors = [nlp(doc).vector.reshape(1, -1) for doc in docs]

In [101]:
# Compute similarities.
for doc, doc_vector in zip(docs, doc_vectors):
    similarity = cosine_similarity(target_vector, doc_vector)[0 ,0]
    print(f"Similarity {target} - {doc:10s} {similarity:.2f}")

Similarity cash - cash       1.00
Similarity cash - money      0.80
Similarity cash - shoes      0.29
