In [None]:
import re
import string

import spacy
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
lines = [
    "The European Central Bank increased its interest rate by 0.25% to a total of 4%.",
    "Inflation was surging at 5.6% last year; the highest rate in 10 years.",
    "This article was updated on 24-03-2023.",
    "For more information, contact: finance@fake.tld",
]

In [None]:
example = "The European Central Bank raised its key interest rates by 0.25% on the 20th of September 2023."

In [None]:
# Applying nlp() creates a Doc instance.
doc = nlp(example)

In [None]:
# Use retokenize() to adjust tokenization for the document.
with doc.retokenize() as retokenizer:
    for ent in doc.ents:
        retokenizer.merge(doc[ent.start:ent.end], attrs={"LEMMA": ent.text})

In [None]:
# Loop over the tokens in doc.
for token in doc:
    print(f"{token.text} | {token.lemma_} | {token.pos_} | {token.ent_type_}")

In [None]:
def tokenizer(text):
    """Tokenize"""
    doc = nlp(text)


    # Use retokenize() to adjust tokenization for the document.
    with doc.retokenize() as retokenizer:
        for ent in doc.ents:
            retokenizer.merge(doc[ent.start:ent.end], attrs={"LEMMA": ent.text})

    # Drop stopwords or punctuation.
    doc = [t for t in doc if not (t.is_punct or t.is_stop)]

    return doc

In [None]:
cvs = CountVectorizer(tokenizer=tokenizer, token_pattern=None)
cvs.fit([example])
cvs.get_feature_names_out()

## Similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
ref = "De jaarrekening geeft een getrouw beeld van het vermogen en het resultaat."

In [None]:
docs = [
    "De jaarrekening geeft een getrouw beeld van het vermogen en het resultaat.",
    "De jaarrekening geeft een getrouw beeld van onze bedrijfsvoering.",
    "Het vermogen is gegroeid en het jaarresultaat was positief.",
]

In [None]:
ref_vector = nlp(ref).vector.reshape(1, -1)

for doc in docs:
    doc_vector = nlp(doc).vector.reshape(1, -1)
    similarity = cosine_similarity(ref_vector, doc_vector)[0 ,0]

    print(f"Comparing: {doc}")
    print(f"Similarity: {similarity:0.3f}")

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
# Define target to match to.
target = "cash"

In [None]:
# Define documents to match.
docs = [
    "cash",
    "money",
    "shoes",
]

In [None]:
# Vectorizer to convert to numeric data.
vectorizer = CountVectorizer().fit(docs)
vectorizer.vocabulary_

In [None]:
# Transform the target.
target_vector = vectorizer.transform([target])
target_vector.todense()

In [None]:
# Transform the documents.
doc_vectors = vectorizer.transform(docs)
doc_vectors.todense()

In [None]:
# Compute similarities.
for doc, doc_vector in zip(docs, doc_vectors):
    similarity = cosine_similarity(target_vector, doc_vector)[0 ,0]
    print(f"Similarity {ref} - {doc:10s} {similarity:.2f}")

In [None]:
# Transform the target.
target_vector = nlp(target).vector.reshape(1, -1)
target_vector

In [None]:
# Transform the documents.
doc_vectors = [nlp(doc).vector.reshape(1, -1) for doc in docs]

In [None]:
# Compute similarities.
for doc, doc_vector in zip(docs, doc_vectors):
    similarity = cosine_similarity(target_vector, doc_vector)[0 ,0]
    print(f"Similarity {ref} - {doc:10s} {similarity:.2f}")