In [1]:
from devtools import debug

%load_ext autoreload
%autoreload 2

!export PYTHONPATH=":./python"

In [19]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m47.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [20]:
from functools import cache
import bm25s
import Stemmer  # optional: for stemming
import spacy


@cache
def spacy_model(model="en_core_web_sm") -> tuple[spacy.language.Language, set[str]]:
    nlp = spacy.load(model)
    stop_words = nlp.Defaults.stop_words
    stop_words.update(
        {
            ",",
            ";",
            "(",
            ")",
            ":",
            "[",
            "]",
            "master",
            "mastère",
            "formation",
            "diplome" "\n",
        }
    )
    return nlp, stop_words


spacy_model()

(<spacy.lang.en.English at 0x7f04be077c40>,
 {"'d",
  "'ll",
  "'m",
  "'re",
  "'s",
  "'ve",
  '(',
  ')',
  ',',
  ':',
  ';',
  '[',
  ']',
  'a',
  'about',
  'above',
  'across',
  'after',
  'afterwards',
  'again',
  'against',
  'all',
  'almost',
  'alone',
  'along',
  'already',
  'also',
  'although',
  'always',
  'am',
  'among',
  'amongst',
  'amount',
  'an',
  'and',
  'another',
  'any',
  'anyhow',
  'anyone',
  'anything',
  'anyway',
  'anywhere',
  'are',
  'around',
  'as',
  'at',
  'back',
  'be',
  'became',
  'because',
  'become',
  'becomes',
  'becoming',
  'been',
  'before',
  'beforehand',
  'behind',
  'being',
  'below',
  'beside',
  'besides',
  'between',
  'beyond',
  'both',
  'bottom',
  'but',
  'by',
  'ca',
  'call',
  'can',
  'cannot',
  'could',
  'did',
  'diplome\n',
  'do',
  'does',
  'doing',
  'done',
  'down',
  'due',
  'during',
  'each',
  'eight',
  'either',
  'eleven',
  'else',
  'elsewhere',
  'empty',
  'enough',
  'even'

In [31]:
def preprocess_text(text) -> list[str]:
    nlp, stop_words = spacy_model()
    lemmas = [token.lemma_.lower() for token in nlp(text)]
    filtered = [token for token in lemmas if token not in stop_words]
    return filtered


# Create your corpus here
corpus = [
    "a cat is a feline and likes to purr",
    "a dog is the human's best friend and loves to play",
    "a bird is a beautiful animal that can fly",
    "a fish is a creature that lives in water and swims",
]

corpus_tokens = [preprocess_text(doc) for doc in corpus]
retriever = bm25s.BM25()
retriever.index(corpus_tokens, show_progress=False)

# Query the corpus
query = "does the fish purr like a cat?"
query_tokens = preprocess_text(query)

# Get top-k results as a tuple of (doc ids, scores). Both are arrays of shape (n_queries, k)
results = retriever.retrieve(
    [query_tokens], corpus=corpus, k=2, show_progress=False, return_as="documents"
)
return_docs = [results[0, i] for i in range(results.shape[1])]
print(return_docs)

['a cat is a feline and likes to purr', 'a fish is a creature that lives in water and swims']


In [28]:
debug(results)

/tmp/ipykernel_55702/3298553790.py:1 <module>
    results: (
        array([['a cat is a feline and likes to purr',
                'a fish is a creature that lives in water and swims']],
              dtype='<U50')
    ) (ndarray) len=1


array([['a cat is a feline and likes to purr',
        'a fish is a creature that lives in water and swims']],
      dtype='<U50')

In [33]:
def get_spacy_preprocess_fn(model: str, more_stop_words: list[str] = []):
    import spacy

    nlp = spacy.load(model)
    stop_words = nlp.Defaults.stop_words
    stop_words.update(more_stop_words)

    def preprocess_text(text) -> list[str]:
        nlp, stop_words = spacy_model()
        lemmas = [token.lemma_.lower() for token in nlp(text)]
        filtered = [token for token in lemmas if token not in stop_words]
        return filtered

    return preprocess_text

In [38]:
fn = get_spacy_preprocess_fn(model="fr_core_news_sm")

fn(
    "hello world Here, (we just invoke the assistant), [assistant_runnable], with a prompt and check if the resulting tool call is as expected."
)

['hello',
 'world',
 'invoke',
 'assistant',
 'assistant_runnable',
 'prompt',
 'check',
 'result',
 'tool',
 'expect',
 '.']