From aefb5536426c20a729986a44fa3b467bd19481c7 Mon Sep 17 00:00:00 2001
From: stephantul <stephantul@gmail.com>
Date: Thu, 23 Jan 2025 09:32:07 +0100
Subject: [PATCH] Fix issue with tokenizer, add token pattern to _distill

---
 model2vec/distill/distillation.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/model2vec/distill/distillation.py b/model2vec/distill/distillation.py
index 4da4e503..db1f0b3e 100644
--- a/model2vec/distill/distillation.py
+++ b/model2vec/distill/distillation.py
@@ -185,6 +185,10 @@ def _remove_tokens_and_embeddings(
 
         # Remove the unused tokens from the tokenizer.
     new_tokenizer = remove_tokens(tokenizer.backend_tokenizer, wrong_tokens)
+    if new_tokenizer.get_vocab_size() == tokenizer.backend_tokenizer.get_vocab_size():
+        # This happens if we didn't remove any tokens.
+        return new_tokenizer, embeddings
+
     # Remove the embeddings of the unused tokens.
     embeddings = np.delete(embeddings, wrong_token_ids, axis=0)
     logger.info(f"Removed {len(wrong_tokens)} unused tokens from the tokenizer and embeddings.")
@@ -199,6 +203,7 @@ def distill(
     pca_dims: PCADimType = 256,
     apply_zipf: bool = True,
     use_subword: bool = True,
+    token_remove_pattern: str | None = r"\[unused\d+\]",
 ) -> StaticModel:
     """
     Distill a staticmodel from a sentence transformer.
@@ -217,6 +222,7 @@ def distill(
         If this is 'auto', we don't reduce dimenionality, but still apply PCA.
     :param apply_zipf: Whether to apply Zipf weighting to the embeddings.
     :param use_subword: Whether to keep subword tokens in the vocabulary. If this is False, you must pass a vocabulary, and the returned tokenizer will only detect full words.
+    :param token_remove_pattern: If this is set to a string, we compile this into a regex. Any tokens that conform to this regex pattern will be removed from the vocabulary.
     :return: A StaticModel
 
     """
@@ -231,6 +237,7 @@ def distill(
         pca_dims=pca_dims,
         apply_zipf=apply_zipf,
         use_subword=use_subword,
+        token_remove_pattern=token_remove_pattern,
     )