From 426aa4544d1025ba95252a1e90a028d238e687af Mon Sep 17 00:00:00 2001
From: stephantul <stephantul@gmail.com>
Date: Wed, 15 Jan 2025 19:52:35 +0100
Subject: [PATCH 1/3] Add loading from st

---
 model2vec/hf_utils.py | 69 +++++++++++++++++++++++++++++++++++++++++++
 model2vec/model.py    | 21 +++++++++++++
 2 files changed, 90 insertions(+)

diff --git a/model2vec/hf_utils.py b/model2vec/hf_utils.py
index 0fa35148..a976ba98 100644
--- a/model2vec/hf_utils.py
+++ b/model2vec/hf_utils.py
@@ -83,6 +83,75 @@ def _create_model_card(
     model_card.save(folder_path / "README.md")
 
 
+def load_from_st(
+    folder_or_repo_path: str | Path, token: str | None = None
+) -> tuple[np.ndarray, Tokenizer, dict[str, Any], dict[str, Any]]:
+    """
+    Loads a sentence-transformers model from a folder.
+
+    :param folder_or_repo_path: The folder or repo path to load from.
+        - If this is a local path, we will load from the local path.
+        - If the local path is not found, we will attempt to load from the huggingface hub.
+    :param token: The huggingface token to use.
+    :raises: FileNotFoundError if the folder exists, but the file does not exist locally.
+    :return: The embeddings, tokenizer, config, and metadata.
+    """
+    folder_or_repo_path = Path(folder_or_repo_path)
+    prepend_path = "0_StaticEmbedding"
+    if folder_or_repo_path.exists():
+        embeddings_path = folder_or_repo_path / prepend_path / "model.safetensors"
+        if not embeddings_path.exists():
+            raise FileNotFoundError(f"Embeddings file does not exist in {folder_or_repo_path}")
+        config_path = folder_or_repo_path / "config_sentence_transformers.json"
+        if not config_path.exists():
+            raise FileNotFoundError(f"Config file does not exist in {folder_or_repo_path}")
+
+        tokenizer_path = folder_or_repo_path / prepend_path / "tokenizer.json"
+        if not tokenizer_path.exists():
+            raise FileNotFoundError(f"Tokenizer file does not exist in {folder_or_repo_path}")
+
+        # README is optional, so this is a bit finicky.
+        readme_path = folder_or_repo_path / "README.md"
+        metadata = _get_metadata_from_readme(readme_path)
+
+    else:
+        logger.info("Folder does not exist locally, attempting to use huggingface hub.")
+        try:
+            embeddings_path = huggingface_hub.hf_hub_download(
+                folder_or_repo_path.as_posix(), "0_StaticEmbedding/model.safetensors", token=token
+            )
+        except huggingface_hub.utils.EntryNotFoundError as e:
+            # Raise original exception.
+            raise e
+
+        try:
+            readme_path = huggingface_hub.hf_hub_download(folder_or_repo_path.as_posix(), "README.md", token=token)
+            metadata = _get_metadata_from_readme(Path(readme_path))
+        except huggingface_hub.utils.EntryNotFoundError:
+            logger.info("No README found in the model folder. No model card loaded.")
+            metadata = {}
+
+        config_path = huggingface_hub.hf_hub_download(
+            folder_or_repo_path.as_posix(), "config_sentence_transformers.json", token=token
+        )
+        tokenizer_path = huggingface_hub.hf_hub_download(
+            folder_or_repo_path.as_posix(), "0_StaticEmbedding/tokenizer.json", token=token
+        )
+
+    opened_tensor_file = cast(SafeOpenProtocol, safetensors.safe_open(embeddings_path, framework="numpy"))
+    embeddings = opened_tensor_file.get_tensor("embedding.weight")
+
+    tokenizer: Tokenizer = Tokenizer.from_file(str(tokenizer_path))
+    config = json.load(open(config_path))
+
+    if len(tokenizer.get_vocab()) != len(embeddings):
+        logger.warning(
+            f"Number of tokens does not match number of embeddings: `{len(tokenizer.get_vocab())}` vs `{len(embeddings)}`"
+        )
+
+    return embeddings, tokenizer, config, metadata
+
+
 def load_pretrained(
     folder_or_repo_path: str | Path, token: str | None = None
 ) -> tuple[np.ndarray, Tokenizer, dict[str, Any], dict[str, Any]]:
diff --git a/model2vec/model.py b/model2vec/model.py
index 8afeb220..bce335f1 100644
--- a/model2vec/model.py
+++ b/model2vec/model.py
@@ -166,6 +166,27 @@ def from_pretrained(
             embeddings, tokenizer, config, base_model_name=metadata.get("base_model"), language=metadata.get("language")
         )
 
+    @classmethod
+    def from_sentence_transformers(
+        cls: type[StaticModel],
+        path: PathLike,
+        token: str | None = None,
+    ) -> StaticModel:
+        """
+        Load a StaticModel trained with sentence transformers from a local path or huggingface hub path.
+
+        NOTE: if you load a private model from the huggingface hub, you need to pass a token.
+
+        :param path: The path to load your static model from.
+        :param token: The huggingface token to use.
+        :return: A StaticModel
+        """
+        from model2vec.hf_utils import load_from_st
+
+        embeddings, tokenizer, config, metadata = load_from_st(path, token=token)
+
+        return cls(embeddings, tokenizer, config, base_model_name=None, language=None)
+
     def encode_as_sequence(
         self,
         sentences: list[str] | str,

From 9479b72567d72a5a133677df0318da6029a559d7 Mon Sep 17 00:00:00 2001
From: stephantul <stephantul@gmail.com>
Date: Wed, 15 Jan 2025 20:04:58 +0100
Subject: [PATCH 2/3] fix: make single function

---
 model2vec/hf_utils.py | 118 ++++++++----------------------------------
 model2vec/model.py    |   6 +--
 2 files changed, 26 insertions(+), 98 deletions(-)

diff --git a/model2vec/hf_utils.py b/model2vec/hf_utils.py
index a976ba98..b9e853fa 100644
--- a/model2vec/hf_utils.py
+++ b/model2vec/hf_utils.py
@@ -83,77 +83,8 @@ def _create_model_card(
     model_card.save(folder_path / "README.md")
 
 
-def load_from_st(
-    folder_or_repo_path: str | Path, token: str | None = None
-) -> tuple[np.ndarray, Tokenizer, dict[str, Any], dict[str, Any]]:
-    """
-    Loads a sentence-transformers model from a folder.
-
-    :param folder_or_repo_path: The folder or repo path to load from.
-        - If this is a local path, we will load from the local path.
-        - If the local path is not found, we will attempt to load from the huggingface hub.
-    :param token: The huggingface token to use.
-    :raises: FileNotFoundError if the folder exists, but the file does not exist locally.
-    :return: The embeddings, tokenizer, config, and metadata.
-    """
-    folder_or_repo_path = Path(folder_or_repo_path)
-    prepend_path = "0_StaticEmbedding"
-    if folder_or_repo_path.exists():
-        embeddings_path = folder_or_repo_path / prepend_path / "model.safetensors"
-        if not embeddings_path.exists():
-            raise FileNotFoundError(f"Embeddings file does not exist in {folder_or_repo_path}")
-        config_path = folder_or_repo_path / "config_sentence_transformers.json"
-        if not config_path.exists():
-            raise FileNotFoundError(f"Config file does not exist in {folder_or_repo_path}")
-
-        tokenizer_path = folder_or_repo_path / prepend_path / "tokenizer.json"
-        if not tokenizer_path.exists():
-            raise FileNotFoundError(f"Tokenizer file does not exist in {folder_or_repo_path}")
-
-        # README is optional, so this is a bit finicky.
-        readme_path = folder_or_repo_path / "README.md"
-        metadata = _get_metadata_from_readme(readme_path)
-
-    else:
-        logger.info("Folder does not exist locally, attempting to use huggingface hub.")
-        try:
-            embeddings_path = huggingface_hub.hf_hub_download(
-                folder_or_repo_path.as_posix(), "0_StaticEmbedding/model.safetensors", token=token
-            )
-        except huggingface_hub.utils.EntryNotFoundError as e:
-            # Raise original exception.
-            raise e
-
-        try:
-            readme_path = huggingface_hub.hf_hub_download(folder_or_repo_path.as_posix(), "README.md", token=token)
-            metadata = _get_metadata_from_readme(Path(readme_path))
-        except huggingface_hub.utils.EntryNotFoundError:
-            logger.info("No README found in the model folder. No model card loaded.")
-            metadata = {}
-
-        config_path = huggingface_hub.hf_hub_download(
-            folder_or_repo_path.as_posix(), "config_sentence_transformers.json", token=token
-        )
-        tokenizer_path = huggingface_hub.hf_hub_download(
-            folder_or_repo_path.as_posix(), "0_StaticEmbedding/tokenizer.json", token=token
-        )
-
-    opened_tensor_file = cast(SafeOpenProtocol, safetensors.safe_open(embeddings_path, framework="numpy"))
-    embeddings = opened_tensor_file.get_tensor("embedding.weight")
-
-    tokenizer: Tokenizer = Tokenizer.from_file(str(tokenizer_path))
-    config = json.load(open(config_path))
-
-    if len(tokenizer.get_vocab()) != len(embeddings):
-        logger.warning(
-            f"Number of tokens does not match number of embeddings: `{len(tokenizer.get_vocab())}` vs `{len(embeddings)}`"
-        )
-
-    return embeddings, tokenizer, config, metadata
-
-
 def load_pretrained(
-    folder_or_repo_path: str | Path, token: str | None = None
+    folder_or_repo_path: str | Path, token: str | None = None, from_sentence_transformers: bool = False
 ) -> tuple[np.ndarray, Tokenizer, dict[str, Any], dict[str, Any]]:
     """
     Loads a pretrained model from a folder.
@@ -162,26 +93,31 @@ def load_pretrained(
         - If this is a local path, we will load from the local path.
         - If the local path is not found, we will attempt to load from the huggingface hub.
     :param token: The huggingface token to use.
+    :param from_sentence_transformers: Whether to load the model from a sentence transformers model.
     :raises: FileNotFoundError if the folder exists, but the file does not exist locally.
     :return: The embeddings, tokenizer, config, and metadata.
 
     """
+    if from_sentence_transformers:
+        model_file = "0_StaticEmbedding/model.safetensors"
+        tokenizer_file = "0_StaticEmbedding/tokenizer.json"
+        config_name = "config_sentence_transformers.json"
+    else:
+        model_file = "model.safetensors"
+        tokenizer_file = "tokenizer.json"
+        config_name = "config.json"
+
     folder_or_repo_path = Path(folder_or_repo_path)
     if folder_or_repo_path.exists():
-        embeddings_path = folder_or_repo_path / "model.safetensors"
+        embeddings_path = folder_or_repo_path / model_file
         if not embeddings_path.exists():
-            old_embeddings_path = folder_or_repo_path / "embeddings.safetensors"
-            if old_embeddings_path.exists():
-                logger.warning("Old embeddings file found. Please rename to `model.safetensors` and re-save.")
-                embeddings_path = old_embeddings_path
-            else:
-                raise FileNotFoundError(f"Embeddings file does not exist in {folder_or_repo_path}")
-
-        config_path = folder_or_repo_path / "config.json"
+            raise FileNotFoundError(f"Embeddings file does not exist in {folder_or_repo_path}")
+
+        config_path = folder_or_repo_path / config_name
         if not config_path.exists():
             raise FileNotFoundError(f"Config file does not exist in {folder_or_repo_path}")
 
-        tokenizer_path = folder_or_repo_path / "tokenizer.json"
+        tokenizer_path = folder_or_repo_path / tokenizer_file
         if not tokenizer_path.exists():
             raise FileNotFoundError(f"Tokenizer file does not exist in {folder_or_repo_path}")
 
@@ -191,18 +127,7 @@ def load_pretrained(
 
     else:
         logger.info("Folder does not exist locally, attempting to use huggingface hub.")
-        try:
-            embeddings_path = huggingface_hub.hf_hub_download(
-                folder_or_repo_path.as_posix(), "model.safetensors", token=token
-            )
-        except huggingface_hub.utils.EntryNotFoundError as e:
-            try:
-                embeddings_path = huggingface_hub.hf_hub_download(
-                    folder_or_repo_path.as_posix(), "embeddings.safetensors", token=token
-                )
-            except huggingface_hub.utils.EntryNotFoundError:
-                # Raise original exception.
-                raise e
+        embeddings_path = huggingface_hub.hf_hub_download(folder_or_repo_path.as_posix(), model_file, token=token)
 
         try:
             readme_path = huggingface_hub.hf_hub_download(folder_or_repo_path.as_posix(), "README.md", token=token)
@@ -211,11 +136,14 @@ def load_pretrained(
             logger.info("No README found in the model folder. No model card loaded.")
             metadata = {}
 
-        config_path = huggingface_hub.hf_hub_download(folder_or_repo_path.as_posix(), "config.json", token=token)
-        tokenizer_path = huggingface_hub.hf_hub_download(folder_or_repo_path.as_posix(), "tokenizer.json", token=token)
+        config_path = huggingface_hub.hf_hub_download(folder_or_repo_path.as_posix(), config_name, token=token)
+        tokenizer_path = huggingface_hub.hf_hub_download(folder_or_repo_path.as_posix(), tokenizer_file, token=token)
 
     opened_tensor_file = cast(SafeOpenProtocol, safetensors.safe_open(embeddings_path, framework="numpy"))
-    embeddings = opened_tensor_file.get_tensor("embeddings")
+    if from_sentence_transformers:
+        embeddings = opened_tensor_file.get_tensor("embedding.weight")
+    else:
+        embeddings = opened_tensor_file.get_tensor("embeddings")
 
     tokenizer: Tokenizer = Tokenizer.from_file(str(tokenizer_path))
     config = json.load(open(config_path))
diff --git a/model2vec/model.py b/model2vec/model.py
index bce335f1..5d60d8b7 100644
--- a/model2vec/model.py
+++ b/model2vec/model.py
@@ -160,7 +160,7 @@ def from_pretrained(
         """
         from model2vec.hf_utils import load_pretrained
 
-        embeddings, tokenizer, config, metadata = load_pretrained(path, token=token)
+        embeddings, tokenizer, config, metadata = load_pretrained(path, token=token, from_sentence_transformers=False)
 
         return cls(
             embeddings, tokenizer, config, base_model_name=metadata.get("base_model"), language=metadata.get("language")
@@ -181,9 +181,9 @@ def from_sentence_transformers(
         :param token: The huggingface token to use.
         :return: A StaticModel
         """
-        from model2vec.hf_utils import load_from_st
+        from model2vec.hf_utils import load_pretrained
 
-        embeddings, tokenizer, config, metadata = load_from_st(path, token=token)
+        embeddings, tokenizer, config, _ = load_pretrained(path, token=token, from_sentence_transformers=True)
 
         return cls(embeddings, tokenizer, config, base_model_name=None, language=None)
 

From 3f283bbdee898415d9aeac5b02d32694af89cbeb Mon Sep 17 00:00:00 2001
From: stephantul <stephantul@gmail.com>
Date: Wed, 15 Jan 2025 20:46:53 +0100
Subject: [PATCH 3/3] lock file

---
 uv.lock | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/uv.lock b/uv.lock
index 596a7f1a..ee17e1ec 100644
--- a/uv.lock
+++ b/uv.lock
@@ -160,7 +160,7 @@ name = "click"
 version = "8.1.8"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "colorama", marker = "platform_system == 'Windows'" },
+    { name = "colorama", marker = "sys_platform == 'win32'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/b9/2e/0090cbf739cee7d23781ad4b89a9894a41538e4fcf4c31dcdd705b78eb8b/click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a", size = 226593 }
 wheels = [
@@ -536,7 +536,7 @@ wheels = [
 
 [[package]]
 name = "model2vec"
-version = "0.3.4"
+version = "0.3.5"
 source = { editable = "." }
 dependencies = [
     { name = "jinja2" },
@@ -1627,19 +1627,19 @@ dependencies = [
     { name = "jinja2" },
     { name = "networkx", version = "3.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
     { name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
-    { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" },
-    { name = "nvidia-cuda-cupti-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" },
-    { name = "nvidia-cuda-nvrtc-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" },
-    { name = "nvidia-cuda-runtime-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" },
-    { name = "nvidia-cudnn-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" },
-    { name = "nvidia-cufft-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" },
-    { name = "nvidia-curand-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" },
-    { name = "nvidia-cusolver-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" },
-    { name = "nvidia-cusparse-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" },
-    { name = "nvidia-nccl-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" },
-    { name = "nvidia-nvtx-cu12", marker = "platform_machine == 'x86_64' and platform_system == 'Linux'" },
+    { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cuda-cupti-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cuda-nvrtc-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cuda-runtime-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cudnn-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cufft-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-curand-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cusolver-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cusparse-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-nccl-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-nvtx-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "sympy" },
-    { name = "triton", marker = "python_full_version < '3.13' and platform_machine == 'x86_64' and platform_system == 'Linux'" },
+    { name = "triton", marker = "python_full_version < '3.13' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "typing-extensions" },
 ]
 wheels = [
@@ -1666,7 +1666,7 @@ name = "tqdm"
 version = "4.67.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "colorama", marker = "platform_system == 'Windows'" },
+    { name = "colorama", marker = "sys_platform == 'win32'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/a8/4b/29b4ef32e036bb34e4ab51796dd745cdba7ed47ad142a9f4a1eb8e0c744d/tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2", size = 169737 }
 wheels = [