Do not save sparse vectors for OSNeuralSparseDocV3GTE

ghukill · ghukill · commit 3810f03d0c2f · 2025-11-14T09:49:54.000-05:00
Why these changes are being introduced:

Our initial pass with the embedding class OSNeuralSparseDocV3GTE was to save both
the sparse vector and the decoded token:weights.  Each sparse vector was the length of the
model vocabulary, about 30k, with mostly zeros.  While technically this could be used for
analysis beyond just the decoded token:weights given to OpenSearch, the data transfer
and storage overhead exceeds any known use cases at the moment.

How this addresses that need:

The OSNeuralSparseDocV3GTE embedding model is updated to not include the sparse vector
for the Embedding.embedding_vector property on output.

This can easily be turned on later, with an inline code comment showing how to toggle
that back on.

Side effects of this change:
* No sparse vectors are stored for now, storage is decreased.

Relevant ticket(s):
* None
diff --git a/embeddings/embedding.py b/embeddings/embedding.py
@@ -48,8 +48,8 @@ class Embedding:
     run_record_offset: int
     model_uri: str
     embedding_strategy: str
-    embedding_vector: list[float]
-    embedding_token_weights: dict
+    embedding_vector: list[float] | None
+    embedding_token_weights: dict | None
 
     timestamp: datetime.datetime = field(
         default_factory=lambda: datetime.datetime.now(datetime.UTC)
diff --git a/embeddings/models/os_neural_sparse_doc_v3_gte.py b/embeddings/models/os_neural_sparse_doc_v3_gte.py
@@ -247,8 +247,11 @@ def _get_embedding_from_sparse_vector(
         decoded_token_weights = cast("list[tuple[str, float]]", decoded_token_weights)
         embedding_token_weights = dict(decoded_token_weights)
 
-        # prepare sparse vector for JSON serialization
-        embedding_vector = sparse_vector.to_dense().tolist()
+        # # prepare sparse vector for JSON serialization
+        # NOTE: at this time we are NOT including the sparse vector for output.  This
+        #   block can be uncommented in the future to include it when wanted.
+        # embedding_vector = sparse_vector.to_dense().tolist() # noqa: ERA001
+        embedding_vector = None
 
         return Embedding(
             timdex_record_id=embedding_input.timdex_record_id,
diff --git a/tests/test_os_neural_sparse_doc_v3_gte.py b/tests/test_os_neural_sparse_doc_v3_gte.py
@@ -217,7 +217,7 @@ def test_create_embedding_returns_embedding_object(tmp_path):
     assert embedding.run_record_offset == 42
     assert embedding.model_uri == model.model_uri
     assert embedding.embedding_strategy == "title_only"
-    assert embedding.embedding_vector == pytest.approx([0.1, 0.2])
+    assert embedding.embedding_vector is None
     assert embedding.embedding_token_weights == {"sum": pytest.approx(0.3)}
 
 
@@ -257,6 +257,6 @@ def test_create_embeddings_consumes_iterator_and_returns_embeddings(
 
     assert len(embeddings) == 2
     assert embeddings[0].timdex_record_id == "id-1"
-    assert embeddings[0].embedding_vector == pytest.approx([0.1, 0.2])
+    assert embeddings[0].embedding_vector is None
     assert embeddings[1].timdex_record_id == "id-2"
-    assert embeddings[1].embedding_vector == pytest.approx([0.3, 0.4])
+    assert embeddings[1].embedding_vector is None