Skip to content

Commit

Permalink
Handle invalid metadata for SQLDocumentStore (deepset-ai#2868)
Browse files Browse the repository at this point in the history
* modify notebook

* skip invalid metadata

* Update Documentation & Code Style

* fix nonetype

* fix nonetype

* drop nonetype from valid types

* drop nonetype from valid types

* fix

* Update sql.py

* sqlalchemy validation

* removed newlines

* Update Documentation & Code Style

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
  • Loading branch information
2 people authored and andrch-FS committed Jul 26, 2022
1 parent 0caceee commit 12477c6
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 2 deletions.
20 changes: 18 additions & 2 deletions haystack/document_stores/sql.py
Expand Up @@ -21,7 +21,7 @@
ForeignKeyConstraint,
)
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship, sessionmaker
from sqlalchemy.orm import relationship, sessionmaker, validates
from sqlalchemy.sql import case, null
except (ImportError, ModuleNotFoundError) as ie:
from haystack.utils.import_utils import _optional_component_not_installed
Expand Down Expand Up @@ -73,6 +73,17 @@ class MetaDocumentORM(ORMBase):
{},
) # type: ignore

valid_metadata_types = (str, int, float, bool, bytes, bytearray, type(None))

@validates("value")
def validate_value(self, key, value):
if not isinstance(value, self.valid_metadata_types):
raise TypeError(
f"Discarded metadata '{self.name}', since it has invalid type: {type(value).__name__}.\n"
f"SQLDocumentStore can accept and cast to string only the following types: {', '.join([el.__name__ for el in self.valid_metadata_types])}"
)
return value


class LabelORM(ORMBase):
__tablename__ = "label"
Expand Down Expand Up @@ -386,7 +397,12 @@ def write_documents(
for doc in document_objects[i : i + batch_size]:
meta_fields = doc.meta or {}
vector_id = meta_fields.pop("vector_id", None)
meta_orms = [MetaDocumentORM(name=key, value=value) for key, value in meta_fields.items()]
meta_orms = []
for key, value in meta_fields.items():
try:
meta_orms.append(MetaDocumentORM(name=key, value=value))
except TypeError as ex:
logger.error(f"Document {doc.id} - {ex}")
doc_mapping = {
"id": doc.id,
"content": doc.to_dict()["content"],
Expand Down
24 changes: 24 additions & 0 deletions test/document_stores/test_document_store.py
Expand Up @@ -479,6 +479,30 @@ def test_write_document_meta(document_store: BaseDocumentStore):
assert document_store.get_document_by_id("4").meta["meta_field"] == "test4"


@pytest.mark.parametrize("document_store", ["sql"], indirect=True)
def test_write_document_sql_invalid_meta(document_store: BaseDocumentStore):
documents = [
{
"content": "dict_with_invalid_meta",
"valid_meta_field": "test1",
"invalid_meta_field": [1, 2, 3],
"name": "filename1",
"id": "1",
},
Document(
content="document_object_with_invalid_meta",
meta={"valid_meta_field": "test2", "invalid_meta_field": [1, 2, 3], "name": "filename2"},
id="2",
),
]
document_store.write_documents(documents)
documents_in_store = document_store.get_all_documents()
assert len(documents_in_store) == 2

assert document_store.get_document_by_id("1").meta == {"name": "filename1", "valid_meta_field": "test1"}
assert document_store.get_document_by_id("2").meta == {"name": "filename2", "valid_meta_field": "test2"}


def test_write_document_index(document_store: BaseDocumentStore):
document_store.delete_index("haystack_test_one")
document_store.delete_index("haystack_test_two")
Expand Down

0 comments on commit 12477c6

Please sign in to comment.