From 4f317873599d23c63d423d322c283a988c20a2ca Mon Sep 17 00:00:00 2001 From: aCampello Date: Tue, 20 Oct 2020 22:05:06 +0100 Subject: [PATCH 01/43] Add NER filth --- scrubadub/filth/ner.py | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 scrubadub/filth/ner.py diff --git a/scrubadub/filth/ner.py b/scrubadub/filth/ner.py new file mode 100644 index 00000000..d84fbe2a --- /dev/null +++ b/scrubadub/filth/ner.py @@ -0,0 +1,5 @@ +from .base import Filth + + +class NERFilth(Filth): + type = 'ner' From 449d150192f60b50375d7cbf885c3bd344f130b3 Mon Sep 17 00:00:00 2001 From: aCampello Date: Tue, 20 Oct 2020 22:05:35 +0100 Subject: [PATCH 02/43] Add NER filth to __init__ --- scrubadub/filth/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scrubadub/filth/__init__.py b/scrubadub/filth/__init__.py index 63697370..f24f180c 100644 --- a/scrubadub/filth/__init__.py +++ b/scrubadub/filth/__init__.py @@ -4,6 +4,7 @@ from .email import EmailFilth from .known import KnownFilth from .name import NameFilth +from .ner import NERFilth from .organization import OrganizationFilth from .phone import PhoneFilth from .postalcode import PostalCodeFilth From ed77716491270b8437d7961ea759eed31a5c70db Mon Sep 17 00:00:00 2001 From: aCampello Date: Tue, 20 Oct 2020 22:34:57 +0100 Subject: [PATCH 03/43] Add NER detector --- scrubadub/detectors/ner.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 scrubadub/detectors/ner.py diff --git a/scrubadub/detectors/ner.py b/scrubadub/detectors/ner.py new file mode 100644 index 00000000..acf6907f --- /dev/null +++ b/scrubadub/detectors/ner.py @@ -0,0 +1,31 @@ +import spacy + +from typing import List, Optional, Generator + +from .base import Detector +from ..filth import NERFilth, Filth +from ..utils import CanonicalStringSet + + +class SpacyDetector(Detector): + """Use spacy's named entity recognition to clean named entities. + List specific entities to include passing ``named_entities`` + """ + filth_cls = NERFilth + name = 'spacy_ner' + + disallowed_nouns = CanonicalStringSet(["skype"]) + + def __init__(self, named_entities: Optional[List[str]] = None, model: str = "en_core_web_trf", **kwargs): + self.named_entities = named_entities + if model not in spacy.info()['pipelines']: + raise OSError(f"Can't find model '{model}'. If it is a valid Spacy model, " + f"download it (e.g. with the CLI command " + f"`python -m spacy download {model}`).") + self.nlp = spacy.load(model) + # Only enable necessary pipes + self.nlp.select_pipes(enable=["transformer", "tagger", "parser", "ner"]) + super(SpacyDetector, self).__init__(**kwargs) + + def iter_filth(self, text: str, document_name: Optional[str] = None) -> Generator[Filth, None, None]: + pass From b20c212c9a471bad0610fcc9f094af8826e98630 Mon Sep 17 00:00:00 2001 From: aCampello Date: Thu, 22 Oct 2020 22:59:11 +0100 Subject: [PATCH 04/43] Add iter_filth_documents basic logic --- scrubadub/detectors/ner.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/scrubadub/detectors/ner.py b/scrubadub/detectors/ner.py index acf6907f..4c7bf4e6 100644 --- a/scrubadub/detectors/ner.py +++ b/scrubadub/detectors/ner.py @@ -1,6 +1,6 @@ import spacy -from typing import List, Optional, Generator +from typing import Dict, Generator, List, Optional, Set, Sequence, Union from .base import Detector from ..filth import NERFilth, Filth @@ -9,15 +9,18 @@ class SpacyDetector(Detector): """Use spacy's named entity recognition to clean named entities. - List specific entities to include passing ``named_entities`` + List specific entities to include passing ``named_entities``, e.g. + (PERSON) """ filth_cls = NERFilth name = 'spacy_ner' disallowed_nouns = CanonicalStringSet(["skype"]) - def __init__(self, named_entities: Optional[List[str]] = None, model: str = "en_core_web_trf", **kwargs): - self.named_entities = named_entities + def __init__(self, named_entities: Optional[Union[List[str], Set[str]]] = {'PERSON'}, + model: str = "en_core_web_trf", **kwargs): + # Spacy NER are all upper cased + self.named_entities = {entity.upper() for entity in named_entities} if model not in spacy.info()['pipelines']: raise OSError(f"Can't find model '{model}'. If it is a valid Spacy model, " f"download it (e.g. with the CLI command " @@ -27,5 +30,18 @@ def __init__(self, named_entities: Optional[List[str]] = None, model: str = "en_ self.nlp.select_pipes(enable=["transformer", "tagger", "parser", "ner"]) super(SpacyDetector, self).__init__(**kwargs) + def iter_filth_documents(self, documents: Union[Sequence[str], Dict[str, str]]) -> Generator[Filth, None, None]: + if isinstance(documents, list): + for doc in self.nlp.pipe(documents): + for ent in doc.ents: + if ent.label_ in self.named_entities: + yield self.filth_cls(beg=ent.start_char, + end=ent.end_char, + text=ent.text, + detector_name=self.name) + def iter_filth(self, text: str, document_name: Optional[str] = None) -> Generator[Filth, None, None]: pass + + + From e372b0771b77b06a704fd35454c10afce79e3e13 Mon Sep 17 00:00:00 2001 From: aCampello Date: Thu, 22 Oct 2020 23:27:57 +0100 Subject: [PATCH 05/43] Add support to different types for documents --- scrubadub/detectors/ner.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/scrubadub/detectors/ner.py b/scrubadub/detectors/ner.py index 4c7bf4e6..3196498d 100644 --- a/scrubadub/detectors/ner.py +++ b/scrubadub/detectors/ner.py @@ -1,6 +1,6 @@ import spacy -from typing import Dict, Generator, List, Optional, Set, Sequence, Union +from typing import Dict, Generator, List, Optional, Set, Sequence, Tuple, Union from .base import Detector from ..filth import NERFilth, Filth @@ -17,7 +17,7 @@ class SpacyDetector(Detector): disallowed_nouns = CanonicalStringSet(["skype"]) - def __init__(self, named_entities: Optional[Union[List[str], Set[str]]] = {'PERSON'}, + def __init__(self, named_entities: Union[List[str], Set[str]] = {'PERSON'}, model: str = "en_core_web_trf", **kwargs): # Spacy NER are all upper cased self.named_entities = {entity.upper() for entity in named_entities} @@ -32,13 +32,20 @@ def __init__(self, named_entities: Optional[Union[List[str], Set[str]]] = {'PERS def iter_filth_documents(self, documents: Union[Sequence[str], Dict[str, str]]) -> Generator[Filth, None, None]: if isinstance(documents, list): - for doc in self.nlp.pipe(documents): - for ent in doc.ents: - if ent.label_ in self.named_entities: - yield self.filth_cls(beg=ent.start_char, - end=ent.end_char, - text=ent.text, - detector_name=self.name) + doc_names, doc_list = zip(*enumerate(documents)) + elif isinstance(documents, dict): + doc_names, doc_list = zip(*documents.items()) + else: + raise TypeError('documents must be one of a string, list of strings or dict of strings.') + + for doc_name, doc in zip(doc_names, self.nlp.pipe(doc_list)): + for ent in doc.ents: + if ent.label_ in self.named_entities: + yield self.filth_cls(beg=ent.start_char, + end=ent.end_char, + text=ent.text, + document_name=str(doc_name), + detector_name=self.name) def iter_filth(self, text: str, document_name: Optional[str] = None) -> Generator[Filth, None, None]: pass From cd8b48987bda91e3889fed5ea072fcda8b6785d8 Mon Sep 17 00:00:00 2001 From: aCampello Date: Thu, 22 Oct 2020 23:32:51 +0100 Subject: [PATCH 06/43] Flake8 tweaks --- scrubadub/detectors/ner.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/scrubadub/detectors/ner.py b/scrubadub/detectors/ner.py index 3196498d..51b2e96e 100644 --- a/scrubadub/detectors/ner.py +++ b/scrubadub/detectors/ner.py @@ -1,6 +1,6 @@ import spacy -from typing import Dict, Generator, List, Optional, Set, Sequence, Tuple, Union +from typing import Dict, Generator, List, Optional, Set, Sequence, Union from .base import Detector from ..filth import NERFilth, Filth @@ -49,6 +49,3 @@ def iter_filth_documents(self, documents: Union[Sequence[str], Dict[str, str]]) def iter_filth(self, text: str, document_name: Optional[str] = None) -> Generator[Filth, None, None]: pass - - - From 7e5b3d5d858525f1add4abb78c5562f9a9386a38 Mon Sep 17 00:00:00 2001 From: aCampello Date: Fri, 23 Oct 2020 08:46:46 +0100 Subject: [PATCH 07/43] Edit f-string for 3.5 compatibility --- scrubadub/detectors/ner.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scrubadub/detectors/ner.py b/scrubadub/detectors/ner.py index 51b2e96e..a78c7f6a 100644 --- a/scrubadub/detectors/ner.py +++ b/scrubadub/detectors/ner.py @@ -22,9 +22,9 @@ def __init__(self, named_entities: Union[List[str], Set[str]] = {'PERSON'}, # Spacy NER are all upper cased self.named_entities = {entity.upper() for entity in named_entities} if model not in spacy.info()['pipelines']: - raise OSError(f"Can't find model '{model}'. If it is a valid Spacy model, " - f"download it (e.g. with the CLI command " - f"`python -m spacy download {model}`).") + raise OSError("Can't find model '{}'. If it is a valid Spacy model, " + "download it (e.g. with the CLI command " + "`python -m spacy download {}`).".format(model, model)) self.nlp = spacy.load(model) # Only enable necessary pipes self.nlp.select_pipes(enable=["transformer", "tagger", "parser", "ner"]) From 3d9a888b7100fc52bbe121e8fc03027d39433545 Mon Sep 17 00:00:00 2001 From: aCampello Date: Fri, 23 Oct 2020 18:38:51 +0100 Subject: [PATCH 08/43] Add iter_filth --- scrubadub/detectors/ner.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/scrubadub/detectors/ner.py b/scrubadub/detectors/ner.py index a78c7f6a..3a6f4d07 100644 --- a/scrubadub/detectors/ner.py +++ b/scrubadub/detectors/ner.py @@ -30,6 +30,16 @@ def __init__(self, named_entities: Union[List[str], Set[str]] = {'PERSON'}, self.nlp.select_pipes(enable=["transformer", "tagger", "parser", "ner"]) super(SpacyDetector, self).__init__(**kwargs) + def _iter_spacy_pipeline(self, doc_names: Sequence[Optional[str]], doc_list: Sequence[str]): + for doc_name, doc in zip(doc_names, self.nlp.pipe(doc_list)): + for ent in doc.ents: + if ent.label_ in self.named_entities: + yield self.filth_cls(beg=ent.start_char, + end=ent.end_char, + text=ent.text, + document_name=(str(doc_name) if doc_name else None), + detector_name=self.name) + def iter_filth_documents(self, documents: Union[Sequence[str], Dict[str, str]]) -> Generator[Filth, None, None]: if isinstance(documents, list): doc_names, doc_list = zip(*enumerate(documents)) @@ -38,14 +48,7 @@ def iter_filth_documents(self, documents: Union[Sequence[str], Dict[str, str]]) else: raise TypeError('documents must be one of a string, list of strings or dict of strings.') - for doc_name, doc in zip(doc_names, self.nlp.pipe(doc_list)): - for ent in doc.ents: - if ent.label_ in self.named_entities: - yield self.filth_cls(beg=ent.start_char, - end=ent.end_char, - text=ent.text, - document_name=str(doc_name), - detector_name=self.name) + yield from self._iter_spacy_pipeline(doc_names, doc_list) def iter_filth(self, text: str, document_name: Optional[str] = None) -> Generator[Filth, None, None]: - pass + yield from self._iter_spacy_pipeline([document_name], [text]) From 3aaebb44c431364e1b52d94b498cb1fa96a8434e Mon Sep 17 00:00:00 2001 From: aCampello Date: Fri, 23 Oct 2020 18:39:21 +0100 Subject: [PATCH 09/43] Simplify name_entities type --- scrubadub/detectors/ner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scrubadub/detectors/ner.py b/scrubadub/detectors/ner.py index 3a6f4d07..bea48ff5 100644 --- a/scrubadub/detectors/ner.py +++ b/scrubadub/detectors/ner.py @@ -1,6 +1,6 @@ import spacy -from typing import Dict, Generator, List, Optional, Set, Sequence, Union +from typing import Dict, Generator, Iterable, Optional, Sequence, Union from .base import Detector from ..filth import NERFilth, Filth @@ -17,7 +17,7 @@ class SpacyDetector(Detector): disallowed_nouns = CanonicalStringSet(["skype"]) - def __init__(self, named_entities: Union[List[str], Set[str]] = {'PERSON'}, + def __init__(self, named_entities: Iterable[str] = {'PERSON'}, model: str = "en_core_web_trf", **kwargs): # Spacy NER are all upper cased self.named_entities = {entity.upper() for entity in named_entities} From 7a3c679f65bc185d437303020b52c904c4ef5923 Mon Sep 17 00:00:00 2001 From: aCampello Date: Fri, 23 Oct 2020 18:42:04 +0100 Subject: [PATCH 10/43] Rename named-entity-filth --- scrubadub/detectors/ner.py | 4 ++-- scrubadub/filth/__init__.py | 2 +- scrubadub/filth/named_entity.py | 5 +++++ scrubadub/filth/ner.py | 5 ----- 4 files changed, 8 insertions(+), 8 deletions(-) create mode 100644 scrubadub/filth/named_entity.py delete mode 100644 scrubadub/filth/ner.py diff --git a/scrubadub/detectors/ner.py b/scrubadub/detectors/ner.py index bea48ff5..222bf7af 100644 --- a/scrubadub/detectors/ner.py +++ b/scrubadub/detectors/ner.py @@ -3,7 +3,7 @@ from typing import Dict, Generator, Iterable, Optional, Sequence, Union from .base import Detector -from ..filth import NERFilth, Filth +from ..filth import NamedEntityFilth, Filth from ..utils import CanonicalStringSet @@ -12,7 +12,7 @@ class SpacyDetector(Detector): List specific entities to include passing ``named_entities``, e.g. (PERSON) """ - filth_cls = NERFilth + filth_cls = NamedEntityFilth name = 'spacy_ner' disallowed_nouns = CanonicalStringSet(["skype"]) diff --git a/scrubadub/filth/__init__.py b/scrubadub/filth/__init__.py index f24f180c..dde59c71 100644 --- a/scrubadub/filth/__init__.py +++ b/scrubadub/filth/__init__.py @@ -4,7 +4,7 @@ from .email import EmailFilth from .known import KnownFilth from .name import NameFilth -from .ner import NERFilth +from .named_entity import NamedEntityFilth from .organization import OrganizationFilth from .phone import PhoneFilth from .postalcode import PostalCodeFilth diff --git a/scrubadub/filth/named_entity.py b/scrubadub/filth/named_entity.py new file mode 100644 index 00000000..57b5ba8c --- /dev/null +++ b/scrubadub/filth/named_entity.py @@ -0,0 +1,5 @@ +from .base import Filth + + +class NamedEntityFilth(Filth): + type = 'named_entity' diff --git a/scrubadub/filth/ner.py b/scrubadub/filth/ner.py deleted file mode 100644 index d84fbe2a..00000000 --- a/scrubadub/filth/ner.py +++ /dev/null @@ -1,5 +0,0 @@ -from .base import Filth - - -class NERFilth(Filth): - type = 'ner' From c2b9aeb283d93e34cf9fb9fb94caaf2c81478613 Mon Sep 17 00:00:00 2001 From: aCampello Date: Sat, 24 Oct 2020 00:22:23 +0100 Subject: [PATCH 11/43] Figure out which detectors can run on a batch of documents --- scrubadub/scrubbers.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/scrubadub/scrubbers.py b/scrubadub/scrubbers.py index 6e7d72b8..7fb1c424 100644 --- a/scrubadub/scrubbers.py +++ b/scrubadub/scrubbers.py @@ -263,6 +263,16 @@ def iter_filth_documents( if not isinstance(documents, (dict, list)): raise TypeError('documents must be one of a string, list of strings or dict of strings.') + # Figures out which detectors can run on a list of documents + + batch_detector_names = [name for name, detector in self._detectors + if callable(hasattr(detector, 'iter_filth_documents', None))] + + filth_list = [] + for name in batch_detector_names: + for filth in self._detectors[name].iter_filth_documents(documents): + filth_list.append(filth) + if run_post_processors: # Only collect the filts into a list if we need to do post processing filth_list = [] # type: List[Filth] From c12b3d677950e8df6786f9795071c79f5d32c8bd Mon Sep 17 00:00:00 2001 From: aCampello Date: Sat, 24 Oct 2020 00:30:08 +0100 Subject: [PATCH 12/43] Add possibility to disable detector --- scrubadub/scrubbers.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/scrubadub/scrubbers.py b/scrubadub/scrubbers.py index 7fb1c424..2c9d6298 100644 --- a/scrubadub/scrubbers.py +++ b/scrubadub/scrubbers.py @@ -222,7 +222,8 @@ def _post_process_filth_list(self, filth_list: Sequence[Filth]) -> Sequence[Filt return filth_list def iter_filth( - self, text: str, document_name: Optional[str] = None, run_post_processors: bool = True + self, text: str, document_name: Optional[str] = None, run_post_processors: bool = True, + exclude_detectors: Optional[List[str]] = None ) -> Generator[Filth, None, None]: """Iterate over the different types of filth that can exist. """ @@ -234,11 +235,12 @@ def iter_filth( # over all detectors simultaneously. just trying to get something # working right now and we can worry about efficiency later all_filths = [] # type: List[Filth] - for detector in self._detectors.values(): - for filth in detector.iter_filth(text, document_name=document_name): - if not isinstance(filth, Filth): - raise TypeError('iter_filth must always yield Filth') - all_filths.append(filth) + for name, detector in self._detectors.items(): + if exclude_detectors is None or name not in exclude_detectors: + for filth in detector.iter_filth(text, document_name=document_name): + if not isinstance(filth, Filth): + raise TypeError('iter_filth must always yield Filth') + all_filths.append(filth) # This is split up so that we only have to use lists if we have to post_process Filth if run_post_processors: @@ -280,7 +282,7 @@ def iter_filth_documents( filth_list = [ filth for name, text in documents.items() - for filth in self.iter_filth(text, document_name=name, run_post_processors=False) + for filth in self.iter_filth(text, document_name=name, run_post_processors=False, exclude_detectors=[]) ] elif isinstance(documents, list): filth_list = [ From 7f05d28c5a5bf56ac3c882c929bc1ba60ec2153f Mon Sep 17 00:00:00 2001 From: aCampello Date: Sat, 24 Oct 2020 14:10:11 +0100 Subject: [PATCH 13/43] Logic to scrubbers to detect if a detector has document iterator --- scrubadub/scrubbers.py | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/scrubadub/scrubbers.py b/scrubadub/scrubbers.py index 2c9d6298..63c2c440 100644 --- a/scrubadub/scrubbers.py +++ b/scrubadub/scrubbers.py @@ -265,30 +265,33 @@ def iter_filth_documents( if not isinstance(documents, (dict, list)): raise TypeError('documents must be one of a string, list of strings or dict of strings.') - # Figures out which detectors can run on a list of documents - - batch_detector_names = [name for name, detector in self._detectors - if callable(hasattr(detector, 'iter_filth_documents', None))] + # Figures out which detectors have iter_filth_documents and applies to them + document_detectors_names = [] filth_list = [] - for name in batch_detector_names: - for filth in self._detectors[name].iter_filth_documents(documents): - filth_list.append(filth) + + for name, detector in self._detectors.items(): + document_iterator = getattr(detector, 'iter_filth_documents', None) + if callable(document_iterator): + document_detectors_names.append(name) + for filth in document_iterator(documents): + filth_list.append(filth) if run_post_processors: # Only collect the filts into a list if we need to do post processing - filth_list = [] # type: List[Filth] if isinstance(documents, dict): - filth_list = [ + filth_list += [ filth for name, text in documents.items() - for filth in self.iter_filth(text, document_name=name, run_post_processors=False, exclude_detectors=[]) + for filth in self.iter_filth(text, document_name=name, run_post_processors=False, + exclude_detectors=document_detectors_names) ] elif isinstance(documents, list): - filth_list = [ + filth_list += [ filth for i_name, text in enumerate(documents) - for filth in self.iter_filth(text, document_name=str(i_name), run_post_processors=False) + for filth in self.iter_filth(text, document_name=str(i_name), run_post_processors=False, + exclude_detectors=document_detectors_names) ] for filth in self._post_process_filth_list(filth_list): @@ -297,11 +300,13 @@ def iter_filth_documents( # Use generators when we dont post process the Filth if isinstance(documents, dict): for name, text in documents.items(): - for filth in self.iter_filth(text, document_name=name, run_post_processors=False): + for filth in self.iter_filth(text, document_name=name, run_post_processors=False, + exclude_detectors=document_detectors_names): yield filth elif isinstance(documents, list): for i_name, text in enumerate(documents): - for filth in self.iter_filth(text, document_name=str(i_name), run_post_processors=False): + for filth in self.iter_filth(text, document_name=str(i_name), run_post_processors=False, + exclude_detectors=document_detectors_names): yield filth @staticmethod From f413757ffe18d465a6913042b1c92ad7e65d2cba Mon Sep 17 00:00:00 2001 From: aCampello Date: Sat, 24 Oct 2020 14:41:17 +0100 Subject: [PATCH 14/43] Scrubbers to merge with document detectors --- scrubadub/detectors/ner.py | 2 +- scrubadub/scrubbers.py | 24 ++++++++++++++++-------- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/scrubadub/detectors/ner.py b/scrubadub/detectors/ner.py index 222bf7af..b648819f 100644 --- a/scrubadub/detectors/ner.py +++ b/scrubadub/detectors/ner.py @@ -37,7 +37,7 @@ def _iter_spacy_pipeline(self, doc_names: Sequence[Optional[str]], doc_list: Seq yield self.filth_cls(beg=ent.start_char, end=ent.end_char, text=ent.text, - document_name=(str(doc_name) if doc_name else None), + document_name=None or str(doc_name), # None if no doc_name provid detector_name=self.name) def iter_filth_documents(self, documents: Union[Sequence[str], Dict[str, str]]) -> Generator[Filth, None, None]: diff --git a/scrubadub/scrubbers.py b/scrubadub/scrubbers.py index 63c2c440..ada23e9d 100644 --- a/scrubadub/scrubbers.py +++ b/scrubadub/scrubbers.py @@ -223,7 +223,7 @@ def _post_process_filth_list(self, filth_list: Sequence[Filth]) -> Sequence[Filt def iter_filth( self, text: str, document_name: Optional[str] = None, run_post_processors: bool = True, - exclude_detectors: Optional[List[str]] = None + run_merge: bool = True, exclude_detectors: Optional[List[str]] = None ) -> Generator[Filth, None, None]: """Iterate over the different types of filth that can exist. """ @@ -244,7 +244,9 @@ def iter_filth( # This is split up so that we only have to use lists if we have to post_process Filth if run_post_processors: - all_filths = list(self._merge_filths(all_filths)) + if run_merge: + all_filths = list(self._merge_filths(all_filths)) + all_filths = list(self._post_process_filth_list(all_filths)) # Here we loop over a list of Filth... @@ -253,8 +255,12 @@ def iter_filth( else: # ... but here, we're using a generator. If we try to use the same variable it would have two types and # fail static typing in mypy - for filth in self._merge_filths(all_filths): - yield filth + if run_merge: + for filth in self._merge_filths(all_filths): + yield filth + else: + for filth in all_filths: + yield filth def iter_filth_documents( self, @@ -284,16 +290,18 @@ def iter_filth_documents( filth for name, text in documents.items() for filth in self.iter_filth(text, document_name=name, run_post_processors=False, - exclude_detectors=document_detectors_names) + run_merge=False, exclude_detectors=document_detectors_names) ] elif isinstance(documents, list): filth_list += [ filth for i_name, text in enumerate(documents) for filth in self.iter_filth(text, document_name=str(i_name), run_post_processors=False, - exclude_detectors=document_detectors_names) + run_merge=False, exclude_detectors=document_detectors_names) ] + filth_list = list(self._merge_filths(filth_list)) + for filth in self._post_process_filth_list(filth_list): yield filth else: @@ -301,12 +309,12 @@ def iter_filth_documents( if isinstance(documents, dict): for name, text in documents.items(): for filth in self.iter_filth(text, document_name=name, run_post_processors=False, - exclude_detectors=document_detectors_names): + run_merge=False, exclude_detectors=document_detectors_names): yield filth elif isinstance(documents, list): for i_name, text in enumerate(documents): for filth in self.iter_filth(text, document_name=str(i_name), run_post_processors=False, - exclude_detectors=document_detectors_names): + run_merge=False, exclude_detectors=document_detectors_names): yield filth @staticmethod From 00c3343fb917c123debb209f152de3ab5188d9a0 Mon Sep 17 00:00:00 2001 From: aCampello Date: Sat, 24 Oct 2020 16:13:17 +0100 Subject: [PATCH 15/43] Tidy document processors merge --- scrubadub/scrubbers.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/scrubadub/scrubbers.py b/scrubadub/scrubbers.py index ada23e9d..096291d8 100644 --- a/scrubadub/scrubbers.py +++ b/scrubadub/scrubbers.py @@ -1,5 +1,7 @@ +from collections import defaultdict + import warnings -from typing import Optional, Sequence, Generator, Dict, Type, Union, List +from typing import Optional, Sequence, Generator, DefaultDict, Dict, Type, Union, List from . import detectors from . import post_processors @@ -283,8 +285,10 @@ def iter_filth_documents( for filth in document_iterator(documents): filth_list.append(filth) - if run_post_processors: - # Only collect the filts into a list if we need to do post processing + # We have to now merge with the other processors. To do this we need to collect filth into a list + # Also need this if we need to do post processing + + if run_post_processors or document_detectors_names: if isinstance(documents, dict): filth_list += [ filth @@ -302,8 +306,11 @@ def iter_filth_documents( filth_list = list(self._merge_filths(filth_list)) - for filth in self._post_process_filth_list(filth_list): - yield filth + if run_post_processors: + yield from self._post_process_filth_list(filth_list) + else: + for filth in filth_list: + yield filth else: # Use generators when we dont post process the Filth if isinstance(documents, dict): From 9481159c608c6c32276fdd521173269768587f7f Mon Sep 17 00:00:00 2001 From: aCampello Date: Sat, 24 Oct 2020 16:30:53 +0100 Subject: [PATCH 16/43] Named entity filth to accept a label --- scrubadub/detectors/ner.py | 5 +++-- scrubadub/filth/named_entity.py | 10 ++++++++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/scrubadub/detectors/ner.py b/scrubadub/detectors/ner.py index b648819f..cf337bdd 100644 --- a/scrubadub/detectors/ner.py +++ b/scrubadub/detectors/ner.py @@ -37,8 +37,9 @@ def _iter_spacy_pipeline(self, doc_names: Sequence[Optional[str]], doc_list: Seq yield self.filth_cls(beg=ent.start_char, end=ent.end_char, text=ent.text, - document_name=None or str(doc_name), # None if no doc_name provid - detector_name=self.name) + document_name=None or str(doc_name), # None if no doc_name provided + detector_name=self.name, + label=ent.label_) def iter_filth_documents(self, documents: Union[Sequence[str], Dict[str, str]]) -> Generator[Filth, None, None]: if isinstance(documents, list): diff --git a/scrubadub/filth/named_entity.py b/scrubadub/filth/named_entity.py index 57b5ba8c..6b319cff 100644 --- a/scrubadub/filth/named_entity.py +++ b/scrubadub/filth/named_entity.py @@ -2,4 +2,14 @@ class NamedEntityFilth(Filth): + """ + Named entity filth. Upon initialisation provide a label for named entity (e.g. name, org) + """ type = 'named_entity' + + def __init__(self, *args, label: str, **kwargs): + super(NamedEntityFilth, self).__init__(*args, **kwargs) + self.label = label + + def __repr__(self) -> str: + return self._to_string(['text', 'document_name', 'label']) From e148dfcb68e0471adf8d76e35cb2c80da35420c8 Mon Sep 17 00:00:00 2001 From: aCampello Date: Sat, 24 Oct 2020 16:31:38 +0100 Subject: [PATCH 17/43] Add Spacy detector to init --- scrubadub/detectors/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scrubadub/detectors/__init__.py b/scrubadub/detectors/__init__.py index 50a2b9dd..15e955b5 100644 --- a/scrubadub/detectors/__init__.py +++ b/scrubadub/detectors/__init__.py @@ -13,6 +13,7 @@ from .credential import CredentialDetector from .email import EmailDetector, NewEmailDetector from .name import NameDetector +from .ner import SpacyDetector from .phone import PhoneDetector from .postalcode import PostalCodeDetector from .known import KnownFilthDetector From af80edd24a928ae63c49951884cf2fda42ec4999 Mon Sep 17 00:00:00 2001 From: aCampello Date: Sat, 24 Oct 2020 16:33:56 +0100 Subject: [PATCH 18/43] Change detector name to follow the pattern --- scrubadub/detectors/__init__.py | 2 +- scrubadub/detectors/ner.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/scrubadub/detectors/__init__.py b/scrubadub/detectors/__init__.py index 15e955b5..65c01d1c 100644 --- a/scrubadub/detectors/__init__.py +++ b/scrubadub/detectors/__init__.py @@ -13,7 +13,7 @@ from .credential import CredentialDetector from .email import EmailDetector, NewEmailDetector from .name import NameDetector -from .ner import SpacyDetector +from .ner import NamedEntityDetector from .phone import PhoneDetector from .postalcode import PostalCodeDetector from .known import KnownFilthDetector diff --git a/scrubadub/detectors/ner.py b/scrubadub/detectors/ner.py index cf337bdd..385be073 100644 --- a/scrubadub/detectors/ner.py +++ b/scrubadub/detectors/ner.py @@ -7,13 +7,13 @@ from ..utils import CanonicalStringSet -class SpacyDetector(Detector): +class NamedEntityDetector(Detector): """Use spacy's named entity recognition to clean named entities. List specific entities to include passing ``named_entities``, e.g. (PERSON) """ filth_cls = NamedEntityFilth - name = 'spacy_ner' + name = 'named_entity' disallowed_nouns = CanonicalStringSet(["skype"]) @@ -28,7 +28,7 @@ def __init__(self, named_entities: Iterable[str] = {'PERSON'}, self.nlp = spacy.load(model) # Only enable necessary pipes self.nlp.select_pipes(enable=["transformer", "tagger", "parser", "ner"]) - super(SpacyDetector, self).__init__(**kwargs) + super(NamedEntityDetector, self).__init__(**kwargs) def _iter_spacy_pipeline(self, doc_names: Sequence[Optional[str]], doc_list: Sequence[str]): for doc_name, doc in zip(doc_names, self.nlp.pipe(doc_list)): From 86f6e6bb7875143efd4fec09dd0a31b7fe01d8db Mon Sep 17 00:00:00 2001 From: aCampello Date: Sat, 24 Oct 2020 16:34:22 +0100 Subject: [PATCH 19/43] Update module name --- scrubadub/detectors/__init__.py | 2 +- scrubadub/detectors/{ner.py => named_entity.py} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename scrubadub/detectors/{ner.py => named_entity.py} (100%) diff --git a/scrubadub/detectors/__init__.py b/scrubadub/detectors/__init__.py index 65c01d1c..c53273ab 100644 --- a/scrubadub/detectors/__init__.py +++ b/scrubadub/detectors/__init__.py @@ -13,7 +13,7 @@ from .credential import CredentialDetector from .email import EmailDetector, NewEmailDetector from .name import NameDetector -from .ner import NamedEntityDetector +from .named_entity import NamedEntityDetector from .phone import PhoneDetector from .postalcode import PostalCodeDetector from .known import KnownFilthDetector diff --git a/scrubadub/detectors/ner.py b/scrubadub/detectors/named_entity.py similarity index 100% rename from scrubadub/detectors/ner.py rename to scrubadub/detectors/named_entity.py From ae42eb05704f7121c491081fb3decfe8b64038e6 Mon Sep 17 00:00:00 2001 From: aCampello Date: Sat, 24 Oct 2020 16:35:29 +0100 Subject: [PATCH 20/43] Remove unecessary imports --- scrubadub/scrubbers.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/scrubadub/scrubbers.py b/scrubadub/scrubbers.py index 096291d8..58bf0689 100644 --- a/scrubadub/scrubbers.py +++ b/scrubadub/scrubbers.py @@ -1,7 +1,5 @@ -from collections import defaultdict - import warnings -from typing import Optional, Sequence, Generator, DefaultDict, Dict, Type, Union, List +from typing import Optional, Sequence, Generator, Dict, Type, Union, List from . import detectors from . import post_processors From 86b45328e6deff6ee1645b9f2449f81f038897bb Mon Sep 17 00:00:00 2001 From: aCampello Date: Sat, 24 Oct 2020 17:11:50 +0100 Subject: [PATCH 21/43] Change type for NamedEntityFilth depending on label --- scrubadub/detectors/named_entity.py | 4 ++-- scrubadub/filth/named_entity.py | 5 +---- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/scrubadub/detectors/named_entity.py b/scrubadub/detectors/named_entity.py index 385be073..a0bbb2a2 100644 --- a/scrubadub/detectors/named_entity.py +++ b/scrubadub/detectors/named_entity.py @@ -1,7 +1,7 @@ -import spacy - from typing import Dict, Generator, Iterable, Optional, Sequence, Union +import spacy + from .base import Detector from ..filth import NamedEntityFilth, Filth from ..utils import CanonicalStringSet diff --git a/scrubadub/filth/named_entity.py b/scrubadub/filth/named_entity.py index 6b319cff..7be8c530 100644 --- a/scrubadub/filth/named_entity.py +++ b/scrubadub/filth/named_entity.py @@ -9,7 +9,4 @@ class NamedEntityFilth(Filth): def __init__(self, *args, label: str, **kwargs): super(NamedEntityFilth, self).__init__(*args, **kwargs) - self.label = label - - def __repr__(self) -> str: - return self._to_string(['text', 'document_name', 'label']) + self.type = "{}_{}".format(self.type, label).lower() From 45ee26fae4a8d31b66e3307c0ab4aed21678b4b6 Mon Sep 17 00:00:00 2001 From: aCampello Date: Sat, 24 Oct 2020 17:14:37 +0100 Subject: [PATCH 22/43] Revert NamedEntityFilth name because it was a bad idea --- scrubadub/filth/named_entity.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrubadub/filth/named_entity.py b/scrubadub/filth/named_entity.py index 7be8c530..255a331a 100644 --- a/scrubadub/filth/named_entity.py +++ b/scrubadub/filth/named_entity.py @@ -9,4 +9,4 @@ class NamedEntityFilth(Filth): def __init__(self, *args, label: str, **kwargs): super(NamedEntityFilth, self).__init__(*args, **kwargs) - self.type = "{}_{}".format(self.type, label).lower() + self.label = label.lower() From 5dacd62d8d27f6d0d94313ec1bd39857ee314d2f Mon Sep 17 00:00:00 2001 From: aCampello Date: Sat, 24 Oct 2020 17:21:14 +0100 Subject: [PATCH 23/43] Change replacement string of named entity filth --- scrubadub/filth/named_entity.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scrubadub/filth/named_entity.py b/scrubadub/filth/named_entity.py index 255a331a..5358e84a 100644 --- a/scrubadub/filth/named_entity.py +++ b/scrubadub/filth/named_entity.py @@ -10,3 +10,4 @@ class NamedEntityFilth(Filth): def __init__(self, *args, label: str, **kwargs): super(NamedEntityFilth, self).__init__(*args, **kwargs) self.label = label.lower() + self.replacement_string = "{}_{}".format(self.type, self.label) From d319029c0d83e298ee3da03dd692b0a5e5fa602c Mon Sep 17 00:00:00 2001 From: aCampello Date: Sat, 24 Oct 2020 17:37:52 +0100 Subject: [PATCH 24/43] Add spacy nightly to requirements --- requirements/python | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements/python b/requirements/python index 8d00ab70..be22055b 100644 --- a/requirements/python +++ b/requirements/python @@ -3,4 +3,5 @@ argcomplete phonenumbers pandas sklearn -typing_extensions \ No newline at end of file +spacy-nightly[transformers] +typing_extensions From 8a63d479390522801310b37592b50e146cc56f01 Mon Sep 17 00:00:00 2001 From: aCampello Date: Sat, 24 Oct 2020 17:51:49 +0100 Subject: [PATCH 25/43] Add benchmark with spacy accuracy --- tests/benchmark_accuracy.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/tests/benchmark_accuracy.py b/tests/benchmark_accuracy.py index 392125b9..0408ca64 100644 --- a/tests/benchmark_accuracy.py +++ b/tests/benchmark_accuracy.py @@ -11,9 +11,11 @@ def main(): general_docs = [] + named_entity_docs = [] # address_docs = [] # uk_phone_docs = [] known_general_pii = [] + known_named_entity_pii = [] # known_address_pii = [] # known_uk_phone_pii = [] start_time = time.time() @@ -23,6 +25,15 @@ def main(): general_docs.append(new_doc) known_general_pii += new_known_pii + new_doc, new_known_pii = make_fake_document(paragraphs=4, seed=i_doc, filth_types=['name']) + # Change the filth name to allow for comparison with NamedEntityDetector. Probably there is a better way to do it + + for pii in new_known_pii: + pii['filth_type'] = 'named_entity' + + named_entity_docs.append(new_doc) + known_named_entity_pii += new_known_pii + # new_doc, new_known_pii = make_fake_document(paragraphs=4, seed=i_doc, filth_types=['gb_address', 'us_address']) # address_docs.append(new_doc) # known_address_pii += new_known_pii @@ -35,7 +46,6 @@ def main(): scrubber_time = time.time() scrubber = scrubadub.Scrubber() - # scrubber.add_detector(scrubadub.detectors.stanford_ner.StanfordNERDetector()) scrubber.add_detector(scrubadub.detectors.KnownFilthDetector(known_filth_items=known_general_pii)) filth_list = list(scrubber.iter_filth_documents(general_docs)) @@ -57,6 +67,15 @@ def main(): print("Scrubbed documents in {:.2f}s".format(end_time-scrubber_time)) print(get_filth_classification_report(filth_list)) + scrubber_time = time.time() + scrubber = scrubadub.Scrubber(detector_list=[scrubadub.detectors.NamedEntityDetector(), + scrubadub.detectors.KnownFilthDetector(known_filth_items=known_named_entity_pii)]) + filth_list = list(scrubber.iter_filth_documents(named_entity_docs)) + end_time = time.time() + print("Documents generated in {:.2f}s".format(scrubber_time-start_time)) + print("Scrubbed documents in {:.2f}s".format(end_time-scrubber_time)) + print(get_filth_classification_report(filth_list)) + sys.exit(0) From 0d0b83911eb2c13ef958300aafc01deb5a3f30e7 Mon Sep 17 00:00:00 2001 From: aCampello Date: Sat, 24 Oct 2020 18:15:57 +0100 Subject: [PATCH 26/43] Comment named entity test code --- tests/benchmark_accuracy.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/benchmark_accuracy.py b/tests/benchmark_accuracy.py index 0408ca64..3e7e8f1f 100644 --- a/tests/benchmark_accuracy.py +++ b/tests/benchmark_accuracy.py @@ -67,14 +67,14 @@ def main(): print("Scrubbed documents in {:.2f}s".format(end_time-scrubber_time)) print(get_filth_classification_report(filth_list)) - scrubber_time = time.time() - scrubber = scrubadub.Scrubber(detector_list=[scrubadub.detectors.NamedEntityDetector(), - scrubadub.detectors.KnownFilthDetector(known_filth_items=known_named_entity_pii)]) - filth_list = list(scrubber.iter_filth_documents(named_entity_docs)) - end_time = time.time() - print("Documents generated in {:.2f}s".format(scrubber_time-start_time)) - print("Scrubbed documents in {:.2f}s".format(end_time-scrubber_time)) - print(get_filth_classification_report(filth_list)) + # scrubber_time = time.time() + # scrubber = scrubadub.Scrubber(detector_list=[scrubadub.detectors.NamedEntityDetector(), + # scrubadub.detectors.KnownFilthDetector(known_filth_items=known_named_entity_pii)]) + # filth_list = list(scrubber.iter_filth_documents(named_entity_docs)) + # end_time = time.time() + # print("Documents generated in {:.2f}s".format(scrubber_time-start_time)) + # print("Scrubbed documents in {:.2f}s".format(end_time-scrubber_time)) + # print(get_filth_classification_report(filth_list)) sys.exit(0) From f6386dd17be4757a948360d9f7ef4362d9d36734 Mon Sep 17 00:00:00 2001 From: aCampello Date: Mon, 26 Oct 2020 22:41:03 +0000 Subject: [PATCH 27/43] NamedEntityDetector to return standard Filth when it is avaliable --- scrubadub/detectors/named_entity.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/scrubadub/detectors/named_entity.py b/scrubadub/detectors/named_entity.py index a0bbb2a2..a364c59a 100644 --- a/scrubadub/detectors/named_entity.py +++ b/scrubadub/detectors/named_entity.py @@ -3,7 +3,7 @@ import spacy from .base import Detector -from ..filth import NamedEntityFilth, Filth +from ..filth import NamedEntityFilth, Filth, NameFilth, OrganizationFilth from ..utils import CanonicalStringSet @@ -12,7 +12,10 @@ class NamedEntityDetector(Detector): List specific entities to include passing ``named_entities``, e.g. (PERSON) """ - filth_cls = NamedEntityFilth + filth_cls_map = { + 'PERSON': NameFilth, + 'ORG': OrganizationFilth + } name = 'named_entity' disallowed_nouns = CanonicalStringSet(["skype"]) @@ -32,14 +35,17 @@ def __init__(self, named_entities: Iterable[str] = {'PERSON'}, def _iter_spacy_pipeline(self, doc_names: Sequence[Optional[str]], doc_list: Sequence[str]): for doc_name, doc in zip(doc_names, self.nlp.pipe(doc_list)): + print(doc_name) for ent in doc.ents: if ent.label_ in self.named_entities: - yield self.filth_cls(beg=ent.start_char, - end=ent.end_char, - text=ent.text, - document_name=None or str(doc_name), # None if no doc_name provided - detector_name=self.name, - label=ent.label_) + # If there is no standard 'filth', returns a NamedEntity filth + filth_cls = self.filth_cls_map.get(ent.label_, NamedEntityFilth) + yield filth_cls(beg=ent.start_char, + end=ent.end_char, + text=ent.text, + document_name=(str(doc_name) if doc_name else None), # None if no doc_name provided + detector_name=self.name, + label=ent.label_) def iter_filth_documents(self, documents: Union[Sequence[str], Dict[str, str]]) -> Generator[Filth, None, None]: if isinstance(documents, list): From dbdb247ad03ca6b9168f193eadaf28638d718072 Mon Sep 17 00:00:00 2001 From: aCampello Date: Mon, 26 Oct 2020 22:45:13 +0000 Subject: [PATCH 28/43] Change docstring for NamedEntity filth --- scrubadub/filth/named_entity.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scrubadub/filth/named_entity.py b/scrubadub/filth/named_entity.py index 5358e84a..436ea992 100644 --- a/scrubadub/filth/named_entity.py +++ b/scrubadub/filth/named_entity.py @@ -3,7 +3,8 @@ class NamedEntityFilth(Filth): """ - Named entity filth. Upon initialisation provide a label for named entity (e.g. name, org) + Default filth type, for named entities (e.g. the ones in https://nightly.spacy.io/models/en#en_core_web_lg-labels), + except the ones represented in any other filth. """ type = 'named_entity' From 509231605f6c93b9519ee549a06295b76c56198b Mon Sep 17 00:00:00 2001 From: aCampello Date: Mon, 26 Oct 2020 22:48:32 +0000 Subject: [PATCH 29/43] Remove accidental print --- scrubadub/detectors/named_entity.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scrubadub/detectors/named_entity.py b/scrubadub/detectors/named_entity.py index a364c59a..353a6978 100644 --- a/scrubadub/detectors/named_entity.py +++ b/scrubadub/detectors/named_entity.py @@ -35,7 +35,6 @@ def __init__(self, named_entities: Iterable[str] = {'PERSON'}, def _iter_spacy_pipeline(self, doc_names: Sequence[Optional[str]], doc_list: Sequence[str]): for doc_name, doc in zip(doc_names, self.nlp.pipe(doc_list)): - print(doc_name) for ent in doc.ents: if ent.label_ in self.named_entities: # If there is no standard 'filth', returns a NamedEntity filth From 6446f44b0543b70fa9de0413287890ef0c856d8b Mon Sep 17 00:00:00 2001 From: aCampello Date: Wed, 28 Oct 2020 22:36:34 +0000 Subject: [PATCH 30/43] Download necessary model if not present in OS --- scrubadub/detectors/named_entity.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/scrubadub/detectors/named_entity.py b/scrubadub/detectors/named_entity.py index 353a6978..8aad2f4b 100644 --- a/scrubadub/detectors/named_entity.py +++ b/scrubadub/detectors/named_entity.py @@ -1,6 +1,7 @@ from typing import Dict, Generator, Iterable, Optional, Sequence, Union import spacy +from wasabi import msg from .base import Detector from ..filth import NamedEntityFilth, Filth, NameFilth, OrganizationFilth @@ -25,9 +26,9 @@ def __init__(self, named_entities: Iterable[str] = {'PERSON'}, # Spacy NER are all upper cased self.named_entities = {entity.upper() for entity in named_entities} if model not in spacy.info()['pipelines']: - raise OSError("Can't find model '{}'. If it is a valid Spacy model, " - "download it (e.g. with the CLI command " - "`python -m spacy download {}`).".format(model, model)) + msg.warn("Downloading spacy model {}".format(model)) + spacy.cli.download(model) + self.nlp = spacy.load(model) # Only enable necessary pipes self.nlp.select_pipes(enable=["transformer", "tagger", "parser", "ner"]) From b147eb903187d140dbcb622c87e93b0a336c4e0d Mon Sep 17 00:00:00 2001 From: aCampello Date: Wed, 28 Oct 2020 22:59:27 +0000 Subject: [PATCH 31/43] Change iter_filth_documents signature --- scrubadub/detectors/named_entity.py | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/scrubadub/detectors/named_entity.py b/scrubadub/detectors/named_entity.py index 8aad2f4b..f55f73ef 100644 --- a/scrubadub/detectors/named_entity.py +++ b/scrubadub/detectors/named_entity.py @@ -1,4 +1,4 @@ -from typing import Dict, Generator, Iterable, Optional, Sequence, Union +from typing import Generator, Iterable, Optional, Sequence import spacy from wasabi import msg @@ -34,7 +34,8 @@ def __init__(self, named_entities: Iterable[str] = {'PERSON'}, self.nlp.select_pipes(enable=["transformer", "tagger", "parser", "ner"]) super(NamedEntityDetector, self).__init__(**kwargs) - def _iter_spacy_pipeline(self, doc_names: Sequence[Optional[str]], doc_list: Sequence[str]): + def iter_filth_documents(self, doc_names: Sequence[Optional[str]], + doc_list: Sequence[str]) -> Generator[Filth, None, None]: for doc_name, doc in zip(doc_names, self.nlp.pipe(doc_list)): for ent in doc.ents: if ent.label_ in self.named_entities: @@ -47,15 +48,5 @@ def _iter_spacy_pipeline(self, doc_names: Sequence[Optional[str]], doc_list: Seq detector_name=self.name, label=ent.label_) - def iter_filth_documents(self, documents: Union[Sequence[str], Dict[str, str]]) -> Generator[Filth, None, None]: - if isinstance(documents, list): - doc_names, doc_list = zip(*enumerate(documents)) - elif isinstance(documents, dict): - doc_names, doc_list = zip(*documents.items()) - else: - raise TypeError('documents must be one of a string, list of strings or dict of strings.') - - yield from self._iter_spacy_pipeline(doc_names, doc_list) - def iter_filth(self, text: str, document_name: Optional[str] = None) -> Generator[Filth, None, None]: - yield from self._iter_spacy_pipeline([document_name], [text]) + yield from self.iter_filth_documents([document_name], [text]) From 3edc20d1d278ae650f29248b9de613a0236e146a Mon Sep 17 00:00:00 2001 From: aCampello Date: Wed, 28 Oct 2020 23:18:54 +0000 Subject: [PATCH 32/43] Scrubber simplification --- scrubadub/scrubbers.py | 116 +++++++++++++++-------------------------- 1 file changed, 41 insertions(+), 75 deletions(-) diff --git a/scrubadub/scrubbers.py b/scrubadub/scrubbers.py index 58bf0689..85b7d2cd 100644 --- a/scrubadub/scrubbers.py +++ b/scrubadub/scrubbers.py @@ -222,11 +222,36 @@ def _post_process_filth_list(self, filth_list: Sequence[Filth]) -> Sequence[Filt return filth_list def iter_filth( - self, text: str, document_name: Optional[str] = None, run_post_processors: bool = True, - run_merge: bool = True, exclude_detectors: Optional[List[str]] = None + self, text: str, document_name: Optional[str] = None, run_post_processors: bool = True ) -> Generator[Filth, None, None]: """Iterate over the different types of filth that can exist. """ + # Iterates using iter_filth documents. + # If a name is not provided, passes a list with one element, [text] + + yield from self.iter_filth_documents( + documents=({document_name: text} if document_name else [text]), + run_post_processors=run_post_processors + ) + + def iter_filth_documents( + self, + documents: Union[Sequence[str], Dict[str, str]], + run_post_processors: bool = True + ) -> Generator[Filth, None, None]: + """Iterate over the different types of filth that can exist.""" + if not isinstance(documents, (dict, list)): + raise TypeError('documents must be one of a string, list of strings or dict of strings.') + + # Figures out which detectors have iter_filth_documents and applies to them + + if isinstance(documents, dict): + document_names, document_texts = zip(*documents.items()) + elif isinstance(documents, (tuple, list)): + document_texts = documents + document_names = [str(x) for x in range(len(documents))] + + # currently doing this by aggregating all_filths and then sorting # inline instead of with a Filth.__cmp__ method, which is apparently # much slower http://stackoverflow.com/a/988728/564709 @@ -234,19 +259,24 @@ def iter_filth( # NOTE: we could probably do this in a more efficient way by iterating # over all detectors simultaneously. just trying to get something # working right now and we can worry about efficiency later - all_filths = [] # type: List[Filth] + filth_list = [] # type: List[Filth] for name, detector in self._detectors.items(): - if exclude_detectors is None or name not in exclude_detectors: - for filth in detector.iter_filth(text, document_name=document_name): + document_iterator = getattr(detector, 'iter_filth_documents', None) + if callable(document_iterator): + for filth in document_iterator(document_names, document_texts): if not isinstance(filth, Filth): raise TypeError('iter_filth must always yield Filth') - all_filths.append(filth) + filth_list.append(filth) + else: + for document_name, text in zip(document_names, document_texts): + for filth in detector.iter_filth(text, document_name=document_name): + if not isinstance(filth, Filth): + raise TypeError('iter_filth must always yield Filth') + filth_list.append(filth) # This is split up so that we only have to use lists if we have to post_process Filth if run_post_processors: - if run_merge: - all_filths = list(self._merge_filths(all_filths)) - + all_filths = list(self._merge_filths(filth_list)) all_filths = list(self._post_process_filth_list(all_filths)) # Here we loop over a list of Filth... @@ -255,72 +285,8 @@ def iter_filth( else: # ... but here, we're using a generator. If we try to use the same variable it would have two types and # fail static typing in mypy - if run_merge: - for filth in self._merge_filths(all_filths): - yield filth - else: - for filth in all_filths: - yield filth - - def iter_filth_documents( - self, - documents: Union[Sequence[str], Dict[str, str]], - run_post_processors: bool = True - ) -> Generator[Filth, None, None]: - """Iterate over the different types of filth that can exist.""" - if not isinstance(documents, (dict, list)): - raise TypeError('documents must be one of a string, list of strings or dict of strings.') - - # Figures out which detectors have iter_filth_documents and applies to them - - document_detectors_names = [] - filth_list = [] - - for name, detector in self._detectors.items(): - document_iterator = getattr(detector, 'iter_filth_documents', None) - if callable(document_iterator): - document_detectors_names.append(name) - for filth in document_iterator(documents): - filth_list.append(filth) - - # We have to now merge with the other processors. To do this we need to collect filth into a list - # Also need this if we need to do post processing - - if run_post_processors or document_detectors_names: - if isinstance(documents, dict): - filth_list += [ - filth - for name, text in documents.items() - for filth in self.iter_filth(text, document_name=name, run_post_processors=False, - run_merge=False, exclude_detectors=document_detectors_names) - ] - elif isinstance(documents, list): - filth_list += [ - filth - for i_name, text in enumerate(documents) - for filth in self.iter_filth(text, document_name=str(i_name), run_post_processors=False, - run_merge=False, exclude_detectors=document_detectors_names) - ] - - filth_list = list(self._merge_filths(filth_list)) - - if run_post_processors: - yield from self._post_process_filth_list(filth_list) - else: - for filth in filth_list: - yield filth - else: - # Use generators when we dont post process the Filth - if isinstance(documents, dict): - for name, text in documents.items(): - for filth in self.iter_filth(text, document_name=name, run_post_processors=False, - run_merge=False, exclude_detectors=document_detectors_names): - yield filth - elif isinstance(documents, list): - for i_name, text in enumerate(documents): - for filth in self.iter_filth(text, document_name=str(i_name), run_post_processors=False, - run_merge=False, exclude_detectors=document_detectors_names): - yield filth + for filth in self._merge_filths(filth_list): + yield filth @staticmethod def _sort_filths(filth_list: Sequence[Filth]) -> List[Filth]: From 0ee6c4a90a1503a34645676c77a2b2ad87a1f86f Mon Sep 17 00:00:00 2001 From: aCampello Date: Wed, 28 Oct 2020 23:22:39 +0000 Subject: [PATCH 33/43] Fix types for document_names and text --- scrubadub/scrubbers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scrubadub/scrubbers.py b/scrubadub/scrubbers.py index 85b7d2cd..afe55478 100644 --- a/scrubadub/scrubbers.py +++ b/scrubadub/scrubbers.py @@ -246,12 +246,12 @@ def iter_filth_documents( # Figures out which detectors have iter_filth_documents and applies to them if isinstance(documents, dict): - document_names, document_texts = zip(*documents.items()) + document_names = list(documents.keys()) + document_texts = list(documents.values()) elif isinstance(documents, (tuple, list)): document_texts = documents document_names = [str(x) for x in range(len(documents))] - # currently doing this by aggregating all_filths and then sorting # inline instead of with a Filth.__cmp__ method, which is apparently # much slower http://stackoverflow.com/a/988728/564709 From 1b80110a3e93089cb8e807800a8fe95d5943e341 Mon Sep 17 00:00:00 2001 From: aCampello Date: Wed, 28 Oct 2020 23:45:56 +0000 Subject: [PATCH 34/43] Fix types for document dictionary to include None --- scrubadub/__init__.py | 4 ++-- scrubadub/scrubbers.py | 8 +++----- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/scrubadub/__init__.py b/scrubadub/__init__.py index 01315e82..21ab2221 100644 --- a/scrubadub/__init__.py +++ b/scrubadub/__init__.py @@ -1,5 +1,5 @@ -from typing import Union, List, Dict, Sequence +from typing import Union, List, Dict, Sequence, Optional # convenient imports from .scrubbers import Scrubber @@ -82,7 +82,7 @@ def list_filth(text: str, **kwargs) -> List[Filth]: return list(scrubber.iter_filth(text, **kwargs)) -def list_filth_documents(documents: Union[List[str], Dict[str, str]], **kwargs) -> List[Filth]: +def list_filth_documents(documents: Union[List[str], Dict[Optional[str], str]], **kwargs) -> List[Filth]: """Return a list of `Filth` that was detected in the string `text`. `documents` can be in a dict, in the format of ``{'document_name': 'document'}``, or as a list of strings diff --git a/scrubadub/scrubbers.py b/scrubadub/scrubbers.py index afe55478..079c83c0 100644 --- a/scrubadub/scrubbers.py +++ b/scrubadub/scrubbers.py @@ -229,14 +229,12 @@ def iter_filth( # Iterates using iter_filth documents. # If a name is not provided, passes a list with one element, [text] - yield from self.iter_filth_documents( - documents=({document_name: text} if document_name else [text]), - run_post_processors=run_post_processors - ) + yield from self.iter_filth_documents(documents={document_name: text}, + run_post_processors=run_post_processors) def iter_filth_documents( self, - documents: Union[Sequence[str], Dict[str, str]], + documents: Union[Sequence[str], Dict[Optional[str], str]], run_post_processors: bool = True ) -> Generator[Filth, None, None]: """Iterate over the different types of filth that can exist.""" From ffb6ed93a85536dced40b8ef41b6f00fefdb2170 Mon Sep 17 00:00:00 2001 From: aCampello Date: Thu, 29 Oct 2020 09:32:44 +0000 Subject: [PATCH 35/43] Update requirements to nightly 3.0.0rc1 --- requirements/python | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/python b/requirements/python index be22055b..0069fd44 100644 --- a/requirements/python +++ b/requirements/python @@ -3,5 +3,5 @@ argcomplete phonenumbers pandas sklearn -spacy-nightly[transformers] +spacy-nightly[transformers]>=3.0.0rc1 typing_extensions From ecd965471656b266bbfad024f09f6e2a82223d12 Mon Sep 17 00:00:00 2001 From: aCampello Date: Thu, 29 Oct 2020 09:33:11 +0000 Subject: [PATCH 36/43] Comment unecessary piece of code --- tests/benchmark_accuracy.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/benchmark_accuracy.py b/tests/benchmark_accuracy.py index 3e7e8f1f..7d31d5bb 100644 --- a/tests/benchmark_accuracy.py +++ b/tests/benchmark_accuracy.py @@ -25,14 +25,14 @@ def main(): general_docs.append(new_doc) known_general_pii += new_known_pii - new_doc, new_known_pii = make_fake_document(paragraphs=4, seed=i_doc, filth_types=['name']) + #new_doc, new_known_pii = make_fake_document(paragraphs=4, seed=i_doc, filth_types=['name']) # Change the filth name to allow for comparison with NamedEntityDetector. Probably there is a better way to do it - for pii in new_known_pii: - pii['filth_type'] = 'named_entity' + #for pii in new_known_pii: + # pii['filth_type'] = 'named_entity' - named_entity_docs.append(new_doc) - known_named_entity_pii += new_known_pii + #named_entity_docs.append(new_doc) + #known_named_entity_pii += new_known_pii # new_doc, new_known_pii = make_fake_document(paragraphs=4, seed=i_doc, filth_types=['gb_address', 'us_address']) # address_docs.append(new_doc) From aaf36a0f9f5bbd516eb521aa7c50ddf8a0a542de Mon Sep 17 00:00:00 2001 From: aCampello Date: Thu, 29 Oct 2020 16:34:52 +0000 Subject: [PATCH 37/43] Initial tests to named entity detector --- tests/test_detector_named_entity.py | 67 +++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 tests/test_detector_named_entity.py diff --git a/tests/test_detector_named_entity.py b/tests/test_detector_named_entity.py new file mode 100644 index 00000000..c23f968d --- /dev/null +++ b/tests/test_detector_named_entity.py @@ -0,0 +1,67 @@ +import unittest + +from scrubadub.detectors import NamedEntityDetector +from scrubadub.filth import NameFilth, OrganizationFilth, NamedEntityFilth +import scrubadub + +from base import BaseTestCase + + +class NamedEntityTestCase(unittest.TestCase, BaseTestCase): + """ + Tests whether the detector is performing correctly from a function point of view. + For accuracy tests use .benchmark_accuracy instead + """ + + def setUp(self): + self.detector = NamedEntityDetector() + + def _assert_filth_type_and_pos(self, doc_list, beg_end_list, filth_class): + doc_names = [str(x) for x in range(len(doc_list))] + + filth_list = list(self.detector.iter_filth_documents(doc_names, doc_list)) + + for filth, beg_end in zip(filth_list, beg_end_list): + self.assertIsInstance(filth, filth_class) + self.assertEqual((filth.beg, filth.end), beg_end) + + def test_names(self): + doc_list = ["John is a cat", + "When was Maria born?", + "john is a cat", + "when was maria born"] + beg_end_list = [(0, 4), + (9, 14), + (0, 4), + (9, 14)] + + self._assert_filth_type_and_pos(doc_list, beg_end_list, NameFilth) + + def test_organisations(self): + doc_list = ["She started working for Apple this year", + "But used to work for Google"] + beg_end_list = [(24, 30), + (21, 27)] + + self._assert_filth_type_and_pos(doc_list, beg_end_list, OrganizationFilth) + + def test_other_entity(self): + self.detector.named_entities = {"GPE"} + doc_list = ["London is a city in England"] + beg_end_list = [(0, 6), + (20, 27)] + + self._assert_filth_type_and_pos(doc_list, beg_end_list, NamedEntityFilth) + + def test_wrong_model(self): + """Test that it raises an error if user inputs invalid spacy model""" + with self.assertRaises(SystemExit): + NamedEntityDetector(model='not_a_valid_spacy_model') + + def test_iter_filth(self): + doc = "John is a cat" + + output_iter_docs = list(self.detector.iter_filth_documents(doc_list=[doc], doc_names=["0"])) + output_iter = list(self.detector.iter_filth(text=doc, document_name="0")) + + self.assertListEqual(output_iter, output_iter_docs) From 2c8eedb91e66fc7374911c4c0f88412dd4cabce3 Mon Sep 17 00:00:00 2001 From: aCampello Date: Sat, 31 Oct 2020 14:22:59 +0000 Subject: [PATCH 38/43] Skip tests if python version < 3.6 --- tests/test_detector_named_entity.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/test_detector_named_entity.py b/tests/test_detector_named_entity.py index c23f968d..a1b4ce2f 100644 --- a/tests/test_detector_named_entity.py +++ b/tests/test_detector_named_entity.py @@ -1,3 +1,4 @@ +import sys import unittest from scrubadub.detectors import NamedEntityDetector @@ -15,6 +16,10 @@ class NamedEntityTestCase(unittest.TestCase, BaseTestCase): def setUp(self): self.detector = NamedEntityDetector() + unittest.TestCase.skipTest( + (sys.version_info.major, sys.version_info.minor) < (3, 6), + "Named entity detector not supported for python<3.6" + ) def _assert_filth_type_and_pos(self, doc_list, beg_end_list, filth_class): doc_names = [str(x) for x in range(len(doc_list))] From c83829ad4b861e0c283f7794cfd10e43b5708064 Mon Sep 17 00:00:00 2001 From: aCampello Date: Sat, 31 Oct 2020 14:36:22 +0000 Subject: [PATCH 39/43] Add spacy as extra --- requirements/python | 1 - requirements/python-dev | 1 + requirements/python-extras | 1 + setup.py | 20 ++++++++++++++------ 4 files changed, 16 insertions(+), 7 deletions(-) create mode 100644 requirements/python-extras diff --git a/requirements/python b/requirements/python index 0069fd44..10c04814 100644 --- a/requirements/python +++ b/requirements/python @@ -3,5 +3,4 @@ argcomplete phonenumbers pandas sklearn -spacy-nightly[transformers]>=3.0.0rc1 typing_extensions diff --git a/requirements/python-dev b/requirements/python-dev index 7a521401..3a141ad6 100644 --- a/requirements/python-dev +++ b/requirements/python-dev @@ -1,5 +1,6 @@ # install everything in the python requirements too. -r python +-r python-extras # needed for tests/run.py script to read .travis.yml file PyYAML diff --git a/requirements/python-extras b/requirements/python-extras new file mode 100644 index 00000000..ac0b741d --- /dev/null +++ b/requirements/python-extras @@ -0,0 +1 @@ +spacy-nightly[transformers]>=3.0.0rc1 diff --git a/setup.py b/setup.py index dde13e88..50ed6e5b 100644 --- a/setup.py +++ b/setup.py @@ -8,14 +8,21 @@ github_url = 'https://github.com/LeapBeyond/scrubadub' + +def read_packages_from_file(filename): + with open(filename, 'r') as stream: + for line in stream: + package = line.strip().split('#')[0] + if package: + yield package + # read in the dependencies from the virtualenv requirements file -dependencies = [] filename = os.path.join("requirements", "python") -with open(filename, 'r') as stream: - for line in stream: - package = line.strip().split('#')[0] - if package: - dependencies.append(package) +dependencies = list(read_packages_from_file(filename)) + +# read extra spacy dependencies from python-extras requirements file +filename = os.path.join("requirements", "python-extras") +extras = list(read_packages_from_file(filename)) # get the version version = None @@ -60,5 +67,6 @@ 'Topic :: Utilities', ], install_requires=dependencies, + extras_require={"spacy": extras}, zip_safe=False, ) From dd5a7be9dc7072f49d90075e8a813b1024aa0945 Mon Sep 17 00:00:00 2001 From: aCampello Date: Sat, 31 Oct 2020 14:46:13 +0000 Subject: [PATCH 40/43] Tweak travis for python3.8 --- .travis.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.travis.yml b/.travis.yml index 740c4fe9..d30c1afd 100644 --- a/.travis.yml +++ b/.travis.yml @@ -23,6 +23,11 @@ script: - python3 ./tests/benchmark_time.py - cd docs && make html && cd - +jobs: + include: + - python: "3.8" + before_script: pip install -r requirements/python-extras + # commands to run after the tests successfully complete after_success: - coveralls From 491c10d3be40c51100050b79a6f3b01d020b9700 Mon Sep 17 00:00:00 2001 From: aCampello Date: Sat, 31 Oct 2020 16:48:07 +0000 Subject: [PATCH 41/43] Revert CI and add environment marker to requirements --- .travis.yml | 6 +----- requirements/python-dev | 1 - requirements/python-extras | 3 ++- 3 files changed, 3 insertions(+), 7 deletions(-) diff --git a/.travis.yml b/.travis.yml index d30c1afd..4b210326 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,6 +7,7 @@ python: # virtualenv install: - pip install -r requirements/python-dev + - pip install -r requirements/python-extras - python -m textblob.download_corpora - pip install . # - apt-get install curl autoconf automake libtool pkg-config @@ -23,11 +24,6 @@ script: - python3 ./tests/benchmark_time.py - cd docs && make html && cd - -jobs: - include: - - python: "3.8" - before_script: pip install -r requirements/python-extras - # commands to run after the tests successfully complete after_success: - coveralls diff --git a/requirements/python-dev b/requirements/python-dev index 3a141ad6..7a521401 100644 --- a/requirements/python-dev +++ b/requirements/python-dev @@ -1,6 +1,5 @@ # install everything in the python requirements too. -r python --r python-extras # needed for tests/run.py script to read .travis.yml file PyYAML diff --git a/requirements/python-extras b/requirements/python-extras index ac0b741d..56d551bb 100644 --- a/requirements/python-extras +++ b/requirements/python-extras @@ -1 +1,2 @@ -spacy-nightly[transformers]>=3.0.0rc1 +spacy-nightly[transformers]>=3.0.0rc1; python_version >= '3.6' + From 3822cf9bc00ec222a8fbd7707f4e9627de5dd19a Mon Sep 17 00:00:00 2001 From: aCampello Date: Sun, 1 Nov 2020 15:48:34 +0000 Subject: [PATCH 42/43] Add check for extras --- scrubadub/detectors/named_entity.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/scrubadub/detectors/named_entity.py b/scrubadub/detectors/named_entity.py index f55f73ef..824f3522 100644 --- a/scrubadub/detectors/named_entity.py +++ b/scrubadub/detectors/named_entity.py @@ -1,7 +1,14 @@ +import warnings from typing import Generator, Iterable, Optional, Sequence -import spacy -from wasabi import msg +try: + import spacy + from wasabi import msg +except ModuleNotFoundError as e: + if getattr(e, 'name', None) == 'spacy': + warnings.warn("Could not find module 'spacy'. If you want to use extras," + " make sure you install scrubadub with 'pip install scrubadub[spacy]'") + from .base import Detector from ..filth import NamedEntityFilth, Filth, NameFilth, OrganizationFilth @@ -26,7 +33,7 @@ def __init__(self, named_entities: Iterable[str] = {'PERSON'}, # Spacy NER are all upper cased self.named_entities = {entity.upper() for entity in named_entities} if model not in spacy.info()['pipelines']: - msg.warn("Downloading spacy model {}".format(model)) + msg.info("Downloading spacy model {}".format(model)) spacy.cli.download(model) self.nlp = spacy.load(model) From f1b29cdef8be42355c464039a30f51fb2fb34fd8 Mon Sep 17 00:00:00 2001 From: aCampello Date: Sun, 1 Nov 2020 15:52:54 +0000 Subject: [PATCH 43/43] Fix test skipping --- tests/test_detector_named_entity.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_detector_named_entity.py b/tests/test_detector_named_entity.py index a1b4ce2f..f308527f 100644 --- a/tests/test_detector_named_entity.py +++ b/tests/test_detector_named_entity.py @@ -3,8 +3,6 @@ from scrubadub.detectors import NamedEntityDetector from scrubadub.filth import NameFilth, OrganizationFilth, NamedEntityFilth -import scrubadub - from base import BaseTestCase @@ -15,11 +13,13 @@ class NamedEntityTestCase(unittest.TestCase, BaseTestCase): """ def setUp(self): - self.detector = NamedEntityDetector() + unsupported_version = (sys.version_info.major, sys.version_info.minor) < (3, 6) unittest.TestCase.skipTest( - (sys.version_info.major, sys.version_info.minor) < (3, 6), + unsupported_version, "Named entity detector not supported for python<3.6" ) + if not unsupported_version: + self.detector = NamedEntityDetector() def _assert_filth_type_and_pos(self, doc_list, beg_end_list, filth_class): doc_names = [str(x) for x in range(len(doc_list))]