Merge ef3b3c6 into 0eec66e

LeapBeyond · Nov 1, 2020 · cda8127 · cda8127
2 parents 0eec66e + ef3b3c6
commit cda8127
Show file tree

Hide file tree

Showing 13 changed files with 264 additions and 59 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -7,6 +7,7 @@ python:
 # virtualenv
 install:
   - pip install -r requirements/python-dev
+  - pip install -r requirements/python-extras
   - python -m textblob.download_corpora
   - pip install .
 #  - apt-get install curl autoconf automake libtool pkg-config

diff --git a/requirements/python b/requirements/python
@@ -3,4 +3,4 @@ argcomplete
 phonenumbers
 pandas
 sklearn
-typing_extensions
+typing_extensions
diff --git a/requirements/python-extras b/requirements/python-extras
@@ -0,0 +1,2 @@
+spacy-nightly[transformers]>=3.0.0rc1; python_version >= '3.6'
+
diff --git a/scrubadub/__init__.py b/scrubadub/__init__.py
@@ -1,5 +1,5 @@
 
-from typing import Union, List, Dict, Sequence
+from typing import Union, List, Dict, Sequence, Optional
 
 # convenient imports
 from .scrubbers import Scrubber
@@ -82,7 +82,7 @@ def list_filth(text: str, **kwargs) -> List[Filth]:
     return list(scrubber.iter_filth(text, **kwargs))
 
 
-def list_filth_documents(documents: Union[List[str], Dict[str, str]], **kwargs) -> List[Filth]:
+def list_filth_documents(documents: Union[List[str], Dict[Optional[str], str]], **kwargs) -> List[Filth]:
     """Return a list of `Filth` that was detected in the string `text`.
 
     `documents` can be in a dict, in the format of ``{'document_name': 'document'}``, or as a list of strings

diff --git a/scrubadub/detectors/__init__.py b/scrubadub/detectors/__init__.py
@@ -13,6 +13,7 @@
 from .credential import CredentialDetector
 from .email import EmailDetector, NewEmailDetector
 from .name import NameDetector
+from .named_entity import NamedEntityDetector
 from .phone import PhoneDetector
 from .postalcode import PostalCodeDetector
 from .known import KnownFilthDetector

diff --git a/scrubadub/detectors/named_entity.py b/scrubadub/detectors/named_entity.py
@@ -0,0 +1,59 @@
+import warnings
+from typing import Generator, Iterable, Optional, Sequence
+
+try:
+    import spacy
+    from wasabi import msg
+except ImportError as e:
+    if getattr(e, 'name', None) == 'spacy':
+        warnings.warn("Could not find module 'spacy'. If you want to use extras,"
+                      " make sure you install scrubadub with 'pip install scrubadub[spacy]'")
+
+
+from .base import Detector
+from ..filth import NamedEntityFilth, Filth, NameFilth, OrganizationFilth
+from ..utils import CanonicalStringSet
+
+
+class NamedEntityDetector(Detector):
+    """Use spacy's named entity recognition to clean named entities.
+     List specific entities to include passing ``named_entities``, e.g.
+     (PERSON)
+    """
+    filth_cls_map = {
+        'PERSON': NameFilth,
+        'ORG': OrganizationFilth
+    }
+    name = 'named_entity'
+
+    disallowed_nouns = CanonicalStringSet(["skype"])
+
+    def __init__(self, named_entities: Iterable[str] = {'PERSON'},
+                 model: str = "en_core_web_trf", **kwargs):
+        # Spacy NER are all upper cased
+        self.named_entities = {entity.upper() for entity in named_entities}
+        if model not in spacy.info()['pipelines']:
+            msg.info("Downloading spacy model {}".format(model))
+            spacy.cli.download(model)
+
+        self.nlp = spacy.load(model)
+        # Only enable necessary pipes
+        self.nlp.select_pipes(enable=["transformer", "tagger", "parser", "ner"])
+        super(NamedEntityDetector, self).__init__(**kwargs)
+
+    def iter_filth_documents(self, doc_names: Sequence[Optional[str]],
+                             doc_list: Sequence[str]) -> Generator[Filth, None, None]:
+        for doc_name, doc in zip(doc_names, self.nlp.pipe(doc_list)):
+            for ent in doc.ents:
+                if ent.label_ in self.named_entities:
+                    # If there is no standard 'filth', returns a NamedEntity filth
+                    filth_cls = self.filth_cls_map.get(ent.label_, NamedEntityFilth)
+                    yield filth_cls(beg=ent.start_char,
+                                    end=ent.end_char,
+                                    text=ent.text,
+                                    document_name=(str(doc_name) if doc_name else None),  # None if no doc_name provided
+                                    detector_name=self.name,
+                                    label=ent.label_)
+
+    def iter_filth(self, text: str, document_name: Optional[str] = None) -> Generator[Filth, None, None]:
+        yield from self.iter_filth_documents([document_name], [text])
diff --git a/scrubadub/filth/__init__.py b/scrubadub/filth/__init__.py
@@ -4,6 +4,7 @@
 from .email import EmailFilth
 from .known import KnownFilth
 from .name import NameFilth
+from .named_entity import NamedEntityFilth
 from .organization import OrganizationFilth
 from .phone import PhoneFilth
 from .postalcode import PostalCodeFilth

diff --git a/scrubadub/filth/named_entity.py b/scrubadub/filth/named_entity.py
@@ -0,0 +1,14 @@
+from .base import Filth
+
+
+class NamedEntityFilth(Filth):
+    """
+    Default filth type, for named entities (e.g. the ones in https://nightly.spacy.io/models/en#en_core_web_lg-labels),
+    except the ones represented in any other filth.
+    """
+    type = 'named_entity'
+
+    def __init__(self, *args, label: str, **kwargs):
+        super(NamedEntityFilth, self).__init__(*args, **kwargs)
+        self.label = label.lower()
+        self.replacement_string = "{}_{}".format(self.type, self.label)
diff --git a/scrubadub/scrubbers.py b/scrubadub/scrubbers.py
@@ -181,17 +181,19 @@ def clean_documents(self, documents: Union[Sequence[str], Dict[str, str]], **kwa
 
         filth_list = self._post_process_filth_list(filth_list)
 
+        clean_documents: Union[Dict[str, str], Sequence[str]]
         if isinstance(documents, list):
-            return [
+            clean_documents = [
                 self._replace_text(text=text, filth_list=filth_list, document_name=str(name), **kwargs)
                 for name, text in enumerate(documents)
             ]
         elif isinstance(documents, dict):
-            return {
+            clean_documents = {
                 name: self._replace_text(text=text, filth_list=filth_list, document_name=name, **kwargs)
                 for name, text in documents.items()
             }
-        return []
+
+        return clean_documents
 
     def _replace_text(
             self, text: str, filth_list: Sequence[Filth], document_name: Optional[str] = None, **kwargs
@@ -226,23 +228,55 @@ def iter_filth(
     ) -> Generator[Filth, None, None]:
         """Iterate over the different types of filth that can exist.
         """
+        # Iterates using iter_filth documents.
+        # If a name is not provided, passes a list with one element, [text]
+
+        yield from self.iter_filth_documents(documents={document_name: text},
+                                             run_post_processors=run_post_processors)
+
+    def iter_filth_documents(
+            self,
+            documents: Union[Sequence[str], Dict[Optional[str], str]],
+            run_post_processors: bool = True
+    ) -> Generator[Filth, None, None]:
+        """Iterate over the different types of filth that can exist."""
+        if not isinstance(documents, (dict, list)):
+            raise TypeError('documents must be one of a string, list of strings or dict of strings.')
+
+        # Figures out which detectors have iter_filth_documents and applies to them
+
+        if isinstance(documents, dict):
+            document_names = list(documents.keys())
+            document_texts = list(documents.values())
+        elif isinstance(documents, (tuple, list)):
+            document_texts = documents
+            document_names = [str(x) for x in range(len(documents))]
+
         # currently doing this by aggregating all_filths and then sorting
         # inline instead of with a Filth.__cmp__ method, which is apparently
         # much slower http://stackoverflow.com/a/988728/564709
         #
         # NOTE: we could probably do this in a more efficient way by iterating
         # over all detectors simultaneously. just trying to get something
         # working right now and we can worry about efficiency later
-        all_filths = []  # type: List[Filth]
-        for detector in self._detectors.values():
-            for filth in detector.iter_filth(text, document_name=document_name):
-                if not isinstance(filth, Filth):
-                    raise TypeError('iter_filth must always yield Filth')
-                all_filths.append(filth)
+        filth_list = []  # type: List[Filth]
+        for name, detector in self._detectors.items():
+            document_iterator = getattr(detector, 'iter_filth_documents', None)
+            if callable(document_iterator):
+                for filth in document_iterator(document_names, document_texts):
+                    if not isinstance(filth, Filth):
+                        raise TypeError('iter_filth must always yield Filth')
+                    filth_list.append(filth)
+            else:
+                for document_name, text in zip(document_names, document_texts):
+                    for filth in detector.iter_filth(text, document_name=document_name):
+                        if not isinstance(filth, Filth):
+                            raise TypeError('iter_filth must always yield Filth')
+                        filth_list.append(filth)
 
         # This is split up so that we only have to use lists if we have to post_process Filth
         if run_post_processors:
-            all_filths = list(self._merge_filths(all_filths))
+            all_filths = list(self._merge_filths(filth_list))
             all_filths = list(self._post_process_filth_list(all_filths))
 
             # Here we loop over a list of Filth...
@@ -251,47 +285,9 @@ def iter_filth(
         else:
             # ... but here, we're using a generator. If we try to use the same variable it would have two types and
             # fail static typing in mypy
-            for filth in self._merge_filths(all_filths):
+            for filth in self._merge_filths(filth_list):
                 yield filth
 
-    def iter_filth_documents(
-            self,
-            documents: Union[Sequence[str], Dict[str, str]],
-            run_post_processors: bool = True
-    ) -> Generator[Filth, None, None]:
-        """Iterate over the different types of filth that can exist."""
-        if not isinstance(documents, (dict, list)):
-            raise TypeError('documents must be one of a string, list of strings or dict of strings.')
-
-        if run_post_processors:
-            # Only collect the filts into a list if we need to do post processing
-            filth_list = []  # type: List[Filth]
-            if isinstance(documents, dict):
-                filth_list = [
-                    filth
-                    for name, text in documents.items()
-                    for filth in self.iter_filth(text, document_name=name, run_post_processors=False)
-                ]
-            elif isinstance(documents, list):
-                filth_list = [
-                    filth
-                    for i_name, text in enumerate(documents)
-                    for filth in self.iter_filth(text, document_name=str(i_name), run_post_processors=False)
-                ]
-
-            for filth in self._post_process_filth_list(filth_list):
-                yield filth
-        else:
-            # Use generators when we dont post process the Filth
-            if isinstance(documents, dict):
-                for name, text in documents.items():
-                    for filth in self.iter_filth(text, document_name=name, run_post_processors=False):
-                        yield filth
-            elif isinstance(documents, list):
-                for i_name, text in enumerate(documents):
-                    for filth in self.iter_filth(text, document_name=str(i_name), run_post_processors=False):
-                        yield filth
-
     @staticmethod
     def _sort_filths(filth_list: Sequence[Filth]) -> List[Filth]:
         """Sorts a list of filths, needed before merging and concatenating"""

diff --git a/setup.py b/setup.py
@@ -8,14 +8,21 @@
 
 github_url = 'https://github.com/LeapBeyond/scrubadub'
 
+
+def read_packages_from_file(filename):
+    with open(filename, 'r') as stream:
+        for line in stream:
+            package = line.strip().split('#')[0]
+            if package:
+                yield package
+
 # read in the dependencies from the virtualenv requirements file
-dependencies = []
 filename = os.path.join("requirements", "python")
-with open(filename, 'r') as stream:
-    for line in stream:
-        package = line.strip().split('#')[0]
-        if package:
-            dependencies.append(package)
+dependencies = list(read_packages_from_file(filename))
+
+# read extra spacy dependencies from python-extras requirements file
+filename = os.path.join("requirements", "python-extras")
+extras = list(read_packages_from_file(filename))
 
 # get the version
 version = None
@@ -60,5 +67,6 @@
         'Topic :: Utilities',
     ],
     install_requires=dependencies,
+    extras_require={"spacy": extras},
     zip_safe=False,
 )
diff --git a/tests/benchmark_accuracy.py b/tests/benchmark_accuracy.py
@@ -11,9 +11,11 @@
 
 def main():
     general_docs = []
+    named_entity_docs = []
     # address_docs = []
     # uk_phone_docs = []
     known_general_pii = []
+    known_named_entity_pii = []
     # known_address_pii = []
     # known_uk_phone_pii = []
     start_time = time.time()
@@ -23,6 +25,15 @@ def main():
         general_docs.append(new_doc)
         known_general_pii += new_known_pii
 
+        #new_doc, new_known_pii = make_fake_document(paragraphs=4, seed=i_doc, filth_types=['name'])
+        # Change the filth name to allow for comparison with NamedEntityDetector. Probably there is a better way to do it
+
+        #for pii in new_known_pii:
+       #     pii['filth_type'] = 'named_entity'
+
+        #named_entity_docs.append(new_doc)
+        #known_named_entity_pii += new_known_pii
+
         # new_doc, new_known_pii = make_fake_document(paragraphs=4, seed=i_doc, filth_types=['gb_address', 'us_address'])
         # address_docs.append(new_doc)
         # known_address_pii += new_known_pii
@@ -35,7 +46,6 @@ def main():
 
     scrubber_time = time.time()
     scrubber = scrubadub.Scrubber()
-    # scrubber.add_detector(scrubadub.detectors.stanford_ner.StanfordNERDetector())
     scrubber.add_detector(scrubadub.detectors.KnownFilthDetector(known_filth_items=known_general_pii))
     filth_list = list(scrubber.iter_filth_documents(general_docs))
 
@@ -57,6 +67,15 @@ def main():
     print("Scrubbed documents in  {:.2f}s".format(end_time-scrubber_time))
     print(get_filth_classification_report(filth_list))
 
+    # scrubber_time = time.time()
+    # scrubber = scrubadub.Scrubber(detector_list=[scrubadub.detectors.NamedEntityDetector(),
+    #                                              scrubadub.detectors.KnownFilthDetector(known_filth_items=known_named_entity_pii)])
+    # filth_list = list(scrubber.iter_filth_documents(named_entity_docs))
+    # end_time = time.time()
+    # print("Documents generated in {:.2f}s".format(scrubber_time-start_time))
+    # print("Scrubbed documents in  {:.2f}s".format(end_time-scrubber_time))
+    # print(get_filth_classification_report(filth_list))
+
     sys.exit(0)