From 4f317873599d23c63d423d322c283a988c20a2ca Mon Sep 17 00:00:00 2001
From: aCampello <a.campello@wellcome.ac.uk>
Date: Tue, 20 Oct 2020 22:05:06 +0100
Subject: [PATCH 01/43] Add NER filth

---
 scrubadub/filth/ner.py | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 scrubadub/filth/ner.py

diff --git a/scrubadub/filth/ner.py b/scrubadub/filth/ner.py
new file mode 100644
index 00000000..d84fbe2a
--- /dev/null
+++ b/scrubadub/filth/ner.py
@@ -0,0 +1,5 @@
+from .base import Filth
+
+
+class NERFilth(Filth):
+    type = 'ner'

From 449d150192f60b50375d7cbf885c3bd344f130b3 Mon Sep 17 00:00:00 2001
From: aCampello <a.campello@wellcome.ac.uk>
Date: Tue, 20 Oct 2020 22:05:35 +0100
Subject: [PATCH 02/43] Add NER filth to __init__

---
 scrubadub/filth/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scrubadub/filth/__init__.py b/scrubadub/filth/__init__.py
index 63697370..f24f180c 100644
--- a/scrubadub/filth/__init__.py
+++ b/scrubadub/filth/__init__.py
@@ -4,6 +4,7 @@
 from .email import EmailFilth
 from .known import KnownFilth
 from .name import NameFilth
+from .ner import NERFilth
 from .organization import OrganizationFilth
 from .phone import PhoneFilth
 from .postalcode import PostalCodeFilth

From ed77716491270b8437d7961ea759eed31a5c70db Mon Sep 17 00:00:00 2001
From: aCampello <a.campello@wellcome.ac.uk>
Date: Tue, 20 Oct 2020 22:34:57 +0100
Subject: [PATCH 03/43] Add NER detector

---
 scrubadub/detectors/ner.py | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)
 create mode 100644 scrubadub/detectors/ner.py

diff --git a/scrubadub/detectors/ner.py b/scrubadub/detectors/ner.py
new file mode 100644
index 00000000..acf6907f
--- /dev/null
+++ b/scrubadub/detectors/ner.py
@@ -0,0 +1,31 @@
+import spacy
+
+from typing import List, Optional, Generator
+
+from .base import Detector
+from ..filth import NERFilth, Filth
+from ..utils import CanonicalStringSet
+
+
+class SpacyDetector(Detector):
+    """Use spacy's named entity recognition to clean named entities.
+     List specific entities to include passing ``named_entities``
+    """
+    filth_cls = NERFilth
+    name = 'spacy_ner'
+
+    disallowed_nouns = CanonicalStringSet(["skype"])
+
+    def __init__(self, named_entities: Optional[List[str]] = None, model: str = "en_core_web_trf", **kwargs):
+        self.named_entities = named_entities
+        if model not in spacy.info()['pipelines']:
+            raise OSError(f"Can't find model '{model}'. If it is a valid Spacy model, "
+                          f"download it (e.g. with the CLI command "
+                          f"`python -m spacy download {model}`).")
+        self.nlp = spacy.load(model)
+        # Only enable necessary pipes
+        self.nlp.select_pipes(enable=["transformer", "tagger", "parser", "ner"])
+        super(SpacyDetector, self).__init__(**kwargs)
+
+    def iter_filth(self, text: str, document_name: Optional[str] = None) -> Generator[Filth, None, None]:
+        pass

From b20c212c9a471bad0610fcc9f094af8826e98630 Mon Sep 17 00:00:00 2001
From: aCampello <a.campello@wellcome.ac.uk>
Date: Thu, 22 Oct 2020 22:59:11 +0100
Subject: [PATCH 04/43] Add iter_filth_documents basic logic

---
 scrubadub/detectors/ner.py | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/scrubadub/detectors/ner.py b/scrubadub/detectors/ner.py
index acf6907f..4c7bf4e6 100644
--- a/scrubadub/detectors/ner.py
+++ b/scrubadub/detectors/ner.py
@@ -1,6 +1,6 @@
 import spacy
 
-from typing import List, Optional, Generator
+from typing import Dict, Generator, List, Optional, Set, Sequence, Union
 
 from .base import Detector
 from ..filth import NERFilth, Filth
@@ -9,15 +9,18 @@
 
 class SpacyDetector(Detector):
     """Use spacy's named entity recognition to clean named entities.
-     List specific entities to include passing ``named_entities``
+     List specific entities to include passing ``named_entities``, e.g.
+     (PERSON)
     """
     filth_cls = NERFilth
     name = 'spacy_ner'
 
     disallowed_nouns = CanonicalStringSet(["skype"])
 
-    def __init__(self, named_entities: Optional[List[str]] = None, model: str = "en_core_web_trf", **kwargs):
-        self.named_entities = named_entities
+    def __init__(self, named_entities: Optional[Union[List[str], Set[str]]] = {'PERSON'},
+                 model: str = "en_core_web_trf", **kwargs):
+        # Spacy NER are all upper cased
+        self.named_entities = {entity.upper() for entity in named_entities}
         if model not in spacy.info()['pipelines']:
             raise OSError(f"Can't find model '{model}'. If it is a valid Spacy model, "
                           f"download it (e.g. with the CLI command "
@@ -27,5 +30,18 @@ def __init__(self, named_entities: Optional[List[str]] = None, model: str = "en_
         self.nlp.select_pipes(enable=["transformer", "tagger", "parser", "ner"])
         super(SpacyDetector, self).__init__(**kwargs)
 
+    def iter_filth_documents(self, documents: Union[Sequence[str], Dict[str, str]]) -> Generator[Filth, None, None]:
+        if isinstance(documents, list):
+            for doc in self.nlp.pipe(documents):
+                for ent in doc.ents:
+                    if ent.label_ in self.named_entities:
+                        yield self.filth_cls(beg=ent.start_char,
+                                             end=ent.end_char,
+                                             text=ent.text,
+                                             detector_name=self.name)
+
     def iter_filth(self, text: str, document_name: Optional[str] = None) -> Generator[Filth, None, None]:
         pass
+
+
+

From e372b0771b77b06a704fd35454c10afce79e3e13 Mon Sep 17 00:00:00 2001
From: aCampello <a.campello@wellcome.ac.uk>
Date: Thu, 22 Oct 2020 23:27:57 +0100
Subject: [PATCH 05/43] Add support to different types for documents

---
 scrubadub/detectors/ner.py | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/scrubadub/detectors/ner.py b/scrubadub/detectors/ner.py
index 4c7bf4e6..3196498d 100644
--- a/scrubadub/detectors/ner.py
+++ b/scrubadub/detectors/ner.py
@@ -1,6 +1,6 @@
 import spacy
 
-from typing import Dict, Generator, List, Optional, Set, Sequence, Union
+from typing import Dict, Generator, List, Optional, Set, Sequence, Tuple, Union
 
 from .base import Detector
 from ..filth import NERFilth, Filth
@@ -17,7 +17,7 @@ class SpacyDetector(Detector):
 
     disallowed_nouns = CanonicalStringSet(["skype"])
 
-    def __init__(self, named_entities: Optional[Union[List[str], Set[str]]] = {'PERSON'},
+    def __init__(self, named_entities: Union[List[str], Set[str]] = {'PERSON'},
                  model: str = "en_core_web_trf", **kwargs):
         # Spacy NER are all upper cased
         self.named_entities = {entity.upper() for entity in named_entities}
@@ -32,13 +32,20 @@ def __init__(self, named_entities: Optional[Union[List[str], Set[str]]] = {'PERS
 
     def iter_filth_documents(self, documents: Union[Sequence[str], Dict[str, str]]) -> Generator[Filth, None, None]:
         if isinstance(documents, list):
-            for doc in self.nlp.pipe(documents):
-                for ent in doc.ents:
-                    if ent.label_ in self.named_entities:
-                        yield self.filth_cls(beg=ent.start_char,
-                                             end=ent.end_char,
-                                             text=ent.text,
-                                             detector_name=self.name)
+            doc_names, doc_list = zip(*enumerate(documents))
+        elif isinstance(documents, dict):
+            doc_names, doc_list = zip(*documents.items())
+        else:
+            raise TypeError('documents must be one of a string, list of strings or dict of strings.')
+
+        for doc_name, doc in zip(doc_names, self.nlp.pipe(doc_list)):
+            for ent in doc.ents:
+                if ent.label_ in self.named_entities:
+                    yield self.filth_cls(beg=ent.start_char,
+                                         end=ent.end_char,
+                                         text=ent.text,
+                                         document_name=str(doc_name),
+                                         detector_name=self.name)
 
     def iter_filth(self, text: str, document_name: Optional[str] = None) -> Generator[Filth, None, None]:
         pass

From cd8b48987bda91e3889fed5ea072fcda8b6785d8 Mon Sep 17 00:00:00 2001
From: aCampello <a.campello@wellcome.ac.uk>
Date: Thu, 22 Oct 2020 23:32:51 +0100
Subject: [PATCH 06/43] Flake8 tweaks

---
 scrubadub/detectors/ner.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/scrubadub/detectors/ner.py b/scrubadub/detectors/ner.py
index 3196498d..51b2e96e 100644
--- a/scrubadub/detectors/ner.py
+++ b/scrubadub/detectors/ner.py
@@ -1,6 +1,6 @@
 import spacy
 
-from typing import Dict, Generator, List, Optional, Set, Sequence, Tuple, Union
+from typing import Dict, Generator, List, Optional, Set, Sequence, Union
 
 from .base import Detector
 from ..filth import NERFilth, Filth
@@ -49,6 +49,3 @@ def iter_filth_documents(self, documents: Union[Sequence[str], Dict[str, str]])
 
     def iter_filth(self, text: str, document_name: Optional[str] = None) -> Generator[Filth, None, None]:
         pass
-
-
-

From 7e5b3d5d858525f1add4abb78c5562f9a9386a38 Mon Sep 17 00:00:00 2001
From: aCampello <a.campello@wellcome.ac.uk>
Date: Fri, 23 Oct 2020 08:46:46 +0100
Subject: [PATCH 07/43] Edit f-string for 3.5 compatibility

---
 scrubadub/detectors/ner.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scrubadub/detectors/ner.py b/scrubadub/detectors/ner.py
index 51b2e96e..a78c7f6a 100644
--- a/scrubadub/detectors/ner.py
+++ b/scrubadub/detectors/ner.py
@@ -22,9 +22,9 @@ def __init__(self, named_entities: Union[List[str], Set[str]] = {'PERSON'},
         # Spacy NER are all upper cased
         self.named_entities = {entity.upper() for entity in named_entities}
         if model not in spacy.info()['pipelines']:
-            raise OSError(f"Can't find model '{model}'. If it is a valid Spacy model, "
-                          f"download it (e.g. with the CLI command "
-                          f"`python -m spacy download {model}`).")
+            raise OSError("Can't find model '{}'. If it is a valid Spacy model, "
+                          "download it (e.g. with the CLI command "
+                          "`python -m spacy download {}`).".format(model, model))
         self.nlp = spacy.load(model)
         # Only enable necessary pipes
         self.nlp.select_pipes(enable=["transformer", "tagger", "parser", "ner"])

From 3d9a888b7100fc52bbe121e8fc03027d39433545 Mon Sep 17 00:00:00 2001
From: aCampello <a.campello@wellcome.ac.uk>
Date: Fri, 23 Oct 2020 18:38:51 +0100
Subject: [PATCH 08/43] Add iter_filth

---
 scrubadub/detectors/ner.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/scrubadub/detectors/ner.py b/scrubadub/detectors/ner.py
index a78c7f6a..3a6f4d07 100644
--- a/scrubadub/detectors/ner.py
+++ b/scrubadub/detectors/ner.py
@@ -30,6 +30,16 @@ def __init__(self, named_entities: Union[List[str], Set[str]] = {'PERSON'},
         self.nlp.select_pipes(enable=["transformer", "tagger", "parser", "ner"])
         super(SpacyDetector, self).__init__(**kwargs)
 
+    def _iter_spacy_pipeline(self, doc_names: Sequence[Optional[str]], doc_list: Sequence[str]):
+        for doc_name, doc in zip(doc_names, self.nlp.pipe(doc_list)):
+            for ent in doc.ents:
+                if ent.label_ in self.named_entities:
+                    yield self.filth_cls(beg=ent.start_char,
+                                         end=ent.end_char,
+                                         text=ent.text,
+                                         document_name=(str(doc_name) if doc_name else None),
+                                         detector_name=self.name)
+
     def iter_filth_documents(self, documents: Union[Sequence[str], Dict[str, str]]) -> Generator[Filth, None, None]:
         if isinstance(documents, list):
             doc_names, doc_list = zip(*enumerate(documents))
@@ -38,14 +48,7 @@ def iter_filth_documents(self, documents: Union[Sequence[str], Dict[str, str]])
         else:
             raise TypeError('documents must be one of a string, list of strings or dict of strings.')
 
-        for doc_name, doc in zip(doc_names, self.nlp.pipe(doc_list)):
-            for ent in doc.ents:
-                if ent.label_ in self.named_entities:
-                    yield self.filth_cls(beg=ent.start_char,
-                                         end=ent.end_char,
-                                         text=ent.text,
-                                         document_name=str(doc_name),
-                                         detector_name=self.name)
+        yield from self._iter_spacy_pipeline(doc_names, doc_list)
 
     def iter_filth(self, text: str, document_name: Optional[str] = None) -> Generator[Filth, None, None]:
-        pass
+        yield from self._iter_spacy_pipeline([document_name], [text])

From 3aaebb44c431364e1b52d94b498cb1fa96a8434e Mon Sep 17 00:00:00 2001
From: aCampello <a.campello@wellcome.ac.uk>
Date: Fri, 23 Oct 2020 18:39:21 +0100
Subject: [PATCH 09/43] Simplify name_entities type

---
 scrubadub/detectors/ner.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scrubadub/detectors/ner.py b/scrubadub/detectors/ner.py
index 3a6f4d07..bea48ff5 100644
--- a/scrubadub/detectors/ner.py
+++ b/scrubadub/detectors/ner.py
@@ -1,6 +1,6 @@
 import spacy
 
-from typing import Dict, Generator, List, Optional, Set, Sequence, Union
+from typing import Dict, Generator, Iterable, Optional, Sequence, Union
 
 from .base import Detector
 from ..filth import NERFilth, Filth
@@ -17,7 +17,7 @@ class SpacyDetector(Detector):
 
     disallowed_nouns = CanonicalStringSet(["skype"])
 
-    def __init__(self, named_entities: Union[List[str], Set[str]] = {'PERSON'},
+    def __init__(self, named_entities: Iterable[str] = {'PERSON'},
                  model: str = "en_core_web_trf", **kwargs):
         # Spacy NER are all upper cased
         self.named_entities = {entity.upper() for entity in named_entities}

From 7a3c679f65bc185d437303020b52c904c4ef5923 Mon Sep 17 00:00:00 2001
From: aCampello <a.campello@wellcome.ac.uk>
Date: Fri, 23 Oct 2020 18:42:04 +0100
Subject: [PATCH 10/43] Rename named-entity-filth

---
 scrubadub/detectors/ner.py      | 4 ++--
 scrubadub/filth/__init__.py     | 2 +-
 scrubadub/filth/named_entity.py | 5 +++++
 scrubadub/filth/ner.py          | 5 -----
 4 files changed, 8 insertions(+), 8 deletions(-)
 create mode 100644 scrubadub/filth/named_entity.py
 delete mode 100644 scrubadub/filth/ner.py

diff --git a/scrubadub/detectors/ner.py b/scrubadub/detectors/ner.py
index bea48ff5..222bf7af 100644
--- a/scrubadub/detectors/ner.py
+++ b/scrubadub/detectors/ner.py
@@ -3,7 +3,7 @@
 from typing import Dict, Generator, Iterable, Optional, Sequence, Union
 
 from .base import Detector
-from ..filth import NERFilth, Filth
+from ..filth import NamedEntityFilth, Filth
 from ..utils import CanonicalStringSet
 
 
@@ -12,7 +12,7 @@ class SpacyDetector(Detector):
      List specific entities to include passing ``named_entities``, e.g.
      (PERSON)
     """
-    filth_cls = NERFilth
+    filth_cls = NamedEntityFilth
     name = 'spacy_ner'
 
     disallowed_nouns = CanonicalStringSet(["skype"])
diff --git a/scrubadub/filth/__init__.py b/scrubadub/filth/__init__.py
index f24f180c..dde59c71 100644
--- a/scrubadub/filth/__init__.py
+++ b/scrubadub/filth/__init__.py
@@ -4,7 +4,7 @@
 from .email import EmailFilth
 from .known import KnownFilth
 from .name import NameFilth
-from .ner import NERFilth
+from .named_entity import NamedEntityFilth
 from .organization import OrganizationFilth
 from .phone import PhoneFilth
 from .postalcode import PostalCodeFilth
diff --git a/scrubadub/filth/named_entity.py b/scrubadub/filth/named_entity.py
new file mode 100644
index 00000000..57b5ba8c
--- /dev/null
+++ b/scrubadub/filth/named_entity.py
@@ -0,0 +1,5 @@
+from .base import Filth
+
+
+class NamedEntityFilth(Filth):
+    type = 'named_entity'
diff --git a/scrubadub/filth/ner.py b/scrubadub/filth/ner.py
deleted file mode 100644
index d84fbe2a..00000000
--- a/scrubadub/filth/ner.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from .base import Filth
-
-
-class NERFilth(Filth):
-    type = 'ner'

From c2b9aeb283d93e34cf9fb9fb94caaf2c81478613 Mon Sep 17 00:00:00 2001
From: aCampello <a.campello@wellcome.ac.uk>
Date: Sat, 24 Oct 2020 00:22:23 +0100
Subject: [PATCH 11/43] Figure out which detectors can run on a batch of
 documents

---
 scrubadub/scrubbers.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/scrubadub/scrubbers.py b/scrubadub/scrubbers.py
index 6e7d72b8..7fb1c424 100644
--- a/scrubadub/scrubbers.py
+++ b/scrubadub/scrubbers.py
@@ -263,6 +263,16 @@ def iter_filth_documents(
         if not isinstance(documents, (dict, list)):
             raise TypeError('documents must be one of a string, list of strings or dict of strings.')
 
+        # Figures out which detectors can run on a list of documents
+
+        batch_detector_names = [name for name, detector in self._detectors
+                                if callable(hasattr(detector, 'iter_filth_documents', None))]
+
+        filth_list = []
+        for name in batch_detector_names:
+            for filth in self._detectors[name].iter_filth_documents(documents):
+                filth_list.append(filth)
+
         if run_post_processors:
             # Only collect the filts into a list if we need to do post processing
             filth_list = []  # type: List[Filth]

From c12b3d677950e8df6786f9795071c79f5d32c8bd Mon Sep 17 00:00:00 2001
From: aCampello <a.campello@wellcome.ac.uk>
Date: Sat, 24 Oct 2020 00:30:08 +0100
Subject: [PATCH 12/43] Add possibility to disable detector

---
 scrubadub/scrubbers.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/scrubadub/scrubbers.py b/scrubadub/scrubbers.py
index 7fb1c424..2c9d6298 100644
--- a/scrubadub/scrubbers.py
+++ b/scrubadub/scrubbers.py
@@ -222,7 +222,8 @@ def _post_process_filth_list(self, filth_list: Sequence[Filth]) -> Sequence[Filt
         return filth_list
 
     def iter_filth(
-            self, text: str, document_name: Optional[str] = None, run_post_processors: bool = True
+            self, text: str, document_name: Optional[str] = None, run_post_processors: bool = True,
+            exclude_detectors: Optional[List[str]] = None
     ) -> Generator[Filth, None, None]:
         """Iterate over the different types of filth that can exist.
         """
@@ -234,11 +235,12 @@ def iter_filth(
         # over all detectors simultaneously. just trying to get something
         # working right now and we can worry about efficiency later
         all_filths = []  # type: List[Filth]
-        for detector in self._detectors.values():
-            for filth in detector.iter_filth(text, document_name=document_name):
-                if not isinstance(filth, Filth):
-                    raise TypeError('iter_filth must always yield Filth')
-                all_filths.append(filth)
+        for name, detector in self._detectors.items():
+            if exclude_detectors is None or name not in exclude_detectors:
+                for filth in detector.iter_filth(text, document_name=document_name):
+                    if not isinstance(filth, Filth):
+                        raise TypeError('iter_filth must always yield Filth')
+                    all_filths.append(filth)
 
         # This is split up so that we only have to use lists if we have to post_process Filth
         if run_post_processors:
@@ -280,7 +282,7 @@ def iter_filth_documents(
                 filth_list = [
                     filth
                     for name, text in documents.items()
-                    for filth in self.iter_filth(text, document_name=name, run_post_processors=False)
+                    for filth in self.iter_filth(text, document_name=name, run_post_processors=False, exclude_detectors=[])
                 ]
             elif isinstance(documents, list):
                 filth_list = [

From 7f05d28c5a5bf56ac3c882c929bc1ba60ec2153f Mon Sep 17 00:00:00 2001
From: aCampello <a.campello@wellcome.ac.uk>
Date: Sat, 24 Oct 2020 14:10:11 +0100
Subject: [PATCH 13/43] Logic to scrubbers to detect if a detector has document
 iterator

---
 scrubadub/scrubbers.py | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/scrubadub/scrubbers.py b/scrubadub/scrubbers.py
index 2c9d6298..63c2c440 100644
--- a/scrubadub/scrubbers.py
+++ b/scrubadub/scrubbers.py
@@ -265,30 +265,33 @@ def iter_filth_documents(
         if not isinstance(documents, (dict, list)):
             raise TypeError('documents must be one of a string, list of strings or dict of strings.')
 
-        # Figures out which detectors can run on a list of documents
-
-        batch_detector_names = [name for name, detector in self._detectors
-                                if callable(hasattr(detector, 'iter_filth_documents', None))]
+        # Figures out which detectors have iter_filth_documents and applies to them
 
+        document_detectors_names = []
         filth_list = []
-        for name in batch_detector_names:
-            for filth in self._detectors[name].iter_filth_documents(documents):
-                filth_list.append(filth)
+
+        for name, detector in self._detectors.items():
+            document_iterator = getattr(detector, 'iter_filth_documents', None)
+            if callable(document_iterator):
+                document_detectors_names.append(name)
+                for filth in document_iterator(documents):
+                    filth_list.append(filth)
 
         if run_post_processors:
             # Only collect the filts into a list if we need to do post processing
-            filth_list = []  # type: List[Filth]
             if isinstance(documents, dict):
-                filth_list = [
+                filth_list += [
                     filth
                     for name, text in documents.items()
-                    for filth in self.iter_filth(text, document_name=name, run_post_processors=False, exclude_detectors=[])
+                    for filth in self.iter_filth(text, document_name=name, run_post_processors=False,
+                                                 exclude_detectors=document_detectors_names)
                 ]
             elif isinstance(documents, list):
-                filth_list = [
+                filth_list += [
                     filth
                     for i_name, text in enumerate(documents)
-                    for filth in self.iter_filth(text, document_name=str(i_name), run_post_processors=False)
+                    for filth in self.iter_filth(text, document_name=str(i_name), run_post_processors=False,
+                                                 exclude_detectors=document_detectors_names)
                 ]
 
             for filth in self._post_process_filth_list(filth_list):
@@ -297,11 +300,13 @@ def iter_filth_documents(
             # Use generators when we dont post process the Filth
             if isinstance(documents, dict):
                 for name, text in documents.items():
-                    for filth in self.iter_filth(text, document_name=name, run_post_processors=False):
+                    for filth in self.iter_filth(text, document_name=name, run_post_processors=False,
+                                                 exclude_detectors=document_detectors_names):
                         yield filth
             elif isinstance(documents, list):
                 for i_name, text in enumerate(documents):
-                    for filth in self.iter_filth(text, document_name=str(i_name), run_post_processors=False):
+                    for filth in self.iter_filth(text, document_name=str(i_name), run_post_processors=False,
+                                                 exclude_detectors=document_detectors_names):
                         yield filth
 
     @staticmethod

From f413757ffe18d465a6913042b1c92ad7e65d2cba Mon Sep 17 00:00:00 2001
From: aCampello <a.campello@wellcome.ac.uk>
Date: Sat, 24 Oct 2020 14:41:17 +0100
Subject: [PATCH 14/43] Scrubbers to merge with document detectors

---
 scrubadub/detectors/ner.py |  2 +-
 scrubadub/scrubbers.py     | 24 ++++++++++++++++--------
 2 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/scrubadub/detectors/ner.py b/scrubadub/detectors/ner.py
index 222bf7af..b648819f 100644
--- a/scrubadub/detectors/ner.py
+++ b/scrubadub/detectors/ner.py
@@ -37,7 +37,7 @@ def _iter_spacy_pipeline(self, doc_names: Sequence[Optional[str]], doc_list: Seq
                     yield self.filth_cls(beg=ent.start_char,
                                          end=ent.end_char,
                                          text=ent.text,
-                                         document_name=(str(doc_name) if doc_name else None),
+                                         document_name=None or str(doc_name),  # None if no doc_name provid
                                          detector_name=self.name)
 
     def iter_filth_documents(self, documents: Union[Sequence[str], Dict[str, str]]) -> Generator[Filth, None, None]:
diff --git a/scrubadub/scrubbers.py b/scrubadub/scrubbers.py
index 63c2c440..ada23e9d 100644
--- a/scrubadub/scrubbers.py
+++ b/scrubadub/scrubbers.py
@@ -223,7 +223,7 @@ def _post_process_filth_list(self, filth_list: Sequence[Filth]) -> Sequence[Filt
 
     def iter_filth(
             self, text: str, document_name: Optional[str] = None, run_post_processors: bool = True,
-            exclude_detectors: Optional[List[str]] = None
+            run_merge: bool = True, exclude_detectors: Optional[List[str]] = None
     ) -> Generator[Filth, None, None]:
         """Iterate over the different types of filth that can exist.
         """
@@ -244,7 +244,9 @@ def iter_filth(
 
         # This is split up so that we only have to use lists if we have to post_process Filth
         if run_post_processors:
-            all_filths = list(self._merge_filths(all_filths))
+            if run_merge:
+                all_filths = list(self._merge_filths(all_filths))
+
             all_filths = list(self._post_process_filth_list(all_filths))
 
             # Here we loop over a list of Filth...
@@ -253,8 +255,12 @@ def iter_filth(
         else:
             # ... but here, we're using a generator. If we try to use the same variable it would have two types and
             # fail static typing in mypy
-            for filth in self._merge_filths(all_filths):
-                yield filth
+            if run_merge:
+                for filth in self._merge_filths(all_filths):
+                    yield filth
+            else:
+                for filth in all_filths:
+                    yield filth
 
     def iter_filth_documents(
             self,
@@ -284,16 +290,18 @@ def iter_filth_documents(
                     filth
                     for name, text in documents.items()
                     for filth in self.iter_filth(text, document_name=name, run_post_processors=False,
-                                                 exclude_detectors=document_detectors_names)
+                                                 run_merge=False, exclude_detectors=document_detectors_names)
                 ]
             elif isinstance(documents, list):
                 filth_list += [
                     filth
                     for i_name, text in enumerate(documents)
                     for filth in self.iter_filth(text, document_name=str(i_name), run_post_processors=False,
-                                                 exclude_detectors=document_detectors_names)
+                                                 run_merge=False, exclude_detectors=document_detectors_names)
                 ]
 
+            filth_list = list(self._merge_filths(filth_list))
+
             for filth in self._post_process_filth_list(filth_list):
                 yield filth
         else:
@@ -301,12 +309,12 @@ def iter_filth_documents(
             if isinstance(documents, dict):
                 for name, text in documents.items():
                     for filth in self.iter_filth(text, document_name=name, run_post_processors=False,
-                                                 exclude_detectors=document_detectors_names):
+                                                 run_merge=False, exclude_detectors=document_detectors_names):
                         yield filth
             elif isinstance(documents, list):
                 for i_name, text in enumerate(documents):
                     for filth in self.iter_filth(text, document_name=str(i_name), run_post_processors=False,
-                                                 exclude_detectors=document_detectors_names):
+                                                 run_merge=False, exclude_detectors=document_detectors_names):
                         yield filth
 
     @staticmethod

From 00c3343fb917c123debb209f152de3ab5188d9a0 Mon Sep 17 00:00:00 2001
From: aCampello <a.campello@wellcome.ac.uk>
Date: Sat, 24 Oct 2020 16:13:17 +0100
Subject: [PATCH 15/43] Tidy document processors merge

---
 scrubadub/scrubbers.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/scrubadub/scrubbers.py b/scrubadub/scrubbers.py
index ada23e9d..096291d8 100644
--- a/scrubadub/scrubbers.py
+++ b/scrubadub/scrubbers.py
@@ -1,5 +1,7 @@
+from collections import defaultdict
+
 import warnings
-from typing import Optional, Sequence, Generator, Dict, Type, Union, List
+from typing import Optional, Sequence, Generator, DefaultDict, Dict, Type, Union, List
 
 from . import detectors
 from . import post_processors
@@ -283,8 +285,10 @@ def iter_filth_documents(
                 for filth in document_iterator(documents):
                     filth_list.append(filth)
 
-        if run_post_processors:
-            # Only collect the filts into a list if we need to do post processing
+        # We have to now merge with the other processors. To do this we need to collect filth into a list
+        # Also need this if we need to do post processing
+
+        if run_post_processors or document_detectors_names:
             if isinstance(documents, dict):
                 filth_list += [
                     filth
@@ -302,8 +306,11 @@ def iter_filth_documents(
 
             filth_list = list(self._merge_filths(filth_list))
 
-            for filth in self._post_process_filth_list(filth_list):
-                yield filth
+            if run_post_processors:
+                yield from self._post_process_filth_list(filth_list)
+            else:
+                for filth in filth_list:
+                    yield filth
         else:
             # Use generators when we dont post process the Filth
             if isinstance(documents, dict):

From 9481159c608c6c32276fdd521173269768587f7f Mon Sep 17 00:00:00 2001
From: aCampello <a.campello@wellcome.ac.uk>
Date: Sat, 24 Oct 2020 16:30:53 +0100
Subject: [PATCH 16/43] Named entity filth to accept a label

---
 scrubadub/detectors/ner.py      |  5 +++--
 scrubadub/filth/named_entity.py | 10 ++++++++++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/scrubadub/detectors/ner.py b/scrubadub/detectors/ner.py
index b648819f..cf337bdd 100644
--- a/scrubadub/detectors/ner.py
+++ b/scrubadub/detectors/ner.py
@@ -37,8 +37,9 @@ def _iter_spacy_pipeline(self, doc_names: Sequence[Optional[str]], doc_list: Seq
                     yield self.filth_cls(beg=ent.start_char,
                                          end=ent.end_char,
                                          text=ent.text,
-                                         document_name=None or str(doc_name),  # None if no doc_name provid
-                                         detector_name=self.name)
+                                         document_name=None or str(doc_name),  # None if no doc_name provided
+                                         detector_name=self.name,
+                                         label=ent.label_)
 
     def iter_filth_documents(self, documents: Union[Sequence[str], Dict[str, str]]) -> Generator[Filth, None, None]:
         if isinstance(documents, list):
diff --git a/scrubadub/filth/named_entity.py b/scrubadub/filth/named_entity.py
index 57b5ba8c..6b319cff 100644
--- a/scrubadub/filth/named_entity.py
+++ b/scrubadub/filth/named_entity.py
@@ -2,4 +2,14 @@
 
 
 class NamedEntityFilth(Filth):
+    """
+    Named entity filth. Upon initialisation provide a label for named entity (e.g. name, org)
+    """
     type = 'named_entity'
+
+    def __init__(self, *args, label: str, **kwargs):
+        super(NamedEntityFilth, self).__init__(*args, **kwargs)
+        self.label = label
+
+    def __repr__(self) -> str:
+        return self._to_string(['text', 'document_name', 'label'])

From e148dfcb68e0471adf8d76e35cb2c80da35420c8 Mon Sep 17 00:00:00 2001
From: aCampello <a.campello@wellcome.ac.uk>
Date: Sat, 24 Oct 2020 16:31:38 +0100
Subject: [PATCH 17/43] Add Spacy detector to init

---
 scrubadub/detectors/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scrubadub/detectors/__init__.py b/scrubadub/detectors/__init__.py
index 50a2b9dd..15e955b5 100644
--- a/scrubadub/detectors/__init__.py
+++ b/scrubadub/detectors/__init__.py
@@ -13,6 +13,7 @@
 from .credential import CredentialDetector
 from .email import EmailDetector, NewEmailDetector
 from .name import NameDetector
+from .ner import SpacyDetector
 from .phone import PhoneDetector
 from .postalcode import PostalCodeDetector
 from .known import KnownFilthDetector

From af80edd24a928ae63c49951884cf2fda42ec4999 Mon Sep 17 00:00:00 2001
From: aCampello <a.campello@wellcome.ac.uk>
Date: Sat, 24 Oct 2020 16:33:56 +0100
Subject: [PATCH 18/43] Change detector name to follow the pattern

---
 scrubadub/detectors/__init__.py | 2 +-
 scrubadub/detectors/ner.py      | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/scrubadub/detectors/__init__.py b/scrubadub/detectors/__init__.py
index 15e955b5..65c01d1c 100644
--- a/scrubadub/detectors/__init__.py
+++ b/scrubadub/detectors/__init__.py
@@ -13,7 +13,7 @@
 from .credential import CredentialDetector
 from .email import EmailDetector, NewEmailDetector
 from .name import NameDetector
-from .ner import SpacyDetector
+from .ner import NamedEntityDetector
 from .phone import PhoneDetector
 from .postalcode import PostalCodeDetector
 from .known import KnownFilthDetector
diff --git a/scrubadub/detectors/ner.py b/scrubadub/detectors/ner.py
index cf337bdd..385be073 100644
--- a/scrubadub/detectors/ner.py
+++ b/scrubadub/detectors/ner.py
@@ -7,13 +7,13 @@
 from ..utils import CanonicalStringSet
 
 
-class SpacyDetector(Detector):
+class NamedEntityDetector(Detector):
     """Use spacy's named entity recognition to clean named entities.
      List specific entities to include passing ``named_entities``, e.g.
      (PERSON)
     """
     filth_cls = NamedEntityFilth
-    name = 'spacy_ner'
+    name = 'named_entity'
 
     disallowed_nouns = CanonicalStringSet(["skype"])
 
@@ -28,7 +28,7 @@ def __init__(self, named_entities: Iterable[str] = {'PERSON'},
         self.nlp = spacy.load(model)
         # Only enable necessary pipes
         self.nlp.select_pipes(enable=["transformer", "tagger", "parser", "ner"])
-        super(SpacyDetector, self).__init__(**kwargs)
+        super(NamedEntityDetector, self).__init__(**kwargs)
 
     def _iter_spacy_pipeline(self, doc_names: Sequence[Optional[str]], doc_list: Sequence[str]):
         for doc_name, doc in zip(doc_names, self.nlp.pipe(doc_list)):

From 86f6e6bb7875143efd4fec09dd0a31b7fe01d8db Mon Sep 17 00:00:00 2001
From: aCampello <a.campello@wellcome.ac.uk>
Date: Sat, 24 Oct 2020 16:34:22 +0100
Subject: [PATCH 19/43] Update module name

---
 scrubadub/detectors/__init__.py                 | 2 +-
 scrubadub/detectors/{ner.py => named_entity.py} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename scrubadub/detectors/{ner.py => named_entity.py} (100%)

diff --git a/scrubadub/detectors/__init__.py b/scrubadub/detectors/__init__.py
index 65c01d1c..c53273ab 100644
--- a/scrubadub/detectors/__init__.py
+++ b/scrubadub/detectors/__init__.py
@@ -13,7 +13,7 @@
 from .credential import CredentialDetector
 from .email import EmailDetector, NewEmailDetector
 from .name import NameDetector
-from .ner import NamedEntityDetector
+from .named_entity import NamedEntityDetector
 from .phone import PhoneDetector
 from .postalcode import PostalCodeDetector
 from .known import KnownFilthDetector
diff --git a/scrubadub/detectors/ner.py b/scrubadub/detectors/named_entity.py
similarity index 100%
rename from scrubadub/detectors/ner.py
rename to scrubadub/detectors/named_entity.py

From ae42eb05704f7121c491081fb3decfe8b64038e6 Mon Sep 17 00:00:00 2001
From: aCampello <a.campello@wellcome.ac.uk>
Date: Sat, 24 Oct 2020 16:35:29 +0100
Subject: [PATCH 20/43] Remove unecessary imports

---
 scrubadub/scrubbers.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/scrubadub/scrubbers.py b/scrubadub/scrubbers.py
index 096291d8..58bf0689 100644
--- a/scrubadub/scrubbers.py
+++ b/scrubadub/scrubbers.py
@@ -1,7 +1,5 @@
-from collections import defaultdict
-
 import warnings
-from typing import Optional, Sequence, Generator, DefaultDict, Dict, Type, Union, List
+from typing import Optional, Sequence, Generator, Dict, Type, Union, List
 
 from . import detectors
 from . import post_processors

From 86b45328e6deff6ee1645b9f2449f81f038897bb Mon Sep 17 00:00:00 2001
From: aCampello <a.campello@wellcome.ac.uk>
Date: Sat, 24 Oct 2020 17:11:50 +0100
Subject: [PATCH 21/43] Change type for NamedEntityFilth depending on label

---
 scrubadub/detectors/named_entity.py | 4 ++--
 scrubadub/filth/named_entity.py     | 5 +----
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/scrubadub/detectors/named_entity.py b/scrubadub/detectors/named_entity.py
index 385be073..a0bbb2a2 100644
--- a/scrubadub/detectors/named_entity.py
+++ b/scrubadub/detectors/named_entity.py
@@ -1,7 +1,7 @@
-import spacy
-
 from typing import Dict, Generator, Iterable, Optional, Sequence, Union
 
+import spacy
+
 from .base import Detector
 from ..filth import NamedEntityFilth, Filth
 from ..utils import CanonicalStringSet
diff --git a/scrubadub/filth/named_entity.py b/scrubadub/filth/named_entity.py
index 6b319cff..7be8c530 100644
--- a/scrubadub/filth/named_entity.py
+++ b/scrubadub/filth/named_entity.py
@@ -9,7 +9,4 @@ class NamedEntityFilth(Filth):
 
     def __init__(self, *args, label: str, **kwargs):
         super(NamedEntityFilth, self).__init__(*args, **kwargs)
-        self.label = label
-
-    def __repr__(self) -> str:
-        return self._to_string(['text', 'document_name', 'label'])
+        self.type = "{}_{}".format(self.type, label).lower()

From 45ee26fae4a8d31b66e3307c0ab4aed21678b4b6 Mon Sep 17 00:00:00 2001
From: aCampello <a.campello@wellcome.ac.uk>
Date: Sat, 24 Oct 2020 17:14:37 +0100
Subject: [PATCH 22/43] Revert NamedEntityFilth name because it was a bad idea

---
 scrubadub/filth/named_entity.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scrubadub/filth/named_entity.py b/scrubadub/filth/named_entity.py
index 7be8c530..255a331a 100644
--- a/scrubadub/filth/named_entity.py
+++ b/scrubadub/filth/named_entity.py
@@ -9,4 +9,4 @@ class NamedEntityFilth(Filth):
 
     def __init__(self, *args, label: str, **kwargs):
         super(NamedEntityFilth, self).__init__(*args, **kwargs)
-        self.type = "{}_{}".format(self.type, label).lower()
+        self.label = label.lower()

From 5dacd62d8d27f6d0d94313ec1bd39857ee314d2f Mon Sep 17 00:00:00 2001
From: aCampello <a.campello@wellcome.ac.uk>
Date: Sat, 24 Oct 2020 17:21:14 +0100
Subject: [PATCH 23/43] Change replacement string of named entity filth

---
 scrubadub/filth/named_entity.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scrubadub/filth/named_entity.py b/scrubadub/filth/named_entity.py
index 255a331a..5358e84a 100644
--- a/scrubadub/filth/named_entity.py
+++ b/scrubadub/filth/named_entity.py
@@ -10,3 +10,4 @@ class NamedEntityFilth(Filth):
     def __init__(self, *args, label: str, **kwargs):
         super(NamedEntityFilth, self).__init__(*args, **kwargs)
         self.label = label.lower()
+        self.replacement_string = "{}_{}".format(self.type, self.label)

From d319029c0d83e298ee3da03dd692b0a5e5fa602c Mon Sep 17 00:00:00 2001
From: aCampello <a.campello@wellcome.ac.uk>
Date: Sat, 24 Oct 2020 17:37:52 +0100
Subject: [PATCH 24/43] Add spacy nightly to requirements

---
 requirements/python | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements/python b/requirements/python
index 8d00ab70..be22055b 100644
--- a/requirements/python
+++ b/requirements/python
@@ -3,4 +3,5 @@ argcomplete
 phonenumbers
 pandas
 sklearn
-typing_extensions
\ No newline at end of file
+spacy-nightly[transformers]
+typing_extensions

From 8a63d479390522801310b37592b50e146cc56f01 Mon Sep 17 00:00:00 2001
From: aCampello <a.campello@wellcome.ac.uk>
Date: Sat, 24 Oct 2020 17:51:49 +0100
Subject: [PATCH 25/43] Add benchmark with spacy accuracy

---
 tests/benchmark_accuracy.py | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/tests/benchmark_accuracy.py b/tests/benchmark_accuracy.py
index 392125b9..0408ca64 100644
--- a/tests/benchmark_accuracy.py
+++ b/tests/benchmark_accuracy.py
@@ -11,9 +11,11 @@
 
 def main():
     general_docs = []
+    named_entity_docs = []
     # address_docs = []
     # uk_phone_docs = []
     known_general_pii = []
+    known_named_entity_pii = []
     # known_address_pii = []
     # known_uk_phone_pii = []
     start_time = time.time()
@@ -23,6 +25,15 @@ def main():
         general_docs.append(new_doc)
         known_general_pii += new_known_pii
 
+        new_doc, new_known_pii = make_fake_document(paragraphs=4, seed=i_doc, filth_types=['name'])
+        # Change the filth name to allow for comparison with NamedEntityDetector. Probably there is a better way to do it
+
+        for pii in new_known_pii:
+            pii['filth_type'] = 'named_entity'
+
+        named_entity_docs.append(new_doc)
+        known_named_entity_pii += new_known_pii
+
         # new_doc, new_known_pii = make_fake_document(paragraphs=4, seed=i_doc, filth_types=['gb_address', 'us_address'])
         # address_docs.append(new_doc)
         # known_address_pii += new_known_pii
@@ -35,7 +46,6 @@ def main():
 
     scrubber_time = time.time()
     scrubber = scrubadub.Scrubber()
-    # scrubber.add_detector(scrubadub.detectors.stanford_ner.StanfordNERDetector())
     scrubber.add_detector(scrubadub.detectors.KnownFilthDetector(known_filth_items=known_general_pii))
     filth_list = list(scrubber.iter_filth_documents(general_docs))
 
@@ -57,6 +67,15 @@ def main():
     print("Scrubbed documents in  {:.2f}s".format(end_time-scrubber_time))
     print(get_filth_classification_report(filth_list))
 
+    scrubber_time = time.time()
+    scrubber = scrubadub.Scrubber(detector_list=[scrubadub.detectors.NamedEntityDetector(),
+                                                 scrubadub.detectors.KnownFilthDetector(known_filth_items=known_named_entity_pii)])
+    filth_list = list(scrubber.iter_filth_documents(named_entity_docs))
+    end_time = time.time()
+    print("Documents generated in {:.2f}s".format(scrubber_time-start_time))
+    print("Scrubbed documents in  {:.2f}s".format(end_time-scrubber_time))
+    print(get_filth_classification_report(filth_list))
+
     sys.exit(0)
 
 

From 0d0b83911eb2c13ef958300aafc01deb5a3f30e7 Mon Sep 17 00:00:00 2001
From: aCampello <a.campello@wellcome.ac.uk>
Date: Sat, 24 Oct 2020 18:15:57 +0100
Subject: [PATCH 26/43] Comment named entity test code

---
 tests/benchmark_accuracy.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/benchmark_accuracy.py b/tests/benchmark_accuracy.py
index 0408ca64..3e7e8f1f 100644
--- a/tests/benchmark_accuracy.py
+++ b/tests/benchmark_accuracy.py
@@ -67,14 +67,14 @@ def main():
     print("Scrubbed documents in  {:.2f}s".format(end_time-scrubber_time))
     print(get_filth_classification_report(filth_list))
 
-    scrubber_time = time.time()
-    scrubber = scrubadub.Scrubber(detector_list=[scrubadub.detectors.NamedEntityDetector(),
-                                                 scrubadub.detectors.KnownFilthDetector(known_filth_items=known_named_entity_pii)])
-    filth_list = list(scrubber.iter_filth_documents(named_entity_docs))
-    end_time = time.time()
-    print("Documents generated in {:.2f}s".format(scrubber_time-start_time))
-    print("Scrubbed documents in  {:.2f}s".format(end_time-scrubber_time))
-    print(get_filth_classification_report(filth_list))
+    # scrubber_time = time.time()
+    # scrubber = scrubadub.Scrubber(detector_list=[scrubadub.detectors.NamedEntityDetector(),
+    #                                              scrubadub.detectors.KnownFilthDetector(known_filth_items=known_named_entity_pii)])
+    # filth_list = list(scrubber.iter_filth_documents(named_entity_docs))
+    # end_time = time.time()
+    # print("Documents generated in {:.2f}s".format(scrubber_time-start_time))
+    # print("Scrubbed documents in  {:.2f}s".format(end_time-scrubber_time))
+    # print(get_filth_classification_report(filth_list))
 
     sys.exit(0)
 

From f6386dd17be4757a948360d9f7ef4362d9d36734 Mon Sep 17 00:00:00 2001
From: aCampello <a.campello@wellcome.ac.uk>
Date: Mon, 26 Oct 2020 22:41:03 +0000
Subject: [PATCH 27/43] NamedEntityDetector to return standard Filth when it is
 avaliable

---
 scrubadub/detectors/named_entity.py | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/scrubadub/detectors/named_entity.py b/scrubadub/detectors/named_entity.py
index a0bbb2a2..a364c59a 100644
--- a/scrubadub/detectors/named_entity.py
+++ b/scrubadub/detectors/named_entity.py
@@ -3,7 +3,7 @@
 import spacy
 
 from .base import Detector
-from ..filth import NamedEntityFilth, Filth
+from ..filth import NamedEntityFilth, Filth, NameFilth, OrganizationFilth
 from ..utils import CanonicalStringSet
 
 
@@ -12,7 +12,10 @@ class NamedEntityDetector(Detector):
      List specific entities to include passing ``named_entities``, e.g.
      (PERSON)
     """
-    filth_cls = NamedEntityFilth
+    filth_cls_map = {
+        'PERSON': NameFilth,
+        'ORG': OrganizationFilth
+    }
     name = 'named_entity'
 
     disallowed_nouns = CanonicalStringSet(["skype"])
@@ -32,14 +35,17 @@ def __init__(self, named_entities: Iterable[str] = {'PERSON'},
 
     def _iter_spacy_pipeline(self, doc_names: Sequence[Optional[str]], doc_list: Sequence[str]):
         for doc_name, doc in zip(doc_names, self.nlp.pipe(doc_list)):
+            print(doc_name)
             for ent in doc.ents:
                 if ent.label_ in self.named_entities:
-                    yield self.filth_cls(beg=ent.start_char,
-                                         end=ent.end_char,
-                                         text=ent.text,
-                                         document_name=None or str(doc_name),  # None if no doc_name provided
-                                         detector_name=self.name,
-                                         label=ent.label_)
+                    # If there is no standard 'filth', returns a NamedEntity filth
+                    filth_cls = self.filth_cls_map.get(ent.label_, NamedEntityFilth)
+                    yield filth_cls(beg=ent.start_char,
+                                    end=ent.end_char,
+                                    text=ent.text,
+                                    document_name=(str(doc_name) if doc_name else None),  # None if no doc_name provided
+                                    detector_name=self.name,
+                                    label=ent.label_)
 
     def iter_filth_documents(self, documents: Union[Sequence[str], Dict[str, str]]) -> Generator[Filth, None, None]:
         if isinstance(documents, list):

From dbdb247ad03ca6b9168f193eadaf28638d718072 Mon Sep 17 00:00:00 2001
From: aCampello <a.campello@wellcome.ac.uk>
Date: Mon, 26 Oct 2020 22:45:13 +0000
Subject: [PATCH 28/43] Change docstring for NamedEntity filth

---
 scrubadub/filth/named_entity.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scrubadub/filth/named_entity.py b/scrubadub/filth/named_entity.py
index 5358e84a..436ea992 100644
--- a/scrubadub/filth/named_entity.py
+++ b/scrubadub/filth/named_entity.py
@@ -3,7 +3,8 @@
 
 class NamedEntityFilth(Filth):
     """
-    Named entity filth. Upon initialisation provide a label for named entity (e.g. name, org)
+    Default filth type, for named entities (e.g. the ones in https://nightly.spacy.io/models/en#en_core_web_lg-labels),
+    except the ones represented in any other filth.
     """
     type = 'named_entity'
 

From 509231605f6c93b9519ee549a06295b76c56198b Mon Sep 17 00:00:00 2001
From: aCampello <a.campello@wellcome.ac.uk>
Date: Mon, 26 Oct 2020 22:48:32 +0000
Subject: [PATCH 29/43] Remove accidental print

---
 scrubadub/detectors/named_entity.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scrubadub/detectors/named_entity.py b/scrubadub/detectors/named_entity.py
index a364c59a..353a6978 100644
--- a/scrubadub/detectors/named_entity.py
+++ b/scrubadub/detectors/named_entity.py
@@ -35,7 +35,6 @@ def __init__(self, named_entities: Iterable[str] = {'PERSON'},
 
     def _iter_spacy_pipeline(self, doc_names: Sequence[Optional[str]], doc_list: Sequence[str]):
         for doc_name, doc in zip(doc_names, self.nlp.pipe(doc_list)):
-            print(doc_name)
             for ent in doc.ents:
                 if ent.label_ in self.named_entities:
                     # If there is no standard 'filth', returns a NamedEntity filth

From 6446f44b0543b70fa9de0413287890ef0c856d8b Mon Sep 17 00:00:00 2001
From: aCampello <a.campello@wellcome.ac.uk>
Date: Wed, 28 Oct 2020 22:36:34 +0000
Subject: [PATCH 30/43] Download necessary model if not present in OS

---
 scrubadub/detectors/named_entity.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/scrubadub/detectors/named_entity.py b/scrubadub/detectors/named_entity.py
index 353a6978..8aad2f4b 100644
--- a/scrubadub/detectors/named_entity.py
+++ b/scrubadub/detectors/named_entity.py
@@ -1,6 +1,7 @@
 from typing import Dict, Generator, Iterable, Optional, Sequence, Union
 
 import spacy
+from wasabi import msg
 
 from .base import Detector
 from ..filth import NamedEntityFilth, Filth, NameFilth, OrganizationFilth
@@ -25,9 +26,9 @@ def __init__(self, named_entities: Iterable[str] = {'PERSON'},
         # Spacy NER are all upper cased
         self.named_entities = {entity.upper() for entity in named_entities}
         if model not in spacy.info()['pipelines']:
-            raise OSError("Can't find model '{}'. If it is a valid Spacy model, "
-                          "download it (e.g. with the CLI command "
-                          "`python -m spacy download {}`).".format(model, model))
+            msg.warn("Downloading spacy model {}".format(model))
+            spacy.cli.download(model)
+
         self.nlp = spacy.load(model)
         # Only enable necessary pipes
         self.nlp.select_pipes(enable=["transformer", "tagger", "parser", "ner"])

From b147eb903187d140dbcb622c87e93b0a336c4e0d Mon Sep 17 00:00:00 2001
From: aCampello <a.campello@wellcome.ac.uk>
Date: Wed, 28 Oct 2020 22:59:27 +0000
Subject: [PATCH 31/43] Change iter_filth_documents signature

---
 scrubadub/detectors/named_entity.py | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/scrubadub/detectors/named_entity.py b/scrubadub/detectors/named_entity.py
index 8aad2f4b..f55f73ef 100644
--- a/scrubadub/detectors/named_entity.py
+++ b/scrubadub/detectors/named_entity.py
@@ -1,4 +1,4 @@
-from typing import Dict, Generator, Iterable, Optional, Sequence, Union
+from typing import Generator, Iterable, Optional, Sequence
 
 import spacy
 from wasabi import msg
@@ -34,7 +34,8 @@ def __init__(self, named_entities: Iterable[str] = {'PERSON'},
         self.nlp.select_pipes(enable=["transformer", "tagger", "parser", "ner"])
         super(NamedEntityDetector, self).__init__(**kwargs)
 
-    def _iter_spacy_pipeline(self, doc_names: Sequence[Optional[str]], doc_list: Sequence[str]):
+    def iter_filth_documents(self, doc_names: Sequence[Optional[str]],
+                             doc_list: Sequence[str]) -> Generator[Filth, None, None]:
         for doc_name, doc in zip(doc_names, self.nlp.pipe(doc_list)):
             for ent in doc.ents:
                 if ent.label_ in self.named_entities:
@@ -47,15 +48,5 @@ def _iter_spacy_pipeline(self, doc_names: Sequence[Optional[str]], doc_list: Seq
                                     detector_name=self.name,
                                     label=ent.label_)
 
-    def iter_filth_documents(self, documents: Union[Sequence[str], Dict[str, str]]) -> Generator[Filth, None, None]:
-        if isinstance(documents, list):
-            doc_names, doc_list = zip(*enumerate(documents))
-        elif isinstance(documents, dict):
-            doc_names, doc_list = zip(*documents.items())
-        else:
-            raise TypeError('documents must be one of a string, list of strings or dict of strings.')
-
-        yield from self._iter_spacy_pipeline(doc_names, doc_list)
-
     def iter_filth(self, text: str, document_name: Optional[str] = None) -> Generator[Filth, None, None]:
-        yield from self._iter_spacy_pipeline([document_name], [text])
+        yield from self.iter_filth_documents([document_name], [text])

From 3edc20d1d278ae650f29248b9de613a0236e146a Mon Sep 17 00:00:00 2001
From: aCampello <a.campello@wellcome.ac.uk>
Date: Wed, 28 Oct 2020 23:18:54 +0000
Subject: [PATCH 32/43] Scrubber simplification

---
 scrubadub/scrubbers.py | 116 +++++++++++++++--------------------------
 1 file changed, 41 insertions(+), 75 deletions(-)

diff --git a/scrubadub/scrubbers.py b/scrubadub/scrubbers.py
index 58bf0689..85b7d2cd 100644
--- a/scrubadub/scrubbers.py
+++ b/scrubadub/scrubbers.py
@@ -222,11 +222,36 @@ def _post_process_filth_list(self, filth_list: Sequence[Filth]) -> Sequence[Filt
         return filth_list
 
     def iter_filth(
-            self, text: str, document_name: Optional[str] = None, run_post_processors: bool = True,
-            run_merge: bool = True, exclude_detectors: Optional[List[str]] = None
+            self, text: str, document_name: Optional[str] = None, run_post_processors: bool = True
     ) -> Generator[Filth, None, None]:
         """Iterate over the different types of filth that can exist.
         """
+        # Iterates using iter_filth documents.
+        # If a name is not provided, passes a list with one element, [text]
+
+        yield from self.iter_filth_documents(
+            documents=({document_name: text} if document_name else [text]),
+            run_post_processors=run_post_processors
+        )
+
+    def iter_filth_documents(
+            self,
+            documents: Union[Sequence[str], Dict[str, str]],
+            run_post_processors: bool = True
+    ) -> Generator[Filth, None, None]:
+        """Iterate over the different types of filth that can exist."""
+        if not isinstance(documents, (dict, list)):
+            raise TypeError('documents must be one of a string, list of strings or dict of strings.')
+
+        # Figures out which detectors have iter_filth_documents and applies to them
+
+        if isinstance(documents, dict):
+            document_names, document_texts = zip(*documents.items())
+        elif isinstance(documents, (tuple, list)):
+            document_texts = documents
+            document_names = [str(x) for x in range(len(documents))]
+
+
         # currently doing this by aggregating all_filths and then sorting
         # inline instead of with a Filth.__cmp__ method, which is apparently
         # much slower http://stackoverflow.com/a/988728/564709
@@ -234,19 +259,24 @@ def iter_filth(
         # NOTE: we could probably do this in a more efficient way by iterating
         # over all detectors simultaneously. just trying to get something
         # working right now and we can worry about efficiency later
-        all_filths = []  # type: List[Filth]
+        filth_list = []  # type: List[Filth]
         for name, detector in self._detectors.items():
-            if exclude_detectors is None or name not in exclude_detectors:
-                for filth in detector.iter_filth(text, document_name=document_name):
+            document_iterator = getattr(detector, 'iter_filth_documents', None)
+            if callable(document_iterator):
+                for filth in document_iterator(document_names, document_texts):
                     if not isinstance(filth, Filth):
                         raise TypeError('iter_filth must always yield Filth')
-                    all_filths.append(filth)
+                    filth_list.append(filth)
+            else:
+                for document_name, text in zip(document_names, document_texts):
+                    for filth in detector.iter_filth(text, document_name=document_name):
+                        if not isinstance(filth, Filth):
+                            raise TypeError('iter_filth must always yield Filth')
+                        filth_list.append(filth)
 
         # This is split up so that we only have to use lists if we have to post_process Filth
         if run_post_processors:
-            if run_merge:
-                all_filths = list(self._merge_filths(all_filths))
-
+            all_filths = list(self._merge_filths(filth_list))
             all_filths = list(self._post_process_filth_list(all_filths))
 
             # Here we loop over a list of Filth...
@@ -255,72 +285,8 @@ def iter_filth(
         else:
             # ... but here, we're using a generator. If we try to use the same variable it would have two types and
             # fail static typing in mypy
-            if run_merge:
-                for filth in self._merge_filths(all_filths):
-                    yield filth
-            else:
-                for filth in all_filths:
-                    yield filth
-
-    def iter_filth_documents(
-            self,
-            documents: Union[Sequence[str], Dict[str, str]],
-            run_post_processors: bool = True
-    ) -> Generator[Filth, None, None]:
-        """Iterate over the different types of filth that can exist."""
-        if not isinstance(documents, (dict, list)):
-            raise TypeError('documents must be one of a string, list of strings or dict of strings.')
-
-        # Figures out which detectors have iter_filth_documents and applies to them
-
-        document_detectors_names = []
-        filth_list = []
-
-        for name, detector in self._detectors.items():
-            document_iterator = getattr(detector, 'iter_filth_documents', None)
-            if callable(document_iterator):
-                document_detectors_names.append(name)
-                for filth in document_iterator(documents):
-                    filth_list.append(filth)
-
-        # We have to now merge with the other processors. To do this we need to collect filth into a list
-        # Also need this if we need to do post processing
-
-        if run_post_processors or document_detectors_names:
-            if isinstance(documents, dict):
-                filth_list += [
-                    filth
-                    for name, text in documents.items()
-                    for filth in self.iter_filth(text, document_name=name, run_post_processors=False,
-                                                 run_merge=False, exclude_detectors=document_detectors_names)
-                ]
-            elif isinstance(documents, list):
-                filth_list += [
-                    filth
-                    for i_name, text in enumerate(documents)
-                    for filth in self.iter_filth(text, document_name=str(i_name), run_post_processors=False,
-                                                 run_merge=False, exclude_detectors=document_detectors_names)
-                ]
-
-            filth_list = list(self._merge_filths(filth_list))
-
-            if run_post_processors:
-                yield from self._post_process_filth_list(filth_list)
-            else:
-                for filth in filth_list:
-                    yield filth
-        else:
-            # Use generators when we dont post process the Filth
-            if isinstance(documents, dict):
-                for name, text in documents.items():
-                    for filth in self.iter_filth(text, document_name=name, run_post_processors=False,
-                                                 run_merge=False, exclude_detectors=document_detectors_names):
-                        yield filth
-            elif isinstance(documents, list):
-                for i_name, text in enumerate(documents):
-                    for filth in self.iter_filth(text, document_name=str(i_name), run_post_processors=False,
-                                                 run_merge=False, exclude_detectors=document_detectors_names):
-                        yield filth
+            for filth in self._merge_filths(filth_list):
+                yield filth
 
     @staticmethod
     def _sort_filths(filth_list: Sequence[Filth]) -> List[Filth]:

From 0ee6c4a90a1503a34645676c77a2b2ad87a1f86f Mon Sep 17 00:00:00 2001
From: aCampello <a.campello@wellcome.ac.uk>
Date: Wed, 28 Oct 2020 23:22:39 +0000
Subject: [PATCH 33/43] Fix types for document_names and text

---
 scrubadub/scrubbers.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scrubadub/scrubbers.py b/scrubadub/scrubbers.py
index 85b7d2cd..afe55478 100644
--- a/scrubadub/scrubbers.py
+++ b/scrubadub/scrubbers.py
@@ -246,12 +246,12 @@ def iter_filth_documents(
         # Figures out which detectors have iter_filth_documents and applies to them
 
         if isinstance(documents, dict):
-            document_names, document_texts = zip(*documents.items())
+            document_names = list(documents.keys())
+            document_texts = list(documents.values())
         elif isinstance(documents, (tuple, list)):
             document_texts = documents
             document_names = [str(x) for x in range(len(documents))]
 
-
         # currently doing this by aggregating all_filths and then sorting
         # inline instead of with a Filth.__cmp__ method, which is apparently
         # much slower http://stackoverflow.com/a/988728/564709

From 1b80110a3e93089cb8e807800a8fe95d5943e341 Mon Sep 17 00:00:00 2001
From: aCampello <a.campello@wellcome.ac.uk>
Date: Wed, 28 Oct 2020 23:45:56 +0000
Subject: [PATCH 34/43] Fix types for document dictionary to include None

---
 scrubadub/__init__.py  | 4 ++--
 scrubadub/scrubbers.py | 8 +++-----
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/scrubadub/__init__.py b/scrubadub/__init__.py
index 01315e82..21ab2221 100644
--- a/scrubadub/__init__.py
+++ b/scrubadub/__init__.py
@@ -1,5 +1,5 @@
 
-from typing import Union, List, Dict, Sequence
+from typing import Union, List, Dict, Sequence, Optional
 
 # convenient imports
 from .scrubbers import Scrubber
@@ -82,7 +82,7 @@ def list_filth(text: str, **kwargs) -> List[Filth]:
     return list(scrubber.iter_filth(text, **kwargs))
 
 
-def list_filth_documents(documents: Union[List[str], Dict[str, str]], **kwargs) -> List[Filth]:
+def list_filth_documents(documents: Union[List[str], Dict[Optional[str], str]], **kwargs) -> List[Filth]:
     """Return a list of `Filth` that was detected in the string `text`.
 
     `documents` can be in a dict, in the format of ``{'document_name': 'document'}``, or as a list of strings
diff --git a/scrubadub/scrubbers.py b/scrubadub/scrubbers.py
index afe55478..079c83c0 100644
--- a/scrubadub/scrubbers.py
+++ b/scrubadub/scrubbers.py
@@ -229,14 +229,12 @@ def iter_filth(
         # Iterates using iter_filth documents.
         # If a name is not provided, passes a list with one element, [text]
 
-        yield from self.iter_filth_documents(
-            documents=({document_name: text} if document_name else [text]),
-            run_post_processors=run_post_processors
-        )
+        yield from self.iter_filth_documents(documents={document_name: text},
+                                             run_post_processors=run_post_processors)
 
     def iter_filth_documents(
             self,
-            documents: Union[Sequence[str], Dict[str, str]],
+            documents: Union[Sequence[str], Dict[Optional[str], str]],
             run_post_processors: bool = True
     ) -> Generator[Filth, None, None]:
         """Iterate over the different types of filth that can exist."""

From ffb6ed93a85536dced40b8ef41b6f00fefdb2170 Mon Sep 17 00:00:00 2001
From: aCampello <a.campello@wellcome.ac.uk>
Date: Thu, 29 Oct 2020 09:32:44 +0000
Subject: [PATCH 35/43] Update requirements to nightly 3.0.0rc1

---
 requirements/python | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/python b/requirements/python
index be22055b..0069fd44 100644
--- a/requirements/python
+++ b/requirements/python
@@ -3,5 +3,5 @@ argcomplete
 phonenumbers
 pandas
 sklearn
-spacy-nightly[transformers]
+spacy-nightly[transformers]>=3.0.0rc1
 typing_extensions

From ecd965471656b266bbfad024f09f6e2a82223d12 Mon Sep 17 00:00:00 2001
From: aCampello <a.campello@wellcome.ac.uk>
Date: Thu, 29 Oct 2020 09:33:11 +0000
Subject: [PATCH 36/43] Comment unecessary piece of code

---
 tests/benchmark_accuracy.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/benchmark_accuracy.py b/tests/benchmark_accuracy.py
index 3e7e8f1f..7d31d5bb 100644
--- a/tests/benchmark_accuracy.py
+++ b/tests/benchmark_accuracy.py
@@ -25,14 +25,14 @@ def main():
         general_docs.append(new_doc)
         known_general_pii += new_known_pii
 
-        new_doc, new_known_pii = make_fake_document(paragraphs=4, seed=i_doc, filth_types=['name'])
+        #new_doc, new_known_pii = make_fake_document(paragraphs=4, seed=i_doc, filth_types=['name'])
         # Change the filth name to allow for comparison with NamedEntityDetector. Probably there is a better way to do it
 
-        for pii in new_known_pii:
-            pii['filth_type'] = 'named_entity'
+        #for pii in new_known_pii:
+       #     pii['filth_type'] = 'named_entity'
 
-        named_entity_docs.append(new_doc)
-        known_named_entity_pii += new_known_pii
+        #named_entity_docs.append(new_doc)
+        #known_named_entity_pii += new_known_pii
 
         # new_doc, new_known_pii = make_fake_document(paragraphs=4, seed=i_doc, filth_types=['gb_address', 'us_address'])
         # address_docs.append(new_doc)

From aaf36a0f9f5bbd516eb521aa7c50ddf8a0a542de Mon Sep 17 00:00:00 2001
From: aCampello <a.campello@wellcome.ac.uk>
Date: Thu, 29 Oct 2020 16:34:52 +0000
Subject: [PATCH 37/43] Initial tests to named entity detector

---
 tests/test_detector_named_entity.py | 67 +++++++++++++++++++++++++++++
 1 file changed, 67 insertions(+)
 create mode 100644 tests/test_detector_named_entity.py

diff --git a/tests/test_detector_named_entity.py b/tests/test_detector_named_entity.py
new file mode 100644
index 00000000..c23f968d
--- /dev/null
+++ b/tests/test_detector_named_entity.py
@@ -0,0 +1,67 @@
+import unittest
+
+from scrubadub.detectors import NamedEntityDetector
+from scrubadub.filth import NameFilth, OrganizationFilth, NamedEntityFilth
+import scrubadub
+
+from base import BaseTestCase
+
+
+class NamedEntityTestCase(unittest.TestCase, BaseTestCase):
+    """
+    Tests whether the detector is performing correctly from a function point of view.
+    For accuracy tests use .benchmark_accuracy instead
+    """
+
+    def setUp(self):
+        self.detector = NamedEntityDetector()
+
+    def _assert_filth_type_and_pos(self, doc_list, beg_end_list, filth_class):
+        doc_names = [str(x) for x in range(len(doc_list))]
+
+        filth_list = list(self.detector.iter_filth_documents(doc_names, doc_list))
+
+        for filth, beg_end in zip(filth_list, beg_end_list):
+            self.assertIsInstance(filth, filth_class)
+            self.assertEqual((filth.beg, filth.end), beg_end)
+
+    def test_names(self):
+        doc_list = ["John is a cat",
+                    "When was Maria born?",
+                    "john is a cat",
+                    "when was maria born"]
+        beg_end_list = [(0, 4),
+                        (9, 14),
+                        (0, 4),
+                        (9, 14)]
+
+        self._assert_filth_type_and_pos(doc_list, beg_end_list, NameFilth)
+
+    def test_organisations(self):
+        doc_list = ["She started working for Apple this year",
+                    "But used to work for Google"]
+        beg_end_list = [(24, 30),
+                        (21, 27)]
+
+        self._assert_filth_type_and_pos(doc_list, beg_end_list, OrganizationFilth)
+
+    def test_other_entity(self):
+        self.detector.named_entities = {"GPE"}
+        doc_list = ["London is a city in England"]
+        beg_end_list = [(0, 6),
+                        (20, 27)]
+
+        self._assert_filth_type_and_pos(doc_list, beg_end_list, NamedEntityFilth)
+
+    def test_wrong_model(self):
+        """Test that it raises an error if user inputs invalid spacy model"""
+        with self.assertRaises(SystemExit):
+            NamedEntityDetector(model='not_a_valid_spacy_model')
+
+    def test_iter_filth(self):
+        doc = "John is a cat"
+
+        output_iter_docs = list(self.detector.iter_filth_documents(doc_list=[doc], doc_names=["0"]))
+        output_iter = list(self.detector.iter_filth(text=doc, document_name="0"))
+
+        self.assertListEqual(output_iter, output_iter_docs)

From 2c8eedb91e66fc7374911c4c0f88412dd4cabce3 Mon Sep 17 00:00:00 2001
From: aCampello <a.campello@wellcome.ac.uk>
Date: Sat, 31 Oct 2020 14:22:59 +0000
Subject: [PATCH 38/43] Skip tests if python version < 3.6

---
 tests/test_detector_named_entity.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/test_detector_named_entity.py b/tests/test_detector_named_entity.py
index c23f968d..a1b4ce2f 100644
--- a/tests/test_detector_named_entity.py
+++ b/tests/test_detector_named_entity.py
@@ -1,3 +1,4 @@
+import sys
 import unittest
 
 from scrubadub.detectors import NamedEntityDetector
@@ -15,6 +16,10 @@ class NamedEntityTestCase(unittest.TestCase, BaseTestCase):
 
     def setUp(self):
         self.detector = NamedEntityDetector()
+        unittest.TestCase.skipTest(
+            (sys.version_info.major, sys.version_info.minor) < (3, 6),
+            "Named entity detector not supported for python<3.6"
+        )
 
     def _assert_filth_type_and_pos(self, doc_list, beg_end_list, filth_class):
         doc_names = [str(x) for x in range(len(doc_list))]

From c83829ad4b861e0c283f7794cfd10e43b5708064 Mon Sep 17 00:00:00 2001
From: aCampello <a.campello@wellcome.ac.uk>
Date: Sat, 31 Oct 2020 14:36:22 +0000
Subject: [PATCH 39/43] Add spacy as extra

---
 requirements/python        |  1 -
 requirements/python-dev    |  1 +
 requirements/python-extras |  1 +
 setup.py                   | 20 ++++++++++++++------
 4 files changed, 16 insertions(+), 7 deletions(-)
 create mode 100644 requirements/python-extras

diff --git a/requirements/python b/requirements/python
index 0069fd44..10c04814 100644
--- a/requirements/python
+++ b/requirements/python
@@ -3,5 +3,4 @@ argcomplete
 phonenumbers
 pandas
 sklearn
-spacy-nightly[transformers]>=3.0.0rc1
 typing_extensions
diff --git a/requirements/python-dev b/requirements/python-dev
index 7a521401..3a141ad6 100644
--- a/requirements/python-dev
+++ b/requirements/python-dev
@@ -1,5 +1,6 @@
 # install everything in the python requirements too.
 -r python
+-r python-extras
 
 # needed for tests/run.py script to read .travis.yml file
 PyYAML
diff --git a/requirements/python-extras b/requirements/python-extras
new file mode 100644
index 00000000..ac0b741d
--- /dev/null
+++ b/requirements/python-extras
@@ -0,0 +1 @@
+spacy-nightly[transformers]>=3.0.0rc1
diff --git a/setup.py b/setup.py
index dde13e88..50ed6e5b 100644
--- a/setup.py
+++ b/setup.py
@@ -8,14 +8,21 @@
 
 github_url = 'https://github.com/LeapBeyond/scrubadub'
 
+
+def read_packages_from_file(filename):
+    with open(filename, 'r') as stream:
+        for line in stream:
+            package = line.strip().split('#')[0]
+            if package:
+                yield package
+
 # read in the dependencies from the virtualenv requirements file
-dependencies = []
 filename = os.path.join("requirements", "python")
-with open(filename, 'r') as stream:
-    for line in stream:
-        package = line.strip().split('#')[0]
-        if package:
-            dependencies.append(package)
+dependencies = list(read_packages_from_file(filename))
+
+# read extra spacy dependencies from python-extras requirements file
+filename = os.path.join("requirements", "python-extras")
+extras = list(read_packages_from_file(filename))
 
 # get the version
 version = None
@@ -60,5 +67,6 @@
         'Topic :: Utilities',
     ],
     install_requires=dependencies,
+    extras_require={"spacy": extras},
     zip_safe=False,
 )

From dd5a7be9dc7072f49d90075e8a813b1024aa0945 Mon Sep 17 00:00:00 2001
From: aCampello <a.campello@wellcome.ac.uk>
Date: Sat, 31 Oct 2020 14:46:13 +0000
Subject: [PATCH 40/43] Tweak travis for python3.8

---
 .travis.yml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.travis.yml b/.travis.yml
index 740c4fe9..d30c1afd 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -23,6 +23,11 @@ script:
   - python3 ./tests/benchmark_time.py
   - cd docs && make html && cd -
 
+jobs:
+  include:
+    - python: "3.8"
+      before_script: pip install -r requirements/python-extras
+
 # commands to run after the tests successfully complete
 after_success:
   - coveralls

From 491c10d3be40c51100050b79a6f3b01d020b9700 Mon Sep 17 00:00:00 2001
From: aCampello <a.campello@wellcome.ac.uk>
Date: Sat, 31 Oct 2020 16:48:07 +0000
Subject: [PATCH 41/43] Revert CI and add environment marker to requirements

---
 .travis.yml                | 6 +-----
 requirements/python-dev    | 1 -
 requirements/python-extras | 3 ++-
 3 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index d30c1afd..4b210326 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -7,6 +7,7 @@ python:
 # virtualenv
 install:
   - pip install -r requirements/python-dev
+  - pip install -r requirements/python-extras
   - python -m textblob.download_corpora
   - pip install .
 #  - apt-get install curl autoconf automake libtool pkg-config
@@ -23,11 +24,6 @@ script:
   - python3 ./tests/benchmark_time.py
   - cd docs && make html && cd -
 
-jobs:
-  include:
-    - python: "3.8"
-      before_script: pip install -r requirements/python-extras
-
 # commands to run after the tests successfully complete
 after_success:
   - coveralls
diff --git a/requirements/python-dev b/requirements/python-dev
index 3a141ad6..7a521401 100644
--- a/requirements/python-dev
+++ b/requirements/python-dev
@@ -1,6 +1,5 @@
 # install everything in the python requirements too.
 -r python
--r python-extras
 
 # needed for tests/run.py script to read .travis.yml file
 PyYAML
diff --git a/requirements/python-extras b/requirements/python-extras
index ac0b741d..56d551bb 100644
--- a/requirements/python-extras
+++ b/requirements/python-extras
@@ -1 +1,2 @@
-spacy-nightly[transformers]>=3.0.0rc1
+spacy-nightly[transformers]>=3.0.0rc1; python_version >= '3.6'
+

From 3822cf9bc00ec222a8fbd7707f4e9627de5dd19a Mon Sep 17 00:00:00 2001
From: aCampello <a.campello@wellcome.ac.uk>
Date: Sun, 1 Nov 2020 15:48:34 +0000
Subject: [PATCH 42/43] Add check for extras

---
 scrubadub/detectors/named_entity.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/scrubadub/detectors/named_entity.py b/scrubadub/detectors/named_entity.py
index f55f73ef..824f3522 100644
--- a/scrubadub/detectors/named_entity.py
+++ b/scrubadub/detectors/named_entity.py
@@ -1,7 +1,14 @@
+import warnings
 from typing import Generator, Iterable, Optional, Sequence
 
-import spacy
-from wasabi import msg
+try:
+    import spacy
+    from wasabi import msg
+except ModuleNotFoundError as e:
+    if getattr(e, 'name', None) == 'spacy':
+        warnings.warn("Could not find module 'spacy'. If you want to use extras,"
+                      " make sure you install scrubadub with 'pip install scrubadub[spacy]'")
+
 
 from .base import Detector
 from ..filth import NamedEntityFilth, Filth, NameFilth, OrganizationFilth
@@ -26,7 +33,7 @@ def __init__(self, named_entities: Iterable[str] = {'PERSON'},
         # Spacy NER are all upper cased
         self.named_entities = {entity.upper() for entity in named_entities}
         if model not in spacy.info()['pipelines']:
-            msg.warn("Downloading spacy model {}".format(model))
+            msg.info("Downloading spacy model {}".format(model))
             spacy.cli.download(model)
 
         self.nlp = spacy.load(model)

From f1b29cdef8be42355c464039a30f51fb2fb34fd8 Mon Sep 17 00:00:00 2001
From: aCampello <a.campello@wellcome.ac.uk>
Date: Sun, 1 Nov 2020 15:52:54 +0000
Subject: [PATCH 43/43] Fix test skipping

---
 tests/test_detector_named_entity.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/test_detector_named_entity.py b/tests/test_detector_named_entity.py
index a1b4ce2f..f308527f 100644
--- a/tests/test_detector_named_entity.py
+++ b/tests/test_detector_named_entity.py
@@ -3,8 +3,6 @@
 
 from scrubadub.detectors import NamedEntityDetector
 from scrubadub.filth import NameFilth, OrganizationFilth, NamedEntityFilth
-import scrubadub
-
 from base import BaseTestCase
 
 
@@ -15,11 +13,13 @@ class NamedEntityTestCase(unittest.TestCase, BaseTestCase):
     """
 
     def setUp(self):
-        self.detector = NamedEntityDetector()
+        unsupported_version = (sys.version_info.major, sys.version_info.minor) < (3, 6)
         unittest.TestCase.skipTest(
-            (sys.version_info.major, sys.version_info.minor) < (3, 6),
+            unsupported_version,
             "Named entity detector not supported for python<3.6"
         )
+        if not unsupported_version:
+            self.detector = NamedEntityDetector()
 
     def _assert_filth_type_and_pos(self, doc_list, beg_end_list, filth_class):
         doc_names = [str(x) for x in range(len(doc_list))]