Skip to content

Commit

Permalink
Merge ef3b3c6 into 0eec66e
Browse files Browse the repository at this point in the history
  • Loading branch information
aCampello committed Nov 1, 2020
2 parents 0eec66e + ef3b3c6 commit cda8127
Show file tree
Hide file tree
Showing 13 changed files with 264 additions and 59 deletions.
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ python:
# virtualenv
install:
- pip install -r requirements/python-dev
- pip install -r requirements/python-extras
- python -m textblob.download_corpora
- pip install .
# - apt-get install curl autoconf automake libtool pkg-config
Expand Down
2 changes: 1 addition & 1 deletion requirements/python
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ argcomplete
phonenumbers
pandas
sklearn
typing_extensions
typing_extensions
2 changes: 2 additions & 0 deletions requirements/python-extras
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
spacy-nightly[transformers]>=3.0.0rc1; python_version >= '3.6'

4 changes: 2 additions & 2 deletions scrubadub/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@

from typing import Union, List, Dict, Sequence
from typing import Union, List, Dict, Sequence, Optional

# convenient imports
from .scrubbers import Scrubber
Expand Down Expand Up @@ -82,7 +82,7 @@ def list_filth(text: str, **kwargs) -> List[Filth]:
return list(scrubber.iter_filth(text, **kwargs))


def list_filth_documents(documents: Union[List[str], Dict[str, str]], **kwargs) -> List[Filth]:
def list_filth_documents(documents: Union[List[str], Dict[Optional[str], str]], **kwargs) -> List[Filth]:
"""Return a list of `Filth` that was detected in the string `text`.
`documents` can be in a dict, in the format of ``{'document_name': 'document'}``, or as a list of strings
Expand Down
1 change: 1 addition & 0 deletions scrubadub/detectors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from .credential import CredentialDetector
from .email import EmailDetector, NewEmailDetector
from .name import NameDetector
from .named_entity import NamedEntityDetector
from .phone import PhoneDetector
from .postalcode import PostalCodeDetector
from .known import KnownFilthDetector
Expand Down
59 changes: 59 additions & 0 deletions scrubadub/detectors/named_entity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import warnings
from typing import Generator, Iterable, Optional, Sequence

try:
import spacy
from wasabi import msg
except ImportError as e:
if getattr(e, 'name', None) == 'spacy':
warnings.warn("Could not find module 'spacy'. If you want to use extras,"
" make sure you install scrubadub with 'pip install scrubadub[spacy]'")


from .base import Detector
from ..filth import NamedEntityFilth, Filth, NameFilth, OrganizationFilth
from ..utils import CanonicalStringSet


class NamedEntityDetector(Detector):
"""Use spacy's named entity recognition to clean named entities.
List specific entities to include passing ``named_entities``, e.g.
(PERSON)
"""
filth_cls_map = {
'PERSON': NameFilth,
'ORG': OrganizationFilth
}
name = 'named_entity'

disallowed_nouns = CanonicalStringSet(["skype"])

def __init__(self, named_entities: Iterable[str] = {'PERSON'},
model: str = "en_core_web_trf", **kwargs):
# Spacy NER are all upper cased
self.named_entities = {entity.upper() for entity in named_entities}
if model not in spacy.info()['pipelines']:
msg.info("Downloading spacy model {}".format(model))
spacy.cli.download(model)

self.nlp = spacy.load(model)
# Only enable necessary pipes
self.nlp.select_pipes(enable=["transformer", "tagger", "parser", "ner"])
super(NamedEntityDetector, self).__init__(**kwargs)

def iter_filth_documents(self, doc_names: Sequence[Optional[str]],
doc_list: Sequence[str]) -> Generator[Filth, None, None]:
for doc_name, doc in zip(doc_names, self.nlp.pipe(doc_list)):
for ent in doc.ents:
if ent.label_ in self.named_entities:
# If there is no standard 'filth', returns a NamedEntity filth
filth_cls = self.filth_cls_map.get(ent.label_, NamedEntityFilth)
yield filth_cls(beg=ent.start_char,
end=ent.end_char,
text=ent.text,
document_name=(str(doc_name) if doc_name else None), # None if no doc_name provided
detector_name=self.name,
label=ent.label_)

def iter_filth(self, text: str, document_name: Optional[str] = None) -> Generator[Filth, None, None]:
yield from self.iter_filth_documents([document_name], [text])
1 change: 1 addition & 0 deletions scrubadub/filth/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from .email import EmailFilth
from .known import KnownFilth
from .name import NameFilth
from .named_entity import NamedEntityFilth
from .organization import OrganizationFilth
from .phone import PhoneFilth
from .postalcode import PostalCodeFilth
Expand Down
14 changes: 14 additions & 0 deletions scrubadub/filth/named_entity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from .base import Filth


class NamedEntityFilth(Filth):
"""
Default filth type, for named entities (e.g. the ones in https://nightly.spacy.io/models/en#en_core_web_lg-labels),
except the ones represented in any other filth.
"""
type = 'named_entity'

def __init__(self, *args, label: str, **kwargs):
super(NamedEntityFilth, self).__init__(*args, **kwargs)
self.label = label.lower()
self.replacement_string = "{}_{}".format(self.type, self.label)
94 changes: 45 additions & 49 deletions scrubadub/scrubbers.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,17 +181,19 @@ def clean_documents(self, documents: Union[Sequence[str], Dict[str, str]], **kwa

filth_list = self._post_process_filth_list(filth_list)

clean_documents: Union[Dict[str, str], Sequence[str]]
if isinstance(documents, list):
return [
clean_documents = [
self._replace_text(text=text, filth_list=filth_list, document_name=str(name), **kwargs)
for name, text in enumerate(documents)
]
elif isinstance(documents, dict):
return {
clean_documents = {
name: self._replace_text(text=text, filth_list=filth_list, document_name=name, **kwargs)
for name, text in documents.items()
}
return []

return clean_documents

def _replace_text(
self, text: str, filth_list: Sequence[Filth], document_name: Optional[str] = None, **kwargs
Expand Down Expand Up @@ -226,23 +228,55 @@ def iter_filth(
) -> Generator[Filth, None, None]:
"""Iterate over the different types of filth that can exist.
"""
# Iterates using iter_filth documents.
# If a name is not provided, passes a list with one element, [text]

yield from self.iter_filth_documents(documents={document_name: text},
run_post_processors=run_post_processors)

def iter_filth_documents(
self,
documents: Union[Sequence[str], Dict[Optional[str], str]],
run_post_processors: bool = True
) -> Generator[Filth, None, None]:
"""Iterate over the different types of filth that can exist."""
if not isinstance(documents, (dict, list)):
raise TypeError('documents must be one of a string, list of strings or dict of strings.')

# Figures out which detectors have iter_filth_documents and applies to them

if isinstance(documents, dict):
document_names = list(documents.keys())
document_texts = list(documents.values())
elif isinstance(documents, (tuple, list)):
document_texts = documents
document_names = [str(x) for x in range(len(documents))]

# currently doing this by aggregating all_filths and then sorting
# inline instead of with a Filth.__cmp__ method, which is apparently
# much slower http://stackoverflow.com/a/988728/564709
#
# NOTE: we could probably do this in a more efficient way by iterating
# over all detectors simultaneously. just trying to get something
# working right now and we can worry about efficiency later
all_filths = [] # type: List[Filth]
for detector in self._detectors.values():
for filth in detector.iter_filth(text, document_name=document_name):
if not isinstance(filth, Filth):
raise TypeError('iter_filth must always yield Filth')
all_filths.append(filth)
filth_list = [] # type: List[Filth]
for name, detector in self._detectors.items():
document_iterator = getattr(detector, 'iter_filth_documents', None)
if callable(document_iterator):
for filth in document_iterator(document_names, document_texts):
if not isinstance(filth, Filth):
raise TypeError('iter_filth must always yield Filth')
filth_list.append(filth)
else:
for document_name, text in zip(document_names, document_texts):
for filth in detector.iter_filth(text, document_name=document_name):
if not isinstance(filth, Filth):
raise TypeError('iter_filth must always yield Filth')
filth_list.append(filth)

# This is split up so that we only have to use lists if we have to post_process Filth
if run_post_processors:
all_filths = list(self._merge_filths(all_filths))
all_filths = list(self._merge_filths(filth_list))
all_filths = list(self._post_process_filth_list(all_filths))

# Here we loop over a list of Filth...
Expand All @@ -251,47 +285,9 @@ def iter_filth(
else:
# ... but here, we're using a generator. If we try to use the same variable it would have two types and
# fail static typing in mypy
for filth in self._merge_filths(all_filths):
for filth in self._merge_filths(filth_list):
yield filth

def iter_filth_documents(
self,
documents: Union[Sequence[str], Dict[str, str]],
run_post_processors: bool = True
) -> Generator[Filth, None, None]:
"""Iterate over the different types of filth that can exist."""
if not isinstance(documents, (dict, list)):
raise TypeError('documents must be one of a string, list of strings or dict of strings.')

if run_post_processors:
# Only collect the filts into a list if we need to do post processing
filth_list = [] # type: List[Filth]
if isinstance(documents, dict):
filth_list = [
filth
for name, text in documents.items()
for filth in self.iter_filth(text, document_name=name, run_post_processors=False)
]
elif isinstance(documents, list):
filth_list = [
filth
for i_name, text in enumerate(documents)
for filth in self.iter_filth(text, document_name=str(i_name), run_post_processors=False)
]

for filth in self._post_process_filth_list(filth_list):
yield filth
else:
# Use generators when we dont post process the Filth
if isinstance(documents, dict):
for name, text in documents.items():
for filth in self.iter_filth(text, document_name=name, run_post_processors=False):
yield filth
elif isinstance(documents, list):
for i_name, text in enumerate(documents):
for filth in self.iter_filth(text, document_name=str(i_name), run_post_processors=False):
yield filth

@staticmethod
def _sort_filths(filth_list: Sequence[Filth]) -> List[Filth]:
"""Sorts a list of filths, needed before merging and concatenating"""
Expand Down
20 changes: 14 additions & 6 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,21 @@

github_url = 'https://github.com/LeapBeyond/scrubadub'


def read_packages_from_file(filename):
with open(filename, 'r') as stream:
for line in stream:
package = line.strip().split('#')[0]
if package:
yield package

# read in the dependencies from the virtualenv requirements file
dependencies = []
filename = os.path.join("requirements", "python")
with open(filename, 'r') as stream:
for line in stream:
package = line.strip().split('#')[0]
if package:
dependencies.append(package)
dependencies = list(read_packages_from_file(filename))

# read extra spacy dependencies from python-extras requirements file
filename = os.path.join("requirements", "python-extras")
extras = list(read_packages_from_file(filename))

# get the version
version = None
Expand Down Expand Up @@ -60,5 +67,6 @@
'Topic :: Utilities',
],
install_requires=dependencies,
extras_require={"spacy": extras},
zip_safe=False,
)
21 changes: 20 additions & 1 deletion tests/benchmark_accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,11 @@

def main():
general_docs = []
named_entity_docs = []
# address_docs = []
# uk_phone_docs = []
known_general_pii = []
known_named_entity_pii = []
# known_address_pii = []
# known_uk_phone_pii = []
start_time = time.time()
Expand All @@ -23,6 +25,15 @@ def main():
general_docs.append(new_doc)
known_general_pii += new_known_pii

#new_doc, new_known_pii = make_fake_document(paragraphs=4, seed=i_doc, filth_types=['name'])
# Change the filth name to allow for comparison with NamedEntityDetector. Probably there is a better way to do it

#for pii in new_known_pii:
# pii['filth_type'] = 'named_entity'

#named_entity_docs.append(new_doc)
#known_named_entity_pii += new_known_pii

# new_doc, new_known_pii = make_fake_document(paragraphs=4, seed=i_doc, filth_types=['gb_address', 'us_address'])
# address_docs.append(new_doc)
# known_address_pii += new_known_pii
Expand All @@ -35,7 +46,6 @@ def main():

scrubber_time = time.time()
scrubber = scrubadub.Scrubber()
# scrubber.add_detector(scrubadub.detectors.stanford_ner.StanfordNERDetector())
scrubber.add_detector(scrubadub.detectors.KnownFilthDetector(known_filth_items=known_general_pii))
filth_list = list(scrubber.iter_filth_documents(general_docs))

Expand All @@ -57,6 +67,15 @@ def main():
print("Scrubbed documents in {:.2f}s".format(end_time-scrubber_time))
print(get_filth_classification_report(filth_list))

# scrubber_time = time.time()
# scrubber = scrubadub.Scrubber(detector_list=[scrubadub.detectors.NamedEntityDetector(),
# scrubadub.detectors.KnownFilthDetector(known_filth_items=known_named_entity_pii)])
# filth_list = list(scrubber.iter_filth_documents(named_entity_docs))
# end_time = time.time()
# print("Documents generated in {:.2f}s".format(scrubber_time-start_time))
# print("Scrubbed documents in {:.2f}s".format(end_time-scrubber_time))
# print(get_filth_classification_report(filth_list))

sys.exit(0)


Expand Down
Loading

0 comments on commit cda8127

Please sign in to comment.