Skip to content

Commit

Permalink
Merge pull request #11 from hugo-quantmetry/master
Browse files Browse the repository at this point in the history
Use unicodedata instead of unidecode to remove accents -> much faster
  • Loading branch information
hugo-quantmetry committed Jul 24, 2019
2 parents a93c7cb + 8a808ab commit 9a9d54d
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 9 deletions.
17 changes: 13 additions & 4 deletions melusine/prepare_email/cleaning.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
Cleaning of the body and the header
"""

import unidecode
import unidecode, unicodedata
import re
from melusine.config import ConfigJsonReader

Expand Down Expand Up @@ -95,9 +95,18 @@ def text_to_lowercase(text):
return text.lower()


def remove_accents(text):
"""Remove accents from text"""
return unidecode.unidecode(text)
def remove_accents(text, use_unidecode=False):
"""
Remove accents from text
Using unidecode is more powerful but much more time consuming
Exemple: the joined 'ae' character is converted to 'a' + 'e' by unidecode while it is suppressed by unicodedata.
"""
if use_unidecode:
return unidecode.unidecode(text)
else:
utf8_str = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode("utf-8")
return utf8_str


def remove_line_break(text):
Expand Down
7 changes: 2 additions & 5 deletions melusine/prepare_email/mail_segmenting.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import re
from unidecode import unidecode
from melusine.config.config import ConfigJsonReader
from melusine.prepare_email.cleaning import remove_accents

conf_reader = ConfigJsonReader()
config = conf_reader.get_config_file()
Expand Down Expand Up @@ -293,13 +293,10 @@ def tag(string):
Examples
--------
"""
def _remove_accents(string):
return unidecode(string)
regex_parts = regex_segmenting_dict.items()
sentence_with_no_accent = _remove_accents(string)
sentence_with_no_accent = remove_accents(string)
for k, reg in regex_parts:
for r in reg:
r = _remove_accents(r)
r = r.replace(" ", regex_tag)
if re.search(r, sentence_with_no_accent, re.I):
return [(string, k)], True
Expand Down

0 comments on commit 9a9d54d

Please sign in to comment.