Skip to content

Commit

Permalink
Fix regression involving the entity … in unescape_html
Browse files Browse the repository at this point in the history
  • Loading branch information
Rob Speer committed Mar 10, 2017
1 parent f88b40b commit 0ba0aa8
Show file tree
Hide file tree
Showing 5 changed files with 29 additions and 6 deletions.
14 changes: 13 additions & 1 deletion CHANGELOG.md
@@ -1,3 +1,15 @@
## Version 5.0.1 and 4.4.1 (March 10, 2017)

Bug fix:

- The `unescape_html` fixer will decode entities between `€` and `Ÿ`
as what they would be in Windows-1252, even without the help of
`fix_encoding`.

This better matches what Web browsers do, and fixes a regression that version
4.4 introduced in an example that uses `…` as an ellipsis.


## Version 5.0 (February 17, 2017)

Breaking changes:
Expand Down Expand Up @@ -36,7 +48,7 @@ Heuristic changes:

Bug fix:

' `remove_control_chars` was removing U+0D ('\r') prematurely. That's the
- `remove_control_chars` was removing U+0D ('\r') prematurely. That's the
job of `fix_line_breaks`.


Expand Down
2 changes: 1 addition & 1 deletion ftfy/__init__.py
Expand Up @@ -10,7 +10,7 @@
from ftfy import fixes
from ftfy.formatting import display_ljust

__version__ = '5.0'
__version__ = '5.0.1'


# See the docstring for ftfy.bad_codecs to see what we're doing here.
Expand Down
10 changes: 8 additions & 2 deletions ftfy/fixes.py
Expand Up @@ -313,9 +313,15 @@ def fixup(match):
# character reference
try:
if text[:3] == "&#x":
return chr(int(text[3:-1], 16))
codept = int(text[3:-1], 16)
else:
return chr(int(text[2:-1]))
codept = int(text[2:-1])
if 0x80 <= codept < 0xa0:
# Decode this range of characters as Windows-1252, as Web
# browsers do in practice.
return bytes([codept]).decode('sloppy-windows-1252')
else:
return chr(codept)
except ValueError:
pass
else:
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Expand Up @@ -13,7 +13,7 @@
The older version of ftfy, version 4.4, is still available and can run on
Python 2. Try this:
pip install ftfy==4.4
pip install ftfy==4.4.1
"""


Expand All @@ -24,7 +24,7 @@

setup(
name="ftfy",
version='5.0',
version='5.0.1',
maintainer='Luminoso Technologies, Inc.',
maintainer_email='info@luminoso.com',
license="MIT",
Expand Down
5 changes: 5 additions & 0 deletions tests/test_entities.py
@@ -1,4 +1,5 @@
from ftfy import fix_text, fix_text_segment
from ftfy.fixes import unescape_html
from nose.tools import eq_


Expand All @@ -18,3 +19,7 @@ def test_entities():
eq_(fix_text_segment('&lt;&gt;'), '<>')
eq_(fix_text_segment('jednocze&sacute;nie'), 'jednocześnie')
eq_(fix_text_segment('JEDNOCZE&Sacute;NIE'), 'JEDNOCZEŚNIE')
eq_(fix_text_segment('ellipsis&#133;', normalization='NFKC'), 'ellipsis...')
eq_(fix_text_segment('ellipsis&#x85;', normalization='NFKC'), 'ellipsis...')
eq_(fix_text_segment('broken&#x81;'), 'broken\x81')
eq_(unescape_html('euro &#x80;'), 'euro €')

0 comments on commit 0ba0aa8

Please sign in to comment.