Permalink
Browse files

Escape non-ASCII characters in hyperlinks.

  • Loading branch information...
SimonSapin committed Oct 4, 2012
1 parent 726faa9 commit c2b2b8c2ea907450a537b7ff77e8775f038af6b2
Showing with 32 additions and 9 deletions.
  1. +2 −2 weasyprint/__init__.py
  2. +2 −2 weasyprint/css/validation.py
  3. +23 −3 weasyprint/tests/test_api.py
  4. +5 −2 weasyprint/urls.py
View
@@ -189,6 +189,8 @@ def __init__(self, guess=None, filename=None, url=None, file_obj=None,
url_fetcher=default_url_fetcher, _check_mime_type=False,
media_type='print'):
from .css import PARSER, preprocess_stylesheet
+ from .urls import wrap_url_fetcher
+ url_fetcher = wrap_url_fetcher(url_fetcher)
source_type, source, base_url, protocol_encoding = _select_source(
guess, filename, url, file_obj, string, tree=None,
@@ -225,8 +227,6 @@ def _select_source(guess=None, filename=None, url=None, file_obj=None,
"""
from .urls import path2url, ensure_url, url_is_absolute
- from .urls import wrap_url_fetcher
- url_fetcher = wrap_url_fetcher(url_fetcher)
if base_url is not None:
base_url = ensure_url(base_url)
@@ -21,7 +21,7 @@
from ..logger import LOGGER
from ..formatting_structure import counters
from ..compat import urljoin, unquote
-from ..urls import url_is_absolute
+from ..urls import url_is_absolute, iri_to_uri
from .properties import (INITIAL_VALUES, KNOWN_PROPERTIES, NOT_PRINT_MEDIA,
Dimension)
from . import computed_values
@@ -906,7 +906,7 @@ def link(token, base_url):
if token.value.startswith('#'):
return 'internal', unquote(token.value[1:])
else:
- return 'external', safe_urljoin(base_url, token.value)
+ return 'external', iri_to_uri(safe_urljoin(base_url, token.value))
function = parse_function(token)
if function:
name, args = function
@@ -695,11 +695,25 @@ def assert_links(html, expected_links_by_page, expected_anchors_by_page,
assert_links('''
<body style="width: 200px">
- <a href="../lipsum" style="display: block; margin: 10px 5px">
+ <a href="../lipsum/é_%E9" style="display: block; margin: 10px 5px">
''', [[
- ('external', 'http://weasyprint.org/foo/lipsum', (5, 10, 190, 0)),
+ ('external', 'http://weasyprint.org/foo/lipsum/%C3%A9_%E9',
+ (5, 10, 190, 0)),
]], [{}], [[
- ('external', 'http://weasyprint.org/foo/lipsum', (5, 10, 190, 0)),
+ ('external', 'http://weasyprint.org/foo/lipsum/%C3%A9_%E9',
+ (5, 10, 190, 0)),
+ ]],
+ base_url='http://weasyprint.org/foo/bar/')
+ assert_links('''
+ <body style="width: 200px">
+ <div style="display: block; margin: 10px 5px;
+ -weasy-link: url(../lipsum/é_%E9)">
+ ''', [[
+ ('external', 'http://weasyprint.org/foo/lipsum/%C3%A9_%E9',
+ (5, 10, 190, 0)),
+ ]], [{}], [[
+ ('external', 'http://weasyprint.org/foo/lipsum/%C3%A9_%E9',
+ (5, 10, 190, 0)),
]],
base_url='http://weasyprint.org/foo/bar/')
@@ -848,3 +862,9 @@ def test(html, blank=False):
test('<body><img src="custom:foo/bar">', blank=True)
assert len(logs) == 1
assert logs[0].startswith('WARNING: Error for image at custom:foo/bar')
+
+ def fetcher(url):
+ assert url == 'weasyprint-custom:%C3%A9_%e9.css'
+ return dict(string='')
+ TestHTML(string='<link rel=stylesheet href="weasyprint-custom:'
+ 'é_%e9.css"><body>', url_fetcher=fetcher).render()
View
@@ -53,6 +53,9 @@ def iri_to_uri(url):
"""Turn an IRI that can contain any Unicode character into an ASII-only
URI that conforms to RFC 3986.
"""
+ if url.startswith('data:'):
+ # Data URIs can be huge, but don’t need this anyway.
+ return url
# Use UTF-8 as per RFC 3987 (IRI), except for file://
url = url.encode(FILESYSTEM_ENCODING
if url.startswith('file:') else 'utf-8')
@@ -112,9 +115,9 @@ def url_join(base_url, url, context, *args):
"""
if url_is_absolute(url):
- return url
+ return iri_to_uri(url)
elif base_url:
- return urljoin(base_url, url)
+ return iri_to_uri(urljoin(base_url, url))
else:
LOGGER.warn('Relative URI reference without a base URI: ' + context,
*args)

0 comments on commit c2b2b8c

Please sign in to comment.