/
__init__.py
476 lines (395 loc) · 18.1 KB
/
__init__.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
# coding: utf8
"""
WeasyPrint
==========
WeasyPrint converts web documents to PDF.
The public API is what is accessible from this "root" packages
without importing sub-modules.
:copyright: Copyright 2011-2012 Simon Sapin and contributors, see AUTHORS.
:license: BSD, see LICENSE for details.
"""
from __future__ import division, unicode_literals
VERSION = '0.15'
__version__ = VERSION
# Used for 'User-Agent' in HTTP and 'Creator' in PDF
VERSION_STRING = 'WeasyPrint %s (http://weasyprint.org/)' % VERSION
import io
import sys
from .urls import default_url_fetcher
# Make sure the logger is configured early:
from .logger import LOGGER
# No other import here. For this module, do them in functions/methods instead.
# (This reduces the work for eg. 'weasyprint --help')
class HTML(object):
"""Represents an HTML document parsed by `lxml <http://lxml.de/>`_.
You can just create an instance with a positional argument:
``doc = HTML(something)``
The class will try to guess if the input is a filename, an absolute URL,
or a file-like object.
Alternatively, use **one** named argument so that no guessing is involved:
:param filename: A filename, relative to the current directory or absolute.
:param url: An absolute, fully qualified URL.
:param file_obj: a file-like: any object with a :meth:`~file.read` method.
:param string: a string of HTML source. (This argument must be named.)
:param tree: a parsed lxml tree. (This argument must be named.)
Specifying multiple inputs is an error: ``HTML(filename=foo, url=bar)``
will raise.
You can also pass optional named arguments:
:param encoding: Force the source character encoding.
:param base_url: The base used to resolve relative URLs
(eg. in ``<img src="../foo.png">``). If not provided, try to use
the input filename, URL, or ``name`` attribute of file objects.
:param url_fetcher: The URL fetcher function. (See :ref:`url-fetchers`.)
:param media_type: The media type to use for ``@media``.
Defaults to ``'print'``. **Note:** In some cases like
``HTML(string=foo)`` relative URLs will be invalid if ``base_url``
is not provided.
"""
def __init__(self, guess=None, filename=None, url=None, file_obj=None,
string=None, tree=None, encoding=None, base_url=None,
url_fetcher=default_url_fetcher, media_type='print'):
import lxml.html
from .html import find_base_url
from .urls import wrap_url_fetcher
url_fetcher = wrap_url_fetcher(url_fetcher)
source_type, source, base_url, protocol_encoding = _select_source(
guess, filename, url, file_obj, string, tree, base_url,
url_fetcher)
if source_type == 'tree':
result = source
else:
if source_type == 'string':
parse = lxml.html.document_fromstring
else:
parse = lxml.html.parse
if not encoding:
encoding = protocol_encoding
parser = lxml.html.HTMLParser(encoding=encoding)
result = parse(source, parser=parser)
if result is None:
raise ValueError('Error while parsing HTML')
base_url = find_base_url(result, base_url)
if hasattr(result, 'getroot'):
result.docinfo.URL = base_url
result = result.getroot()
else:
result.getroottree().docinfo.URL = base_url
self.root_element = result
self.base_url = base_url
self.url_fetcher = url_fetcher
self.media_type = media_type
def _ua_stylesheet(self):
from .html import HTML5_UA_STYLESHEET
return [HTML5_UA_STYLESHEET]
def _get_document(self, stylesheets, enable_hinting, ua_stylesheets=None):
if ua_stylesheets is None:
ua_stylesheets = self._ua_stylesheet()
user_stylesheets = [css if hasattr(css, 'rules')
else CSS(guess=css, media_type=self.media_type)
for css in stylesheets or []]
from .document import Document
return Document(self.root_element, enable_hinting, self.url_fetcher,
self.media_type, user_stylesheets, ua_stylesheets)
def render(self, enable_hinting, stylesheets=None, resolution=96):
"""Render the document and return a list of Page objects.
This is the low-level API. It provides individual pages that can
paint to any type of cairo surface.
:type enable_hinting: bool
:param enable_hinting:
Whether text, borders and background should be *hinted* to fall
at device pixel boundaries. Should be enabled for pixel-based
output (like PNG) but not vector based output (like PDF).
:param stylesheets:
An optional list of user stylesheets. (See
:ref:`stylesheet-origins`\.) List elements are :class:`CSS`
objects, filenames, URLs, or file-like objects.
:type resolution: float
:param resolution:
The output resolution in cairo user units per CSS inch. At 96 dpi
(the default), user units match the CSS ``px`` unit.
For example, :class:`cairo.PDFSurface`’s device units are
in PostScript points (72dpi), so ``resolution=72`` will set
the right scale for physical units.
:returns: A list of :class:`Page` objects.
"""
document = self._get_document(stylesheets, enable_hinting)
return [Page(p, enable_hinting, resolution)
for p in document.render_pages()]
def write_pdf(self, target=None, stylesheets=None):
"""Render the document to PDF.
:param target:
Where the PDF output is written.
A filename or a file-like object (anything with a
:meth:`~file.write` method) or :obj:`None`.
:param stylesheets:
An optional list of user stylesheets. (See
:ref:`stylesheet-origins`\.) The list’s elements are
:class:`CSS` objects, filenames, URLs, or file-like objects.
:returns:
If :obj:`target` is :obj:`None`, a PDF byte string.
"""
pages = self.render(enable_hinting=False, stylesheets=stylesheets,
resolution=72)
return pages_to_pdf(pages, target)
def write_png(self, target=None, stylesheets=None, resolution=96):
"""Render the document to a single PNG image.
Pages are arranged vertically without any decoration.
:param target:
Where the PNG output is written.
A filename or a file-like object (anything with a
:meth:`~file.write` method) or :obj:`None`.
:param stylesheets:
An optional list of user stylesheets. (See
:ref:`stylesheet-origins`\.) The list’s elements are
:class:`CSS` objects, filenames, URLs, or file-like objects.
:type resolution: float
:param resolution:
The output resolution in PNG pixels per CSS inch. At 96 dpi
(the default), PNG pixels match the CSS ``px`` unit.
:returns:
If :obj:`target` is :obj:`None`, a PNG byte string.
"""
pages = self.render(enable_hinting=True, stylesheets=stylesheets,
resolution=resolution)
return pages_to_png(pages, target)
def get_png_pages(self, stylesheets=None, resolution=96):
"""Render the document to multiple PNG images, one per page.
:param stylesheets:
An optional list of user stylesheets. (See
:ref:`stylesheet-origins`\.) The list’s elements are
:class:`CSS` objects, filenames, URLs, or file-like objects.
:type resolution: float
:param resolution:
The output resolution in PNG pixels per CSS inch. At 96 dpi
(the default), PNG pixels match the CSS ``px`` unit.
:returns:
A generator of ``(width, height, png_bytes)`` tuples, one for
each page, in order.
"""
for page in self.render(enable_hinting=True, stylesheets=stylesheets,
resolution=resolution):
surface = pages_to_image_surface([page])
yield (surface.get_width(), surface.get_height(),
surface_to_png(surface))
class CSS(object):
"""Represents a CSS stylesheet parsed by tinycss.
An instance is created in the same way as :class:`HTML`, except that
the ``tree`` parameter is not available. All other parameters are the same.
``CSS`` objects have no public attribute or method. They are only meant to
be used in the ``write_pdf`` or ``write_png`` method. (See above.)
"""
def __init__(self, guess=None, filename=None, url=None, file_obj=None,
string=None, encoding=None, base_url=None,
url_fetcher=default_url_fetcher, _check_mime_type=False,
media_type='print'):
from .css import PARSER, preprocess_stylesheet
source_type, source, base_url, protocol_encoding = _select_source(
guess, filename, url, file_obj, string, tree=None,
base_url=base_url, url_fetcher=url_fetcher,
check_css_mime_type=_check_mime_type,)
kwargs = dict(linking_encoding=encoding,
protocol_encoding=protocol_encoding)
if source_type == 'string':
if isinstance(source, bytes):
method = 'parse_stylesheet_bytes'
else:
# unicode, no encoding
method = 'parse_stylesheet'
kwargs.clear()
else:
# file_obj or filename
method = 'parse_stylesheet_file'
# TODO: do not keep this?
self.stylesheet = getattr(PARSER, method)(source, **kwargs)
self.base_url = base_url
self.rules = list(preprocess_stylesheet(
media_type, base_url, self.stylesheet.rules, url_fetcher))
for error in self.stylesheet.errors:
LOGGER.warn(error)
def _select_source(guess=None, filename=None, url=None, file_obj=None,
string=None, tree=None, base_url=None,
url_fetcher=default_url_fetcher, check_css_mime_type=False):
"""
Check that only one input is not None, and return it with the
normalized ``base_url``.
"""
from .urls import path2url, ensure_url, url_is_absolute
from .urls import wrap_url_fetcher
url_fetcher = wrap_url_fetcher(url_fetcher)
if base_url is not None:
base_url = ensure_url(base_url)
nones = [guess is None, filename is None, url is None,
file_obj is None, string is None, tree is None]
if nones == [False, True, True, True, True, True]:
if hasattr(guess, 'read'):
type_ = 'file_obj'
elif url_is_absolute(guess):
type_ = 'url'
else:
type_ = 'filename'
return _select_source(
base_url=base_url, url_fetcher=url_fetcher,
check_css_mime_type=check_css_mime_type,
**{type_: guess})
if nones == [True, False, True, True, True, True]:
if base_url is None:
base_url = path2url(filename)
return 'filename', filename, base_url, None
if nones == [True, True, False, True, True, True]:
result = url_fetcher(url)
if check_css_mime_type and result['mime_type'] != 'text/css':
LOGGER.warn('Unsupported stylesheet type %s for %s',
result['mime_type'], result['redirected_url'])
return 'string', '', base_url, None
protocol_encoding = result.get('encoding')
if base_url is None:
base_url = result.get('redirected_url', url)
if 'string' in result:
return 'string', result['string'], base_url, protocol_encoding
else:
return 'file_obj', result['file_obj'], base_url, protocol_encoding
if nones == [True, True, True, False, True, True]:
if base_url is None:
# filesystem file objects have a 'name' attribute.
name = getattr(file_obj, 'name', None)
# Some streams have a .name like '<stdin>', not a filename.
if name and not name.startswith('<'):
base_url = ensure_url(name)
return 'file_obj', file_obj, base_url, None
if nones == [True, True, True, True, False, True]:
return 'string', string, base_url, None
if nones == [True, True, True, True, True, False]:
return 'tree', tree, base_url, None
raise TypeError('Expected exactly one source, got %i' % nones.count(False))
class Page(object):
"""Represents a single rendered page."""
def __init__(self, page, enable_hinting, resolution=96):
self._page_box = page
self._enable_hinting = enable_hinting
self._dppx = resolution / 96
#: The page width, including margins, in cairo user units.
self.width = page.margin_width() * self._dppx
#: The page height, including margins, in cairo user units.
self.height = page.margin_height() * self._dppx
def paint(self, cairo_context, left_x=0, top_y=0, clip=False):
"""Paint the surface on any cairo Context object.
:type left_x: float
:param left_x:
X coordinate of the left of the page, in user units.
:type top_y: float
:param top_y:
Y coordinate of the top of the page, in user units.
:type clip: bool
:param clip:
Whether to clip/cut content outside the page. If false or
not provided, content can overflow.
"""
from .draw import draw_page, stacked
with stacked(cairo_context):
if self._enable_hinting:
from math import ceil
left_x, top_y = cairo_context.user_to_device(left_x, top_y)
width, height = cairo_context.user_to_device_distance(
self.width, self.height)
# Hint in device space
left_x = int(left_x)
top_y = int(top_y)
width = int(ceil(width))
height = int(ceil(height))
left_x, top_y = cairo_context.device_to_user(left_x, top_y)
width, height = cairo_context.device_to_user_distance(
width, height)
else:
width = self.width
height = self.height
cairo_context.translate(left_x, top_y)
# The top-left corner is now (0, 0)
if clip:
cairo_context.rectangle(0, 0, width, height)
cairo_context.clip()
cairo_context.scale(self._dppx, self._dppx)
# User units are now CSS pixels
draw_page(self._page_box, cairo_context, self._enable_hinting)
def pages_to_pdf(pages, target=None):
"""Paint pages; write PDF bytes to ``target``, or return them
if ``target`` is ``None``.
This function also adds PDF metadata (bookmarks/outlines, hyperlinks, …).
PDF files coming straight from :class:`cairo.PDFSurface` do not have
such metadata.
:param pages: a list of Page objects
:param target: a filename, file object, or ``None``
:returns: a bytestring if ``target`` is ``None``.
"""
import shutil
import cairo
from .pdf import write_pdf_metadata
# Use an in-memory buffer. We will need to seek for metadata
# TODO: avoid this if target can seek? Benchmark first.
file_obj = io.BytesIO()
# (1, 1) is overridden by .set_size() below.
surface = cairo.PDFSurface(file_obj, 1, 1)
context = cairo.Context(surface)
for page in pages:
surface.set_size(page.width, page.height)
page.paint(context)
surface.show_page()
surface.finish()
write_pdf_metadata(pages, file_obj)
if target is None:
return file_obj.getvalue()
else:
file_obj.seek(0)
if hasattr(target, 'write'):
shutil.copyfileobj(file_obj, target)
else:
with open(target, 'wb') as fd:
shutil.copyfileobj(file_obj, fd)
def pages_to_image_surface(pages):
"""Paint pages vertically for pixel output.
:param pages: a list of :class:`~weasyprint.Page` objects
:returns: a :class:`cairo.ImageSurface` object
"""
import math
import cairo
from .draw import stacked
from .compat import izip
# This duplicates the hinting logic in Page.paint. There is a dependency
# cycle otherwise: this → hinting logic → context → surface → this
# But since we do no transform here, cairo_context.user_to_device and
# friends are identity functions.
widths = [int(math.ceil(p.width)) for p in pages]
heights = [int(math.ceil(p.height)) for p in pages]
max_width = max(widths)
surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, max_width, sum(heights))
context = cairo.Context(surface)
pos_y = 0
for page, width, height in izip(pages, widths, heights):
pos_x = (max_width - width) // 2
with stacked(context):
page.paint(context, pos_x, pos_y, clip=True)
pos_y += height
return surface
def surface_to_png(surface, target=None):
"""Write PNG bytes to ``target``, or return them if ``target`` is ``None``.
:param surface: a :class:`cairo.ImageSurface` object
:param target: a filename, file object, or ``None``
:returns: a bytestring if ``target`` is ``None``.
"""
from .urls import FILESYSTEM_ENCODING
if target is None:
target = io.BytesIO()
surface.write_to_png(target)
return target.getvalue()
else:
if sys.version_info[0] < 3 and isinstance(target, unicode):
# py2cairo 1.8 does not support unicode filenames.
target = target.encode(FILESYSTEM_ENCODING)
surface.write_to_png(target)
def pages_to_png(pages, target=None):
"""Paint pages vertically; write PNG bytes to ``target``, or return them
if ``target`` is ``None``.
:param pages: a list of :class:`~weasyprint.Page` objects
:param target: a filename, file object, or ``None``
:returns: a bytestring if ``target`` is ``None``.
"""
return surface_to_png(pages_to_image_surface(pages), target)