Skip to content

Commit

Permalink
Use UTF-8 indices really everywhere
Browse files Browse the repository at this point in the history
Fix #1973.
  • Loading branch information
liZe committed Sep 28, 2023
1 parent 563d775 commit f5640de
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 20 deletions.
31 changes: 31 additions & 0 deletions tests/layout/test_inline.py
Original file line number Diff line number Diff line change
Expand Up @@ -492,6 +492,37 @@ def test_breaking_linebox_regression_15():
assert pre.width == 4 * 4


@assert_no_logs
def test_breaking_linebox_regression_16():
# Regression test for https://github.com/Kozea/WeasyPrint/issues/1973
page, = render_pages(
'<style>'
' @font-face {src: url(weasyprint.otf); font-family: weasyprint}'
' body {font-family: weasyprint; font-size: 4px}'
' p {float: left}'
'</style>'
'<p>tést</p>'
'<pre>ab©\n'
'déf\n'
'ghïj\n'
'klm</pre>')
html, = page.children
body, = html.children
p, pre = body.children
line1, = p.children
assert line1.children[0].text == 'tést'
assert p.width == 4 * 4
line1, line2, line3, line4 = pre.children
assert line1.children[0].text == 'ab©'
assert line2.children[0].text == 'déf'
assert line3.children[0].text == 'ghïj'
assert line4.children[0].text == 'klm'
assert line1.children[0].width == 4 * 3
assert line2.children[0].width == 4 * 3
assert line3.children[0].width == 4 * 4
assert line4.children[0].width == 4 * 3


@assert_no_logs
def test_linebox_text():
page, = render_pages('''
Expand Down
31 changes: 11 additions & 20 deletions weasyprint/layout/inline.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,13 +192,15 @@ def skip_first_whitespace(box, skip_stack):
if isinstance(box, boxes.TextBox):
assert next_skip_stack is None
white_space = box.style['white_space']
length = len(box.text)
if index == length:
text = box.text.encode()
if index == len(text):
# Starting a the end of the TextBox, no text to see: Continue
return 'continue'
if white_space in ('normal', 'nowrap', 'pre-line'):
while index < length and box.text[index] == ' ':
text = text[index:]
while text and text.startswith(b' '):
index += 1
text = text[1:]
return {index: None} if index else None

if isinstance(box, (boxes.LineBox, boxes.InlineBox)):
Expand Down Expand Up @@ -480,7 +482,7 @@ def split_inline_level(context, box, position_x, max_x, bottom_space,
if skip is None:
last_letter = box.text[-1]
else:
last_letter = box.text[skip - 1]
last_letter = box.text.encode()[:skip].decode()[-1]
else:
first_letter = last_letter = None
elif isinstance(box, boxes.InlineBox):
Expand Down Expand Up @@ -872,28 +874,16 @@ def split_text_box(context, box, available_width, skip, is_line_start=True):
"""
assert isinstance(box, boxes.TextBox)
font_size = box.style['font_size']
text = box.text[skip:]
text = box.text.encode()[skip:]
if font_size == 0 or not text:
return None, None, False
layout, length, resume_index, width, height, baseline = split_first_line(
text, box.style, context, available_width, box.justification_spacing,
is_line_start=is_line_start)
text.decode(), box.style, context, available_width,
box.justification_spacing, is_line_start=is_line_start)
assert resume_index != 0

# Convert ``length`` and ``resume_at`` from UTF-8 indexes in text
# to Unicode indexes.
# No need to encode what’s after resume_at (if set) or length (if
# resume_at is not set). One code point is one or more byte, so
# UTF-8 indexes are always bigger or equal to Unicode indexes.
new_text = layout.text
encoded = text.encode()
if resume_index is not None:
between = encoded[length:resume_index].decode()
resume_index = len(encoded[:resume_index].decode())
length = len(encoded[:length].decode())

if length > 0:
box = box.copy_with_text(new_text)
box = box.copy_with_text(layout.text)
box.width = width
box.pango_layout = layout
# "The height of the content area should be based on the font,
Expand All @@ -918,6 +908,7 @@ def split_text_box(context, box, available_width, skip, is_line_start=True):
if resume_index is None:
preserved_line_break = False
else:
between = text[length:resume_index].decode()
preserved_line_break = (
(length != resume_index) and between.strip(' '))
if preserved_line_break:
Expand Down

0 comments on commit f5640de

Please sign in to comment.