Use UTF-8 indices really everywhere

Fix #1973.
Kozea · Sep 28, 2023 · f5640de · f5640de
1 parent 563d775
commit f5640de
Show file tree

Hide file tree

Showing 2 changed files with 42 additions and 20 deletions.
diff --git a/tests/layout/test_inline.py b/tests/layout/test_inline.py
@@ -492,6 +492,37 @@ def test_breaking_linebox_regression_15():
     assert pre.width == 4 * 4
 
 
+@assert_no_logs
+def test_breaking_linebox_regression_16():
+    # Regression test for https://github.com/Kozea/WeasyPrint/issues/1973
+    page, = render_pages(
+        '<style>'
+        '  @font-face {src: url(weasyprint.otf); font-family: weasyprint}'
+        '  body {font-family: weasyprint; font-size: 4px}'
+        '  p {float: left}'
+        '</style>'
+        '<p>tést</p>'
+        '<pre>ab©\n'
+        'déf\n'
+        'ghïj\n'
+        'klm</pre>')
+    html, = page.children
+    body, = html.children
+    p, pre = body.children
+    line1, = p.children
+    assert line1.children[0].text == 'tést'
+    assert p.width == 4 * 4
+    line1, line2, line3, line4 = pre.children
+    assert line1.children[0].text == 'ab©'
+    assert line2.children[0].text == 'déf'
+    assert line3.children[0].text == 'ghïj'
+    assert line4.children[0].text == 'klm'
+    assert line1.children[0].width == 4 * 3
+    assert line2.children[0].width == 4 * 3
+    assert line3.children[0].width == 4 * 4
+    assert line4.children[0].width == 4 * 3
+
+
 @assert_no_logs
 def test_linebox_text():
     page, = render_pages('''

diff --git a/weasyprint/layout/inline.py b/weasyprint/layout/inline.py
@@ -192,13 +192,15 @@ def skip_first_whitespace(box, skip_stack):
     if isinstance(box, boxes.TextBox):
         assert next_skip_stack is None
         white_space = box.style['white_space']
-        length = len(box.text)
-        if index == length:
+        text = box.text.encode()
+        if index == len(text):
             # Starting a the end of the TextBox, no text to see: Continue
             return 'continue'
         if white_space in ('normal', 'nowrap', 'pre-line'):
-            while index < length and box.text[index] == ' ':
+            text = text[index:]
+            while text and text.startswith(b' '):
                 index += 1
+                text = text[1:]
         return {index: None} if index else None
 
     if isinstance(box, (boxes.LineBox, boxes.InlineBox)):
@@ -480,7 +482,7 @@ def split_inline_level(context, box, position_x, max_x, bottom_space,
             if skip is None:
                 last_letter = box.text[-1]
             else:
-                last_letter = box.text[skip - 1]
+                last_letter = box.text.encode()[:skip].decode()[-1]
         else:
             first_letter = last_letter = None
     elif isinstance(box, boxes.InlineBox):
@@ -872,28 +874,16 @@ def split_text_box(context, box, available_width, skip, is_line_start=True):
     """
     assert isinstance(box, boxes.TextBox)
     font_size = box.style['font_size']
-    text = box.text[skip:]
+    text = box.text.encode()[skip:]
     if font_size == 0 or not text:
         return None, None, False
     layout, length, resume_index, width, height, baseline = split_first_line(
-        text, box.style, context, available_width, box.justification_spacing,
-        is_line_start=is_line_start)
+        text.decode(), box.style, context, available_width,
+        box.justification_spacing, is_line_start=is_line_start)
     assert resume_index != 0
 
-    # Convert ``length`` and ``resume_at`` from UTF-8 indexes in text
-    # to Unicode indexes.
-    # No need to encode what’s after resume_at (if set) or length (if
-    # resume_at is not set). One code point is one or more byte, so
-    # UTF-8 indexes are always bigger or equal to Unicode indexes.
-    new_text = layout.text
-    encoded = text.encode()
-    if resume_index is not None:
-        between = encoded[length:resume_index].decode()
-        resume_index = len(encoded[:resume_index].decode())
-    length = len(encoded[:length].decode())
-
     if length > 0:
-        box = box.copy_with_text(new_text)
+        box = box.copy_with_text(layout.text)
         box.width = width
         box.pango_layout = layout
         # "The height of the content area should be based on the font,
@@ -918,6 +908,7 @@ def split_text_box(context, box, available_width, skip, is_line_start=True):
     if resume_index is None:
         preserved_line_break = False
     else:
+        between = text[length:resume_index].decode()
         preserved_line_break = (
             (length != resume_index) and between.strip(' '))
         if preserved_line_break: