Skip to content

Commit

Permalink
Simplify text extraction
Browse files Browse the repository at this point in the history
  • Loading branch information
liZe committed Apr 19, 2022
1 parent 46c10bf commit e2eb182
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 49 deletions.
2 changes: 1 addition & 1 deletion weasyprint/css/targets.py
Expand Up @@ -22,7 +22,7 @@ def __init__(self, state='pending'):

# Required by target-counter and target-counters to access the
# target's .cached_counter_values.
# Needed for target-text via TEXT_CONTENT_EXTRACTORS.
# Needed for target-text via extract_text().
self.target_box = None

# Functions that have to been called to check pending targets.
Expand Down
75 changes: 27 additions & 48 deletions weasyprint/formatting_structure/build.py
Expand Up @@ -427,7 +427,7 @@ def add_text(text):
content_boxes.append(
boxes.InlineReplacedBox.anonymous_from(parent_box, image))
elif type_ == 'content()':
added_text = TEXT_CONTENT_EXTRACTORS[value](parent_box)
added_text = extract_text(value, parent_box)
# Simulate the step of white space processing
# (normally done during the layout)
add_text(added_text.strip())
Expand Down Expand Up @@ -495,7 +495,7 @@ def add_text(text):
target_box = lookup_target.target_box
# TODO: 'before'- and 'after'- content referring missing
# counters are not properly set.
text = TEXT_CONTENT_EXTRACTORS[text_style](target_box)
text = extract_text(text_style, target_box)
# Simulate the step of white space processing
# (normally done during the layout)
add_text(text.strip())
Expand Down Expand Up @@ -1564,50 +1564,29 @@ def box_text(box):
not child.element_tag.endswith('::after') and
not child.element_tag.endswith('::marker') and
isinstance(child, boxes.TextBox))
else:
return ''


def box_text_first_letter(box):
# TODO: use the same code as in inlines.first_letter_to_box
character_found = False
first_letter = ''
text = box_text(box)
while text:
next_letter = text[0]
category = unicodedata.category(next_letter)
if category not in ('Ps', 'Pe', 'Pi', 'Pf', 'Po'):
if character_found:
break
character_found = True
first_letter += next_letter
text = text[1:]
return first_letter


def box_text_before(box):
if isinstance(box, boxes.ParentBox):
return ''.join(
box_text(child) for child in box.descendants()
if child.element_tag.endswith('::before') and
not isinstance(child, boxes.ParentBox))
else:
return ''


def extract_text(text_part, box):
if text_part in ('text', 'content'):
return box_text(box)
elif text_part in ('before', 'after'):
if isinstance(box, boxes.ParentBox):
return ''.join(
box_text(child) for child in box.descendants()
if child.element_tag.endswith(f'::{text_part}') and
not isinstance(child, boxes.ParentBox))
return ''


def box_text_after(box):
if isinstance(box, boxes.ParentBox):
return ''.join(
box_text(child) for child in box.descendants()
if child.element_tag.endswith('::after') and
not isinstance(child, boxes.ParentBox))
else:
return ''


TEXT_CONTENT_EXTRACTORS = {
'text': box_text,
'content': box_text,
'before': box_text_before,
'after': box_text_after,
'first-letter': box_text_first_letter}
elif text_part == 'first-letter':
# TODO: use the same code as in inlines.first_letter_to_box
character_found = False
first_letter = ''
text = box_text(box)
for letter in text:
category = unicodedata.category(letter)
if category not in ('Ps', 'Pe', 'Pi', 'Pf', 'Po'):
if character_found:
break
character_found = True
first_letter += letter
return first_letter

0 comments on commit e2eb182

Please sign in to comment.