Skip to content

Commit

Permalink
Remove comments at preprocessing
Browse files Browse the repository at this point in the history
  • Loading branch information
Hiromu Hota committed Sep 30, 2020
1 parent b4ec81e commit a55d375
Showing 1 changed file with 4 additions and 2 deletions.
6 changes: 4 additions & 2 deletions src/fonduer/parser/preprocessors/hocr_doc_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from typing import Iterator, Optional, Tuple

from bs4 import BeautifulSoup
from bs4.element import NavigableString, Tag
from bs4.element import Comment, NavigableString, Tag

from fonduer.parser.models import Document
from fonduer.parser.preprocessors.doc_preprocessor import DocPreprocessor
Expand Down Expand Up @@ -126,7 +126,9 @@ def get_bbox(node: Tag) -> Tuple[str, ...]:
# Remove linebreaks and excess spaces
# in reverse order b/c removing element from list in loop
for child in reversed(parent.contents):
if isinstance(child, NavigableString):
if isinstance(child, Comment): # remove comments
child.extract()
elif isinstance(child, NavigableString):
if child.strip() == "": # remove if space or linebreak
child.extract()
else:
Expand Down

0 comments on commit a55d375

Please sign in to comment.