Skip to content

Commit

Permalink
improved cleaning
Browse files Browse the repository at this point in the history
  • Loading branch information
odelaere committed Aug 11, 2022
1 parent 5804814 commit 719efd1
Showing 1 changed file with 10 additions and 2 deletions.
12 changes: 10 additions & 2 deletions src/Products/MeetingCommunes/Extensions/import-csv-inforius.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,18 +105,26 @@

datetime_format = "%Y-%m-%d %H:%M:%S"

cleaner = Cleaner(tags=['p', 'br', 'ul', 'ol', 'li', 'strong', 'u', 'em'], strip=True)
cleaner = Cleaner(tags=['p', 'br', 'ul', 'ol', 'li', 'strong', 'u', 'em', 'sup', 'sub', 'a', 'img'
'table', 'thead', 'tr', 'th', 'tbody', 'td'],
attributes={'a': ['href', 'alt'],
'img': ['src', 'alt', 'width', 'height']},
strip=True)

commit_step = 10


def clean_xhtml(html_value):
xhtml = html_value.strip()
if not xhtml.startswith(u"<p"):
xhtml = u"<p>" + xhtml
if not xhtml.endswith(u"</p>"):
xhtml += u"</p>"
xhtml = xhtml.replace(u"\u00A0", u"&nbsp;").strip()
# replace multiple br
xhtml = xhtml.replace(u"\n", u"").strip()
xhtml = re.sub(r'<br.?>((\s|\n)*<br.?>)+', u'</p>\n<p>', xhtml)
xhtml = xhtml.replace(u"&", u"&amp;").strip()
xhtml = xhtml.replace(u"\u00A0", u"&nbsp;").strip()
xhtml = xhtml.replace(u"\u2022", u"*").strip()
xhtml = xhtml.replace(u"\u25E6", u"*").strip()
xhtml = xhtml.replace(u"\u2219", u"*").strip()
Expand Down

0 comments on commit 719efd1

Please sign in to comment.