Skip to content

Commit

Permalink
Pm 3751 anonymize content (#13)
Browse files Browse the repository at this point in the history
* Begin to add xhtml.replace_content function relying on cssselect and lxml

* Added test for xhtml.replace_content
See #PM-3751

* Try to use cache

* Revert "Try to use cache"

This reverts commit 143d928.

* Make lxml.cssselect optional by only importing it in xhtml.replace_content

Co-authored-by: Gauthier Bastien <gauthier.bastien@imio.be>
  • Loading branch information
gbastien and gbastien committed Sep 24, 2021
1 parent bb9411c commit 0933918
Show file tree
Hide file tree
Showing 7 changed files with 93 additions and 4 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,4 @@ var/
.coverage
share/
pip-selfcheck.json
pyvenv.cfg
1 change: 0 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ plone:=4
bootstrap:
virtualenv -p python$(py) .
bin/pip install -r requirements.txt
./bin/python bootstrap.py --version=2.13.2

buildout:
cp test_plone$(plone).cfg buildout.cfg
Expand Down
4 changes: 2 additions & 2 deletions base.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ extends =
sources.cfg

package-name = imio.helpers
package-extras = [test,pdf]
package-extras = [test,pdf,lxml]

develop = .

Expand Down Expand Up @@ -40,4 +40,4 @@ directory =
${buildout:directory}/src/imio/helpers
flake8-ignore = E123,E124,E501,E126,E127,E128,W391,C901,W503,W504
flake8-extensions =
flake8-isort
flake8-isort
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
pip==20.3.4
setuptools==42.0.2
zc.buildout==2.13.2
zc.buildout==2.13.2
3 changes: 3 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@
'PyPDF2',
'reportlab',
],
'lxml': [
'cssselect',
],
},
entry_points="""
[z3c.autoinclude.plugin]
Expand Down
39 changes: 39 additions & 0 deletions src/imio/helpers/tests/test_xhtml.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from imio.helpers.xhtml import object_link
from imio.helpers.xhtml import removeBlanks
from imio.helpers.xhtml import removeCssClasses
from imio.helpers.xhtml import replace_content
from imio.helpers.xhtml import separate_images
from imio.helpers.xhtml import storeImagesLocally
from imio.helpers.xhtml import xhtmlContentIsEmpty
Expand Down Expand Up @@ -828,3 +829,41 @@ def test_separate_images(self):
result = separate_images(text)
self.assertEqual(result, '<p><img src="http://plone/nohost/image1.png">\xc2\xa0 \xc2\xa0<br></p>'
'<p><img src="http://plone/nohost/image2.png"></p>')

def test_replace_content(self):
text = '<p>Text <span class="to-hide">hidden</span> some other text.</p>' \
'<p>Text <span class="to-hide">hidden <strong>hidden</strong> hidden</span> some other text.</p>' \
'<p><span class="to-hide">hidden</span> some other text.</p>' \
'<p><span class="to-hide">hidden</span></p>' \
'<p><span class="to-hide">hidden</span></p>' \
'<table><tr><td>Text <span class="to-hide">hidden</span></td>' \
'<td>Text not hidden</td></tr></table>'
expected = '<p>Text <span class="to-hide"></span> some other text.</p>' \
'<p>Text <span class="to-hide"><span></span></span> some other text.</p>' \
'<p><span class="to-hide"></span> some other text.</p>' \
'<p><span class="to-hide"></span></p>' \
'<p><span class="to-hide"></span></p>' \
'<table><tr><td>Text <span class="to-hide"></span></td>' \
'<td>Text not hidden</td></tr></table>'
expected_new_content = '<p>Text <span class="to-hide">replaced</span> some other text.</p>' \
'<p>Text <span class="to-hide">replaced<span></span></span> some other text.</p>' \
'<p><span class="to-hide">replaced</span> some other text.</p>' \
'<p><span class="to-hide">replaced</span></p>' \
'<p><span class="to-hide">replaced</span></p>' \
'<table><tr><td>Text <span class="to-hide">replaced</span></td>' \
'<td>Text not hidden</td></tr></table>'
res = replace_content(text, css_class="to-hide")
self.assertEqual(res, expected)
res = replace_content(text, css_class="to-hide", new_content=u"replaced")
self.assertEqual(res, expected_new_content)
text_link = '<p>Text <span class="to-hide">hidden <strong>hidden</strong></span></p>'
expected_link = '<p>Text <span class="to-hide"><span></span>' \
'<a href="https://python.org" title="Explanation">replaced</a></span></p>'
res = replace_content(
text_link,
css_class="to-hide",
new_content=u"replaced",
new_content_link={
"href": "https://python.org",
"title": u"Explanation"})
self.assertEqual(res, expected_link)
46 changes: 46 additions & 0 deletions src/imio/helpers/xhtml.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,52 @@ def removeBlanks(xhtmlContent, pretty_print=False):
for x in tree.iterchildren()])


def replace_content(xhtml_content,
css_class,
new_content=u"",
new_content_link={},
pretty_print=False):
'''This method will get tags using given p_css_class and
replace it's content with given p_content.'''
tree = _turnToLxmlTree(xhtml_content)
if not isinstance(tree, lxml.html.HtmlElement):
return xhtml_content

new_content = safe_unicode(new_content)

from lxml.cssselect import CSSSelector
selector = CSSSelector('.' + css_class)
elements = selector(tree)
for main_elt in elements:
# will find every contained elements
# store itered elements as we add new elements,
# iteration ignores last elements
elts = [elt for elt in main_elt.iter()]
for elt in elts:
is_main_elt = elt == main_elt
if is_main_elt:
if new_content_link:
elt.text = u""
new_elt = lxml.html.Element("a")
new_elt.attrib["href"] = new_content_link["href"]
new_elt.attrib["title"] = new_content_link.get("title", u"")
new_elt.text = new_content or u""
elt.append(new_elt)
else:
elt.text = new_content or u""
else:
elt.tag = "span"
elt.text = u""
elt.tail = u""

# only return children of the <special_tag>
return ''.join([lxml.html.tostring(x,
encoding='ascii',
pretty_print=pretty_print,
method='html')
for x in tree.iterchildren()])


def _turnToLxmlTree(xhtmlContent):
if not xhtmlContent or not xhtmlContent.strip():
return xhtmlContent
Expand Down

0 comments on commit 0933918

Please sign in to comment.