Pm 3751 anonymize content (#13)

* Begin to add xhtml.replace_content function relying on cssselect and lxml * Added test for xhtml.replace_content See #PM-3751 * Try to use cache * Revert "Try to use cache" This reverts commit 143d928. * Make lxml.cssselect optional by only importing it in xhtml.replace_content Co-authored-by: Gauthier Bastien <gauthier.bastien@imio.be>
IMIO · Sep 24, 2021 · 0933918 · 0933918
1 parent bb9411c
commit 0933918
Show file tree

Hide file tree

Showing 7 changed files with 93 additions and 4 deletions.
diff --git a/.gitignore b/.gitignore
@@ -20,3 +20,4 @@ var/
 .coverage
 share/
 pip-selfcheck.json
+pyvenv.cfg
diff --git a/Makefile b/Makefile
@@ -8,7 +8,6 @@ plone:=4
 bootstrap:
 	virtualenv -p python$(py) .
 	bin/pip install -r requirements.txt
-	./bin/python bootstrap.py --version=2.13.2
 
 buildout:
 	cp test_plone$(plone).cfg buildout.cfg

diff --git a/base.cfg b/base.cfg
@@ -4,7 +4,7 @@ extends =
     sources.cfg
 
 package-name = imio.helpers
-package-extras = [test,pdf]
+package-extras = [test,pdf,lxml]
 
 develop = .
 
@@ -40,4 +40,4 @@ directory =
     ${buildout:directory}/src/imio/helpers
 flake8-ignore = E123,E124,E501,E126,E127,E128,W391,C901,W503,W504
 flake8-extensions =
-    flake8-isort
+    flake8-isort
diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1,3 @@
+pip==20.3.4
 setuptools==42.0.2
-zc.buildout==2.13.2
+zc.buildout==2.13.2
diff --git a/setup.py b/setup.py
@@ -60,6 +60,9 @@
             'PyPDF2',
             'reportlab',
         ],
+        'lxml': [
+            'cssselect',
+        ],
     },
     entry_points="""
     [z3c.autoinclude.plugin]

diff --git a/src/imio/helpers/tests/test_xhtml.py b/src/imio/helpers/tests/test_xhtml.py
@@ -8,6 +8,7 @@
 from imio.helpers.xhtml import object_link
 from imio.helpers.xhtml import removeBlanks
 from imio.helpers.xhtml import removeCssClasses
+from imio.helpers.xhtml import replace_content
 from imio.helpers.xhtml import separate_images
 from imio.helpers.xhtml import storeImagesLocally
 from imio.helpers.xhtml import xhtmlContentIsEmpty
@@ -828,3 +829,41 @@ def test_separate_images(self):
         result = separate_images(text)
         self.assertEqual(result, '<p><img src="http://plone/nohost/image1.png">\xc2\xa0 \xc2\xa0<br></p>'
                          '<p><img src="http://plone/nohost/image2.png"></p>')
+
+    def test_replace_content(self):
+        text = '<p>Text <span class="to-hide">hidden</span> some other text.</p>' \
+            '<p>Text <span class="to-hide">hidden <strong>hidden</strong> hidden</span> some other text.</p>' \
+            '<p><span class="to-hide">hidden</span> some other text.</p>' \
+            '<p><span class="to-hide">hidden</span></p>' \
+            '<p><span class="to-hide">hidden</span></p>' \
+            '<table><tr><td>Text <span class="to-hide">hidden</span></td>' \
+            '<td>Text not hidden</td></tr></table>'
+        expected = '<p>Text <span class="to-hide"></span> some other text.</p>' \
+            '<p>Text <span class="to-hide"><span></span></span> some other text.</p>' \
+            '<p><span class="to-hide"></span> some other text.</p>' \
+            '<p><span class="to-hide"></span></p>' \
+            '<p><span class="to-hide"></span></p>' \
+            '<table><tr><td>Text <span class="to-hide"></span></td>' \
+            '<td>Text not hidden</td></tr></table>'
+        expected_new_content = '<p>Text <span class="to-hide">replaced</span> some other text.</p>' \
+            '<p>Text <span class="to-hide">replaced<span></span></span> some other text.</p>' \
+            '<p><span class="to-hide">replaced</span> some other text.</p>' \
+            '<p><span class="to-hide">replaced</span></p>' \
+            '<p><span class="to-hide">replaced</span></p>' \
+            '<table><tr><td>Text <span class="to-hide">replaced</span></td>' \
+            '<td>Text not hidden</td></tr></table>'
+        res = replace_content(text, css_class="to-hide")
+        self.assertEqual(res, expected)
+        res = replace_content(text, css_class="to-hide", new_content=u"replaced")
+        self.assertEqual(res, expected_new_content)
+        text_link = '<p>Text <span class="to-hide">hidden <strong>hidden</strong></span></p>'
+        expected_link = '<p>Text <span class="to-hide"><span></span>' \
+            '<a href="https://python.org" title="Explanation">replaced</a></span></p>'
+        res = replace_content(
+            text_link,
+            css_class="to-hide",
+            new_content=u"replaced",
+            new_content_link={
+                "href": "https://python.org",
+                "title": u"Explanation"})
+        self.assertEqual(res, expected_link)
diff --git a/src/imio/helpers/xhtml.py b/src/imio/helpers/xhtml.py
@@ -87,6 +87,52 @@ def removeBlanks(xhtmlContent, pretty_print=False):
                     for x in tree.iterchildren()])
 
 
+def replace_content(xhtml_content,
+                    css_class,
+                    new_content=u"",
+                    new_content_link={},
+                    pretty_print=False):
+    '''This method will get tags using given p_css_class and
+       replace it's content with given p_content.'''
+    tree = _turnToLxmlTree(xhtml_content)
+    if not isinstance(tree, lxml.html.HtmlElement):
+        return xhtml_content
+
+    new_content = safe_unicode(new_content)
+
+    from lxml.cssselect import CSSSelector
+    selector = CSSSelector('.' + css_class)
+    elements = selector(tree)
+    for main_elt in elements:
+        # will find every contained elements
+        # store itered elements as we add new elements,
+        # iteration ignores last elements
+        elts = [elt for elt in main_elt.iter()]
+        for elt in elts:
+            is_main_elt = elt == main_elt
+            if is_main_elt:
+                if new_content_link:
+                    elt.text = u""
+                    new_elt = lxml.html.Element("a")
+                    new_elt.attrib["href"] = new_content_link["href"]
+                    new_elt.attrib["title"] = new_content_link.get("title", u"")
+                    new_elt.text = new_content or u""
+                    elt.append(new_elt)
+                else:
+                    elt.text = new_content or u""
+            else:
+                elt.tag = "span"
+                elt.text = u""
+                elt.tail = u""
+
+    # only return children of the <special_tag>
+    return ''.join([lxml.html.tostring(x,
+                                       encoding='ascii',
+                                       pretty_print=pretty_print,
+                                       method='html')
+                    for x in tree.iterchildren()])
+
+
 def _turnToLxmlTree(xhtmlContent):
     if not xhtmlContent or not xhtmlContent.strip():
         return xhtmlContent