New feature: support for {include} syntax. Fixes getpelican#1902.

The new {include} syntax makes it possible to include frequently used text snippets into your content.
Lucas-C · Oct 2, 2019 · 2b6f00d · 2b6f00d
1 parent 047d884
commit 2b6f00d
Show file tree

Hide file tree

Showing 26 changed files with 377 additions and 32 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,7 @@
 .*.swp
 .*.swo
 *.pyc
+.cache/
 .DS_Store
 docs/_build
 docs/fr/_build
@@ -16,3 +17,4 @@ six-*.egg/
 venv
 samples/output
 *.pem
+pip-wheel-metadata/
diff --git a/RELEASE.md b/RELEASE.md
@@ -0,0 +1,3 @@
+Release type: minor
+
+Add support for the ``{include}`` syntax
diff --git a/docs/content.rst b/docs/content.rst
@@ -369,6 +369,45 @@ Linking to authors, categories, index and tags
 You can link to authors, categories, index and tags using the ``{author}name``,
 ``{category}foobar``, ``{index}`` and ``{tag}tagname`` syntax.
 
+Including common text into your content
+---------------------------------------
+
+From Pelican 4.2 onward, you can include common text snippets into your content using
+the ``{include}file.ext`` syntax. You can specify semi-absolute paths starting
+from the ``PATH`` directory, e.g. ``{include}/pages/disclaimer.html`` or use
+relative paths, e.g. ``{include}notice.html``. Relativity is
+calculated based on the location of the file containing the ``{include}``.
+For example when you have the following content layout::
+
+    content
+    └── notice2.html
+    └── pages
+        ├── page1.html
+        └── notice1.html
+
+Then the includes may look like::
+
+    <html>
+        <head>
+            <title>PAGE 1</title>
+        </head>
+        <body>
+            This is the content of page 1
+
+            {include}../notice2.html
+        </body>
+    </html>
+
+
+``notice2.html`` looks like::
+
+    {include}pages/notice1.html
+    This is the second warning about relative paths
+
+When using ``{include}`` it is best to blacklist the included files using the
+``IGNORE_FILES`` setting. Otherwise Pelican will try to render them as regular
+content and will most likely fail!
+
 Deprecated internal link syntax
 -------------------------------
 

diff --git a/pelican/contents.py b/pelican/contents.py
@@ -11,7 +11,7 @@
 import pytz
 
 import six
-from six.moves.urllib.parse import urljoin, urlparse, urlunparse
+from six.moves.urllib.parse import unquote, urljoin, urlparse, urlunparse
 
 from pelican import signals
 from pelican.settings import DEFAULT_CONFIG
@@ -359,14 +359,7 @@ def get_static_links(self):
             path = value.path
             if what not in {'static', 'attach'}:
                 continue
-            if path.startswith('/'):
-                path = path[1:]
-            else:
-                # relative to the source path of this content
-                path = self.get_relative_source_path(
-                    os.path.join(self.relative_dir, path)
-                )
-            path = path.replace('%20', ' ')
+            path = relativize_path(self.settings['PATH'], self.relative_dir, path)
             static_links.add(path)
         return static_links
 
@@ -449,24 +442,11 @@ def get_relative_source_path(self, source_path=None):
         """
         if not source_path:
             source_path = self.source_path
-        if source_path is None:
-            return None
-
-        return posixize_path(
-            os.path.relpath(
-                os.path.abspath(os.path.join(
-                    self.settings['PATH'],
-                    source_path)),
-                os.path.abspath(self.settings['PATH'])
-            ))
+        return get_relative_source_path(self.settings['PATH'], source_path)
 
     @property
     def relative_dir(self):
-        return posixize_path(
-            os.path.dirname(
-                os.path.relpath(
-                    os.path.abspath(self.source_path),
-                    os.path.abspath(self.settings['PATH']))))
+        return relative_dir(self.settings['PATH'], self.source_path)
 
     def refresh_metadata_intersite_links(self):
         for key in self.settings['FORMATTED_FIELDS']:
@@ -613,3 +593,106 @@ def _log_reason(reason):
 
         self.override_save_as = new_save_as
         self.override_url = new_url
+
+
+def get_relative_source_path(content_path, source_path):
+    if source_path is None:
+        return None
+
+    return posixize_path(
+        os.path.relpath(
+            os.path.abspath(os.path.join(
+                content_path,
+                source_path)),
+            os.path.abspath(content_path)
+        ))
+
+
+def relativize_path(content_path, relative_dir, path):
+    """
+    Update path depending on whether this is an absolute
+    or relative value.
+    """
+    if path.startswith('/'):
+        path = path[1:]
+    else:
+        path = get_relative_source_path(content_path, os.path.join(relative_dir, path))
+
+    path = path.replace('%20', ' ')
+
+    return path
+
+
+def relative_dir(content_path, path):
+    return posixize_path(
+        os.path.dirname(
+            os.path.relpath(
+                os.path.abspath(path),
+                os.path.abspath(content_path))))
+
+
+def insert_included_content(content, source_path, content_path, exclude_exts=()):
+    """
+        Replace {include}some.file with the
+        contents of this file.
+
+        Perform conversion to HTML 
+    """
+    regex = r"""(?P<indent>[ \t]+)?[{|]include[|}](?P<path>[\w./]+)"""
+    hrefs = re.compile(regex, re.X)
+    processed_paths = []
+    # In Python 3.x we can use the `nonlocal` declaration, in `replacer()`,
+    # to tell Python we mean to assign to the `source_path` variable from
+    # `insert_included_content()`.
+    # In Python 2.x we simply can't assign to `source_path` in `replacer()`.
+    # However, we work around this by not assigning to the variable itself,
+    # but using a mutable container to keep track about the current working
+    # directory while doing the recursion.
+    source_dir = [relative_dir(content_path, source_path)]
+
+    def replacer(m):
+        path, indent = m.group('path'), m.group('indent')
+        path = relativize_path(content_path, source_dir[0], path)
+        path = posixize_path(
+                os.path.abspath(
+                    os.path.join(content_path, path)
+                )
+            )
+
+        if not os.path.isfile(path):
+            logger.warning("Unable to find `%s`, skipping include.", path)
+            return ''.join(('{include}', m.group('path')))
+
+        _, ext = os.path.splitext(path)
+        # remove leading dot
+        ext = ext[1:]
+
+        if ext in exclude_exts:
+            return ''.join(('{include}', m.group('path')))
+
+        with open(path) as content_file:
+            text = content_file.read()
+
+        if indent:
+            text = ''.join(indent + line for line in text.splitlines(keepends=True))
+
+        # recursion stop
+        if path in processed_paths:
+            raise RuntimeError("Circular inclusion detected for '%s'" % path)
+        processed_paths.append(path)
+
+        # if we recurse into another file to perform more includes
+        # _path_replacer needs to know in which directory
+        # it operates otherwise it produces wrong paths
+        source_dir[0] = posixize_path(os.path.dirname(path))
+        current_source_dir = source_dir[0]
+
+        # recursively replace other includes
+        text = hrefs.sub(replacer, text)
+
+        # restore source dir
+        source_dir[0] = current_source_dir
+        return text
+
+    return hrefs.sub(replacer, content)
+
diff --git a/pelican/readers.py b/pelican/readers.py
@@ -5,6 +5,7 @@
 import os
 import re
 from collections import OrderedDict
+from tempfile import NamedTemporaryFile
 
 import docutils
 import docutils.core
@@ -19,7 +20,8 @@
 from pelican import rstdirectives  # NOQA
 from pelican import signals
 from pelican.cache import FileStampDataCacher
-from pelican.contents import Author, Category, Page, Tag
+from pelican.contents import Author, Category, Page, Tag, \
+                             insert_included_content
 from pelican.utils import SafeDatetime, escape_html, get_date, pelican_open, \
     posixize_path
 
@@ -286,9 +288,23 @@ def _get_publisher(self, source_path):
 
     def read(self, source_path):
         """Parses restructured text"""
-        pub = self._get_publisher(source_path)
-        parts = pub.writer.parts
-        content = parts.get('body')
+        with open(source_path) as content_file:
+            content = content_file.read()
+            exclude_exts = set(Readers(self.settings).extensions) - set(self.file_extensions)
+            content = insert_included_content(content, source_path, self.settings['PATH'], exclude_exts)
+            # We have pre-processed the file content, but docutils require a file as input,
+            # so with use a temporary one:
+            with NamedTemporaryFile('w+') as tmp_file:
+                tmp_file.write(content)
+                tmp_file.seek(0)
+                try:
+                    pub = self._get_publisher(tmp_file.name)
+                    parts = pub.writer.parts
+                    content = parts.get('body')
+                except docutils.ApplicationError as err:
+                    # We fix any potential error message to reference the original file:
+                    err.args = (err.args[0].replace(tmp_file.name, source_path),)
+                    raise err
 
         metadata = self._parse_metadata(pub.document, source_path)
         metadata.setdefault('title', parts.get('title'))
@@ -349,6 +365,8 @@ def read(self, source_path):
         self._source_path = source_path
         self._md = Markdown(**self.settings['MARKDOWN'])
         with pelican_open(source_path) as text:
+            exclude_exts = set(Readers(self.settings).extensions) - set(self.file_extensions)
+            text = insert_included_content(text, source_path, self.settings['PATH'], exclude_exts)
             content = self._md.convert(text)
 
         if hasattr(self._md, 'Meta'):
@@ -500,7 +518,12 @@ def read(self, filename):
         metadata = {}
         for k in parser.metadata:
             metadata[k] = self.process_metadata(k, parser.metadata[k])
-        return parser.body, metadata
+
+        if parser.body:
+            return parser.body, metadata
+        else:
+            # in case we're parsing HTML includes
+            return content, metadata
 
 
 class Readers(FileStampDataCacher):
@@ -596,6 +619,9 @@ def read_file(self, base_path, path, content_class=Page, fmt=None,
         metadata.update(_filter_discardable_metadata(reader_metadata))
 
         if content:
+            # We excluded file extensions already processed by the dedicated readers:
+            exclude_exts = MarkdownReader.file_extensions + RstReader.file_extensions
+            content = insert_included_content(content, path, self.settings['PATH'], exclude_exts)
             # find images with empty alt
             find_empty_alt(content, path)
 

diff --git a/pelican/tests/content/include/html_from_subdir_includer.md b/pelican/tests/content/include/html_from_subdir_includer.md
@@ -0,0 +1,5 @@
+_includes HTML_:
+
+{include}subdir/include_other.html
+
+^Included content above^
diff --git a/pelican/tests/content/include/html_includer.md b/pelican/tests/content/include/html_includer.md
@@ -0,0 +1,5 @@
+_includes HTML_:
+
+{include}included.html
+
+^Included content above^
diff --git a/pelican/tests/content/include/html_includer.rst b/pelican/tests/content/include/html_includer.rst
@@ -0,0 +1,6 @@
+Article including some HTML file
+################################
+
+{include}included.html
+
+^Included content above^
diff --git a/pelican/tests/content/include/html_includer_with_full_path.md b/pelican/tests/content/include/html_includer_with_full_path.md
@@ -0,0 +1,5 @@
+_includes HTML_:
+
+{include}/pelican/tests/content/include/included.html
+
+^Included content above^
diff --git a/pelican/tests/content/include/include_other.html b/pelican/tests/content/include/include_other.html
@@ -0,0 +1 @@
+{include}include_sibling.html
diff --git a/pelican/tests/content/include/include_sibling.html b/pelican/tests/content/include/include_sibling.html
@@ -0,0 +1 @@
+{include}include_other.html
diff --git a/pelican/tests/content/include/included.html b/pelican/tests/content/include/included.html
@@ -0,0 +1 @@
+<span>this content has been included</span>
diff --git a/pelican/tests/content/include/included.md b/pelican/tests/content/include/included.md
@@ -0,0 +1,2 @@
+**this is Markdown**
+Here is a [link](https://docs.getpelican.com).
diff --git a/pelican/tests/content/include/included.py b/pelican/tests/content/include/included.py
@@ -0,0 +1,3 @@
+from __future__ import braces
+import antigravity
+import this
diff --git a/pelican/tests/content/include/included.rst b/pelican/tests/content/include/included.rst
@@ -0,0 +1,2 @@
+**this is reStructuredText**
+Here is a `link <https://docs.getpelican.com>`_.
diff --git a/pelican/tests/content/include/includer_of_md_includer.md b/pelican/tests/content/include/includer_of_md_includer.md
@@ -0,0 +1,5 @@
+START
+
+{include}md_includer.md
+
+END
diff --git a/pelican/tests/content/include/inexisting_file_includer.md b/pelican/tests/content/include/inexisting_file_includer.md
@@ -0,0 +1,5 @@
+_includes HTML_:
+
+{include}inexisting_file.html
+
+^Included content above^
diff --git a/pelican/tests/content/include/md_includer.html b/pelican/tests/content/include/md_includer.html
@@ -0,0 +1,2 @@
+<em>includes Markdown</em>: {include}included.md
+^Included content above^
diff --git a/pelican/tests/content/include/md_includer.md b/pelican/tests/content/include/md_includer.md
@@ -0,0 +1,2 @@
+_inline includes Markdown_: {include}included.md
+^Included content above^
diff --git a/pelican/tests/content/include/py_includer.md b/pelican/tests/content/include/py_includer.md
@@ -0,0 +1,3 @@
+```
+{include}included.py
+```
diff --git a/pelican/tests/content/include/py_includer.rst b/pelican/tests/content/include/py_includer.rst
@@ -0,0 +1,6 @@
+Article with an indented code block
+###################################
+
+.. code-block:: python
+
+    {include}included.py
diff --git a/pelican/tests/content/include/rst_includer.rst b/pelican/tests/content/include/rst_includer.rst
@@ -0,0 +1,5 @@
+Article with an inline included reStructuredText file
+#####################################################
+
+*inline includes reStructuredText*: {include}included.rst
+^Included content above^
diff --git a/pelican/tests/content/include/subdir/include_other.html b/pelican/tests/content/include/subdir/include_other.html
@@ -0,0 +1,2 @@
+this file includes another via absolute path
+{include}/pelican/tests/content/include/subdir/include_parent.html
diff --git a/pelican/tests/content/include/subdir/include_parent.html b/pelican/tests/content/include/subdir/include_parent.html
@@ -0,0 +1,2 @@
+this file includes another in a parent directory
+{include}../included.html
diff --git a/pelican/tests/test_cache.py b/pelican/tests/test_cache.py
@@ -162,8 +162,11 @@ def test_article_object_caching(self):
         - 2012-11-30_md_w_filename_meta#foo-bar.md
         - empty.md
         - empty_with_bom.md
+
+        There are 5 more include* files which are HTML or Markdown snippets
+        and also not valid.
         """
-        self.assertEqual(generator.readers.read_file.call_count, 6)
+        self.assertEqual(generator.readers.read_file.call_count, 11)
 
     @unittest.skipUnless(MagicMock, 'Needs Mock module')
     def test_article_reader_content_caching(self):