New feature: support for {include} syntax. Fixes getpelican#1902.

The new {include} syntax makes it possible to include frequently used text snippets into your content.
Lucas-C · Oct 1, 2019 · c7bfb5c · c7bfb5c
1 parent 047d884
commit c7bfb5c
Show file tree

Hide file tree

Showing 15 changed files with 366 additions and 8 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,7 @@
 .*.swp
 .*.swo
 *.pyc
+.cache/
 .DS_Store
 docs/_build
 docs/fr/_build
@@ -16,3 +17,4 @@ six-*.egg/
 venv
 samples/output
 *.pem
+pip-wheel-metadata/
diff --git a/RELEASE.md b/RELEASE.md
@@ -0,0 +1,3 @@
+Release type: minor
+
+Add support for the ``{include}`` syntax
diff --git a/docs/content.rst b/docs/content.rst
@@ -369,6 +369,45 @@ Linking to authors, categories, index and tags
 You can link to authors, categories, index and tags using the ``{author}name``,
 ``{category}foobar``, ``{index}`` and ``{tag}tagname`` syntax.
 
+Including common text into your content
+---------------------------------------
+
+From Pelican 4.2 onward, you can include common text snippets into your content using
+the ``{include}file.ext`` syntax. You can specify semi-absolute paths starting
+from the ``PATH`` directory, e.g. ``{include}/pages/disclaimer.html`` or use
+relative paths, e.g. ``{include}notice.html``. Relativity is
+calculated based on the location of the file containing the ``{include}``.
+For example when you have the following content layout::
+
+    content
+    └── notice2.html
+    └── pages
+        ├── page1.html
+        └── notice1.html
+
+Then the includes may look like::
+
+    <html>
+        <head>
+            <title>PAGE 1</title>
+        </head>
+        <body>
+            This is the content of page 1
+
+            {include}../notice2.html
+        </body>
+    </html>
+
+
+``notice2.html`` looks like::
+
+    {include}pages/notice1.html
+    This is the second warning about relative paths
+
+When using ``{include}`` it is best to blacklist the included files using the
+``IGNORE_FILES`` setting. Otherwise Pelican will try to render them as regular
+content and will most likely fail!
+
 Deprecated internal link syntax
 -------------------------------
 

diff --git a/pelican/contents.py b/pelican/contents.py
@@ -11,7 +11,7 @@
 import pytz
 
 import six
-from six.moves.urllib.parse import urljoin, urlparse, urlunparse
+from six.moves.urllib.parse import unquote, urljoin, urlparse, urlunparse
 
 from pelican import signals
 from pelican.settings import DEFAULT_CONFIG
@@ -36,14 +36,15 @@ class Content(object):
     :param settings: the settings dictionary (optional).
     :param source_path: The location of the source of this content (if any).
     :param context: The shared context between generators.
+    :param readers: readers.Readers() instance used for rendering includes.
 
     """
     @deprecated_attribute(old='filename', new='source_path', since=(3, 2, 0))
     def filename():
         return None
 
     def __init__(self, content, metadata=None, settings=None,
-                 source_path=None, context=None):
+                 source_path=None, context=None, readers=None):
         if metadata is None:
             metadata = {}
         if settings is None:
@@ -148,8 +149,15 @@ def __init__(self, content, metadata=None, settings=None,
         if 'summary' in metadata:
             self._summary = metadata['summary']
 
+        # used for rendering {includes}
+        self._readers = readers
+
         signals.content_object_init.send(self)
 
+    @property
+    def readers(self):
+        return self._readers
+
     def __str__(self):
         return self.source_path or repr(self)
 
@@ -257,6 +265,8 @@ def _link_replacer(self, siteurl, m):
                 siteurl += '/'
 
         # XXX Put this in a different location.
+        if what == 'include':
+            import pdb; pdb.set_trace()
         if what in {'filename', 'static', 'attach'}:
             if path.startswith('/'):
                 path = path[1:]
@@ -334,6 +344,30 @@ def _get_intrasite_link_regex(self):
             \2""".format(intrasite_link_regex)
         return re.compile(regex, re.X)
 
+    def _path_replacer(self, path, relative_dir=None):
+        """
+        Update path depending on whether this is an absolute
+        or relative value.
+        """
+        if not relative_dir:
+            relative_dir = self.relative_dir
+
+        if path.startswith('/'):
+            path = path[1:]
+        else:
+            # relative to the source path of this content
+            path = self.get_relative_source_path(
+                os.path.join(relative_dir, path)
+            )
+
+        if path not in self._context['filenames']:
+            unquoted_path = unquote(path)
+
+            if unquoted_path in self._context['filenames']:
+                path = unquoted_path
+
+        return path.replace('%20', ' ')
+
     def _update_content(self, content, siteurl):
         """Update the content attribute.
 
@@ -348,6 +382,7 @@ def _update_content(self, content, siteurl):
             return content
 
         hrefs = self._get_intrasite_link_regex()
+        import pdb; pdb.set_trace()
         return hrefs.sub(lambda m: self._link_replacer(siteurl, m), content)
 
     def get_static_links(self):
@@ -367,12 +402,74 @@ def get_static_links(self):
                     os.path.join(self.relative_dir, path)
                 )
             path = path.replace('%20', ' ')
+            # path = self._path_replacer(path)
             static_links.add(path)
         return static_links
 
     def get_siteurl(self):
         return self._context.get('localsiteurl', '')
 
+    def _update_includes(self, content, source_path=None):
+        """
+            Replace {include}some.file with the
+            contents of this file.
+        """
+        regex = r"""[{|]include[|}](?P<path>[\w./]+)"""
+        hrefs = re.compile(regex, re.X)
+        processed_paths = []
+        # In Python 3.x we can use the `nonlocal` declaration, in `replacer()`,
+        # to tell Python we mean to assign to the `source_path` variable from
+        # `_update_includes()`.
+        # In Python 2.x we simply can't assign to `source_path` in `replacer()`.
+        # However, we work around this by not assigning to the variable itself,
+        # but using a mutable container to keep track about the current working
+        # directory while doing the recursion.
+        source_dir = [source_path]
+
+        def replacer(m):
+            path = m.group('path')
+            path = self._path_replacer(path, source_dir[0])
+            path = posixize_path(
+                    os.path.abspath(
+                        os.path.join(self.settings['PATH'], path)
+                    )
+                )
+
+            if not os.path.isfile(path):
+                logger.warning("Unable to find `%s`, skipping include.", path)
+                return ''.join(('{include}', m.group('path')))
+
+            _, ext = os.path.splitext(path)
+            # remove leading dot
+            ext = ext[1:]
+
+            if ext not in self.readers.reader_classes.keys():
+                logger.warning("Unable to read `%s`, skipping include.", path)
+                return ''.join(('{include}', m.group('path')))
+
+            # recursion stop
+            if path in processed_paths:
+                raise RuntimeError("Circular inclusion detected for '%s'" % path)
+            processed_paths.append(path)
+
+            reader = self.readers.reader_classes[ext](self.settings)
+            text, meta = reader.read(path)
+
+            # if we recurse into another file to perform more includes
+            # self._path_replacer needs to know in which directory
+            # it operates otherwise it produces wrong paths
+            source_dir[0] = posixize_path(os.path.dirname(path))
+            current_source_dir = source_dir[0]
+
+            # recursively replace other includes
+            text = hrefs.sub(replacer, text)
+
+            # restore source dir
+            source_dir[0] = current_source_dir
+            return text
+
+        return hrefs.sub(replacer, content)
+
     @memoized
     def get_content(self, siteurl):
         if hasattr(self, '_get_content'):

diff --git a/pelican/readers.py b/pelican/readers.py
@@ -500,7 +500,12 @@ def read(self, filename):
         metadata = {}
         for k in parser.metadata:
             metadata[k] = self.process_metadata(k, parser.metadata[k])
-        return parser.body, metadata
+
+        if parser.body:
+            return parser.body, metadata
+        else:
+            # in case we're parsing HTML includes
+            return content, metadata
 
 
 class Readers(FileStampDataCacher):
@@ -637,7 +642,7 @@ def typogrify_wrapper(text):
 
         return content_class(content=content, metadata=metadata,
                              settings=self.settings, source_path=path,
-                             context=context)
+                             context=context, readers=self)
 
 
 def find_empty_alt(content, path):

diff --git a/pelican/tests/content/include.md b/pelican/tests/content/include.md
@@ -0,0 +1,2 @@
+**this is Markdown**
+Here is a [link](https://docs.getpelican.com).
diff --git a/pelican/tests/content/include.unknown b/pelican/tests/content/include.unknown
@@ -0,0 +1,2 @@
+**this is Markdown**
+Here is a [link](https://docs.getpelican.com).
diff --git a/pelican/tests/content/include/include3.html b/pelican/tests/content/include/include3.html
@@ -0,0 +1,2 @@
+this file includes another in a different directory
+{include}../include1.html
diff --git a/pelican/tests/content/include/include4.html b/pelican/tests/content/include/include4.html
@@ -0,0 +1,2 @@
+this file includes another via absolute path
+{include}/include1.html
diff --git a/pelican/tests/content/include1.html b/pelican/tests/content/include1.html
@@ -0,0 +1 @@
+<span>this content has been included</span>
diff --git a/pelican/tests/content/include2.html b/pelican/tests/content/include2.html
@@ -0,0 +1,2 @@
+this file includes another
+{include}include1.html
diff --git a/pelican/tests/content/include5.html b/pelican/tests/content/include5.html
@@ -0,0 +1 @@
+{include}include6.html
diff --git a/pelican/tests/content/include6.html b/pelican/tests/content/include6.html
@@ -0,0 +1 @@
+{include}include5.html
diff --git a/pelican/tests/test_cache.py b/pelican/tests/test_cache.py
@@ -162,8 +162,11 @@ def test_article_object_caching(self):
         - 2012-11-30_md_w_filename_meta#foo-bar.md
         - empty.md
         - empty_with_bom.md
+
+        There are 5 more include* files which are HTML or Markdown snippets
+        and also not valid.
         """
-        self.assertEqual(generator.readers.read_file.call_count, 6)
+        self.assertEqual(generator.readers.read_file.call_count, 11)
 
     @unittest.skipUnless(MagicMock, 'Needs Mock module')
     def test_article_reader_content_caching(self):