From 79101186ddfb7d9b6491e4e790fbec857035f97e Mon Sep 17 00:00:00 2001
From: "Mr. Senko" <atodorov@mrsenko.com>
Date: Thu, 26 May 2016 17:36:48 +0300
Subject: [PATCH] New feature: support for {include} syntax. Fixes #1902.

The new {include} syntax makes it possible to include
frequently used text snippets into your content.
---
 docs/changelog.rst                          |   1 +
 docs/content.rst                            |  39 ++++++
 pelican/contents.py                         |  93 +++++++++++--
 pelican/readers.py                          |   7 +-
 pelican/tests/content/include.markdown      |   2 +
 pelican/tests/content/include.unknown       |   2 +
 pelican/tests/content/include/include3.html |   2 +
 pelican/tests/content/include/include4.html |   2 +
 pelican/tests/content/include1.html         |   1 +
 pelican/tests/content/include2.html         |   2 +
 pelican/tests/test_cache.py                 |   5 +-
 pelican/tests/test_contents.py              | 138 ++++++++++++++++++++
 12 files changed, 279 insertions(+), 15 deletions(-)
 create mode 100644 pelican/tests/content/include.markdown
 create mode 100644 pelican/tests/content/include.unknown
 create mode 100644 pelican/tests/content/include/include3.html
 create mode 100644 pelican/tests/content/include/include4.html
 create mode 100644 pelican/tests/content/include1.html
 create mode 100644 pelican/tests/content/include2.html
diff --git a/docs/changelog.rst b/docs/changelog.rst
index 6a4d65a4ec..8fd46f28a0 100644
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -15,6 +15,7 @@ Next release
 * Author slugs can be controlled with greater precision using the
   ``AUTHOR_SUBSTITUTIONS`` setting. Keeping non-alphanum characters is supported
   as well but discouraged.
+* Add support for the ``{include}`` syntax
 
 3.6.3 (2015-08-14)
 ==================
diff --git a/docs/content.rst b/docs/content.rst
index 0fa8992108..ec68b30ade 100644
--- a/docs/content.rst
+++ b/docs/content.rst
@@ -335,6 +335,45 @@ Linking to authors, categories, index and tags
 You can link to authors, categories, index and tags using the ``{author}name``,
 ``{category}foobar``, ``{index}`` and ``{tag}tagname`` syntax.
 
+Including common text into your content
+---------------------------------------
+
+From Pelican 3.6.4 you can include common text snippets into your content using
+the ``{include}file.ext`` syntax. You can specify semi-absolute paths starting
+from the ``PATH`` directory, e.g. ``{include}/pages/disclaimer.html`` or use
+relative paths, e.g. ``{include}notice.html``. Relativity is
+calculated based on the location of the file containing the ``{include}``.
+For example when you have the following content layout::
+
+    content
+    └── notice2.html
+    └── pages
+        ├── page1.html
+        └── notice1.html
+
+Then the includes may look like::
+
+    <html>
+        <head>
+            <title>PAGE 1</title>
+        </head>
+        <body>
+            This is the content of page 1
+
+            {include}../notice2.html
+        </body>
+    </html>
+
+
+``notice2.html`` looks like::
+
+    {include}pages/notice1.html
+    This is the second warning about relative paths
+
+When using ``{include}`` it is best to blacklist the included files using the
+``IGNORE_FILES`` setting. Otherwise Pelican will try to render them as regular
+content and will most likely fail!
+
 Deprecated internal link syntax
 -------------------------------
 
diff --git a/pelican/contents.py b/pelican/contents.py
index 9b6aa971d0..8d04885483 100644
--- a/pelican/contents.py
+++ b/pelican/contents.py
@@ -150,8 +150,20 @@ def __init__(self, content, metadata=None, settings=None,
         if 'summary' in metadata:
             self._summary = metadata['summary']
 
+        # used for rendering {includes}
+        self._readers = None
+
         signals.content_object_init.send(self)
 
+    @property
+    def readers(self):
+        if self._readers is None:
+            # import here due to circular imports
+            from pelican.readers import Readers
+            self._readers = Readers(self.settings)
+
+        return self._readers
+
     def __str__(self):
         return self.source_path or repr(self)
 
@@ -187,6 +199,30 @@ def get_url_setting(self, key):
         key = key if self.in_default_lang else 'lang_%s' % key
         return self._expand_settings(key)
 
+    def _path_replacer(self, path, relative_dir=None):
+        """
+        Update path depending on whether this is an absolute
+        or relative value.
+        """
+        if not relative_dir:
+            relative_dir = self.relative_dir
+
+        if path.startswith('/'):
+            path = path[1:]
+        else:
+            # relative to the source path of this content
+            path = self.get_relative_source_path(
+                os.path.join(relative_dir, path)
+            )
+
+        if path not in self._context['filenames']:
+            unquoted_path = path.replace('%20', ' ')
+
+            if unquoted_path in self._context['filenames']:
+                path = unquoted_path
+
+        return path
+
     def _update_content(self, content, siteurl):
         """Update the content attribute.
 
@@ -218,19 +254,7 @@ def replacer(m):
 
             # XXX Put this in a different location.
             if what in {'filename', 'attach'}:
-                if path.startswith('/'):
-                    path = path[1:]
-                else:
-                    # relative to the source path of this content
-                    path = self.get_relative_source_path(
-                        os.path.join(self.relative_dir, path)
-                    )
-
-                if path not in self._context['filenames']:
-                    unquoted_path = path.replace('%20', ' ')
-
-                    if unquoted_path in self._context['filenames']:
-                        path = unquoted_path
+                path = self._path_replacer(path)
 
                 linked_content = self._context['filenames'].get(path)
                 if linked_content:
@@ -277,12 +301,55 @@ def replacer(m):
     def get_siteurl(self):
         return self._context.get('localsiteurl', '')
 
+    def _update_includes(self, content, source_path=None):
+        """
+            Replace {include}some.file with the
+            contents of this file.
+        """
+        regex = r"""[{|]include[|}](?P<path>[\w./]+)"""
+        hrefs = re.compile(regex, re.X)
+
+        def replacer(m):
+            path = m.group('path')
+            path = self._path_replacer(path, source_path)
+            path = posixize_path(
+                    os.path.abspath(
+                        os.path.join(self.settings['PATH'], path)
+                    )
+                )
+
+            if not os.path.isfile(path):
+                logger.warning("Unable to find `%s`, skipping include.", path)
+                return ''.join(('{include}', m.group('path')))
+
+            _, ext = os.path.splitext(path)
+            # remove leading dot
+            ext = ext[1:]
+
+            if ext not in self.readers.reader_classes.keys():
+                logger.warning("Unable to read `%s`, skipping include.", path)
+                return ''.join(('{include}', m.group('path')))
+
+            reader = self.readers.reader_classes[ext](self.settings)
+            text, meta = reader.read(path)
+
+            # if we recurse into another file to perform more includes
+            # self._path_replacer needs to know in which directory
+            # it operates otherwise it produces wrong paths
+            source_dir = posixize_path(os.path.dirname(path))
+
+            text = self._update_includes(text, source_dir)
+            return text
+
+        return hrefs.sub(replacer, content)
+
     @memoized
     def get_content(self, siteurl):
         if hasattr(self, '_get_content'):
             content = self._get_content()
         else:
             content = self._content
+        content = self._update_includes(content)
         return self._update_content(content, siteurl)
 
     @property
diff --git a/pelican/readers.py b/pelican/readers.py
index 585a6e7969..b0ec66d728 100644
--- a/pelican/readers.py
+++ b/pelican/readers.py
@@ -424,7 +424,12 @@ def read(self, filename):
         metadata = {}
         for k in parser.metadata:
             metadata[k] = self.process_metadata(k, parser.metadata[k])
-        return parser.body, metadata
+
+        if parser.body:
+            return parser.body, metadata
+        else:
+            # in case we're parsing HTML includes
+            return content, metadata
 
 
 class Readers(FileStampDataCacher):
diff --git a/pelican/tests/content/include.markdown b/pelican/tests/content/include.markdown
new file mode 100644
index 0000000000..9055424a1b
--- /dev/null
+++ b/pelican/tests/content/include.markdown
@@ -0,0 +1,2 @@
+**this is Markdown**
+Here is a [link](http://MrSenko.com).
diff --git a/pelican/tests/content/include.unknown b/pelican/tests/content/include.unknown
new file mode 100644
index 0000000000..9055424a1b
--- /dev/null
+++ b/pelican/tests/content/include.unknown
@@ -0,0 +1,2 @@
+**this is Markdown**
+Here is a [link](http://MrSenko.com).
diff --git a/pelican/tests/content/include/include3.html b/pelican/tests/content/include/include3.html
new file mode 100644
index 0000000000..6933bccea8
--- /dev/null
+++ b/pelican/tests/content/include/include3.html
@@ -0,0 +1,2 @@
+this file includes another in a different directory
+{include}../include1.html
\ No newline at end of file
diff --git a/pelican/tests/content/include/include4.html b/pelican/tests/content/include/include4.html
new file mode 100644
index 0000000000..aa2181bdc6
--- /dev/null
+++ b/pelican/tests/content/include/include4.html
@@ -0,0 +1,2 @@
+this file includes another via absolute path
+{include}/include1.html
\ No newline at end of file
diff --git a/pelican/tests/content/include1.html b/pelican/tests/content/include1.html
new file mode 100644
index 0000000000..b307a825fc
--- /dev/null
+++ b/pelican/tests/content/include1.html
@@ -0,0 +1 @@
+<span>this content has been included</span>
\ No newline at end of file
diff --git a/pelican/tests/content/include2.html b/pelican/tests/content/include2.html
new file mode 100644
index 0000000000..b8c46657fb
--- /dev/null
+++ b/pelican/tests/content/include2.html
@@ -0,0 +1,2 @@
+this file includes another
+{include}include1.html
\ No newline at end of file
diff --git a/pelican/tests/test_cache.py b/pelican/tests/test_cache.py
index 3da3f7897f..8fb085435e 100644
--- a/pelican/tests/test_cache.py
+++ b/pelican/tests/test_cache.py
@@ -60,8 +60,11 @@ def test_article_object_caching(self):
         - article_with_comments.html
         - article_with_null_attributes.html
         - 2012-11-30_md_w_filename_meta#foo-bar.md
+
+        There are 5 more include* files which are HTML or Markdown snippets
+        and also not valid.
         """
-        self.assertEqual(generator.readers.read_file.call_count, 4)
+        self.assertEqual(generator.readers.read_file.call_count, 9)
 
     @unittest.skipUnless(MagicMock, 'Needs Mock module')
     def test_article_reader_content_caching(self):
diff --git a/pelican/tests/test_contents.py b/pelican/tests/test_contents.py
index 2f774a6e0e..25e1d2283b 100644
--- a/pelican/tests/test_contents.py
+++ b/pelican/tests/test_contents.py
@@ -22,6 +22,8 @@
 TEST_CONTENT = str(generate_lorem_ipsum(n=1))
 TEST_SUMMARY = generate_lorem_ipsum(n=1, html=False)
 
+CONTENT_PATH = os.path.join(os.path.dirname(__file__), 'content')
+
 
 class TestPage(LoggedTestCase):
 
@@ -418,6 +420,142 @@ def test_intrasite_link_markdown_spaces(self):
             '<a href="http://notmyidea.org/article-spaces.html">link</a>'
         )
 
+    def test_includes(self):
+        args = self.page_kwargs.copy()
+        args['settings'] = get_settings()
+        args['source_path'] = CONTENT_PATH
+        args['context']['filenames'] = {}
+        settings = get_settings()
+        settings['PATH'] = CONTENT_PATH
+        args['settings'] = settings
+
+        # test inclusion b/w files of different types
+        # HTML includes Markdown
+        args['content'] = (
+            'HTML includes Markdown '
+            '{include}include.markdown'
+            ' Included content is above'
+        )
+        content = Page(**args).get_content('http://notmyidea.org')
+        self.assertEqual(
+            content,
+            'HTML includes Markdown '
+            '<p><strong>this is Markdown</strong>\n'
+            'Here is a <a href="http://MrSenko.com">link</a>.</p>'
+            ' Included content is above'
+        )
+
+        # test inclusion b/w files of different types
+        # where we don't know how to render the included type
+        args['content'] = (
+            'HTML includes Unknown '
+            '{include}include.unknown'
+        )
+        content = Page(**args).get_content('http://notmyidea.org')
+        # we have a warning in this case
+        self.assertLogCountEqual(
+                count=1,
+                msg="Unable to read `.*`, skipping include\.",
+                level=logging.WARNING)
+        self.assertEqual(
+            content,
+            'HTML includes Unknown '
+            '{include}include.unknown'
+        )
+
+        # one include via relative path
+        args['content'] = (
+            'There is a simple include here '
+            '{include}include1.html'
+            ' Included content is above'
+        )
+        content = Page(**args).get_content('http://notmyidea.org')
+        self.assertEqual(
+            content,
+            'There is a simple include here '
+            '<span>this content has been included</span>'
+            ' Included content is above'
+        )
+
+        # two nested includes via relative paths
+        args['content'] = (
+            'There is a simple include here '
+            '{include}include2.html'
+            ' Included content is above'
+        )
+        content = Page(**args).get_content('http://notmyidea.org')
+        self.assertEqual(
+            content,
+            'There is a simple include here '
+            'this file includes another\n'
+            '<span>this content has been included</span>'
+            ' Included content is above'
+        )
+
+        # include via full path
+        args['content'] = (
+            'There is a simple include here '
+            '{include}/include1.html'
+            ' Included content is above'
+        )
+        content = Page(**args).get_content('http://notmyidea.org')
+        self.assertEqual(
+            content,
+            'There is a simple include here '
+            '<span>this content has been included</span>'
+            ' Included content is above'
+        )
+
+        # 2nd include is in different directory
+        # include paths are relative to the caller directory
+        args['content'] = (
+            'There is a simple include here '
+            '{include}include/include3.html'
+            ' Included content is above'
+        )
+        content = Page(**args).get_content('http://notmyidea.org')
+        self.assertEqual(
+            content,
+            'There is a simple include here '
+            'this file includes another in a different directory\n'
+            '<span>this content has been included</span>'
+            ' Included content is above'
+        )
+
+        # 2nd include using absolute path in the included file
+        args['content'] = (
+            'There is a simple include here '
+            '{include}include/include4.html'
+            ' Included content is above'
+        )
+        content = Page(**args).get_content('http://notmyidea.org')
+        self.assertEqual(
+            content,
+            'There is a simple include here '
+            'this file includes another via absolute path\n'
+            '<span>this content has been included</span>'
+            ' Included content is above'
+        )
+
+        # include non-existing file => inclusion is skipped
+        args['content'] = (
+            'There is a simple include here '
+            '{include}missing.html'
+            ' Included content is above'
+        )
+        content = Page(**args).get_content('http://notmyidea.org')
+        # we have a warning in this case
+        self.assertLogCountEqual(
+                count=1,
+                msg="Unable to find `.*`, skipping include\.",
+                level=logging.WARNING)
+        self.assertEqual(
+            content,
+            'There is a simple include here '
+            '{include}missing.html'
+            ' Included content is above'
+        )
+
     def test_multiple_authors(self):
         """Test article with multiple authors."""
         args = self.page_kwargs.copy()