feat: add option to convert Jupyter Notebooks to blog posts

GetRD · Nov 4, 2023 · 637ab31 · 637ab31
1 parent ff3981e
commit 637ab31
Show file tree

Hide file tree

Showing 10 changed files with 2,609 additions and 200 deletions.
diff --git a/.gitignore b/.gitignore
@@ -6,4 +6,10 @@ dist/
 site/
 .idea/
 .tox/
-.cache/
+.cache/
+
+# Test data - temp files
+.ipynb_checkpoints
+
+# Trial runs
+output/
diff --git a/README.md b/README.md
@@ -9,7 +9,7 @@
 [![GitHub followers](https://img.shields.io/github/followers/gcushen?label=Follow%20on%20GH&style=for-the-badge)](https://github.com/gcushen)  
 
 
-### 📚 Easily import publications from your reference manager to your Markdown-formatted website or book
+### 📚 Easily import publications and Jupyter notebooks to your Markdown-formatted website or book
 
 ![](.github/media/demo.gif)
 
@@ -86,6 +86,18 @@ After importing publications, we suggest you:
 
 [Learn more in the Wowchemy Docs](https://university.wowchemy.com).
 
+### Import blog posts from Jupyter Notebooks
+
+Say we have our notebooks in a `notebooks` folder within the website folder, let's import them into the `content/post/` folder:
+
+    academic import 'notebooks/*.ipynb' content/post/ --verbose
+
+Optional arguments:
+
+* `--overwrite` Overwrite any existing blog posts in the output folder
+* `--verbose` or `-v` Show verbose messages
+* `--help` Help
+
 ## Contribute
 
 Interested in contributing to **open source** and **open science**?
@@ -99,7 +111,8 @@ For local development, clone this repository and use Poetry to install and run t
     git clone https://github.com/wowchemy/bibtex-to-markdown.git
     cd bibtex-to-markdown
     poetry install
-    poetry run academic import tests/data/article.bib output/ --overwrite --compact
+    poetry run academic import tests/data/article.bib output/publication/ --overwrite --compact
+    poetry run academic import 'tests/data/**/*.ipynb' output/post/ --overwrite --verbose
 
 When preparing a contribution, run the following checks and ensure that they all pass:
 

diff --git a/academic/cli.py b/academic/cli.py
@@ -7,6 +7,7 @@
 from argparse import RawTextHelpFormatter
 
 from academic.import_bibtex import import_bibtex
+from academic.import_notebook import import_notebook
 
 # Initialise logger.
 logging.basicConfig(
@@ -35,7 +36,7 @@ def parse_args(args):
 
     # Sub-parser for import command.
     parser_a = subparsers.add_parser("import", help="Import content into your website or book")
-    parser_a.add_argument("input", type=str, help="File path to your BibTeX file")
+    parser_a.add_argument("input", type=str, help="File path to your BibTeX or Jupyter Notebook file")
     parser_a.add_argument("output", type=str, help="Path to import publications to (e.g. `content/publication/`)")
     parser_a.add_argument("--featured", action="store_true", help="Flag publications as featured")
     parser_a.add_argument("--overwrite", action="store_true", help="Overwrite existing publications")
@@ -65,18 +66,26 @@ def parse_args(args):
         if known_args.command:
             if known_args.verbose:
                 # Set logging level to debug if verbose mode activated.
-                logging.getLogger().setLevel(logging.DEBUG)
-
-            # Run command to import bibtex.
-            import_bibtex(
-                known_args.input,
-                pub_dir=known_args.output,
-                featured=known_args.featured,
-                overwrite=known_args.overwrite,
-                normalize=known_args.normalize,
-                compact=known_args.compact,
-                dry_run=known_args.dry_run,
-            )
+                logging.getLogger().setLevel(logging.INFO)
+            if known_args.input.lower().endswith(".bib"):
+                # Run command to import bibtex.
+                import_bibtex(
+                    known_args.input,
+                    pub_dir=known_args.output,
+                    featured=known_args.featured,
+                    overwrite=known_args.overwrite,
+                    normalize=known_args.normalize,
+                    compact=known_args.compact,
+                    dry_run=known_args.dry_run,
+                )
+            elif known_args.input.lower().endswith(".ipynb"):
+                # Run command to import bibtex.
+                import_notebook(
+                    known_args.input,
+                    output_dir=known_args.output,
+                    overwrite=known_args.overwrite,
+                    dry_run=known_args.dry_run,
+                )
 
 
 if __name__ == "__main__":

diff --git a/academic/import_bibtex.py b/academic/import_bibtex.py
@@ -92,12 +92,10 @@ def parse_bibtex_entry(
 
     # Prepare YAML front matter for Markdown file.
     if not dry_run:
-        from importlib import resources as impresources
+        from importlib import resources as import_resources
 
         # Load the Markdown template from within the `templates` folder of the `academic` package
-        inp_file = impresources.files(__package__ + ".templates") / "publication.md"
-        with inp_file.open("rt") as f:
-            template = f.read()
+        template = import_resources.read_text(__package__ + ".templates", "publication.md")
 
         with open(markdown_path, "w") as f:
             f.write(template)

diff --git a/academic/import_notebook.py b/academic/import_notebook.py
@@ -0,0 +1,111 @@
+import glob
+import json
+import os
+import re
+from datetime import datetime
+from pathlib import Path
+
+import nbconvert as nbc
+import nbformat as nbf
+import yaml
+from traitlets.config import Config
+
+from academic.jupyter_whitespace_remover import JupyterWhitespaceRemover
+
+
+def _get_slug(text: str):
+    return text.lower().replace(" ", "-")
+
+
+def import_notebook(
+    input_path,
+    output_dir=os.path.join("content", "post"),
+    overwrite=False,
+    dry_run=False,
+):
+    """Import blog posts from Jupyter Notebook files"""
+    from academic.cli import log
+
+    for filename in glob.glob(input_path, recursive=True):
+        if not filename.endswith(".ipynb") or not os.path.basename(filename) != ".ipynb_checkpoints":
+            return
+
+        log.info(f"Found notebook {filename}")
+
+        # Read Notebook
+        nb = nbf.read(open(filename, "r"), as_version=4)
+
+        # Export Markdown
+        nbc_config = Config()
+        nbc_config.MarkdownExporter.preprocessors = [JupyterWhitespaceRemover]
+        exporter = nbc.MarkdownExporter(config=nbc_config)
+        if not dry_run:
+            _export(nb, exporter, output_dir, filename, ".md", overwrite)
+
+
+def _export(nb, exporter, output_dir, filename, extension, overwrite):
+    from academic.cli import log
+
+    # Determine output path for page bundle
+    filename_base = Path(filename).stem
+    slug = _get_slug(filename_base)
+    page_bundle_path = Path(output_dir) / slug
+
+    # Do not overwrite blog post if it already exists
+    if not overwrite and os.path.isdir(page_bundle_path):
+        log.info(f"Skipping creation of {page_bundle_path} as it already exists. " f"To overwrite, add the `--overwrite` argument.")
+        return
+
+    # Create page bundle folder
+    if not os.path.exists(page_bundle_path):
+        os.makedirs(page_bundle_path)
+
+    # Check for front matter variables in notebook metadata
+    if "front_matter" in nb["metadata"]:
+        front_matter_from_file = dict(nb["metadata"]["front_matter"])
+        log.info(f"Found front matter metadata in notebook: {json.dumps(front_matter_from_file)}")
+    else:
+        front_matter_from_file = {}
+
+    # Convert notebook to markdown
+    (body, resources) = exporter.from_notebook_node(nb)
+
+    # Export notebook resources
+    for name, data in resources.get("outputs", {}).items():
+        output_filename = Path(page_bundle_path) / name
+        with open(output_filename, "wb") as image_file:
+            image_file.write(data)
+
+    # Try to find title as top-level heading (h1), falling back to filename
+    search = re.search("^#{1}(.*)", body)
+    if search:
+        title = search.group(1).strip()
+        body = re.sub("^#{1}(.*)", "", body)
+    else:
+        title = filename_base.replace("-", " ").title()
+
+    # Initialise front matter variables
+    date = datetime.now().strftime("%Y-%m-%d")
+    front_matter = {"title": title, "date": date}
+    front_matter.update(front_matter_from_file)
+    log.info(f"Generating page with title: {front_matter['title']}")
+
+    # Unlike the Bibtex converter, we can't easily use Ruamel YAML library here as we need to output to string
+    front_matter_yaml = yaml.safe_dump(front_matter, sort_keys=False, allow_unicode=True)
+    # Strip final newline as our `output` will auto-add newlines below
+    front_matter_yaml = front_matter_yaml.rstrip()
+    # Wrap front matter variables with triple hyphens to represent Markdown front matter
+    output = "\n".join(("---", front_matter_yaml, "---", clean_markdown(body)))
+
+    # Write output file
+    output_filename = os.path.join(page_bundle_path, "index" + extension)
+    with open(output_filename, "w") as text_file:
+        text_file.write(output)
+
+
+def clean_markdown(body: str) -> str:
+    """
+    `nbconvert` creates too much whitespace and newlines.
+    Try to tidy up the output by removing multiple new lines.
+    """
+    return re.sub(r"\n+(?=\n)", "\n", body)
diff --git a/academic/jupyter_whitespace_remover.py b/academic/jupyter_whitespace_remover.py
@@ -0,0 +1,29 @@
+from nbconvert.preprocessors import Preprocessor
+
+
+class JupyterWhitespaceRemover(Preprocessor):
+    """
+    Try to clean up a Jupyter notebook by:
+     - removing blank code cells
+     - removing unnecessary whitespace
+    """
+
+    def preprocess(self, nb, resources):
+        """
+        Remove blank `code` cells
+        """
+        for index, cell in enumerate(nb.cells):
+            if cell.cell_type == "code" and not cell.source:
+                nb.cells.pop(index)
+            else:
+                nb.cells[index], resources = self.preprocess_cell(cell, resources, index)
+        return nb, resources
+
+    def preprocess_cell(self, cell, resources, cell_index):
+        """
+        Remove extraneous whitespace from code cells' source code
+        """
+        if cell.cell_type == "code":
+            cell.source = cell.source.strip()
+
+        return cell, resources