In [1]:
import glob
import re
import os

### Get environment variables

In [2]:
LATEX_PATH = os.getenv('LATEX_PATH')
LATEX_ENTRY = 'intel4coro.tex'
IMG_PATH = os.getenv('IMG_PATH')
MD_OUTPUT_PATH = os.getenv('MD_OUTPUT_PATH')
JB_OUTPUT_PATH = os.getenv('JB_OUTPUT_PATH')
MAIN_MD_PATH = MD_OUTPUT_PATH + "/__main__.md"

### Convert `intel4coroTextbook/intel4coro.tex` to `jupyterbook/content/__main__.md`
1. Clean previous converted markdown files.
2. Find and copy all the images from latex project to the markdown folder.
3. Convert .tex files to .md files using **pandoc 3.1.2**. (Note: There are two versions of `pandoc` installed, one is 2.19 which installed by JupyterLab. Another is 3.1.2 (/bin/pandoc) which should be used here.)

In [3]:
def runCMD(cmd):
    print("Shell Command: ", cmd)
    os.system(cmd)

def convertTex():
    runCMD("cd {} && /bin/pandoc -f latex -t markdown {} -o {}".format(LATEX_PATH, LATEX_ENTRY, MAIN_MD_PATH))
    
def clearMD():
    runCMD('rm -rf $MD_OUTPUT_PATH && mkdir $MD_OUTPUT_PATH')

def cpImages():
    runCMD('find $IMG_PATH -type f -name "*.*g" -exec cp {} $MD_OUTPUT_PATH \;')

In [4]:
clearMD()
cpImages()
convertTex()

Shell Command:  rm -rf $MD_OUTPUT_PATH && mkdir $MD_OUTPUT_PATH
Shell Command:  find $IMG_PATH -type f -name "*.*g" -exec cp {} $MD_OUTPUT_PATH \;
Shell Command:  cd /data/intel4coroTextbook && /bin/pandoc -f latex -t markdown intel4coro.tex -o /data/jupyterbook/content/__main__.md




### Filter out unrecognized code in `__main__.md`
Found unconverted code:
1. `{#XXX:XXX}` at the end of titles
2. `{reference-type="XXX" reference="XXX"}` at the end of reference links

In [5]:
def remove_unrecognized_strings(input_file):
    with open(input_file, 'r') as file:
        markdown_text = file.read()

    # Define the pattern for matching the custom strings
    patterns = [
        r'\{#[^\}]+\}',
        r'\{reference-type="(ref|autoref)"[\n|\t|\r| ]+reference="[^\}]+\}'
    ]

    counts = [len(re.findall(i, markdown_text)) for i in patterns]
    
    print('Unrecognized strings: ', counts)
    # Use regular expression to remove the custom strings
    for pattern in patterns:
        markdown_text = re.sub(pattern, '', markdown_text)

    with open(input_file, 'w') as file:
        file.write(markdown_text)

In [6]:
remove_unrecognized_strings(MAIN_MD_PATH)

Unrecognized strings:  [167, 72]


### Split `__main__.md` into `{chapters}.md`
Split `__main__.md` into chapters (e.g., `1_Introduction.md`).

Chapters have more than 3 subchapters will be further divided (e.g., `1_1_Running_example_making_popcorn.md`).

In [7]:
def escape_title(title):
    return re.sub(r'\W+', '', title.replace('#', '').strip().replace(' ', '_'))

def split_text_regex(text, regex):
    matches = [i.group() for i in re.finditer(regex, text)]
    span_start = [i.span()[0] for i in re.finditer(regex, text)]
    span_end = [i - 1 for i in span_start[1:]]
    span_end.append(len(text) - 1)
    return [(i, span_start[idx], span_end[idx]) for idx, i in enumerate(matches)]

def split_to_chapters(input_file, output_directory):
    with open(input_file, 'r') as file:
        markdown_text = file.read()
    chapters = split_text_regex(markdown_text, r'(\n|^)# \S+(.*)')
    filenames = []
    for idx, i in enumerate(chapters):
        chapter_content = markdown_text[i[1]:i[2]].strip()
        filename = "{}_{}.md".format(idx, escape_title(i[0]))
        filenames.append(filename)
        filepath = output_directory + '/' + filename
        with open(filepath, 'w') as file:
            file.write(chapter_content)
    return filenames

def split_to_subchapter(input_file, output_directory):
    with open(MD_OUTPUT_PATH + '/' + input_file, 'r') as file:
        markdown_text = file.read()
    subchapters = split_text_regex(markdown_text, r'(\n|^)## \S+(.*)')
    if len(subchapters) < 3:
        return []
    filenames = []
    chapter_num = input_file.split('_')[0]
    for idx, i in enumerate(subchapters):
        subchapter_content = markdown_text[i[1]:i[2]].strip()
        filename = "{}_{}_{}.md".format(chapter_num, idx + 1, escape_title(i[0]))
        filenames.append(filename)
        filepath = output_directory + '/' + filename
        with open(filepath, 'w') as file:
            file.write(subchapter_content)
    with open(MD_OUTPUT_PATH + '/' + input_file, 'w') as file:
        file.write(markdown_text[:subchapters[0][1]])
    return filenames


In [None]:
chapters_md = split_to_chapters(MAIN_MD_PATH, MD_OUTPUT_PATH)
subchapters_md = {x: split_to_subchapter(x, MD_OUTPUT_PATH) for x in chapters_md[1:]}

### Generate `_toc.yml`

In [10]:
def generate_toc():
    with open('/data/jupyterbook/_toc-template.yml', 'r') as file:
        toc_text = file.read()
    toc_text = toc_text.strip()
    for i in subchapters_md:
        toc_text += "\n  - file: content/" + i
        if len(subchapters_md[i]) > 0:
            toc_text += "\n    sections:"
            toc_text += "".join(["\n    - file: content/" + x for x in subchapters_md[i]])
    
    with open('/data/jupyterbook/_toc.yml', 'w') as file:
        file.write(toc_text)
    return True

In [11]:
generate_toc()

True

### Others

In [11]:
# Copy references.bib
!cp /data/intel4coroTextbook/intel4coro.bib /data/jupyterbook/references.bib

### Build Jupyter book
Open: http://127.0.0.1:8888/files/html/intro.html

In [12]:
print('~~~~~~~~~~ Building Jupyter Book ~~~~~~~~~~')
!rm -rf $JB_OUTPUT_PATH
!jupyter-book build ${WORKSPACE}/jupyterbook/
!cp $MD_OUTPUT_PATH/* $JB_OUTPUT_PATH/html/content
!rm -rf /data/html && mv $JB_OUTPUT_PATH/html /data/

~~~~~~~~~~ Building Jupyter Book ~~~~~~~~~~
[32m[1mRunning Jupyter-Book v0.15.1[0m
[34m[1mSource Folder: [0m/data/jupyterbook
[34m[1mConfig Path: [0m/data/jupyterbook/_config.yml
[34m[1mOutput Path: [0m/data/jupyterbook/_build/html
[sphinxcontrib-bibtex] Beware that docutils versions 0.18 and 0.19 (you are running 0.18.1) are known to generate invalid html for citations. If this issue affects you, please use docutils<0.18 (or >=0.20 once released) instead. For more details, see https://sourceforge.net/p/docutils/patches/195/
[01mRunning Sphinx v5.0.2[39;49;00m
[01mmaking output directory... [39;49;00mdone
[etoc] Changing master_doc to 'intro'
checking bibtex cache... out of date
parsing bibtex file /data/jupyterbook/references.bib... parsed 4340 entries
[01mmyst v0.18.1:[39;49;00m MdParserConfig(commonmark_only=False, gfm_only=False, enable_extensions=['colon_fence', 'dollarmath', 'linkify', 'substitution', 'tasklist'], disable_syntax=[], all_links_external=False, url