tensorflow/g3doc/md2ipynb.py

import re
import os
import sys
import shutil
import subprocess

from HTMLParser import HTMLParser
		
MD_LINKS=re.compile(r'''
    (?P<prefix>     
        \[          #find a pair of square brackets 
            [^\]]*?\]    #that do not contain a closing square bracket
        \(          #followed by parens 
            [^\)]*?      #that do not contain a closing paren
    )\.md                 #with a '.md' extension
    (?P<suffix>
        (\#[^\)]*?)? #and optional frgment
        \)           #and closing paren   
    )''',
    re.VERBOSE | re.MULTILINE)

#DANGER: html is not parseable with regex, there are a huge number of ways this can go wrong
#[issue on notedown]()
HTML_LINKS=re.compile(r'''   
    (?P<prefix>     
        <[^<>]*?href=[^\s<>]*?  #containing a literal 'href=' and link text
    )\.md                     #followed my a .md extension
    (?P<suffix>
        [^<>]*?>                  #and the remainder of the tag
    )                      
    ''',
    re.VERBOSE | re.MULTILINE)

def convert(filepath):
    print(filepath)

    ipynb=subprocess.check_output([
        'notedown',filepath])

    ipynb=MD_LINKS.sub('\g<prefix>.ipynb\g<suffix>',ipynb)
    ipynb=HTML_LINKS.sub('\g<prefix>.ipynb\g<suffix>',ipynb)

    outfilepath=os.path.splitext(filepath)[0]+".ipynb"
    with open(outfilepath,'w') as out:
        out.write(ipynb)

    
def main(root = os.path.split(__file__)[0]):
    root = os.path.expanduser(root)
    
    for (dirpath, _, filenames) in os.walk(root):
        for filename in filenames:
            filepath = os.path.join(dirpath,filename)
            if filename.endswith('.md'):
                convert(filepath)
                
if __name__ == "__main__":
    main(*sys.argv[1:])