In [1]:
from markdown import Markdown
from io import StringIO
import yaml
import json
import os

In [2]:
def unmark_element(element, stream=None):
    if stream is None:
        stream = StringIO()
    if element.text:
        stream.write(element.text)
    for sub in element:
        unmark_element(sub, stream)
    if element.tail:
        stream.write(element.tail)
    return stream.getvalue()


# patching Markdown
Markdown.output_formats["plain"] = unmark_element
__md = Markdown(output_format="plain")
__md.stripTopLevelTags = False


def unmark(text):
    return __md.convert(text).replace('&nbsp;', '\n')


In [3]:
import re

def delete_html_tags(text):
    CLEANR = re.compile(r'<.*?>')
    t = re.sub(CLEANR, ' ', text)
    t = ' '.join([s for s in t.split(' ') if s])
    return t

def clear_text(text):
    t = unmark(text)
    t = delete_html_tags(t)
    t = '\n'.join([s for s in t.split('\n') if s])
    return t

In [15]:
with open('../markdown_files/sfcc_docs/mkdocs.yml', 'r') as f:
    try:
        c4c_struct = yaml.safe_load(f)
    except yaml.YAMLError as e:
        print(e)
print(json.dumps(c4c_struct, indent=4))

{
    "site_name": "Revenue Grid Knowledge Base",
    "plugins": [
        "macros",
        {
            "search": {
                "separator": "[\\s\\-,:!=\\[\\]()\"/]+|(?!\\b)(?=[A-Z][a-z])|\\.(?!\\d)|&[lg]t;"
            }
        }
    ],
    "site_dir": "helpcenter/",
    "site_url": "https://docs.revenuegrid.com/",
    "docs_dir": "src",
    "nav": [
        {
            "<b>About</b>": [
                {
                    "Salesforce integration Knowledge Base": "index.md"
                },
                {
                    "Supported Salesforce editions": "ri/fast/articles/Supported-Salesforce-Editions.md"
                },
                {
                    "Overcoming Firewall issues": "articles/Overcoming-Firewall-Issues.md"
                },
                {
                    "Grant login access": "ri/fast/articles/grant-account-login-access.md"
                }
            ]
        },
        {
            "<b>Productivity & Activity Capture</b>": [


In [14]:
def traverse_nav(nav, path=None):
    if path is None:
        path = []

    if isinstance(nav, str):
        yield path, 'Index', nav

    if isinstance(nav, dict):
        for key, val in nav.items():
            key = delete_html_tags(key)
            if isinstance(val, str):
                yield path, key, val
            else:
                yield from traverse_nav(val, path+[key])
        # path = path[:-1]

    elif isinstance(nav, list):
        for it in nav:
            yield from traverse_nav(it, path)

traverse_nav(c4c_struct['nav'])

In [12]:
str(c4c_struct['nav']).find('ri/fast/articles/AddIn-vs-Sync-Functions.md')

448

In [None]:
with open('../markdown_files/sfcc_docs/ri/fast/articles/AddIn-vs-Sync-Functions.md', 'r') as f:
    text = f.read()
print(text)

In [None]:
print(clear_text(text))

In [None]:
for dirpath, dirnames, filenames in os.walk('../markdown_files/'):
    print(dirpath, dirnames, filenames)

In [None]:
for base_docs in os.listdir('../markdown_files/'):
    base_path = os.path.join(os.pardir, 'markdown_files', base_docs)
    with open(os.path.join(base_path, 'mkdocs.yml'), 'r') as f:
        mkdocs = yaml.safe_load(f)
    base_url = mkdocs['site_url']

    for dirpath, dirnames, filenames in os.walk(base_path):
        for fname in filenames:
            if fname.endswith('.md'):
                with open(os.path.join(dirpath, fname), 'r') as f:
                    text = f.read()

                clean_text = clear_text(text)
                print(f'{len(text) = }, {len(clean_text) = }, {100 - int(len(clean_text) / len(text) * 100)}% improvement')

                web_url = os.path.join(*([base_url] + dirpath.split(os.path.sep)[3:]))
                if not fname.startswith('index'):
                    web_url = os.path.join(web_url, fname.replace('.md', ''))
                print(web_url)

                new_path = dirpath.replace('/markdown_files/', '/basic_preprocessing/')
                new_fname = fname.replace('.md', '.json')
                new_full_filepath = os.path.join(new_path, new_fname)
                os.makedirs(new_path, exist_ok=True)

                with open(new_full_filepath, 'w') as f:
                    json.dump({
                        'site_name': mkdocs['site_name'],
                        'short_site_name': 'C4C' if 'c4c' in base_path else 'Sfcc',
                        'doc_dir': os.path.join(*new_full_filepath.split(os.path.sep)[3:]).replace('.json', '.md'),
                        'doc_url': web_url,
                        'title': None,
                        'full_content': text,
                        'text_content': clean_text,
                        }, f, indent=4)