In [9]:
from bs4 import BeautifulSoup
import requests
import itertools as it
from tidylib import tidy_document
import tidylib
import re

# Override the tidylib defaults. The "wrap" parameter results in some weird bug.
tidylib.BASE_OPTIONS = {
    "indent": 1,           # Pretty; not too much of a performance hit
    "tidy-mark": 0,        # No tidy meta tag in output
#     "wrap": 0,             # No wrapping
    "alt-text": "",        # Help ensure validation
    "doctype": 'strict',   # Little sense in transitional for tool-generated markup...
    "force-output": 1,     # May not get what you expect but you will get something
    # Some unclosed <p> tags exist. We don't want to drop 'em. Just close them.
    'drop_empty_paras': False 
    }

In [10]:
def parse_link(url):
    html_doc = requests.get(url).text
    tidy_doc = tidy_document(html_doc, options={'drop_empty_paras': False, 'force_output': True})[0]
    soup = BeautifulSoup(tidy_doc, 'html.parser')
    return soup

def consolidate_content(content):
    content_container = BeautifulSoup("<div></div>", 'html.parser')
    for tag in content:
        content_container.append(tag)
    return content_container

def get_content(clhs_soup):
    without_head = [tag for tag in clhs_soup.body.hr.find_next_siblings()]
    content = list(it.takewhile(lambda x: str(x) != '<hr/>', without_head))
    return consolidate_content(content)

def get_h2s(content):
    return [tag for tag in content if tag.name == 'h2' and tag.a]

def get_base_url(url):
    return re.findall(".*\/", url)[0]

def build_link_from_h2(h2, base_url):
    return get_base_url(url) + h2.a['href']

def write_soup_tmp(soup):
    with open("clhs.html", "w") as f:
        f.write(str(soup))


In [11]:
def replace_with_sub_content(h2, base_url):
    link = build_link_from_h2(h2, base_url)
    soup = parse_link(link)
    content = get_content(soup)
    h2s = get_h2s(content)
    for h2 in h2s:
        h2.replace_with(replace_with_sub_content(h2, base_url))
    return content

def insert_sub_chapters(url):
    base_url = get_base_url(url)
    soup = parse_link(url)
    content = get_content(soup)
    h2s = get_h2s(content)
    for h2 in h2s:
        h2.replace_with(replace_with_sub_content(h2, base_url))
    soup.hr.append(content)
    return soup

def fix_links(soup, base_url):
    links = [a for a in soup.find_all('a') if a['href'] 
            and not a['href'].startswith('http://www.')
            and not a['href'].startswith('../')]
    for link in links:
        link['href'] = base_url + link['href']

def write_and_cleanup_clhs(soup, filepath):
    with open(filepath, "w") as f:
        # Remove double carriage return strings, resulting in bad formating for code examples.
        f.write(str(soup).replace('\r\n', '\n'))

In [None]:
url = "http://www.lispworks.com/documentation/HyperSpec/Body/06_a.htm"
soup = insert_sub_chapters(url)
fix_links(soup, get_base_url(url))
write_and_cleanup_clhs(soup, "clhs.html")
soup