In [1]:
from bs4 import BeautifulSoup
import requests
import itertools as it
from tidylib import tidy_document
import tidylib
import re

tidylib.BASE_OPTIONS = {
    "indent": 1,           # Pretty; not too much of a performance hit
    "tidy-mark": 0,        # No tidy meta tag in output
#     "wrap": 0,             # No wrapping
    "alt-text": "",        # Help ensure validation
    "doctype": 'strict',   # Little sense in transitional for tool-generated markup...
    "force-output": 1,     # May not get what you expect but you will get something
    'drop_empty_paras': False
    }

In [2]:
def parse_link(url):
    html_doc = requests.get(url).text
    tidy_doc = tidy_document(html_doc, options={'drop_empty_paras': False, 'force_output': True})[0]
    soup = BeautifulSoup(tidy_doc, 'html.parser')
    return soup

def consolidate_content(content):
    content_container = BeautifulSoup("<div></div>", 'html.parser')
    for tag in content:
        content_container.append(tag)
    return content_container

def get_content(clhs_soup):
    without_head = [tag for tag in clhs_soup.body.hr.find_next_siblings()]
    content = list(it.takewhile(lambda x: str(x) != '<hr/>', without_head))
    return consolidate_content(content)

def get_h2s(content):
    return [tag for tag in content if tag.name == 'h2' and tag.a]

def get_base_url(url):
    return re.findall(".*\/", url)[0]

def build_link_from_h2(h2, base_url):
    return get_base_url(url) + h2.a['href']

def write_soup_tmp(soup):
    with open("clhs.html", "w") as f:
        f.write(str(soup))


In [4]:
def replace_with_sub_content(h2, base_url):
    link = build_link_from_h2(h2, base_url)
    soup = parse_link(link)
    content = get_content(soup)
    h2s = get_h2s(content)
    for h2 in h2s:
        h2.replace_with(replace_with_sub_content(h2, base_url))
    return content

url = "http://www.lispworks.com/documentation/HyperSpec/Body/06_a.htm"
base_url = get_base_url(url)
soup = parse_link(url)
content = get_content(soup)
h2s = get_h2s(content)

for h2 in h2s:
    h2.replace_with(replace_with_sub_content(h2, base_url))
    
soup.hr.append(content)

# write_soup_tmp(soup)
with open("clhs.html", "w") as f:
    # Remove double carriage return strings, resulting in bad formating for code examples.
    f.write(str(soup).replace('\r\n', '\n'))
soup

<!-- Common Lisp HyperSpec (TM), version 7.0 generated by Kent M. Pitman on Mon, 11-Apr-2005 2:31am EDT -->
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN">

<html>
<head>
<title>
      CLHS: Section 6.1
    </title>
<link href="../Data/clhs.css" rel="stylesheet" type="text/css"/>
<meta content="Kent M. Pitman" http-equiv="Author"/>
<meta content="LispWorks Ltd." http-equiv="Organization"/>
<link href="../Front/index.htm" rel="TOP"/>
<link href="../Front/Help.htm#Legal" rel="COPYRIGHT"/>
<link href="../Front/Help.htm#Disclaimer" rel="DISCLAIMER"/>
<link href="06_.htm" rel="PREV"/>
<link href="06_.htm" rel="UP"/>
<link href="06_aa.htm" rel="NEXT"/>
</head>
<body>
<h1>
<a href="http://www.lispworks.com/" rev="MADE"><img align="bottom" alt="[LISPWORKS]" height="65" src="../Graphics/LWSmall.gif" width="80"/></a><a href="../Front/index.htm" rel="TOP"><img align="bottom" alt="[Common Lisp HyperSpec (TM)]" height="65" src="../Graphics/CLHS_Sm.gif" width="237"/></a> <a href="06_.htm" rel="PR

In [13]:
def fix_links(soup, base_url):
    links = [a for a in soup.find_all('a') if a['href'] 
            and not a['href'].startswith('http://www.')
            and not a['href'].startswith('../')]
    for link in links:
        link['href'] = base_url + link['href']

fix_links(soup, base_url)

[a['href'] for a in soup.find_all('a') if a['href'] 
#             and not a['href'].startswith('http://www.')
            and not a['href'].startswith('../')]

['http://www.lispworks.com/',
 'http://www.lispworks.com/documentation/HyperSpec/Body/06_.htm',
 'http://www.lispworks.com/documentation/HyperSpec/Body/06_.htm',
 'http://www.lispworks.com/documentation/HyperSpec/Body/06_aa.htm',
 'http://www.lispworks.com/documentation/HyperSpec/Body/m_loop.htm#loop',
 'http://www.lispworks.com/documentation/HyperSpec/Body/26_glo_m.htm#macro',
 'http://www.lispworks.com/documentation/HyperSpec/Body/m_loop.htm#loop',
 'http://www.lispworks.com/documentation/HyperSpec/Body/26_glo_f.htm#form',
 'http://www.lispworks.com/documentation/HyperSpec/Body/m_loop.htm#loop',
 'http://www.lispworks.com/documentation/HyperSpec/Body/26_glo_f.htm#form',
 'http://www.lispworks.com/documentation/HyperSpec/Body/m_loop.htm#loop',
 'http://www.lispworks.com/documentation/HyperSpec/Body/26_glo_f.htm#form',
 'http://www.lispworks.com/documentation/HyperSpec/Body/m_loop.htm#loop',
 'http://www.lispworks.com/documentation/HyperSpec/Body/26_glo_f.htm#form',
 'http://www.lispwo

In [162]:
soup.find_all('pre')[0].text

"\r\n;; Collect every name and the kids in one list by using \r\n;; COLLECT and APPEND.\r\n (loop for name in '(fred sue alice joe june)\r\n       for kids in '((bob ken) () () (kris sunshine) ())\r\n       collect name\r\n       append kids)\r\n=>  (FRED BOB KEN SUE ALICE JOE KRIS SUNSHINE JUNE)\r\n"

In [166]:
with open("clhs.html", "w") as f:
    # Remove double carriage return strings, resulting in bad formating for code examples.
    f.write(str(soup).replace('\r\n', '\n'))