In [51]:
from bs4 import BeautifulSoup
import requests
import itertools as it
from tidylib import tidy_document
import tidylib
import re

tidylib.BASE_OPTIONS = {
    "indent": 1,           # Pretty; not too much of a performance hit
    "tidy-mark": 0,        # No tidy meta tag in output
#     "wrap": 0,             # No wrapping
    "alt-text": "",        # Help ensure validation
    "doctype": 'strict',   # Little sense in transitional for tool-generated markup...
    "force-output": 1,     # May not get what you expect but you will get something
    'drop_empty_paras': False
    }

In [52]:
# url = "http://www.lispworks.com/documentation/HyperSpec/Body/06_a.htm"
url = "http://www.lispworks.com/documentation/HyperSpec/Body/06_ac.htm"
html_doc = requests.get(url).text
tidy_doc = tidy_document(html_doc, options={'drop_empty_paras': False, 'force_output': True})[0]
soup = BeautifulSoup(tidy_doc, 'html.parser')

soup

<!-- Common Lisp HyperSpec (TM), version 7.0 generated by Kent M. Pitman on Mon, 11-Apr-2005 2:31am EDT -->
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN">

<html>
<head>
<title>
      CLHS: Section 6.1.3
    </title>
<link href="../Data/clhs.css" rel="stylesheet" type="text/css"/>
<meta content="Kent M. Pitman" http-equiv="Author"/>
<meta content="LispWorks Ltd." http-equiv="Organization"/>
<link href="../Front/index.htm" rel="TOP"/>
<link href="../Front/Help.htm#Legal" rel="COPYRIGHT"/>
<link href="../Front/Help.htm#Disclaimer" rel="DISCLAIMER"/>
<link href="06_abba.htm" rel="PREV"/>
<link href="06_a.htm" rel="UP"/>
<link href="06_aca.htm" rel="NEXT"/>
</head>
<body>
<h1>
<a href="http://www.lispworks.com/" rev="MADE"><img align="bottom" alt="[LISPWORKS]" height="65" src="../Graphics/LWSmall.gif" width="80"/></a><a href="../Front/index.htm" rel="TOP"><img align="bottom" alt="[Common Lisp HyperSpec (TM)]" height="65" src="../Graphics/CLHS_Sm.gif" width="237"/></a> <a href="06_abba.

In [53]:
def parse_link(url):
    html_doc = requests.get(url).text
    tidy_doc = tidy_document(html_doc, options={'drop_empty_paras': False, 'force_output': True})[0]
    soup = BeautifulSoup(tidy_doc, 'html.parser')
    return soup

clhs_soup = parse_link("http://www.lispworks.com/documentation/HyperSpec/Body/06_ac.htm")
clhs_soup

<!-- Common Lisp HyperSpec (TM), version 7.0 generated by Kent M. Pitman on Mon, 11-Apr-2005 2:31am EDT -->
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN">

<html>
<head>
<title>
      CLHS: Section 6.1.3
    </title>
<link href="../Data/clhs.css" rel="stylesheet" type="text/css"/>
<meta content="Kent M. Pitman" http-equiv="Author"/>
<meta content="LispWorks Ltd." http-equiv="Organization"/>
<link href="../Front/index.htm" rel="TOP"/>
<link href="../Front/Help.htm#Legal" rel="COPYRIGHT"/>
<link href="../Front/Help.htm#Disclaimer" rel="DISCLAIMER"/>
<link href="06_abba.htm" rel="PREV"/>
<link href="06_a.htm" rel="UP"/>
<link href="06_aca.htm" rel="NEXT"/>
</head>
<body>
<h1>
<a href="http://www.lispworks.com/" rev="MADE"><img align="bottom" alt="[LISPWORKS]" height="65" src="../Graphics/LWSmall.gif" width="80"/></a><a href="../Front/index.htm" rel="TOP"><img align="bottom" alt="[Common Lisp HyperSpec (TM)]" height="65" src="../Graphics/CLHS_Sm.gif" width="237"/></a> <a href="06_abba.

In [54]:
def get_content(clhs_soup):
    without_head = [tag for tag in clhs_soup.body.hr.find_next_siblings()]
    content = list(it.takewhile(lambda x: str(x) != '<hr/>', without_head))
    return content

content = get_content(clhs_soup)
content

[<h2>
       6.1.3 Value Accumulation Clauses
     </h2>, <p>
       The constructs <tt>collect</tt>, <tt>collecting</tt>,
       <tt>append</tt>, <tt>appending</tt>, <tt>nconc</tt>,
       <tt>nconcing</tt>, <tt>count</tt>, <tt>counting</tt>,
       <tt>maximize</tt>, <tt>maximizing</tt>, <tt>minimize</tt>,
       <tt>minimizing</tt>, <tt>sum</tt>, and <tt>summing</tt>,
       allow values to be accumulated in a <a href="m_loop.htm#loop" rel="DEFINITION"><b>loop</b></a>.
     </p>, <p>
       The constructs <tt>collect</tt>, <tt>collecting</tt>,
       <tt>append</tt>, <tt>appending</tt>, <tt>nconc</tt>, and
       <tt>nconcing</tt>, designate clauses that accumulate values
       in <a href="26_glo_l.htm#list" rel="DEFINITION"><i>lists</i></a> and return them. The
       constructs <tt>count</tt>, <tt>counting</tt>,
       <tt>maximize</tt>, <tt>maximizing</tt>, <tt>minimize</tt>,
       <tt>minimizing</tt>, <tt>sum</tt>, and <tt>summing</tt>
       designate clauses that accumulate 

In [55]:
h2s = [tag for tag in content if tag.name == 'h2' and tag.a]
h2s

[<h2>
 <a href="06_aca.htm" rel="CHILD">6.1.3.1 Examples of COLLECT
       clause</a>
 </h2>, <h2>
 <a href="06_acb.htm" rel="CHILD">6.1.3.2 Examples of APPEND
       and NCONC clauses</a>
 </h2>, <h2>
 <a href="06_acc.htm" rel="CHILD">6.1.3.3 Examples of COUNT
       clause</a>
 </h2>, <h2>
 <a href="06_acd.htm" rel="CHILD">6.1.3.4 Examples of MAXIMIZE
       and MINIMIZE clauses</a>
 </h2>, <h2>
 <a href="06_ace.htm" rel="CHILD">6.1.3.5 Examples of SUM
       clause</a>
 </h2>]

In [56]:
def get_base_url(url):
    return re.findall(".*\/", url)[0]

# def extract_link(a_tag, base_url):
#     return base_url + a_tag.a['href']

# def extract_full_link(a_tag):
#     return extract_link(a_tag, base_url)

def build_link_from_h2(h2, base_url):
    return get_base_url(url) + h2.a['href']

base_url = get_base_url(url)
build_link_from_h2(h2s[0], base_url)

'http://www.lispworks.com/documentation/HyperSpec/Body/06_aca.htm'

In [57]:
# Getting the content to fill in for first h2
base_url = get_base_url(url)
link = build_link_from_h2(h2s[0], base_url)
new_soup = parse_link(link)
new_content = get_content(new_soup)
# new_content
# link
# new_soup
new_content


[<h2>
       6.1.3.1 Examples of COLLECT clause
     </h2>, <p></p>, <pre>
 ;; Collect all the symbols in a list.
  (loop for i in '(bird 3 4 turtle (1 . 4) horse cat)
        when (symbolp i) collect i)
 =&gt;  (BIRD TURTLE HORSE CAT)
  
 ;; Collect and return odd numbers.
  (loop for i from 1 to 10
        if (oddp i) collect i)
 =&gt;  (1 3 5 7 9)
  
 ;; Collect items into local variable, but don't return them.
  (loop for i in '(a b c d) by #'cddr
        collect i into my-list
        finally (print my-list))
 &gt;&gt;  (A C) 
 =&gt;  NIL
 </pre>, <p></p>]

In [58]:
h2s[0]

<h2>
<a href="06_aca.htm" rel="CHILD">6.1.3.1 Examples of COLLECT
      clause</a>
</h2>

In [61]:
for tag in new_content[2:]:
    h2s[0].insert_after(tag)

clhs_soup

<!-- Common Lisp HyperSpec (TM), version 7.0 generated by Kent M. Pitman on Mon, 11-Apr-2005 2:31am EDT -->
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN">

<html>
<head>
<title>
      CLHS: Section 6.1.3
    </title>
<link href="../Data/clhs.css" rel="stylesheet" type="text/css"/>
<meta content="Kent M. Pitman" http-equiv="Author"/>
<meta content="LispWorks Ltd." http-equiv="Organization"/>
<link href="../Front/index.htm" rel="TOP"/>
<link href="../Front/Help.htm#Legal" rel="COPYRIGHT"/>
<link href="../Front/Help.htm#Disclaimer" rel="DISCLAIMER"/>
<link href="06_abba.htm" rel="PREV"/>
<link href="06_a.htm" rel="UP"/>
<link href="06_aca.htm" rel="NEXT"/>
</head>
<body>
<h1>
<a href="http://www.lispworks.com/" rev="MADE"><img align="bottom" alt="[LISPWORKS]" height="65" src="../Graphics/LWSmall.gif" width="80"/></a><a href="../Front/index.htm" rel="TOP"><img align="bottom" alt="[Common Lisp HyperSpec (TM)]" height="65" src="../Graphics/CLHS_Sm.gif" width="237"/></a> <a href="06_abba.

In [60]:
with open("clhs.html", "w") as f:
    f.write(str(clhs_soup))