In [1]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

In [3]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')
soup


<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body></html>

In [4]:
soup.title

<title>The Dormouse's story</title>

In [5]:
soup.title.name

'title'

In [6]:
soup.title.string

"The Dormouse's story"

In [7]:
soup.title.parent.name

'head'

In [8]:
soup.p

<p class="title"><b>The Dormouse's story</b></p>

In [9]:
soup.p['class']

['title']

In [10]:
soup.a

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [11]:
soup.find_all('a')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [12]:
soup.find(id="link3")

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

In [13]:
for link in soup.find_all('a'):
    print(link.get('href'))

http://example.com/elsie
http://example.com/lacie
http://example.com/tillie


In [15]:
soup.get_text() # 文档中获取所有文字内容

"\nThe Dormouse's story\n\nThe Dormouse's story\nOnce upon a time there were three little sisters; and their names were\nElsie,\nLacie and\nTillie;\nand they lived at the bottom of a well.\n...\n"

In [16]:
# html.parser 内置标准库 执行速度适中 文档容错能力强
# lxml 速度快 文档容错能力强
# lxml-xml xml 速度快 唯一支持XML的解释器
# html5lib 最好的容错性 以浏览器的方式解释文档 生成HTML5格式的文档

In [19]:
from bs4 import BeautifulSoup

soup = BeautifulSoup("<html>data</html>")
soup

<html><body><p>data</p></body></html>

In [20]:
# 将复杂HTML文档转换成一个复杂的树形结构,每个节点都是Python对象
# 所有对象可以归纳为4种: Tag , NavigableString , BeautifulSoup , Comment 

In [31]:
# Tag
soup = BeautifulSoup('<b class="boldest">Extremely bold</b>')
tag = soup.b
type(tag)
# name
tag.name # 每个tag都有自己的名字
tag.name = "blockquote"
tag
# Attributes
tag['class']
tag.attrs
# tag的属性操作方法与字典一样
tag['class'] = 'verybold'
tag['id'] = 1
tag
del tag['class']
del tag['id']
tag
# tag['class']
print(tag.get('class'))

None


In [32]:
# 多值属性
# 最常见的多值的属性是 class (一个tag可以有多个CSS的class)
# 还有一些属性 rel , rev , accept-charset , headers , accesskey . 在Beautiful Soup中多值属性的返回类型是list
css_soup = BeautifulSoup('<p class="body strikeout"></p>')
css_soup.p['class']

['body', 'strikeout']

In [33]:
# 属性看起来好像有多个值,但没有被定义为多值属性
# 那么Beautiful Soup会将这个属性作为字符串返回
id_soup = BeautifulSoup('<p id="my id"></p>')
id_soup.p['id']

'my id'

In [34]:
rel_soup = BeautifulSoup('<p>Back to the <a rel="index">homepage</a></p>')
rel_soup.a['rel']
# ['index']
rel_soup.a['rel'] = ['index', 'contents']
print(rel_soup.p)

<p>Back to the <a rel="index contents">homepage</a></p>


In [35]:
# 如果转换的文档是XML格式,那么tag中不包含多值属性
xml_soup = BeautifulSoup('<p class="body strikeout"></p>', 'xml')
xml_soup.p['class']

'body strikeout'

In [41]:
# 可以遍历的字符串
# 字符串常被包含在tag内.Beautiful Soup用 NavigableString 类来包装tag中的字符串
tag.string
type(tag.string)

bs4.element.NavigableString

In [42]:
# 通过 unicode() 方法可以直接将 NavigableString 对象转换成Unicode字符串

In [43]:
# tag中包含的字符串不能编辑,但是可以被替换成其它的字符串,用 replace_with() 方法
tag.string.replace_with("No longer bold")
tag

<blockquote>No longer bold</blockquote>

In [44]:
# BeautifulSoup 对象表示的是一个文档的全部内容.大部分时候,可以把它当作 Tag 对象
soup.name

'[document]'

In [45]:
# 注释及特殊字符串
markup = "<b><!--Hey, buddy. Want to buy a used parser?--></b>"
soup = BeautifulSoup(markup)
comment = soup.b.string
type(comment)
# <class 'bs4.element.Comment'>

bs4.element.Comment

In [46]:
# Comment 对象是一个特殊类型的 NavigableString 对象
comment

'Hey, buddy. Want to buy a used parser?'

In [47]:
print(soup.b.prettify())

<b>
 <!--Hey, buddy. Want to buy a used parser?-->
</b>


In [48]:
from bs4 import CData
cdata = CData("A CDATA block")
comment.replace_with(cdata)
print(soup.b.prettify())

<b>
 <![CDATA[A CDATA block]]>
</b>


In [49]:
# 遍历文档树
html_doc = """
<html><head><title>The Dormouse's story</title></head>
    <body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')

In [50]:
# 子节点
# 一个Tag可能包含多个字符串或其它的Tag,这些都是这个Tag的子节点

In [51]:
soup.head

<head><title>The Dormouse's story</title></head>

In [52]:
soup.title

<title>The Dormouse's story</title>

In [53]:
soup.body.b

<b>The Dormouse's story</b>

In [54]:
soup.a

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [55]:
soup.find_all('a')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [57]:
# .contents 和 .children
# tag的 .contents 属性可以将tag的子节点以列表的方式输出
head_tag = soup.head
head_tag

<head><title>The Dormouse's story</title></head>

In [58]:
head_tag.contents

[<title>The Dormouse's story</title>]

In [68]:
title_tag = head_tag.contents[0]
title_tag

<title>The Dormouse's story</title>

In [60]:
title_tag.contents

["The Dormouse's story"]

In [64]:
len(soup.contents)
print(soup.contents[0].name)

None


In [66]:
# 字符串没有 .contents 属性,因为字符串没有子节点
text = title_tag.contents[0]
# text.contents
# AttributeError: 'NavigableString' object has no attribute 'contents'

In [69]:
# 通过tag的 .children 生成器,可以对tag的子节点进行循环
for child in title_tag.children:
    print(child)

The Dormouse's story


In [70]:
# 子孙节点
for child in head_tag.descendants:
    print(child)

<title>The Dormouse's story</title>
The Dormouse's story


In [71]:
len(list(soup.children))
# 1
len(list(soup.descendants))
# 25

27

In [72]:
# 如果tag只有一个 NavigableString 类型子节点,那么这个tag可以使用 .string 得到子节点
title_tag.string

"The Dormouse's story"

In [74]:
head_tag.contents
head_tag.string

"The Dormouse's story"

In [75]:
print('soup.html.string')

soup.html.string


In [76]:
# 如果tag中包含多个字符串 [2] ,可以使用 .strings 来循环获取
for string in soup.strings:
    print(repr(string))

'\n'
"The Dormouse's story"
'\n'
'\n'
"The Dormouse's story"
'\n'
'Once upon a time there were three little sisters; and their names were\n'
'Elsie'
',\n'
'Lacie'
' and\n'
'Tillie'
';\nand they lived at the bottom of a well.'
'\n'
'...'
'\n'


In [77]:
# 输出的字符串中可能包含了很多空格或空行
# 使用 .stripped_strings 可以去除多余空白内容
for string in soup.stripped_strings:
    print(repr(string))

"The Dormouse's story"
"The Dormouse's story"
'Once upon a time there were three little sisters; and their names were'
'Elsie'
','
'Lacie'
'and'
'Tillie'
';\nand they lived at the bottom of a well.'
'...'


In [78]:
# 父节点
title_tag = soup.title
title_tag
# <title>The Dormouse's story</title>
title_tag.parent
# <head><title>The Dormouse's story</title></head>

<head><title>The Dormouse's story</title></head>

In [79]:
title_tag.string.parent

<title>The Dormouse's story</title>

In [81]:
html_tag = soup.html
type(html_tag.parent)

bs4.BeautifulSoup

In [82]:
print(soup.parent)

None


In [84]:
# 通过元素的 .parents 属性可以递归得到元素的所有父辈节点
link = soup.a
link
for parent in link.parents:
    if parent is None:
        print(parent)
    else:
        print(parent.name)

p
body
html
[document]


In [85]:
# 兄弟节点
sibling_soup = BeautifulSoup("<a><b>text1</b><c>text2</c></b></a>")
print(sibling_soup.prettify())

<html>
 <body>
  <a>
   <b>
    text1
   </b>
   <c>
    text2
   </c>
  </a>
 </body>
</html>


In [86]:
sibling_soup.b.next_sibling

<c>text2</c>

In [87]:
sibling_soup.c.previous_sibling

<b>text1</b>

In [90]:
link = soup.a
link
link.next_sibling
link.next_sibling.next_sibling

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>

In [92]:
# 通过 .next_siblings 和 .previous_siblings 属性可以对当前节点的兄弟节点迭代输出
for sibling in soup.a.next_siblings:
    print(repr(sibling))
    
for sibling in soup.find(id="link3").previous_siblings:
    print(repr(sibling))

',\n'
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
' and\n'
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
';\nand they lived at the bottom of a well.'
' and\n'
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
',\n'
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
'Once upon a time there were three little sisters; and their names were\n'


In [93]:
last_a_tag = soup.find("a", id="link3")
last_a_tag
last_a_tag.next_element

'Tillie'

In [94]:
last_a_tag.previous_element
# u' and\n'
last_a_tag.previous_element.next_element
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

In [95]:
for element in last_a_tag.next_elements:
    print(repr(element))

'Tillie'
';\nand they lived at the bottom of a well.'
'\n'
<p class="story">...</p>
'...'
'\n'


In [96]:
# 搜索文档树
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')

In [97]:
soup.find_all('b')

[<b>The Dormouse's story</b>]

In [98]:
import re
for tag in soup.find_all(re.compile("^b")):
    print(tag.name)

body
b


In [99]:
soup.find_all(["a", "b"])

[<b>The Dormouse's story</b>,
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [100]:
for tag in soup.find_all(True):
    print(tag.name)

html
head
title
body
p
b
p
a
a
a
p


In [101]:
# find_all( name , attrs , recursive , string , **kwargs )
soup.find_all("title")

[<title>The Dormouse's story</title>]

In [102]:
soup.find_all("p", "title")

[<p class="title"><b>The Dormouse's story</b></p>]

In [103]:
soup.find_all("a")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [104]:
soup.find_all(id="link2")

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

In [105]:
import re
soup.find(string=re.compile("sisters"))

'Once upon a time there were three little sisters; and their names were\n'

In [107]:
soup.find_all(id=True)

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [108]:
soup.find_all(href=re.compile("elsie"), id='link1')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [111]:
data_soup = BeautifulSoup('<div data-foo="value">foo!</div>')
#data_soup.find_all(data-foo="value")
data_soup.find_all(attrs={"data-foo": "value"})

[<div data-foo="value">foo!</div>]

In [112]:
soup.find_all(class_=re.compile("itl"))
# [<p class="title"><b>The Dormouse's story</b></p>]
def has_six_characters(css_class):
    return css_class is not None and len(css_class) == 6
soup.find_all(class_=has_six_characters)

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [113]:
css_soup = BeautifulSoup('<p class="body strikeout"></p>')
css_soup.find_all("p", class_="strikeout")
# [<p class="body strikeout"></p>]

css_soup.find_all("p", class_="body")

[<p class="body strikeout"></p>]

In [114]:
css_soup.find_all("p", class_="body strikeout")

[<p class="body strikeout"></p>]

In [115]:
soup.find_all("a", attrs={"class": "sister"})

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [117]:
soup.find_all(string="Elsie")
# [u'Elsie']

soup.find_all(string=["Tillie", "Elsie", "Lacie"])
# [u'Elsie', u'Lacie', u'Tillie']

soup.find_all(string=re.compile("Dormouse"))
[u"The Dormouse's story", u"The Dormouse's story"]

def is_the_only_string_within_a_tag(s):
    return (s == s.parent.string)

soup.find_all(string=is_the_only_string_within_a_tag)
# [u"The Dormouse's story", u"The Dormouse's story", u'Elsie', u'Lacie', u'Tillie', u'...']

["The Dormouse's story",
 "The Dormouse's story",
 'Elsie',
 'Lacie',
 'Tillie',
 '...']

In [118]:
soup.find_all("a", string="Elsie")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [119]:
soup.find_all("a", limit=2)

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

In [120]:
soup.html.find_all("title")
# [<title>The Dormouse's story</title>]

soup.html.find_all("title", recursive=False)
# []

[]

In [121]:
soup.find_all("a")
soup("a")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [122]:
# find

In [123]:
soup.find_all('title', limit=1)
# [<title>The Dormouse's story</title>]

soup.find('title')
# <title>The Dormouse's story</title>

<title>The Dormouse's story</title>

In [124]:
print(soup.find("nosuchtag"))

None


In [125]:
soup.head.title
# <title>The Dormouse's story</title>

soup.find("head").find("title")
# <title>The Dormouse's story</title>

<title>The Dormouse's story</title>

In [126]:
# find_parents() 和 find_parent()

In [128]:
a_string = soup.find(string="Lacie")
a_string
# u'Lacie'

a_string.find_parents("a")
# [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

a_string.find_parent("p")
# <p class="story">Once upon a time there were three little sisters; and their names were
#  <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
#  <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
#  <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
#  and they lived at the bottom of a well.</p>

#a_string.find_parents("p", class="title")
# []

<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

In [129]:
# find_next_siblings() 合 find_next_sibling()
first_link = soup.a
first_link
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

first_link.find_next_siblings("a")
# [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
#  <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

first_story_paragraph = soup.find("p", "story")
first_story_paragraph.find_next_sibling("p")
# <p class="story">...</p>

<p class="story">...</p>

In [130]:
# find_previous_siblings() 和 find_previous_sibling()
last_link = soup.find("a", id="link3")
last_link
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

last_link.find_previous_siblings("a")
# [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
#  <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

first_story_paragraph = soup.find("p", "story")
first_story_paragraph.find_previous_sibling("p")
# <p class="title"><b>The Dormouse's story</b></p>

<p class="title"><b>The Dormouse's story</b></p>

In [131]:
# find_all_next() 和 find_next()
first_link = soup.a
first_link
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

first_link.find_all_next(string=True)
# [u'Elsie', u',\n', u'Lacie', u' and\n', u'Tillie',
#  u';\nand they lived at the bottom of a well.', u'\n\n', u'...', u'\n']

first_link.find_next("p")
# <p class="story">...</p>

<p class="story">...</p>

In [132]:
# find_all_previous() 和 find_previous()
first_link = soup.a
first_link
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

first_link.find_all_previous("p")
# [<p class="story">Once upon a time there were three little sisters; ...</p>,
#  <p class="title"><b>The Dormouse's story</b></p>]

first_link.find_previous("title")
# <title>The Dormouse's story</title>

<title>The Dormouse's story</title>

In [135]:
# CSS选择器
soup.select("title")
# [<title>The Dormouse's story</title>]

#soup.select("p nth-of-type(1)")
# [<p class="story">...</p>]

[<title>The Dormouse's story</title>]