# 代码例子

In [3]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
</body>
</html>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc,'html.parser')
soup1 = BeautifulSoup('''<html><head><title>The Dormouse's story</title></head>
</html>''',
'lxml')

# 对象种类 

## Tag 类 

<class 'bs4.element.Tag'\>

tag有很多方法和属性，其中重要的属性：name,attributes

tag.name 获取名字

tag\['属性名'\] 获取某种属性

tag.属性  获取所有属性. 因为一个tag可能有很多个属性

tag的属性可以添加，删除，修改，操作和字典一样

In [14]:
print(type(soup.head))  #tag 类型
print(soup.head.name)   #tag 名称
print(soup.a['class'])  #tag 某种属性
print(soup.a.attrs)     #tag 所有属性

<class 'bs4.element.Tag'>
head
['sister']
{'href': 'http://example.com/elsie', 'class': ['sister'], 'id': 'link1'}


In [13]:
print(soup.p['class'])  #原属性 
soup.p['class'] = 'top' # 修改属性
print(soup.p.attrs)
del soup.p['class']     #删除属性
soup.p['class'] = 'title' #添加属性
print(soup.p['class'])

title
{'class': 'top'}
title


## NavigableString类
可遍历的字符串

<class 'bs4.element.NavigableString'\>

字符串常被包含在tag内。BeautifulSoup 用 NavigableString 类来包装tag中的字符串

tag.string 获取tag中的字符串

tag.replace_with('') 方法，tag中包含的字符串不能把编辑，但是可以背替换成其他的字符串

In [17]:
print(type(soup.head.string))
print(soup.head.string)
print(soup.title.string)
soup.title.string.replace_with('I have changed this title')
print(soup.title.string)
soup.title.string.replace_with('The Dormouse\'s story')
print(soup.title.string)

<class 'bs4.element.NavigableString'>
The Dormouse's story
The Dormouse's story
I have changed this title
The Dormouse's story


## BeautifulSoup 类
表示的是一个文档的全部内容。大部分时候，可以当作tag对象

因为 BeautifulSoup 对象并不是真正的HTML或XML的tag,所以它没有name和attribute属性.但有时查看它的 .name 属性是很方便的,所以 BeautifulSoup 对象包含了一个值为 “\[document\]” 的特殊属性 .name

In [19]:
print(soup.name)

[document]


## Comment 类
注释及特殊字符串

<class 'bs4.element.Comment'>

Comment 对象是一个特殊类型的 NavigableString 对象

In [30]:
markup = '<b><!--Hey,budy. Want to buy a used parser?--></b>'
soupc = BeautifulSoup(markup,'html.parser')
comment = soupc.b.string
print(type(comment))
print(comment)
print(soupc.b.prettify()) # 但是当它出现在HTML文档中时, Comment 对象会使用特殊的格式输出:

<class 'bs4.element.Comment'>
Hey,budy. Want to buy a used parser?
<b>
 <!--Hey,budy. Want to buy a used parser?-->
</b>


# 遍历文档书

## 子节点

一个Tag可能包含多个字符串或其它的Tag,这些都是这个Tag的子节点.Beautiful Soup提供了许多操作和遍历子节点的属性.

注意: Beautiful Soup中字符串节点不支持这些属性,因为字符串没有子节点

### .标签名 

获取某个tag标签，点取属性方式只能获得当前名字的第一tag

In [36]:
print(soup.head)  
print(soup.title)
print(soup.body.b) #获取<body>标签中的第一个<b>标签
print(soup.a) #点取属性方式只能获得当前名字的第一tag
print(soup.find_all('a'))#获取所有<a>标签

<head><title>The Dormouse's story</title></head>
<title>The Dormouse's story</title>
<b>The Dormouse's story</b>
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]


### .contents 和 .children（生成器）

tag.contents 属性可以将tag的子节点以列表的方式输出

字符串没有 .contents 属性,因为字符串没有子节点

通过tag的 .children 生成器,可以对tag的子节点进行循环

In [45]:
print(soup.head.contents)
print(soup.body.contents)
print(soup.contents) #BeautifulSoup 对象本身一定会包含子节点,也就是说<html>标签也是 BeautifulSoup 对象的子节点

[<title>The Dormouse's story</title>]
['\n', <p class="title"><b>The Dormouse's story</b></p>, '\n', <p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>, '\n', <p class="story">...</p>, '\n']
['\n', <html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body>
</h

In [47]:
for i in soup.body.children:  # .children 是生成器
    print(i)
    
    
for x in soup.body.contents: # .contents 是列表
    print(x)



<p class="title"><b>The Dormouse's story</b></p>


<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>


<p class="story">...</p>




<p class="title"><b>The Dormouse's story</b></p>


<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>


<p class="story">...</p>




### .descendants 子孙节点

.contents 和 .children 属性仅包含tag的直接子节点.例如,<head\>标签只有一个直接子节点<title\>,但是<title\>标签也包含一个子节点:字符串 “The Dormouse’s story”,这种情况下字符串 “The Dormouse’s story”也属于<head\>标签的子孙节点.

.descendants 属性可以对所有tag的子孙节点进行递归循环

In [50]:
print(soup.head.contents)
for i in soup.head.descendants:
    print(i)

[<title>The Dormouse's story</title>]
<title>The Dormouse's story</title>
The Dormouse's story


### .string

如果tag只有一个 NavigableString 类型子节点,那么这个tag可以使用 .string 得到子节点

如果一个tag仅有一个子节点,那么这个tag也可以使用 .string 方法,输出结果与当前唯一子节点的 .string 结果相同

如果tag包含了多个子节点,tag就无法确定 .string 方法应该调用哪个子节点的内容, .string 的输出结果是 None

In [55]:
print(soup.b) #tag 只有一个NavigableSting 类型子节点，可以用.string 得到子节点
print(soup.b.string)

<b>The Dormouse's story</b>
The Dormouse's story


In [54]:
soup.head #tag 仅有一个子节点，也可以用.string
print(soup.head)
print(soup.head.string)

<head><title>The Dormouse's story</title></head>
The Dormouse's story


In [75]:
print(soup1.html)
print(soup1.html.string) # tag 包含多个子节点,tag就无法确定 .string 方法应该调用哪个子节点的内容, .string 的输出结果是 None 

<html><head><title>The Dormouse's story</title></head>
</html>
None


### .strings（生成器）和 stripped_strings

如果tag中包含多个字符串,可以使用 .strings 来循环获取

输出的字符串中可能包含了很多空格或空行,使用 .stripped_strings 可以去除多余空白内容，全部是空格的行会被忽略掉,段首和段末的空白会被删除

In [59]:
print(type(soup.html))
print((type(soup.html.strings))) 
for i in soup.html.strings: # tag中包含多个字符串,使用 .strings 来循环获取:
    print(i)
print('--------------------------------')

for i in soup.html.stripped_strings:  #输出的字符串中可能包含了很多空格或空行,使用 .stripped_strings 可以去除多余空白内容:
    print(i)

<class 'bs4.element.Tag'>
<class 'generator'>
The Dormouse's story




The Dormouse's story


Once upon a time there were three little sisters; and their names were

Elsie
,

Lacie
 and

Tillie
;
and they lived at the bottom of a well.


...




--------------------------------
The Dormouse's story
The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie
,
Lacie
and
Tillie
;
and they lived at the bottom of a well.
...


## 父节点

每个tag或字符串都有父节点:被包含在某个tag中

### .parent 
通过 .parent 属性来获取某个元素的父节点.

In [67]:
print(soup.title)
print(soup.title.parent) # <head>标签是<title>标签的父节点
print(soup.title.string) # 文档title的字符串也有父节点:<title>标签
print(soup.title.string.parent)
print(type(soup.html.parent)) # 文档的顶层节点比如<html>的父节点是 BeautifulSoup 对象
print(soup.parent) # BeautifulSoup 对象的 .parent 是None

<title>The Dormouse's story</title>
<head><title>The Dormouse's story</title></head>
The Dormouse's story
<title>The Dormouse's story</title>
<class 'bs4.BeautifulSoup'>
None


### .parents (生成器)
通过元素的 .parents 属性可以递归得到元素的所有父辈节点

In [72]:
print(soup.a)
for i in soup.a.parents:# 遍历了<a>标签到根节点的所有节点.
    print(i.name)
    print(i)

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
p
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
body
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body>
html
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="s

## 兄弟节点

兄弟节点是在同一层：他们是同一个元素的子节点.一段文档以标准格式输出时,兄弟节点有相同的缩进级别.在代码中也可以使用这种关系.

### .next_sibling 和 .previous_sibling

使用 .next_sibling 和 .previous_sibling 属性来查询兄弟节点

In [77]:
sibling_soup = BeautifulSoup("<a><b>text1</b><c>text2</c></b></a>",'html.parser')
print(sibling_soup.prettify())
print(sibling_soup.b.next_sibling)
print(sibling_soup.c.previous_sibling)

<a>
 <b>
  text1
 </b>
 <c>
  text2
 </c>
</a>
<c>text2</c>
<b>text1</b>


In [85]:
# 实际文档中的tag的 .next_sibling 和 .previous_sibling 属性通常是字符串或空,因为标签之间还有顿号和换行符:
print(soup.a.next_sibling) # 顿号
print(soup.a.next_sibling.next_sibling)

,

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>


### .next_siblings 和 .previous_siblings (生成器)
通过 .next_siblings 和 .previous_siblings 属性可以对当前节点的兄弟节点迭代输出

In [91]:
print(type(soup.a.next_siblings))

for i in soup.a.next_siblings:
    print(i)
print('-------------------------')    
print(soup.find(id='link3'))    
for t in soup.find(id='link3').previous_siblings:
    print(t)

<class 'generator'>
,

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
 and

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
;
and they lived at the bottom of a well.
-------------------------
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
 and

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
,

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
Once upon a time there were three little sisters; and their names were



## 回退和前进

### .next_element 和 .previous_element
.next_element 属性指向解析过程中下一个被解析的对象(字符串或tag),结果可能与 .next_sibling 相同,但通常是不一样的.

这是“爱丽丝”文档中最后一个<a>标签,它的 .next_sibling 结果是一个字符串,因为当前的解析过程因为遇到了<a>标签而中断了:

In [96]:
last_a = soup.find('a',id = 'link3')
print(last_a.next_sibling)
print(last_a.next_element)

;
and they lived at the bottom of a well.
Tillie


### .next_elements 和 .previous_elements
通过 .next_elements 和 .previous_elements 的迭代器就可以向前或向后访问文档的解析内容,就好像文档正在被解析一样:

In [97]:
for i in last_a.next_elements:
    print(i)

Tillie
;
and they lived at the bottom of a well.


<p class="story">...</p>
...








# 搜索文档树

## 过滤器
介绍 find_all() 方法前,先介绍一下过滤器的类型 [3] ,这些过滤器贯穿整个搜索的API.过滤器可以被用在tag的name中,节点的属性中,字符串中或他们的混合中.

### 字符串

最简单的过滤器是字符串.在搜索方法中传入一个字符串参数,Beautiful Soup会查找与字符串完整匹配的内容

In [98]:
print(soup.find_all('b')) #用于查找文档中所有的<b>标签:

[<b>The Dormouse's story</b>]


### 正则表达式
如果传入正则表达式作为参数,Beautiful Soup会通过正则表达式的 match() 来匹配内容.

In [102]:
import re
for i in soup.find_all(re.compile('b')): # 找出所有以b开头的标签,这表示<body>和<b>标签都应该被找到:
    print(i.name)

body
b


### 列表
如果传入列表参数,Beautiful Soup会将与列表中任一元素匹配的内容返回.

In [104]:
print(soup.find_all(['a','b'])) # 找到文档中所有<a>标签和<b>标签

[<b>The Dormouse's story</b>, <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]


### True
True 可以匹配任何值

In [106]:
print(soup.find_all(True)) #查找到所有的tag,但是不会返回字符串节点
for i in soup.find_all(True):
    print(i.name)

[<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body>
</html>, <head><title>The Dormouse's story</title></head>, <title>The Dormouse's story</title>, <body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</

## 方法
如果没有合适过滤器,那么还可以定义一个方法,方法只接受一个元素参数,如果这个方法返回 True 表示当前元素匹配并且被找到,如果不是则反回 False

## find_all

In [None]:
方法