In [1]:
from lxml import etree

### 直接读取文本文件进行解析

In [3]:
html = etree.parse('./Xpath.html', etree.HTMLParser())
# html = etree.HTML(text)
result = etree.tostring(html)
print(result.decode('utf-8'))

<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html><body><div>&#13;
    <ul>&#13;
         <li class="item-0"><a href="link1.html">first item</a></li>&#13;
         <li class="item-1"><a href="link2.html">second item</a></li>&#13;
         <li class="item-inactive"><a href="link3.html">third item</a></li>&#13;
         <li class="item-1"><a href="link4.html">fourth item</a></li>&#13;
         <li class="item-0"><a href="link5.html">fifth item</a>&#13;
     </li></ul>&#13;
 </div></body></html>


### 一般会用  `//`  开头的 XPath 规则来选取所有符合要求的节点

In [6]:
from lxml import etree
html = etree.parse('./Xpath.html', etree.HTMLParser())
result = html.xpath('//*')
print(result)

[<Element html at 0x20d4836a340>, <Element body at 0x20d48357a00>, <Element div at 0x20d48351cc0>, <Element ul at 0x20d48437ec0>, <Element li at 0x20d48437cc0>, <Element a at 0x20d4846b980>, <Element li at 0x20d4846b700>, <Element a at 0x20d4846b5c0>, <Element li at 0x20d4846b240>, <Element a at 0x20d48437f40>, <Element li at 0x20d4846bb00>, <Element a at 0x20d4846bac0>, <Element li at 0x20d4846ba80>, <Element a at 0x20d4846b540>]


### 通过 ` / ` 或 ` // ` 即可查找元素的子节点或子孙节点

In [4]:
html = etree.parse('./Xpath.html', etree.HTMLParser())
result = html.xpath('//ul//a')
print(result)

[<Element a at 0x20d48357c00>, <Element a at 0x20d48357b00>, <Element a at 0x20d4835a900>, <Element a at 0x20d4835a580>, <Element a at 0x20d4835a680>]


### 通过 `..` 或 `parent::` 即可查找父节点

In [7]:
html = etree.parse('./Xpath.html', etree.HTMLParser())
result = html.xpath('//a[@href="link4.html"]/../@class')
# result = html.xpath('//a[@href="link4.html"]/parent::*/@class')
print(result)

['item-1']


### 可以用 `@` 符号进行属性过滤

In [8]:
html = etree.parse('./Xpath.html', etree.HTMLParser())
result = html.xpath('//li[@class="item-0"]')
print(result)

[<Element li at 0x20d48471380>, <Element li at 0x20d484713c0>]


### 用 XPath 中的 `text()` 方法获取节点中的文本

In [11]:
html = etree.parse('./Xpath.html', etree.HTMLParser())
result = html.xpath('//li[@class="item-0"]/a/text()')
print(result)

['first item', 'fifth item']


### 属性获取

In [None]:
html = etree.parse('./Xpath.html', etree.HTMLParser())
result = html.xpath('//li/a/@href')
print(result)

### 属性多值匹配

In [12]:
text = '''
<li class="li li-first"><a href="link.html">first item</a></li>
'''
html = etree.HTML(text)
result = html.xpath('//li[contains(@class, "li")]/a/text()')
print(result)

['first item']


### 多属性匹配

In [13]:
text = '''
<li class="li li-first" name="item"><a href="link.html">first item</a></li>
'''
html = etree.HTML(text)
result = html.xpath('//li[contains(@class, "li") and @name="item"]/a/text()')
print(result)

['first item']


### 按序选择
* 第一次选择时，我们选取了第一个 li 节点，中括号中传入数字 1 即可。注意，这里和代码中不同，序号是以 1 开头的，不是以 0 开头。
* 第二次选择时，我们选取了最后一个 li 节点，中括号中传入 last() 即可，返回的便是最后一个 li 节点。
* 第三次选择时，我们选取了位置小于 3 的 li 节点，也就是位置序号为 1 和 2 的节点，得到的结果就是前两个 li 节点。
* 第四次选择时，我们选取了倒数第三个 li 节点，中括号中传入 last()-2 即可。因为 last() 是最后一个，所以 last()-2 就是倒数第三个。

In [14]:
html = etree.parse('./Xpath.html', etree.HTMLParser())
result = html.xpath('//li[1]/a/text()')
print(result)
result = html.xpath('//li[last()]/a/text()')
print(result)
result = html.xpath('//li[position()<3]/a/text()')
print(result)
result = html.xpath('//li[last()-2]/a/text()')
print(result)

['first item']
['fifth item']
['first item', 'second item']
['third item']


### 节点轴选择

In [15]:
html = etree.parse('./Xpath.html', etree.HTMLParser())
result = html.xpath('//li[1]/ancestor::*')
print(result)
result = html.xpath('//li[1]/ancestor::div')
print(result)
result = html.xpath('//li[1]/attribute::*')
print(result)
result = html.xpath('//li[1]/child::a[@href="link1.html"]')
print(result)
result = html.xpath('//li[1]/descendant::span')
print(result)
result = html.xpath('//li[1]/following::*[2]')
print(result)
result = html.xpath('//li[1]/following-sibling::*')
print(result)

[<Element html at 0x20d4835e3c0>, <Element body at 0x20d48471980>, <Element div at 0x20d4835a680>, <Element ul at 0x20d4835a900>]
[<Element div at 0x20d4835a680>]
['item-0']
[<Element a at 0x20d48471980>]
[]
[<Element a at 0x20d4835a680>]
[<Element li at 0x20d4835a5c0>, <Element li at 0x20d4835a900>, <Element li at 0x20d4835a700>, <Element li at 0x20d4835a580>]
