In [None]:
from lxml import etree

In [29]:
text = '''
<div>
    <ul>
         <li class="item-0"><a href="link1.html">first item</a></li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-inactive"><a href="link3.html">third item</a></li>
         <li class="item-1"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a>
     </ul>
 </div>
'''

In [None]:
# will complete HTML labels
html = etree.HTML(text)
result = etree.tostring(html)
print(result.decode('utf-8'))

In [None]:
html = etree.parse('./test.html', etree.HTMLParser())
result = etree.tostring(html)
print(result.decode('utf-8'))

In [None]:
# all nodes in the HTML file, including <HTML>, <body>, <div>...
html = etree.parse('./test.html', etree.HTMLParser())
result = html.xpath('//*')
print(result)

In [None]:
# all li nodes
html = etree.parse('./test.html', etree.HTMLParser())
result = html.xpath('//li')
print(result)
print(result[0])

In [None]:
# all <li> nodes' <a> nodes children
html = etree.parse('./test.html', etree.HTMLParser())
result = html.xpath('//li/a')
print(result)

In [13]:
# all <ul> nodes' grandchild <a> nodes 
html = etree.parse('./test.html', etree.HTMLParser())
# //ul/a has no results, need all child and grandchild nodes
result = html.xpath('//ul//a')
print(result)

[<Element a at 0x103caf488>, <Element a at 0x103caf9c8>, <Element a at 0x103cafc48>, <Element a at 0x103cafec8>, <Element a at 0x103cafc08>]


In [14]:
# father node of <a href="link4.html">
html = etree.parse('./test.html', etree.HTMLParser())
result = html.xpath('//a[@href="link4.html"]/../@class')
print(result)

['item-1']


In [15]:
# same result as code above
html = etree.parse('./test.html', etree.HTMLParser())
result = html.xpath('//a[@href="link4.html"]/parent::*/@class')
print(result)

['item-1']


In [16]:
# match by attribute
html = etree.parse('./test.html', etree.HTMLParser())
result = html.xpath('//li[@class="item-0"]')
print(result)

[<Element li at 0x103caf788>, <Element li at 0x103caf0c8>]


In [18]:
# get change line char, because <li>'s son node is <a> node, text is inside <a> node
html = etree.parse('./test.html', etree.HTMLParser())
result = html.xpath('//li[@class="item-0"]/text()')
print(result)

['\n     ']


In [19]:
# should be work now
html = etree.parse('./test.html', etree.HTMLParser())
result = html.xpath('//li[@class="item-0"]/a/text()')
print(result)

['first item', 'fifth item']


In [20]:
# get all results from child nodes including last two pieces
html = etree.parse('./test.html', etree.HTMLParser())
result = html.xpath('//li[@class="item-0"]//text()')
print(result)

['first item', 'fifth item', '\n     ']


In [21]:
# get by attributes
html = etree.parse('./test.html', etree.HTMLParser())
result = html.xpath('//li/a/@href')
print(result)

['link1.html', 'link2.html', 'link3.html', 'link4.html', 'link5.html']


In [25]:
# match by attributes having multi-values
text = '''
         <li class="li li-first"><a href="link.html">first item</a></li>
'''
html = etree.HTML(text)
# get []
result = html.xpath('//li[@class="li"]/a/text()')
print(result)
# need to use contains()
result = html.xpath('//li[contains(@class, "li")]/a/text()')
print(result)

[]
['first item']


In [28]:
# match by multi-attributes
text = '''
         <li class="li li-first" name="item"><a href="link.html">first item</a></li>
'''
html = etree.HTML(text)
result = html.xpath('//li[contains(@class, "li") and @name="item"]/a/text()')
print(result)

['first item']


In [36]:
# choose by index
html = etree.HTML(text)
result = html.xpath('//li[1]/a/text()')
print(result)
result = html.xpath('//li[last()]/a/text()')
print(result)
result = html.xpath('//li[position() < 3]/a/text()')
print(result)
result = html.xpath('//li[last() - 2]/a/text()')
print(result)

['first item']
['fifth item']
['first item', 'second item']
['third item']


In [45]:
text = '''
<div>
    <ul>
         <li class="item-0"><a href="link1.html"><span>first item</span></a></li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-inactive"><a href="link3.html">third item</a></li>
         <li class="item-1"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a>
     </ul>
 </div>
'''
# choose node axis
html = etree.HTML(text)
# all ancestor nodes
result = html.xpath('//li[1]/ancestor::*')
print(result)
# all ancestor nodes which are <div> nodes
result = html.xpath('//li[1]/ancestor::div')
print(result)
# all attributes of the node
result = html.xpath('//li[1]/attribute::*')
print(result)
# child node of <a> node which have attribute "href = link1.html"
result = html.xpath('//li[1]/child::a[@href="link1.html"]')
print(result)
# all descendant nodes which are <span> nodes
result = html.xpath('//li[1]/descendant::span')
print(result)
# all 
result = html.xpath('//li[1]/following::*[2]')
print(result)
# all same level sibling nodes of li[1]
result = html.xpath('//li[1]/following-sibling::*')
print(result)

[<Element html at 0x103da3508>, <Element body at 0x103d271c8>, <Element div at 0x103d98c88>, <Element ul at 0x103d98148>]
[<Element div at 0x103d98c88>]
['item-0']
[<Element a at 0x103da3848>]
[<Element span at 0x103d98148>]
[<Element a at 0x103da3848>]
[<Element li at 0x103d98148>, <Element li at 0x103d98b88>, <Element li at 0x103d98908>, <Element li at 0x103d989c8>]
