# HTMLParser & XML

An HTMLParser instance is fed HTML data and calls handler methods when start tags, end tags, text, comments, and other markup elements are encountered.

In [6]:
from html.parser import HTMLParser

In [8]:
#creating a subclass and overide the Handler method
class myHTMLParser(HTMLParser):
    def handle_starttag(self,tag,attrs):
        print("found an start tag:",tag)
    def handle_endtag(self,tag):
        print("found an end tag:",tag)
    def handle_startendtag(self,tag,attrs):
        print("found a empty tag:",tag)
# instantiate the parser and fed it some HTML
parser = myHTMLParser()
parser.feed("<html><head><title>HTML Parser - I</title></head>"
            +"<body><h1>HackerRank</h1><br /></body></html>")


found an start tag: html
found an start tag: head
found an start tag: title
found an end tag: title
found an end tag: head
found an start tag: body
found an start tag: h1
found an end tag: h1
found a empty tag: br
found an end tag: body
found an end tag: html


1   handle_starttag(tag, attrs)

This method is called to handle the start tag of an element. (For example: <div class='marks'>) 
The tag argument is the name of the tag converted to lowercase. 
The attrs argument is a list of (name, value) pairs containing the attributes found inside the tag’s <> brackets. 


2   handle_endtag(tag)

This method is called to handle the end tag of an element. (For example: </div>) 
The tag argument is the name of the tag converted to lowercase. 


3   handle_startendtag(tag,attrs)

This method is called to handle the empty tag of an element. (For example: <br />) 
The tag argument is the name of the tag converted to lowercase. 
The attrs argument is a list of (name, value) pairs containing the attributes found inside the tag’s <> brackets.

4   handle_comment(data) 
This method is called when a comment is encountered (e.g. <!--comment-->). 
The data argument is the content inside the comment tag:

5   handle_data(data) 
This method is called to process arbitrary data (e.g. text nodes and the content of <script>...</script> and <style>...</style>). 
The data argument is the text content of HTML.

In [17]:
#creating a subclass and overide the Handler method
class myHTMLParser(HTMLParser):
    def handle_starttag(self,tag,attrs):
        print("Start :",tag)

    def handle_endtag(self,tag):
        print("End  :",tag)
    
    def handle_startendtag(self,tag,attrs):
        print("Empty :",tag)
    
    def handle_comment(self,cmt):
        print("Comments :" , cmt)
    
    def handle_data(self,data):
        print("Data :",data)
parser = myHTMLParser()
parser.feed("<html><head><title>HTML Parser - I</title></head>"+
"<body data-modal-target class='1'><h1>HackerRank</h1><br /></body></html>")


Start : html
Start : head
Start : title
Data : HTML Parser - I
End  : title
End  : head
Start : body
->  data-modal-target  >  None
->  class  >  1
Start : h1
Data : HackerRank
End  : h1
Empty : br
End  : body
End  : html


In [41]:
from html.parser import HTMLParser

class MyHTMLParser(HTMLParser):
    def handle_comment(self,cmt):
        number_of_line = len(cmt.split('\n'))
        if number_of_line>1:
            print('>>> Multi-line Comment')
        else:
            print('>>> Single-line Comment')
        if cmt.strip():
            print(cmt)    
    def handle_data(self,data):
        if data!='\n':
            print(">>> Data ")
            print(data)  
html = "<!--[if IE 9]>IE9-specific content\n<![endif]-->\n<div> Welcome to HackerRank</div>\n<!--[if IE 9]>IE9-specific content<![endif]-->"       
parser = MyHTMLParser()
parser.feed(html)
parser.close()

>>> Multi-line Comment
[if IE 9]>IE9-specific content
<![endif]
>>> Data 
 Welcome to HackerRank
>>> Single-line Comment
[if IE 9]>IE9-specific content<![endif]


In [45]:
# print all attribute and values and tags
class myHTMLParser(HTMLParser):
    def handle_starttag(self,tag,attrs):
        print(tag)
        for i,j in attrs:
            print('->',i,'>',j)

    def handle_startendtag(self,tag,attrs):
        print(tag)
        for i,j in attrs:
            print('->',i,'>',j)
html = """
<head>
<title>HTML</title>
</head>
<object type="application/x-flash" 
  data="your-file.swf" 
  width="0" height="0">
  <!-- <param name="movie" value="your-file.swf" /> -->
  <param name="quality" value="high"/>
</object>
"""
parser = myHTMLParser()
parser.feed(html)

head
title
object
-> type > application/x-flash
-> data > your-file.swf
-> width > 0
-> height > 0
param
-> name > quality
-> value > high


# XML

In [109]:
import xml.etree.ElementTree as etree
tree = etree.parse("F:\\android\\Projects\\Calculator\\app\\src\\main\\res\\layout\\activity_main2.xml")
root = tree.getroot()
list(root)
for child in root:
    print(child)

<Element 'TextView' at 0x00952CF0>
<Element 'TextView' at 0x009528A0>
<Element 'TableRow' at 0x00952C00>
<Element 'TableRow' at 0x00952E70>
<Element 'TableRow' at 0x0095A090>
<Element 'TableRow' at 0x0095A270>
<Element 'TableRow' at 0x0095A450>


In [64]:
root.attrib  # root attribute
root[0].attrib.keys()

dict_keys(['{http://schemas.android.com/apk/res/android}background', '{http://schemas.android.com/apk/res/android}layout_height', '{http://schemas.android.com/apk/res/android}id', '{http://schemas.android.com/apk/res/android}layout_width', '{http://schemas.android.com/apk/res/android}textAlignment', '{http://schemas.android.com/apk/res/android}textColor', '{http://schemas.android.com/apk/res/android}textSize', '{http://schemas.android.com/apk/res/android}textStyle'])

In [141]:
import xml.etree.ElementTree as etree
xml = """
<feed xml:lang='en'>
  <title>HackerRank</title>
  <subtitle lang='en'>Programming challenges</subtitle>
  <link rel='alternate' type='text/html' href='http://hank.com/'/>
  <updated>2013-12-25T12:00:00</updated>
  <entry>
  	<author gender='male'>Harsh</author>
    <question type='hard'>XML 1</question>
    <description type='text'>This is related to XML parsing</description>
  </entry>
</feed>"""
tree = etree.ElementTree(etree.fromstring(xml))
root = tree.getroot()
print(sum([len(elem.items()) for elem in tree.iter()]))

8


In [152]:
maxdepth = 0
def depth(elem, level):
    global maxdepth
    for i in elem:
        if len(i.getchildren())==0:
            print(i.getchildren())
            pass
        else :
            maxdepth += 1
            print(i.getchildren())
            depth(i.getchildren())
print(maxdepth)
xml = """
<feed xml:lang='en'>
    <title>HackerRank</title>
    <subtitle lang='en'>Programming challenges</subtitle>
    <link rel='alternate' type='text/html' href='http://hackerrank.com/'/>
    <updated>2013-12-25T12:00:00</updated>
</feed>
"""
tree = etree.ElementTree(etree.fromstring(xml))
depth(tree.getroot() , -1)
print(maxdepth)

[<Element 'title' at 0x00AE2DE0>, <Element 'subtitle' at 0x00AE2EA0>, <Element 'link' at 0x00ADE780>, <Element 'updated' at 0x00ADE4E0>]


TypeError: depth() missing 1 required positional argument: 'level'