In [33]:
import xml.etree.ElementTree as ET
import pandas as pd

In [2]:
tree = ET.parse('data.xml')

In [10]:
# Custom method to prettify XML; otherwise, just open it in your browser.

import xml.dom.minidom

dom = xml.dom.minidom.parse('data.xml') # or xml.dom.minidom.parseString(xml_string)
print(dom.toprettyxml())

<?xml version="1.0" ?>
<data>
	
   
	<country name="Liechtenstein">
		
       
		<rank>1</rank>
		
       
		<year>2008</year>
		
       
		<gdppc>141100</gdppc>
		
       
		<neighbor name="Austria" direction="E"/>
		
       
		<neighbor name="Switzerland" direction="W"/>
		
   
	</country>
	
   
	<country name="Singapore">
		
       
		<rank>4</rank>
		
       
		<year>2011</year>
		
       
		<gdppc>59900</gdppc>
		
       
		<neighbor name="Malaysia" direction="N"/>
		
   
	</country>
	
   
	<country name="Panama">
		
       
		<rank>68</rank>
		
       
		<year>2011</year>
		
       
		<gdppc>13600</gdppc>
		
       
		<neighbor name="Costa Rica" direction="W"/>
		
       
		<neighbor name="Colombia" direction="E"/>
		
   
	</country>
	

</data>



In [3]:
print(tree)
print(type(tree))

<xml.etree.ElementTree.ElementTree object at 0x000001FC5DED9A30>
<class 'xml.etree.ElementTree.ElementTree'>


In [4]:
root = tree.getroot()
root

<Element 'data' at 0x000001FC5DEC4860>

In [6]:
print(root.tag)
print(root.attrib)  # No attributes within the root tag
print(len(root))

data
{}
3


In [18]:
# First child of the root
country1 = root[0]
print(country1)

# First child of the child
rank = country1[0]
print(rank)
print(root[0][0])  # alternative: first key-value of first child

# What is the tag of the grandchild
print(rank.tag)

# What is the text inside this grandchild
print(rank.text)

# What are the attributes of last element?
print(country1[4].attrib)

<Element 'country' at 0x000001FC5DEC48B0>
<Element 'rank' at 0x000001FC5DEC4900>
<Element 'rank' at 0x000001FC5DEC4900>
rank
1
{'name': 'Switzerland', 'direction': 'W'}


In [19]:
# Find all child with tag country
for country in root.findall('country'):
    # rank is child of the country
    rank = country.find('rank').text
    # name is attribute of the country
    name = country.get('name')
    print(name, rank)

Liechtenstein 1
Singapore 4
Panama 68


In [22]:
for neighbor in root.iter('neighbor'):
    print(neighbor.attrib)

{'name': 'Austria', 'direction': 'E'}
{'name': 'Switzerland', 'direction': 'W'}
{'name': 'Malaysia', 'direction': 'N'}
{'name': 'Costa Rica', 'direction': 'W'}
{'name': 'Colombia', 'direction': 'E'}


In [30]:
# Top-level elements
print(root.findall("."))

# All 'neighbor' grand-children of 'country' children of the top-level elements
print(root.findall("./country/neighbor"))

# elements with name='Singapore' that have a 'year' child
print(root.findall(".//year/..[@name='Singapore']"))

# 'year' elements that are children of elements with name='Singapore'
print(root.findall(".//*[@name='Singapore']/year"))

# All 'neighbor' elements that are the second child of their parent
print(root.findall(".//neighbor[2]"))

[<Element 'data' at 0x000001FC5DEC4860>]
[<Element 'neighbor' at 0x000001FC5DEC49F0>, <Element 'neighbor' at 0x000001FC5DEC4A40>, <Element 'neighbor' at 0x000001FC5DEC4BD0>, <Element 'neighbor' at 0x000001FC5DEC4D60>, <Element 'neighbor' at 0x000001FC5DEC4DB0>]
[<Element 'country' at 0x000001FC5DEC4A90>]
[<Element 'year' at 0x000001FC5DEC4B30>]
[<Element 'neighbor' at 0x000001FC5DEC4A40>, <Element 'neighbor' at 0x000001FC5DEC4DB0>]


In [34]:
# My method

names = []
rank = []
year = []
gdppc = []

for country in root:
    names.append(country.get('name'))
    rank.append(country.find('rank').text)
    year.append(country.find('year').text)
    gdppc.append(country.find('gdppc').text)

# Calling DataFrame constructor after zipping
# both lists, with columns specified
df = pd.DataFrame(list(zip(names, rank, year, gdppc)),
               columns =['Name', 'Rank', 'Year', 'GDPPC'])    
df

Unnamed: 0,Name,Rank,Year,GDPPC
0,Liechtenstein,1,2008,141100
1,Singapore,4,2011,59900
2,Panama,68,2011,13600


In [35]:
# LHL solution

import xml.etree.ElementTree as ET
import pandas as pd

tree = ET.parse('data.xml')  # Load from file
root = tree.getroot()

my_dict = {'name': [],
           'rank': [],
           'year': [],
           'gdppc': []}


for country in root:
    name_value = country.attrib['name']
    my_dict['name'].append(name_value)

    rank_value = country[0].text
    my_dict['rank'].append(rank_value)

    year_value = country[1].text
    my_dict['year'].append(year_value)

    gdppc_value = country[2].text
    my_dict['gdppc'].append(gdppc_value)

df = pd.DataFrame(my_dict) 
df

Unnamed: 0,name,rank,year,gdppc
0,Liechtenstein,1,2008,141100
1,Singapore,4,2011,59900
2,Panama,68,2011,13600


Note: because all children of the root are countries, `for country in root:` equals `for country in root.findall('country'):` in this case.