# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [9]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [10]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [17]:
#print names of all countries
for child in document_tree.getroot():
    print(child.find('name').text)

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [20]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print('* ' + element.find('name').text + ':'),
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print(capitals_string[:-2])

* Albania:
Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece:
Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia:
Skopje, Kumanovo
* Serbia:
Beograd, Novi Sad, Niš
* Montenegro:
Podgorica
* Kosovo:
Prishtine
* Andorra:
Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [5]:
document = ET.parse( './data/mondial_database.xml' )

In [22]:
root = document.getroot()

In [66]:
# Part 1 10 countries with the lowest infant mortality rates

data = {'country':[], 'im':[]}

# for each country, get the name and infant mortality  
for element in document.iterfind('country'):
    data['country'].append(element.find('name').text)
    im = element.find('infant_mortality')
    if im is not None:
        data['im'].append(im.text)
    else:
        data['im'].append('')

In [48]:
import numpy as np
import pandas as pd

In [77]:
# put the country and infant mortality data into a pandas dataframe
data_pd = pd.DataFrame(data)

data_pd = data_pd.replace('',np.NaN)         # deal with missing data
data_pd['im'] = data_pd['im'].astype(float)  # convert infant mortality data to float

# sort the dataframe by infant mortality rate
data_pd.sort_values('im', ascending = True).dropna().head(10)

Unnamed: 0,country,im
38,Monaco,1.81
98,Japan,2.13
117,Bermuda,2.48
36,Norway,2.48
106,Singapore,2.53
37,Sweden,2.6
10,Czech Republic,2.63
78,Hong Kong,2.73
79,Macao,3.13
44,Iceland,3.15


In [None]:
# Part 2 10 cities with the largest population

In [86]:
# Define DataFrame
df = pd.DataFrame(columns = ["city","population"])

country = document.findall('country')

for country in document.findall('country'):
    for city in country.iter('city'):
        city_name = city.find('name').text
        pop_list = city.findall('population')
        if len(pop_list) >= 1:
            city_pop = int(pop_list[-1].text)   # last population in list is most recent
        else:
            city_pop = int(0)                   # no population data for city

        df.loc[len(df)] = [city_name, city_pop]

df.sort_values(by= 'population', ascending= False).head(10)
    

Unnamed: 0,city,population
1341,Shanghai,22315474.0
771,Istanbul,13710512.0
1527,Mumbai,12442373.0
479,Moskva,11979529.0
1340,Beijing,11716620.0
2810,São Paulo,11152344.0
1342,Tianjin,11090314.0
1064,Guangzhou,11071424.0
1582,Delhi,11034555.0
1067,Shenzhen,10358381.0


In [99]:
# Part 3 10 ethnic groups with the largest overall populations
#        (sum of best/latest estimates over all countries)

data = {'egroup':[],'population':[]}

# extract country, ethnic group, and population from xml
for country in document.findall('country'):
    egroup = ''
    population = 0
    for node in list(country):
        if node.tag == 'population':
            # note: the last population tag is used because it is the most recent
            population = int(node.text)
            year = int(node.attrib['year'])
        elif node.tag == 'ethnicgroup':
            percentage = float(node.attrib['percentage'])
            egroup = node.text
            
            if egroup and percentage and population: # only add egroups with population values
                data['egroup'].append(egroup)
                # calculate ethnic group population from percentage of total overall population
                data['population'].append(int(population * percentage / 100.))
                
# create a pandas dataframe from the data
df = pd.DataFrame(data)

# group ethnic group from all countries, sum them to find top 10 ethnic groups and total populations
df.groupby('egroup').sum().sort_values(by = 'population', ascending=False).head(10)


Unnamed: 0_level_0,population
egroup,Unnamed: 1_level_1
Han Chinese,1245058800
Indo-Aryan,871815583
European,494872201
African,318325104
Dravidian,302713744
Mestizo,157734349
Bengali,146776916
Russian,131856989
Japanese,126534212
Malay,121993548


In [116]:
# Part 4 name and country of a) longest river, b) largest lake and c) airport at highest elevation

document = ET.parse( './data/mondial_database.xml' )

river_countries = None
river_name = None
river_length = 0

for r_node in document.iterfind('river'):
    for length in r_node.iterfind('length'):
        if river_length < float(length.text):
            river_length = float(length.text)
            river_countries = r_node.attrib['country']
            river_name = r_node.findtext('name')
            
print('Longest river: ', river_name)
print('Length: ', river_length)
print('Countries: ', river_countries)



Longest river:  Amazonas
Length:  6448.0
Countries:  CO BR PE


In [117]:
lake_countries = None
lake_name = None
lake_area = 0

for l_node in document.iterfind('lake'):
    for area in l_node.iterfind('area'):
        if lake_area < float(area.text):
            lake_area = float(area.text)
            lake_countries = l_node.attrib['country']
            lake_name = l_node.findtext('name')
            
print('Largest lake: ', lake_name)
print('Area: ', lake_area)
print('Countries: ', lake_countries)


Largest lake:  Caspian Sea
Area:  386400.0
Countries:  R AZ KAZ IR TM


In [124]:
airport_country = None
airport_name = None
airport_elevation = 0

for a_node in document.iterfind('airport'):
    for elevation in a_node.iterfind('elevation'):
        if (elevation.text is not None) and (airport_elevation < float(elevation.text)):
            airport_elevation = float(elevation.text)
            airport_country = a_node.attrib['country']
            airport_name = a_node.findtext('name')
            
print('Highest airport: ', airport_name)
print('Elevation: ', airport_elevation)
print('Country: ', airport_country)

Highest airport:  El Alto Intl
Elevation:  4063.0
Country:  BOL
