# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [1]:
from xml.etree import ElementTree as ET

In [2]:
import operator

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [3]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [4]:
# print names of all countries
for child in document_tree.getroot():
    print(child.find('name').text)

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [5]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print('* ' + element.find('name').text + ':')
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print(capitals_string[:-2])

* Albania:
Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece:
Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia:
Skopje, Kumanovo
* Serbia:
Beograd, Novi Sad, Niš
* Montenegro:
Podgorica
* Kosovo:
Prishtine
* Andorra:
Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [6]:
# Parse data
document = ET.parse( './data/mondial_database.xml' )

#### 1. 10 countries with the lowest infant mortality rates

In [7]:
# Set up container list to hold values
container = {}

In [8]:
# Fill container with corresponding values of 'country' and 'infant_mortality'
for element in document.iterfind('country'):
    if element.find('infant_mortality') is None:
        pass
    else:
        country_name = element.find('name').text
        inf_mort = element.find('infant_mortality').text
        container[country_name] = float(inf_mort)

In [9]:
#container

In [10]:
# Sorts container according to 'infant_mortality' by making it a a dict first
sorted_dict = sorted(container.items(), key=operator.itemgetter(1))
#sorted_dict

In [11]:
# Print first 10 entries which is the 10 countries with the least infant mortality rates and the corresponding rates
print(sorted_dict[:10])

[('Monaco', 1.81), ('Japan', 2.13), ('Norway', 2.48), ('Bermuda', 2.48), ('Singapore', 2.53), ('Sweden', 2.6), ('Czech Republic', 2.63), ('Hong Kong', 2.73), ('Macao', 3.13), ('Iceland', 3.15)]


#### 2. 10 cities with the largest population

In [12]:
# Set up container list to hold values
container2 = {}

In [13]:
# Fill container with corresponding values of 'city' and 'population'
for element in document.iterfind('country'):
    for subelement in document.getiterator('city'):
        if subelement.find("population[@year='2011']") is None:
            pass
        else:
            city_name = subelement.find('name').text
            pop = subelement.find("population[@year='2011']").text
            container2[city_name] = float(pop)

In [14]:
#container2

In [15]:
# Sorts container according to 'population' of a city by making it a a dict first
sorted_dict2 = sorted(container2.items(), key=operator.itemgetter(1), reverse=True)
#sorted_dict2

In [16]:
# Print first 10 entries which is the biggest populated 10 cities along with their corresponding population
print(sorted_dict2[:10])

[('Mumbai', 12442373.0), ('Delhi', 11034555.0), ('Bangalore', 8443675.0), ('Tehran', 8154051.0), ('Dhaka', 7423137.0), ('Hyderabad', 6731790.0), ('Ahmadabad', 5577940.0), ('Luanda', 5000000.0), ('Chennai', 4646732.0), ('Sydney', 4605992.0)]


#### 3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)

In [17]:
# Set up container list to hold values
container3 = {}

In [18]:
# Fill container with corresponding values of 'country', ethnicgroup' and ethnic pop using arithmatic
for element in document.iterfind('country'):
        if element.find('./ethnicgroup[1][@percentage]') is None:
            pass
        else:
            country_name = element.find('name').text
            best_pop = int(element.find('./population[last()]').text)
            ethnic = element.find('./ethnicgroup[1]')
            ethnic_name = ethnic.text
            ethnic_frac = float(ethnic.get('percentage'))/100   # turn percentage into fraction
            #print(country_name, ethnic_name)
            #print(best_pop * ethnic_frac)
            container3[country_name, ethnic_name] = best_pop * ethnic_frac;   # multiply fraction of ethnicgroup by pop

In [19]:
#container3

In [20]:
# Sorts container according to 'population' of an 'ethnicgroup' by making it a a dict first
sorted_dict3 = sorted(container3.items(), key=operator.itemgetter(1), reverse=True)
#sorted_dict3

In [21]:
# Print first 10 entries which is the biggest populated 10 ethnicgroups along with their corresponding population
print(sorted_dict3[:10])

[(('China', 'Han Chinese'), 1245058800.0), (('India', 'Dravidian'), 302713744.25), (('United States', 'European'), 254958101.97759998), (('Nigeria', 'African'), 162651570.84), (('Bangladesh', 'Bengali'), 146776916.72), (('Japan', 'Japanese'), 126534212.00000001), (('Russia', 'Russian'), 114646210.938), (('Indonesia', 'Javanese'), 113456006.10000001), (('Brazil', 'European'), 108886717.794), (('Vietnam', 'Viet/Kinh'), 76078375.3)]


#### 4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

##### a) longest river

In [22]:
# Set up container list to hold values
container4 = {}

In [23]:
# Fill container with corresponding values of river name, country name and length
for element in document.iterfind('river'):
    river_id = element.get('id')
    #print(river_id)
    river_name = element.find('name').text
    #print(river_name)
    country_name = element.get('country')
    #print(country_name)
    length = element.find('./length')
    if length is None:
        pass
    else:
        #print(float(length.text))
        container4[river_id, river_name, country_name] = float(length.text)

In [24]:
#container4

In [25]:
# Sorts container according to 'length' of a 'river' by making it a a dict first
sorted_dict4 = sorted(container4.items(), key=operator.itemgetter(1), reverse=True)
#sorted_dict4

In [26]:
# Print first entry which is the longest river
print(sorted_dict4[:1])

[(('river-Amazonas', 'Amazonas', 'CO BR PE'), 6448.0)]


##### b) largest lake

In [27]:
# Set up container list to hold values
container5 = {}

In [28]:
# Fill container with corresponding values of lake name, country name and area
for element in document.iterfind('lake'):
    lake_id = element.get('id')
    #print(lake_id)
    lake_name = element.find('name').text
    #print(lake_name)
    country_name = element.get('country')
    #print(country_name)
    area = element.find('./area')
    if area is None:
        pass
    else:
        #print(float(area.text))
        container5[lake_id, lake_name, country_name] = float(area.text)

In [29]:
#container5

In [30]:
# Sorts container according to 'area' of a 'lake' by making it a a dict first
sorted_dict5 = sorted(container5.items(), key=operator.itemgetter(1), reverse=True)
#sorted_dict5

In [31]:
# Print first entry which is the largest lake
print(sorted_dict5[:1])

[(('lake-KaspischesMeer', 'Caspian Sea', 'R AZ KAZ IR TM'), 386400.0)]


##### c) airport at highest elevation

In [32]:
# Set up container list to hold values
container6 = {}

In [33]:
# Fill container with corresponding values of airport, country name and elevation
for element in document.iterfind('airport'):
    airport_id = element.get('iatacode')
    #print(airport_id)
    airport_name = element.find('name').text
    #print(airport_name)
    country_name = element.get('country')
    #print(country_name)
    elevation = element.find('./elevation').text
    if elevation is None:
        pass
    else:
        #print(float(elevation.text))
        container6[airport_id, airport_name, country_name] = float(elevation)

In [34]:
#container6

In [35]:
# Sorts container according to 'elevation' of an 'airport' by making it a a dict first
sorted_dict6 = sorted(container6.items(), key=operator.itemgetter(1), reverse=True)
#sorted_dict6

In [36]:
# Print first entry which is the highest elevation airport
print(sorted_dict6[:1])

[(('LPB', 'El Alto Intl', 'BOL'), 4063.0)]
