# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [1]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [321]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [322]:
# print names of all countries
for child in document_tree.getroot():
    print child.find('name').text

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [323]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia: Skopje, Kumanovo
* Serbia: Beograd, Novi Sad, Niš
* Montenegro: Podgorica
* Kosovo: Prishtine
* Andorra: Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

## 1. 10 countries with the lower infant mortality rates

In [324]:
with open( './data/mondial_database.xml', 'r') as xml_file:
    document = ET.parse( xml_file )

In [325]:
inf_mort_dict = {}

# add each country name and infant mortality rate to a dict
for element in document.iterfind('country'):
    country = element.find('name').text
    try:
        inf_mortality = float(element.find('infant_mortality').text)
    except: 
        inf_mortality = 'N/A'
    inf_mort_dict[country] = inf_mortality     

In [326]:
import operator

# sort the dict and print the first 10 key, value pairs
sorted_inf_mort = sorted(inf_mort_dict.items(),key=operator.itemgetter(1))
sorted_inf_mort[0:10]

[('Monaco', 1.81),
 ('Japan', 2.13),
 ('Bermuda', 2.48),
 ('Norway', 2.48),
 ('Singapore', 2.53),
 ('Sweden', 2.6),
 ('Czech Republic', 2.63),
 ('Hong Kong', 2.73),
 ('Macao', 3.13),
 ('Iceland', 3.15)]

# 2a. 10 cities with the largest population

In [327]:
city_pop_dict = {}

# add each country name and population rate to a dict
for element in document.iterfind('country'):
    
    for subelement in element.getiterator('city'):
        year = 0
        #sub_pop_int = 0
        city = subelement.find('name').text
        
        for sub in subelement.getiterator('population'):
            int_pop = get_recent_population(sub)
            
            city_pop_dict[city] = int_pop
    


In [328]:
import operator
# sort the dict and print the first 10 key, value pairs
sorted_city_pop_dict = sorted(city_pop_dict.items(),key=operator.itemgetter(1), reverse=True)
sorted_city_pop_dict[0:10]

[('Shanghai', 22315474),
 ('Istanbul', 13710512),
 ('Mumbai', 12442373),
 ('Moskva', 11979529),
 ('Beijing', 11716620),
 (u'S\xe3o Paulo', 11152344),
 ('Tianjin', 11090314),
 ('Guangzhou', 11071424),
 ('Delhi', 11034555),
 ('Shenzhen', 10358381)]

  ## 2b. 10 countries with the largest population 
I read the question wrong. I'll leave it here though.

In [347]:
pop_dict = {}

# add each country name and population rate to a dict
for element in document.iterfind('country'):
    country = element.find('name').text
    pop = 0
    # find the biggest population value under each country and add to dict
    for subelement in element.getiterator('population'):

        sub_pop = subelement.text
        sub_pop_int = int(sub_pop)
        if sub_pop_int > pop:
            pop = sub_pop_int
            pop_dict[country] = pop
        else:
            pass

In [348]:
# sort the dict and print the first 10 key, value pairs
sorted_pop_dict = sorted(pop_dict.items(),key=operator.itemgetter(1), reverse=True)
sorted_pop_dict[0:10]

[('China', 1360720000),
 ('India', 1210854977),
 ('United States', 318857056),
 ('Indonesia', 252124458),
 ('Brazil', 202768562),
 ('Pakistan', 173149306),
 ('Nigeria', 164294516),
 ('Bangladesh', 149772364),
 ('Russia', 148178487),
 ('Japan', 128057352)]

## 3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)

In [331]:
from lxml import etree
tree = etree.parse('./data/mondial_database.xml')

In [332]:
# only interested in country populations for this problem
# remove city/province population values

for elem in tree.xpath('//city'):
    elem.getparent().remove(elem)
for elem in tree.xpath('//province'):
    elem.getparent().remove(elem)

In [333]:
from collections import defaultdict
ethnicgroup_dict = defaultdict(list)

for element in tree.iterfind('country'):
        
    year = 0    
    
    # find the latest/best population estimate for each country
    for subelement in element.getiterator('population'):               
        latest_population = get_recent_population(subelement)
    
    # find each ethnic group under each country
    for subelement in element.getiterator('ethnicgroup'):
        
        # ethgroup is the dict key
        ethgroup = subelement.text
        
        # look inside ethnic group and find the percentage of the population for that group
        ethpercent = subelement.attrib['percentage']
        ethpercent_num = float(ethpercent)
        
        # find the total of each ethnic group in that country using the percentage
        percent_of_pop = int(round(latest_population * (ethpercent_num / 100)))
        
        # write to dictionary to store values in a list
        ethnicgroup_dict[ethgroup].append(percent_of_pop)    

In [334]:
# sum values for each dict key, add to new dict
summed_ethgroup = {}
for key, lis in ethnicgroup_dict.items():
    summed_ethgroup[key] = sum(lis)

In [335]:
# sort the dict and print the first 10 key, value pairs
sorted_ethgroup_dict = sorted(summed_ethgroup.items(),key=operator.itemgetter(1), reverse=True)

# answer to the question
sorted_ethgroup_dict[0:10]

[('Han Chinese', 1245058800),
 ('Indo-Aryan', 871815583),
 ('European', 494872221),
 ('African', 318325122),
 ('Dravidian', 302713744),
 ('Mestizo', 157734355),
 ('Bengali', 146776917),
 ('Russian', 131856994),
 ('Japanese', 126534212),
 ('Malay', 121993550)]

# 4. Name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [336]:
with open( './data/mondial_database.xml', 'r') as xml_file:
    doc = ET.parse( xml_file )

## 4.a longest river

In [337]:
country_abbrev_dict = {}

# create dict linking country name to its abbreviation

for element in doc.iterfind('country'):
    
    # country = element.find('name').text
    abbrev = element.attrib['car_code']
    country = element.find('name').text
    country_abbrev_dict[country] = abbrev

In [338]:
# initiate list to hold dicts
river_list = []

for element in doc.iterfind('river'):
    # dict to hold river values
    river_elems = {}
    
    # pull out the river name and store
    river_name = element.find('name').text
    
    # pull out the river length and convert to int
    try:
        river_length = element.find('length').text
        river_length_int = int(river_length)
    except:
        pass
    
    # pull out the country abbreviation
    country = element.find('source')
    country_abbrev = country.attrib['country']
    
    # populate dict with values
    river_elems['name'] = river_name
    river_elems['length'] = river_length_int
    river_elems['country'] = country_abbrev
    
    # append dict to list of all rivers
    river_list.append(river_elems)


In [339]:
# sort list of dicts based on length of river
sort_on = 'length'

sorted_river = [(dict_[sort_on], dict_) for dict_ in river_list]

# reverse order for highest value first
sorted_river.sort(reverse=True)

# show only the first entry
# from country_abbrev_dict 'PE' is Peru
result = [dict_ for (key, dict_) in sorted_river]
result[0:1]


[{'country': 'PE', 'length': 6448, 'name': 'Amazonas'}]

## 4.b largest lake

In [340]:
with open( './data/mondial_database.xml', 'r') as xml_file:
    doc = ET.parse( xml_file )

In [341]:
# initiate list to hold dicts
lake_list = []

for element in doc.iterfind('lake'):
    # dict to hold lake values
    lake_elems = {}
    
    # pull out the lake name and store
    lake_name = element.find('name').text
    
    
    # pull out the lake area and convert to int
    try:
        lake_area = element.find('area').text
        
        lake_area_int = int(lake_area)
        
    except:
        pass
    
    # pull out the country abbreviation
    try:
        located = element.find('located')
        lake_country_abbrev = located.attrib['country']
    except:
        pass
    
    
    # populate dict with values
    lake_elems['name'] = lake_name
    lake_elems['area'] = lake_area_int
    lake_elems['country'] = lake_country_abbrev
        
    # append dict to list of all rivers
    lake_list.append(lake_elems)


In [342]:
# sort list of dicts based on area of lake
sort_on = 'area'

sorted_lakes = [(dict_[sort_on], dict_) for dict_ in lake_list]

# reverse order for highest value first
sorted_lakes.sort(reverse=True)

# show only the first entry
# from country_abbrev_dict 'R' is Russia
result = [dict_ for (key, dict_) in sorted_lakes]
result[0:1]

[{'area': 386400, 'country': 'R', 'name': 'Caspian Sea'}]

## 4c. Airport at highest elevation

In [343]:
with open( './data/mondial_database.xml', 'r') as xml_file:
    doc = ET.parse( xml_file )

In [344]:
# initiate list to hold dicts
airport_list = []

for element in doc.iterfind('airport'):
    # dict to hold airport values
    airport_elems = {}
    
    # pull out the airport name and store in variable
    airport_name = element.find('name').text
    
    
    # pull out the airport elevation and convert to int
    try:
        airport_elevation = element.find('elevation').text
        
        airport_elevation_int = int(airport_elevation)        
    except:
        pass
    
    # pull out the country abbreviation
    try:
        located = element.attrib['country']        
    except:
        pass
    
    
    # populate dict with values
    airport_elems['name'] = airport_name
    airport_elems['elevation'] = airport_elevation_int
    airport_elems['country'] = located
        
    # append dict to list of all airports
    airport_list.append(airport_elems)

In [349]:
# sort list of dicts based on elevation of airport
sort_on = 'elevation'

sorted_airports = [(dict_[sort_on], dict_) for dict_ in airport_list]

# reverse order for highest value first
sorted_airports.sort(reverse=True)

# show only the first entry
# from country_abbrev_dict 'BOL' is Bolivia
result = [dict_ for (key, dict_) in sorted_airports]
result[0:1]

[{'country': 'BOL', 'elevation': 4063, 'name': 'El Alto Intl'}]

## HELPER FUNCTION

In [346]:
def get_recent_population(sub):
    '''
    This function takes in a subelement containing
    a population. It sorts through each one, storing the
    year value, and will return the population from 
    the latest year. 
    '''
    global year
    #global sub_pop_int
    
    # get this subelement year
    sub_year = sub.attrib['year']
    
    # convert to int to do boolean check
    sub_year_int = int(sub_year)

    # check if this year is later than the previous ones
    if sub_year_int > year:
        year = sub_year_int
        
        #get population value from the subelement, and return
        sub_pop = sub.text
        latest_pop = int(sub_pop) 
        
        return latest_pop
        
    else:
        pass