# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [16]:
from xml.etree import ElementTree as ET
import pandas as pd
import numpy as np

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [3]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [54]:
# print names of all countries
for child in document_tree.getroot():
    print(child.find('name').text)

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [7]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print('* ' + element.find('name').text + ': ',end='')
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print(capitals_string[:-2])

* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia: Skopje, Kumanovo
* Serbia: Beograd, Novi Sad, Niš
* Montenegro: Podgorica
* Kosovo: Prishtine
* Andorra: Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [8]:
document = ET.parse( './data/mondial_database.xml' )

## 1. 10 countries with the lowest infant mortality rates

In [56]:
# Build Country Dataframe From XML

#First Build a list of dictionaries from XML
#Note: Not all countries list infant_mortality.  So in this case, don't import the country into dataframe
rows_list = []

for child in document.iterfind('country'):
    dict1 = {}
    dict1['name'] = child.find('name').text
    node = child.find('infant_mortality')
    if node == None:
        continue
    else:
        dict1['infant_mortality'] = child.find('infant_mortality').text
    rows_list.append(dict1)
    
#Then create the dataframe and sort by infant_mortality
countries = pd.DataFrame(rows_list, columns=('name','infant_mortality'))
countries

Unnamed: 0,name,infant_mortality
0,Albania,13.19
1,Greece,4.78
2,Macedonia,7.9
3,Serbia,6.16
4,Andorra,3.69
5,France,3.31
6,Spain,3.33
7,Austria,4.16
8,Czech Republic,2.63
9,Germany,3.46


In [57]:
#List 10 countries with lowest infant mortality
countries = countries.sort_values(by='infant_mortality')
countries.head(10)

Unnamed: 0,name,infant_mortality
36,Monaco,1.81
28,Romania,10.16
142,Fiji,10.2
63,Brunei,10.48
124,Grenada,10.5
221,Mauritius,10.59
116,Panama,10.7
227,Seychelles,10.77
94,United Arab Emirates,10.92
105,Barbados,10.93


## 2. 10 cities with the largest population

In [61]:
# Build City Dataframe From XML

#First Build a list of dictionaries from XML
#Iterate through each country, the iterate through each city 
rows_list = []

for child in document.iterfind('country'):
    country_name = child.find('name').text
    for child2 in child.iterfind('city'):
        dict1 = {}
        dict1['country'] = country_name
        dict1['city'] = child2.find('name').text
        #The XML can contain several population listings, we will only use the latest one
        maxpopyear = 0
        maxpop = 0
        for child3 in child2.iterfind('population'):
            popyear = int(child3.attrib['year'])
            pop = int(child3.text)
            if popyear > maxpopyear:
                maxpopyear = popyear
                maxpop = pop
        dict1['population'] = maxpop
        dict1['populationyear'] = maxpopyear
        rows_list.append(dict1)
    
#Then create the dataframe
cities = pd.DataFrame(rows_list, columns=('country','city','population','populationyear'))
cities

Unnamed: 0,country,city,population,populationyear
0,Albania,Tirana,418495,2011
1,Albania,Shkodër,77075,2011
2,Albania,Durrës,113249,2011
3,Albania,Vlorë,79513,2011
4,Albania,Elbasan,78703,2011
5,Albania,Korçë,51152,2011
6,Macedonia,Skopje,514967,2011
7,Macedonia,Kumanovo,107745,2011
8,Serbia,Beograd,1639121,2011
9,Serbia,Novi Sad,335701,2011


In [62]:
#List the 10 cities with the largest population
cities = cities.sort_values(by='population', ascending=False)
cities.head(10)

Unnamed: 0,country,city,population,populationyear
176,South Korea,Seoul,9708483,2010
164,Egypt,Al Qahirah,8471859,2006
80,Thailand,Bangkok,7506700,1999
128,Hong Kong,Hong Kong,7055071,2009
92,Vietnam,Ho Chi Minh,5968384,2009
212,Singapore,Singapore,5076700,2010
163,Egypt,Al Iskandariyah,4123869,2006
216,Taiwan,New Taipei,3939305,2012
177,South Korea,Busan,3403135,2010
107,North Korea,Pyongyang,3255288,2008


# 3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)

In [63]:
# Build Ethnic Groups Dataframe From XML

#First Build a list of dictionaries from XML
#Iterate through each country, find latest population count, then iterate through each ethnicgroup and add to list
rows_list = []
for child in document.iterfind('country'):
    country_name = child.find('name').text
    #The XML can contain several population listings, we will only use the latest one
    maxpopyear = 0
    maxpop = 0
    for child2 in child.iterfind('population'):
        popyear = int(child2.attrib['year'])
        pop = int(child2.text)
        if popyear > maxpopyear:
            maxpopyear = popyear
            maxpop = pop
        
    for child3 in child.iterfind('ethnicgroup'):
        dict1 = {}
        dict1['country'] = country_name
        dict1['totalpopulation'] = maxpop
        dict1['populationyear'] = maxpopyear
        dict1['ethnicgroup'] = child3.text
        dict1['percentage'] = float(child3.attrib['percentage'])
        rows_list.append(dict1)
    
#Then create the dataframe
ethnicgroups = pd.DataFrame(rows_list, columns=('country','ethnicgroup','percentage','totalpopulation','populationyear'))
ethnicgroups

Unnamed: 0,country,ethnicgroup,percentage,totalpopulation,populationyear
0,Albania,Albanian,95.00,2800138,2011
1,Albania,Greek,3.00,2800138,2011
2,Greece,Greek,93.00,10816286,2011
3,Macedonia,Macedonian,64.20,2059794,2011
4,Macedonia,Albanian,25.20,2059794,2011
5,Macedonia,Turkish,3.90,2059794,2011
6,Macedonia,Gypsy,2.70,2059794,2011
7,Macedonia,Serb,1.80,2059794,2011
8,Serbia,Serb,82.90,7120666,2011
9,Serbia,Montenegrin,0.90,7120666,2011


In [64]:
#Add a column to dataset which calculates percentage of total population
ethnicgroups['ethnicgrouppopulation'] = (ethnicgroups['percentage'] / 100) * ethnicgroups['totalpopulation']
ethnicgroups.ethnicgrouppopulation = ethnicgroups.ethnicgrouppopulation.astype(int)
ethnicgroups

Unnamed: 0,country,ethnicgroup,percentage,totalpopulation,populationyear,ethnicgrouppopulation
0,Albania,Albanian,95.00,2800138,2011,2660131
1,Albania,Greek,3.00,2800138,2011,84004
2,Greece,Greek,93.00,10816286,2011,10059145
3,Macedonia,Macedonian,64.20,2059794,2011,1322387
4,Macedonia,Albanian,25.20,2059794,2011,519068
5,Macedonia,Turkish,3.90,2059794,2011,80331
6,Macedonia,Gypsy,2.70,2059794,2011,55614
7,Macedonia,Serb,1.80,2059794,2011,37076
8,Serbia,Serb,82.90,7120666,2011,5903032
9,Serbia,Montenegrin,0.90,7120666,2011,64085


In [65]:
egtotals = ethnicgroups.groupby('ethnicgroup').ethnicgrouppopulation.sum()
egtotals.sort_values(ascending=False).head(10)

ethnicgroup
Han Chinese    1245058800
Indo-Aryan      871815583
European        494872201
African         318325104
Dravidian       302713744
Mestizo         157734349
Bengali         146776916
Russian         131856989
Japanese        126534212
Malay           121993548
Name: ethnicgrouppopulation, dtype: int32

## 4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [88]:
#First we build dataset of all the countries so we can search by country id
rows_list = []
for child in document.iterfind('country'):
    dict1 = {}
    dict1['countryid'] = child.attrib['car_code']
    dict1['countryname'] = child.find('name').text
    rows_list.append(dict1)
    
countryids = pd.DataFrame(rows_list, columns=('countryid','countryname'))
#countryids = countryids.set_index('id')
countryids
    

Unnamed: 0,countryid,countryname
0,AL,Albania
1,GR,Greece
2,MK,Macedonia
3,SRB,Serbia
4,MNE,Montenegro
5,KOS,Kosovo
6,AND,Andorra
7,F,France
8,E,Spain
9,A,Austria


In [95]:
#Get the Longest River
rows_list = []
for child in document.iterfind('river'):
    rivername = child.find('name').text
    node = child.find('length')
    if node == None:
        continue
    else:
        length = float(node.text)
    #multiple countries possible
    for child2 in child.iterfind('located'):
        countryid = child2.attrib['country']
        #countryname = str(countryids.loc[countryid])
        dict1 = {}
        dict1['name'] = rivername
        dict1['length'] = length
        dict1['countryid'] = countryid
        rows_list.append(dict1)
        
rivers = pd.DataFrame(rows_list, columns=('name','length','countryid'))
rivers = rivers.merge(countryids)
rivers.sort_values(by='length',ascending=False).head(3)

    

Unnamed: 0,name,length,countryid,countryname
258,Amazonas,6448.0,PE,Peru
250,Amazonas,6448.0,BR,Brazil
247,Amazonas,6448.0,CO,Colombia


In [100]:
#Get the Largest Lake
rows_list = []
for child in document.iterfind('lake'):
    lakename = child.find('name').text
    node = child.find('area')
    if node == None:
        continue
    else:
        area = float(node.text)
    #multiple countries possible
    for child2 in child.iterfind('located'):
        countryid = child2.attrib['country']
        #countryname = str(countryids.loc[countryid])
        dict1 = {}
        dict1['name'] = lakename
        dict1['area'] = area
        dict1['countryid'] = countryid
        rows_list.append(dict1)
        
lakes = pd.DataFrame(rows_list, columns=('name','area','countryid'))
lakes = lakes.merge(countryids)
lakes.sort_values(by='area',ascending=False).head(4)


Unnamed: 0,name,area,countryid,countryname
67,Caspian Sea,386400.0,TM,Turkmenistan
64,Caspian Sea,386400.0,KAZ,Kazakhstan
63,Caspian Sea,386400.0,IR,Iran
50,Caspian Sea,386400.0,R,Russia


In [105]:
#Get the Airport with the Highest Elevation
rows_list = []
for child in document.iterfind('airport'):
    dict1 = {}
    dict1['name'] = child.find('name').text
    dict1['countryid'] = child.attrib['country']
    node = child.find('elevation')
    if node == None:
        continue
    elif node.text == None:
        continue
    else:
        dict1['elevation'] = float(node.text)
    rows_list.append(dict1)
        
airports = pd.DataFrame(rows_list, columns=('name','elevation','countryid'))
airports = airports.merge(countryids)
airports.sort_values(by='elevation',ascending=False).head(1)

Unnamed: 0,name,elevation,countryid,countryname
80,El Alto Intl,4063.0,BOL,Bolivia
