# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [1]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [2]:
document_tree = ET.parse( 'data/mondial_database_less.xml' )

In [3]:
# print names of all countries
for child in document_tree.getroot():
    print(child.find('name').text)

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [4]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print('* ' + element.find('name').text + ':'),
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print(capitals_string[:-2])

* Albania:
Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece:
Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia:
Skopje, Kumanovo
* Serbia:
Beograd, Novi Sad, Niš
* Montenegro:
Podgorica
* Kosovo:
Prishtine
* Andorra:
Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [5]:
document = ET.parse( './data/mondial_database.xml' )

In [6]:
import pandas as pd
import numpy as np

In [7]:
#following the reference
root = document.getroot()
root.tag

'mondial'

In [8]:
root.attrib

{}

In [9]:
for child in root:
    print(child.tag)
root[0][1]

country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country


<Element 'population' at 0x000000D887A8DB38>

In [10]:
infant_mort = {}

for country in document.findall('country'):
    try:
       infant_mortality = float(country.find('infant_mortality').text)
       name = country.find('name').text
       infant_mort[name] = infant_mortality
    except AttributeError:
        pass
infant_mort_df = pd.DataFrame.from_dict(infant_mort, orient='index')
infant_mort_df = infant_mort_df.rename(columns={0:'Rate'})
infant_mort_df.sort_values('Rate').head(10)

Unnamed: 0,Rate
Monaco,1.81
Japan,2.13
Bermuda,2.48
Norway,2.48
Singapore,2.53
Sweden,2.6
Czech Republic,2.63
Hong Kong,2.73
Macao,3.13
Iceland,3.15


In [11]:
# 2.10 cities with the largest population

pop = {}

for city in document.iter('city'):
    try:
        population = city.findall('population')[-1].text
        name = city.find('name').text
        pop[name] = int(population)
    except AttributeError:
        pass
    except IndexError:
        pass
    
pop_df = pd.DataFrame.from_dict(pop, orient='index')
pop_df = pop_df.rename(columns={0:'Population'})
pop_df.sort_values('Population', ascending=False).head(10)

Unnamed: 0,Population
Shanghai,22315474
Istanbul,13710512
Mumbai,12442373
Moskva,11979529
Beijing,11716620
São Paulo,11152344
Tianjin,11090314
Guangzhou,11071424
Delhi,11034555
Shenzhen,10358381


In [12]:
#3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)

egpop = []
egpop_labels = ['Country', 'Ethnicity', 'Percentage']

for country in document.findall('country'):
    name = country.find('name').text
    
    for eg in country.iterfind('ethnicgroup'):
        try:
            name_eg = eg.text
            eg_per = float(eg.attrib['percentage'])
            egpop.append([name, name_eg, eg_per])
        except AttributeError:
            pass
        except SyntaxError:
            pass
                             
egpop_df = pd.DataFrame(egpop, columns=egpop_labels)
egpop_df.head()

Unnamed: 0,Country,Ethnicity,Percentage
0,Albania,Albanian,95.0
1,Albania,Greek,3.0
2,Greece,Greek,93.0
3,Macedonia,Macedonian,64.2
4,Macedonia,Albanian,25.2


In [13]:
#first lets get population from countries
pop_ctry = {}

for country in document.iter('country'):
    try:
        population = country.findall('population')[-1].text
        name = country.find('name').text
        pop_ctry[name] = int(population)
    except AttributeError:
        pass
    except IndexError:
        pass
    
pop_ctry_df = pd.DataFrame.from_dict(pop_ctry, orient='index')
pop_ctry_df.reset_index(drop=False, inplace=True)
pop_ctry_df.columns = ['Country', 'Population']
egpop_df = egpop_df.merge(
    pop_ctry_df[['Country', 'Population']], on='Country')
egpop_df.head()

Unnamed: 0,Country,Ethnicity,Percentage,Population
0,Albania,Albanian,95.0,2800138
1,Albania,Greek,3.0,2800138
2,Greece,Greek,93.0,10816286
3,Macedonia,Macedonian,64.2,2059794
4,Macedonia,Albanian,25.2,2059794


In [14]:
e = egpop_df
e['Pop_Eg'] = ((e.Percentage/100)*e.Population)
e.head()

Unnamed: 0,Country,Ethnicity,Percentage,Population,Pop_Eg
0,Albania,Albanian,95.0,2800138,2660131.0
1,Albania,Greek,3.0,2800138,84004.14
2,Greece,Greek,93.0,10816286,10059150.0
3,Macedonia,Macedonian,64.2,2059794,1322388.0
4,Macedonia,Albanian,25.2,2059794,519068.1


In [15]:
e = egpop_df
e.groupby(e.Ethnicity).sum().sort_values('Pop_Eg', ascending=False).head(10)

Unnamed: 0_level_0,Percentage,Population,Pop_Eg
Ethnicity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Han Chinese,91.5,1360720000,1245059000.0
Indo-Aryan,72.0,1210854977,871815600.0
European,970.82,1157295639,494872200.0
African,1868.55,975352746,318325100.0
Dravidian,25.0,1210854977,302713700.0
Mestizo,870.7,279743964,157734400.0
Bengali,98.0,149772364,146776900.0
Russian,224.1,322438406,131857000.0
Japanese,99.4,127298000,126534200.0
Malay,242.3,377500275,121993600.0


In [49]:
#name and country of a) longest river

long_river = []
long_river_labels = ['name', 'country_code', 'lenght']

for river in document.iterfind('river'):
    try:
       length = float(river.find('length').text)
       name = river.find('name').text
       ctry = river.find('located').attrib['country']
       long_river.append([name, ctry, length])
    except AttributeError:
        pass

long_river_df = pd.DataFrame(long_river, columns=long_river_labels)
long_river_df.sort_values('lenght', ascending=False).head(1)


Unnamed: 0,name,country_code,lenght
164,Amazonas,CO,6448.0


In [48]:
ctrycode = {}
for country in document.findall('country'):
    try:
        country_code = country.attrib['car_code']
        ctry = country.find('name').text
        ctrycode[country_code] = ctry
    except AttributeError:
        pass

ctrycode_df = pd.DataFrame.from_dict(ctrycode, orient='index')
ctrycode_df.reset_index(drop=False, inplace=True)
ctrycode_df.columns = ['Country_Code', 'Country']
ctrycode_df.head()
ctrycode_df[ctrycode_df.Country_Code == "CO"]

Unnamed: 0,Country_Code,Country
141,CO,Colombia


In [65]:
#name and country of b) largest lake

larg_lake = []
larg_lake_labels = ['name', 'country_code', 'area']

for lake in document.iterfind('lake'):
    try:
       area = float(lake.find('area').text)
       name = lake.find('name').text
       ctry = lake.find('located').attrib['country']
       larg_lake.append([name, ctry, area])
    except AttributeError:
        pass

larg_lake_df = pd.DataFrame(larg_lake, columns=larg_lake_labels)
a = larg_lake_df.sort_values('area', ascending=False).head(1)
b = ctrycode_df[ctrycode_df.Country_Code == a.country_code[52]]
print(a.name, b.Country)

52    Caspian Sea
Name: name, dtype: object 23    Russia
Name: Country, dtype: object


In [88]:
#name and country of c) airport at highest elevation

high_arprt = []
high_arprt_labels = ['name', 'country_code', 'elevation']

for arprt in document.iterfind('airport'):
    try:
       elev = arprt.find('elevation').text
       name = arprt.find('name').text
       ctry = arprt.attrib['country']
       high_arprt.append([name, ctry, elev])
    except AttributeError:
        pass

high_arprt_df = pd.DataFrame(high_arprt, columns=high_arprt_labels)
high_arprt_df.elevation = high_arprt_df[['elevation']].apply(pd.to_numeric, errors='coerce')
high_arprt_df.sort_values('elevation', ascending=False).head(1)
b = ctrycode_df[ctrycode_df.Country_Code == a.country_code[80]]
#print(a.name, b.Country)

Unnamed: 0,name,country_code,elevation
80,El Alto Intl,BOL,4063.0
