# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [1]:
from xml.etree import ElementTree as ET
import pandas as pd
import numpy as np

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [2]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [3]:
# print names of all countries
for child in document_tree.getroot():
    print (child.find('name').text)

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [4]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print ('* ' + element.find('name').text + ':',)
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print (capitals_string[:-2])

* Albania:
Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece:
Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia:
Skopje, Kumanovo
* Serbia:
Beograd, Novi Sad, Niš
* Montenegro:
Podgorica
* Kosovo:
Prishtine
* Andorra:
Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [5]:
document = ET.parse( './data/mondial_database.xml' )

## Question 1:

In [6]:
# Set blank dictionary
country_dict = {}

#Create loop to find country name and infant mortality rate
for element in document.iterfind('country'):
    country = element.find('name')
    inf_mor = element.find('infant_mortality')
    
    if (country) != None:
        if (inf_mor)!= None:
            country_dict[country.text] = float(inf_mor.text)
        else:
            pass

#Create a dataframe from dictionary        
country_df = pd.DataFrame.from_dict(country_dict, orient='index')

#Set column name to Infant Mortality Rate
country_df.columns = ['Infant_Mortality_Rate']

#Get 10 lowest infant mortality rates by sorting values
country_df.sort_values(by='Infant_Mortality_Rate', ascending=True).head(10)

Unnamed: 0,Infant_Mortality_Rate
Monaco,1.81
Japan,2.13
Bermuda,2.48
Norway,2.48
Singapore,2.53
Sweden,2.6
Czech Republic,2.63
Hong Kong,2.73
Macao,3.13
Iceland,3.15


## Question 2:

In [7]:
document_root = document.getroot()

In [15]:
#Create empty list to store country, city, year, and population
city_pop_list = []

#Create loop to find country name, city, year, and population
for country in document_root.getiterator('country'):
    country_name = country.find('name').text

#Create loop for all cities within a country
    for city in country.iter('city'):
        city_name = city.find('name').text
        
        if city.findall('population') is None:
            continue;
#Some countries have multiple population values. We only want the most recent recorded value,
#so create a loop that finds the max value for year and store that value

        year = 0
        population = 0
        
        for pop in city.findall('population'):
            if int(pop.attrib['year']) > year:
                year = int(pop.attrib['year'])
                population = int(pop.text)

#Append our elements to the empty list            
        city_pop_list.append([country_name, city_name, int(pop.attrib['year']), int(pop.text)])
        
#Create data frame from list
df_city_pop = pd.DataFrame.from_records(city_pop_list)

#Set column names
df_city_pop.columns=['Country','City','Year', 'Population']
            
#Get 10 most populous cities by sorting values
df_city_pop.sort_values(by='Population', ascending=False).head(10)

Unnamed: 0,Country,City,Year,Population
1341,China,Shanghai,2010,22315474
771,Turkey,Istanbul,2012,13710512
1527,India,Mumbai,2011,12442373
479,Russia,Moskva,2013,11979529
1340,China,Beijing,2010,11716620
2810,Brazil,São Paulo,2010,11152344
1342,China,Tianjin,2010,11090314
1064,China,Guangzhou,2010,11071424
1582,India,Delhi,2011,11034555
1067,China,Shenzhen,2010,10358381


## Question 3: 

10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)

In [24]:
#Create empty list to store country, population, year, ethnic groups and percentage of ethnic group
ethnic_grp_list = []

#Create a loop to find country name, population, year, ethnic groups and percentage of ethnic group
for country in document.iterfind('country'):
    country_name = country.find('name').text
    
#Similar to what was done in the above problem, population is stored as the most recent year
    year = 0
    population = 0
    for pop in country.findall('population'):
        if int(pop.attrib['year']) > year:
            population = int(pop.text)
            year = int(pop.attrib['year'])

#Create a loop to find all ethnic groups within a country and store the percentage as a float            
    for ethgrp in country.findall('ethnicgroup'):
        percent = float(ethgrp.attrib['percentage']) / 100

#Append our elements to empty list - for ethnic population total population was multiplied by ethnic percentage        
    ethnic_grp_list.append([country_name, int(pop.text), int(pop.attrib['year']), ethgrp.text, percent, (int(pop.text)*percent)])

#Create a data frame from list
df_ethnic_grp = pd.DataFrame.from_records(ethnic_grp_list)

#Set column names
df_ethnic_grp.columns = ['Country', 'Population', 'Year', 'Ethnic Group', 'Percent', 'Ethnic Pop']

#Select and group by ethnic group and ethnic population
df_ethnic_grp = df_ethnic_grp[['Ethnic Group', 'Ethnic Pop']].groupby(['Ethnic Group']).sum()

#Change float format to get rid of scientific notation
pd.set_option('display.float_format', lambda x: '%.1f' % x)

#Get 10 largest ethnic groups by sorting values
df_ethnic_grp.sort_values('Ethnic Pop', ascending=False).head(10)   

Unnamed: 0_level_0,Ethnic Pop
Ethnic Group,Unnamed: 1_level_1
Han Chinese,1245058800.0
African,261447756.0
Japanese,181309297.7
Bengali,146776916.7
Malay,108725812.3
Eastern Hamitic,82830376.5
Arab-Berber,80604497.2
European,73515284.0
Thai,49486244.2
Mediterranean Nordic,46815916.0


## Question 4:

In [10]:
#Start with dictionary to store country names and codes since the rivers, lakes, and airports use country codes
ctry_dict={}
for ctry_element in document.iterfind('country'):
    ctry_dict[ctry_element.attrib['car_code']]=ctry_element.find('name').text

### Rivers

In [11]:
#Create empty list to store river name and length, as well as, country code and name
river_list = []

#Create loop to find river name, river length, country code, and country name
for element in document.iterfind('river'):
    river = element.find('name').text
    rlen_element = element.find('length')
    if rlen_element != None:
        river_length = rlen_element.text
        
#Rivers pass through multiple countries and are stored together. Split country codes and retrieve country name
        for river_cc in element.attrib['country'].split():
            river_country = ctry_dict[river_cc]
            
#Append our elements to empty list
            river_list.append([river, float(river_length), river_country])

#Create a data frame from list
river_df = pd.DataFrame.from_records(river_list)

#Set column names
river_df.columns=['River', 'River Length', 'Country']

#Sort by river length to find longest river
river_df.sort_values(by='River Length', ascending=False).head(10)

Unnamed: 0,River,River Length,Country
300,Amazonas,6448.0,Peru
298,Amazonas,6448.0,Colombia
299,Amazonas,6448.0,Brazil
240,Jangtse,6380.0,China
239,Hwangho,4845.0,China
215,Lena,4400.0,Russia
358,Zaire,4374.0,Zaire
357,Zaire,4374.0,Congo
245,Mekong,4350.0,Vietnam
243,Mekong,4350.0,Thailand


### Lakes

In [12]:
#The same process was followed as seen above with the river data
lakes_list = []

for element in document.iterfind('lake'):
    lake = element.find('name').text
    area_element = element.find('area')
    if area_element != None:
        area = area_element.text
        for lake_cc in element.attrib['country'].split():
            lake_country = ctry_dict[lake_cc]
            lakes_list.append([lake, float(area), lake_country])
        
lakes_df = pd.DataFrame.from_records(lakes_list)

lakes_df.columns=['Lake', 'Area', 'Country']
        
lakes_df.sort_values(by='Area', ascending=False).head(10)

Unnamed: 0,Lake,Area,Country
68,Caspian Sea,386400.0,Russia
69,Caspian Sea,386400.0,Azerbaijan
70,Caspian Sea,386400.0,Kazakhstan
71,Caspian Sea,386400.0,Iran
72,Caspian Sea,386400.0,Turkmenistan
151,Lake Superior,82103.0,United States
150,Lake Superior,82103.0,Canada
109,Lake Victoria,68870.0,Uganda
107,Lake Victoria,68870.0,Tanzania
108,Lake Victoria,68870.0,Kenya


### Airports

In [13]:
#The same process was followed as seen above with the river and lakes data
airport_list = []

for element in document.iterfind('airport'):
    airport = element.find('name').text
    
#Since airports are only in one country there is no need to use split
    air_cc = element.attrib['country']
    country = ctry_dict[air_cc]
    el_element = element.find('elevation')
    if el_element != None:
        elevation = el_element.text
        
#Some elevations have no text - discard those then elevation can be appended as a float
        if elevation != None:
            airport_list.append([airport, float(elevation), country])
            
airport_df = pd.DataFrame.from_records(airport_list)

airport_df.columns=['Airport Name', 'Elevation', 'Country']

In [14]:
airport_df.sort_values(by='Elevation', ascending=False).head(10)

Unnamed: 0,Airport Name,Elevation,Country
80,El Alto Intl,4063.0,Bolivia
212,Lhasa-Gonggar,4005.0,China
230,Yushu Batang,3963.0,China
787,Juliaca,3827.0,Peru
789,Teniente Alejandro Velasco Astete Intl,3311.0,Peru
82,Juana Azurduy De Padilla,2905.0,Bolivia
308,Mariscal Sucre Intl,2813.0,Ecuador
779,Coronel Fap Alfredo Mendivil Duarte,2719.0,Peru
781,Mayor General FAP Armando Revoredo Iglesias Ai...,2677.0,Peru
666,Licenciado Adolfo Lopez Mateos Intl,2581.0,Mexico
