# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [3]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [122]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [123]:
# print names of all countries
for child in document_tree.getroot():
    print (child.find('name').text)

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [124]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print ('* ' + element.find('name').text + ':',)
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print (capitals_string[:-2])

* Albania:
Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece:
Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia:
Skopje, Kumanovo
* Serbia:
Beograd, Novi Sad, Niš
* Montenegro:
Podgorica
* Kosovo:
Prishtine
* Andorra:
Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [125]:
document = ET.parse( './data/mondial_database.xml' )

In [126]:
# 1. 10 countries with the lowest infant mortality rates

dict = {}
for element in document.iterfind('country'):
    country = element.find('name').text
    
    iM = element.find('infant_mortality')
    if iM is not None:
        dict[float(iM.text)] = country
print ("10 countries with the lowest infant mortality rates are:")
count = 0
for rates in sorted(dict.keys()):
    print(dict[rates])
    count +=1
    if count == 10:
        break
    

    
        
        
   

10 countries with the lowest infant mortality rates are:
Monaco
Japan
Bermuda
Singapore
Sweden
Czech Republic
Hong Kong
Macao
Iceland
Italy


In [127]:
#2. 10 cities with the largest population
#note, didn't have enough time to see if 
#populations were from latest census for the city or not.

cities = document.findall('country/city')
dict = {}
for city in cities:
    cityName = city.findtext('name')
    population = city.findtext('population')
    if ((population is not None) & (city is not None)):
        dict[int(population)] = cityName

print ("10 cities with the largest populations are:")
count = 0
for populations in sorted(dict.keys(), reverse=True):
    print(dict[populations])
    count +=1
    if count == 10:
        break    


10 cities with the largest populations are:
Seoul
Hong Kong
Al Qahirah
Bangkok
Ho Chi Minh
Busan
New Taipei
Hanoi
Al Iskandariyah
Taipei


In [178]:
#2. 10 cities with the largest population 
#populations were from latest census for the city or not.
populationsDict = {}  ## Key population, value City...
for countries in document.iterfind('country'):
  
    city_string = ''
    stuff = ''
    prevYear = ''
    j = 0
    for subelement in countries.getiterator('city'):
        city = subelement.find('name').text         
        i = 0
        population = ''

        for ssubelement in subelement.getiterator('population'):                
            population = (ssubelement.text)
           
        if(population != ''):
            populationDict[int(population)] = city
    
print ("10 cities with the largest populations are:")
count = 0
for populations in sorted(populationDict.keys(), reverse=True):
    print(populationDict[populations])
    
    count +=1
    if count == 10:
        break            


10 cities with the largest populations are:
Shanghai
Istanbul
Mumbai
Moskva
Beijing
São Paulo
Tianjin
Guangzhou
Delhi
Shenzhen


In [241]:
#3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
eG = {}

for element in document.iterfind('country'):
    country = element.find('name').text
    
    #setting up country list
    for children in element.getchildren():
        if children.tag == "population":            
            latestPopulation = children
            population = (int(latestPopulation.text))
            
    for children in element.getchildren():
        if children.tag == 'ethnicgroup':
            
            percentageText = children.attrib['percentage']
            ethnicGroup = children.text            
            percentage = float(percentageText)
            countryEGPop = (percentage * population / 100)
            if ethnicGroup in eG.keys():
                accumEthnicGroupPop = eG.get(ethnicGroup)
                eG[ethnicGroup] = countryEGPop + accumEthnicGroupPop
            else:
                eG[ethnicGroup] = countryEGPop
                
count = 0
for ethnicPopulations in sorted(eG, key=eG.get, reverse=True):
    print(ethnicPopulations, ": ",eG[ethnicPopulations] )
    
    count +=1
    if count == 10:
        break


Han Chinese :  1245058800.0
Indo-Aryan :  871815583.44
European :  494872219.71959996
African :  318325120.369
Dravidian :  302713744.25
Mestizo :  157734354.93699998
Bengali :  146776916.72
Russian :  131856996.077
Japanese :  126534212.0
Malay :  121993550.374


In [384]:
#4. name and country of a) longest river, b) largest lake and c) airport at highest elevationeG = {}
riverLenghthDicts = {}  #river lenght and name
riverNameDicts = {}
lakeAreaDicts = {}
lakeNameDicts = {}
airportElevationDicts = {}
airportNameDicts = {}
for rivers in document.iterfind('river'):
    riverName = rivers.find('name').text
    
    if rivers.find('length') is not None:
        length = float(rivers.find('length').text)
        riverCountry = rivers.attrib['country']
        riverLenghthDicts[riverName] = length
        riverNameDicts[riverName] = riverCountry
       
for lakes in document.iterfind('lake'):
    lakeName = lakes.find('name').text
    
    if lakes.find('area') is not None:
        area = float(lakes.find('area').text)
        lakeCountry = lakes.attrib['country']
        lakeAreaDicts[lakeName] = area
        lakeNameDicts[lakeName] = lakeCountry
        
for airports in document.iterfind('airport'):
    airportName = airports.find('name').text
    
    if airports.find('elevation').text is not None:
        
        airportElevation = int(airports.find('elevation').text)
        airportCountry = airports.attrib['country']
        airportElevationDicts[airportName] = airportElevation
        airportNameDicts[airportName] = airportCountry
    
        
        
    
longestRiver = max(riverLenghthDicts, key=riverLenghthDicts.get)
print (longestRiver, " is the longest River, located in ", riverNameDicts[longestRiver])  
largestLake = max(lakeAreaDicts, key=lakeAreaDicts.get)
print(largestLake, " is the largest Lake, located in ", lakeNameDicts[largestLake])
highestAirport = max(airportElevationDicts, key=airportElevationDicts.get)
print (highestAirport, " has the highest elevation, it is located in ", airportNameDicts[highestAirport])
    
    
    
    
    
 

Amazonas  is the longest River, located in  CO BR PE
Caspian Sea  is the largest Lake, located in  R AZ KAZ IR TM
El Alto Intl  has the highest elevation, it is located in  BOL
