# XML exercise

Using data from [**mondial database**](https://drive.google.com/file/d/14lFT4nWHgwN36ij4XZh6OUuup-K9qLgR/view?usp=sharing) find the answers to following questions:

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [68]:
import xml.etree.ElementTree as ET

import numpy as np

import pandas as pd

In [11]:
mondial = ET.parse('/Users/mitchellpalmer/Projects/Lighthouse Lab Projects/JSON_XML/Data/mondial.xml')

In [12]:
print(type(mondial))

<class 'xml.etree.ElementTree.ElementTree'>


In [None]:
roots = mondial.getroot()
roots

<Element 'mondial' at 0x106c9b6f0>

In [14]:
print(roots.tag)
print(roots.attrib)
print(len(roots))

mondial
{}
3403


In [95]:
country_data = []

for country in roots.findall('country'):
    country_name = country.find('name')
    infant_mortality_value = country.find('infant_mortality')
        
    infant_mortality = (
        infant_mortality_value.text
        if infant_mortality_value is not None
        else np.nan )
    country_data.append([country_name.text,infant_mortality])

question_1 = pd.DataFrame(data=country_data,columns=['country_name','infant_mortality_rate'])



In [94]:
question_1.sort_values(by='infant_mortality_rate').head(10)

Unnamed: 0,country_name,infant_mortality_rate
38,Monaco,1.81
30,Romania,10.16
153,Fiji,10.2
69,Brunei,10.48
132,Grenada,10.5
237,Mauritius,10.59
124,Panama,10.7
243,[],10.77
102,United Arab Emirates,10.92
113,Barbados,10.93


## 2. 10 cities with the largest population

In [114]:
city_populations = []

for country in roots.findall('country'):
    country_name = country.find('name')

    for city in country.findall('city'):
        city_name = (city.find('name')
            if city_name is not None
            else np.nan)
        population = (city.find('population')
            if population is not None
            else np.nan)
        
city_populations.append([city_name.text,population])

In [115]:
city_populations

[['Victoria', <Element 'population' at 0x1243a61b0>]]

In [116]:
city_populations = []

for country in roots.findall('country'):
    country_name = country.find('name').text if country.find('name') is not None else np.nan

    for city in country.findall('city'):
        city_name_elem = city.find('name')
        city_name = city_name_elem.text if city_name_elem is not None else np.nan

        # Extract all population entries under city
        population_entries = city.findall('population')
        latest_population = np.nan
        latest_year = -1

        for pop in population_entries:
            year = pop.attrib.get('year')
            value = pop.text

            if year and value:
                year = int(year)
                value = float(value)

                if year > latest_year:
                    latest_year = year
                    latest_population = value

        city_populations.append([city_name, latest_population])


In [117]:
city_populations

[['Tirana', 418495.0],
 ['Shkodër', 77075.0],
 ['Durrës', 113249.0],
 ['Vlorë', 79513.0],
 ['Elbasan', 78703.0],
 ['Korçë', 51152.0],
 ['Skopje', 514967.0],
 ['Kumanovo', 107745.0],
 ['Beograd', 1639121.0],
 ['Novi Sad', 335701.0],
 ['Niš', 257867.0],
 ['Podgorica', 150977.0],
 ['Prishtine', 198214.0],
 ['Andorra la Vella', 22256.0],
 ['Vaduz', 5207.0],
 ['Ljubljana', 282994.0],
 ['Maribor', 111374.0],
 ['Rīga', 696618.0],
 ['Vilnius', 538747.0],
 ['Kaunas', 307498.0],
 ['Klaipeda', 158891.0],
 ['Luxembourg', 99852.0],
 ['Zagreb', 686568.0],
 ['Split', 165893.0],
 ['Rijeka', 127498.0],
 ['Osijek', 83496.0],
 ['Zadar', 70674.0],
 ['Sofia', 1270284.0],
 ['Plovdiv', 331796.0],
 ['Varna', 330486.0],
 ['Burgas', 197301.0],
 ['Ruse', 146609.0],
 ['Stara Zagora', 136363.0],
 ['Tallinn', 399340.0],
 ['Tartu', 103284.0],
 ['Tórshavn', 13130.0],
 ['Monaco', 975.0],
 ['Gibraltar', nan],
 ['Saint Peter Port', nan],
 ['Vatican City', 842.0],
 ['Ceuta', 82376.0],
 ['Melilla', 78476.0],
 ['Reykjavik'

## 3. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [140]:
airports = []

for airport in roots.findall('airport'):
    name_element = airport.find('name')
    elevation_element = airport.find('elevation')

    airport_name = name_element.text if name_element is not None else np.nan
    elevation = float(elevation_element.text) if elevation_element is not None else np.nan

    airports.append([airport_name,elevation])

In [143]:
airports_df = pd.DataFrame(data= airports,columns=['airport_name','elevation'])

In [145]:
highest_airport = airports_df.sort_values(by='elevation', ascending=False).head(1)

highest_airport

Unnamed: 0,airport_name,elevation
81,El Alto Intl,4063.0
