# XML exercise

Using data from [**mondial database**](https://drive.google.com/file/d/14lFT4nWHgwN36ij4XZh6OUuup-K9qLgR/view?usp=sharing) find the answers to following questions:

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [2]:
import xml.etree.ElementTree as ET
import pandas as pd

tree = ET.parse('mondial.xml')  # Load from file
root = tree.getroot()

# 1

In [15]:
infant_mortality = {'country': [],
                   'infant_m_rate': []}

for country in root.findall('country'):
    try:
        infant_mortality['infant_m_rate'].append(country.find('infant_mortality').text)
        infant_mortality['country'].append(country.find('name').text)
    except AttributeError:
        pass
    
df = pd.DataFrame(infant_mortality)
df

Unnamed: 0,country,infant_m_rate
0,Albania,13.19
1,Greece,4.78
2,North Macedonia,7.9
3,Serbia,6.16
4,Andorra,3.69
...,...,...
223,Swaziland,54.82
224,Reunion,7.5
225,Saint Helena,17.63
226,Sao Tome and Principe,49.16


In [17]:
df['infant_m_rate'] = pd.to_numeric(df['infant_m_rate'])

In [21]:
df.sort_values('infant_m_rate').head(10)

Unnamed: 0,country,infant_m_rate
36,Monaco,1.81
90,Japan,2.13
109,Bermuda,2.48
34,Norway,2.48
98,Singapore,2.53
35,Sweden,2.6
8,Czech Republic,2.63
6,Spain,2.7
72,Hong Kong,2.73
73,Macao,3.13


# 2

In [104]:
city_pop = {'city': [],
            'year': [],
            'pop': []}

for city in root.findall("./country/city"):
    for census in city.findall('population'):
        try:
            city_pop['year'].append(census.get('year'))
            city_pop['pop'].append(census.text)
            city_pop['city'].append(city.find('name').text)
        except AttributeError:
            pass
    
df = pd.DataFrame(city_pop)
typed_df = df.astype({'pop': 'int64'})
sorted_df = typed_df.sort_values(['city','year'])
sorted_df

Unnamed: 0,city,year,pop
884,Abomey-Calavi,1992,126507
885,Abomey-Calavi,2002,307745
511,Abu Dhabi,1989,363432
512,Abu Dhabi,1995,398695
513,Abu Dhabi,2003,552000
...,...,...,...
65,Zagreb,2001,691724
66,Zagreb,2011,686568
383,Zarqa,2004,395227
534,al Hudaydah,1994,298452


In [105]:
# drop duplicates where mulitiple population counts / city. 
dropped_d = sorted_df.drop_duplicates(subset='city', keep="last")
dropped_d

Unnamed: 0,city,year,pop
885,Abomey-Calavi,2002,307745
514,Abu Dhabi,2015,1202756
766,Adamstown,2013,56
531,Aden,2004,570551
137,Akureyri,2015,17195
...,...,...,...
489,Yongin,2015,982590
78,Zadar,2011,70674
66,Zagreb,2011,686568
383,Zarqa,2004,395227


In [106]:
dropped_d.sort_values('pop', ascending=False).head(10)

Unnamed: 0,city,year,pop
398,Seoul,2015,9805506
373,Al Qahirah,2006,8471859
208,Bangkok,2010,8305218
282,Hong Kong,2009,7055071
545,Singapore,2010,5076700
370,Al Iskandariyah,2006,4123869
557,New Taipei,2012,3939305
403,Busan,2015,3440484
229,Pyongyang,2008,3255288
955,Nairobi,2009,3133518


# 3a
name and country of a) longest river

In [79]:
# River
rivers = {'country': [],
         'name': [],
         'length': []}

for river in root.findall('river'):
    try:
        rivers['length'].append(river.find('length').text)
        rivers['country'].append(river.get('country'))
        rivers['name'].append(river.find('name').text)
    except AttributeError as e:
        print(e)
    

df = pd.DataFrame(rivers)
df

'NoneType' object has no attribute 'text'


Unnamed: 0,country,name,length
0,IS,Thjorsa,230
1,IS,Jökulsa a Fjöllum,206
2,GB,Thames,346
3,GB,Severn,354
4,GB,Trent,298
...,...,...,...
433,AUS,Murrumbidgee River,1579
434,AUS,Eucumbene River,83
435,AUS,Snowy River,403
436,NZ,Waikato River,425


In [81]:
typed_df = df.astype({'length': 'float64'})
sorted_df = typed_df.sort_values('length', ascending=False)
sorted_df.head(1)

Unnamed: 0,country,name,length
214,CN,Yangtze,6380.0


## 3b
name and country of  b) largest lake 

In [82]:
# lake
lakes = {'country': [],
         'name': [],
         'area': []}

for lake in root.findall('lake'):
    try:
        lakes['area'].append(lake.find('area').text)
        lakes['country'].append(lake.get('country'))
        lakes['name'].append(lake.find('name').text)
    except AttributeError as e:
        print(e)
    

df = pd.DataFrame(lakes)
df

Unnamed: 0,country,name,area
0,SF,Inarijärvi,1040
1,SF,Oulujärvi,928
2,SF,Saimaa,4370
3,SF,Päijänne,1118
4,N,Mjoesa-See,368
...,...,...,...
184,AUS,Lake Eucumbene,145
185,AUS,Lake Jindabyne,30
186,AUS,Lake Hume,202
187,NZ,Lake Taupo,622


In [85]:
typed_df = df.astype({'area': 'float64'})
sorted_df = typed_df.sort_values('area', ascending=False)
sorted_df.head(1)

Unnamed: 0,country,name,area
59,R AZ KAZ IR TM,Caspian Sea,386400.0


# 3 c
name and country of c) airport at highest elevation

In [94]:
# lake
airports = {'country': [],
         'name': [],
         'elevation': []}

for airport in root.findall('airport'):
    try:
        airports['elevation'].append(airport.find('elevation').text)
        airports['country'].append(airport.get('country'))
        airports['name'].append(airport.find('name').text)
    except AttributeError as e:
        print(e)
    

airport_df = pd.DataFrame(airports)
airport_df

'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute

Unnamed: 0,country,name,elevation
0,AFG,Herat,977
1,AFG,Kabul Intl,1792
2,AL,Tirana Rinas,38
3,DZ,Cheikh Larbi Tebessi,811
4,DZ,Batna Airport,822
...,...,...,...
1287,Z,Livingstone,1007
1288,Z,Ndola,1270
1289,Z,Lusaka Intl,1152
1290,ZW,J M Nkomo Intl,1329


In [103]:
typed_df = airport_df.astype({'elevation': 'float64'})
sorted_df = typed_df.sort_values('elevation', ascending=False)
sorted_df.head(1)

Unnamed: 0,country,name,elevation
81,BOL,El Alto Intl,4063.0
