# XML exercise

Using data from [**mondial database**](https://drive.google.com/file/d/14lFT4nWHgwN36ij4XZh6OUuup-K9qLgR/view?usp=sharing) find the answers to following questions:

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [1255]:
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np

In [1256]:
# parsed xml tree and extracted root
tree = ET.parse('data/mondial.xml')
root = tree.getroot()

### 10 countries with the lowest infact mortality rates

In [1257]:
root[0].attrib

{'car_code': 'AL',
 'area': '28750',
 'capital': 'cty-Albania-Tirane',
 'memberships': 'org-BSEC org-CEI org-CD org-SELEC org-CE org-EAPC org-EBRD org-EITI org-FAO org-IPU org-IAEA org-IBRD org-ICC org-ICAO org-ICCt org-Interpol org-IDA org-IFRCS org-IFC org-IFAD org-ILO org-IMO org-IMF org-IOC org-IOM org-ISO org-OIF org-ITU org-ITUC org-IDB org-MIGA org-NATO org-OSCE org-OPCW org-OAS org-OIC org-PCA org-UN org-UNCTAD org-UNESCO org-UNIDO org-UPU org-WCO org-WFTU org-WHO org-WIPO org-WMO org-UNWTO org-WTO'}

In [1258]:
country_count = 0
for country in root.findall('country'):
    country_count += 1
    
print(country_count, 'countries')

244 countries


In [1259]:
city_count = 0
for country in root.findall('country'):
    city_count += len(country.findall('city'))

print(city_count, 'cities')

401 cities


In [1260]:
mortality_dict = {'name': [],
           'infant_mortality': []}

for country in root.findall('country'):
    
    if country.find('infant_mortality') != None:
        name = country.find('name').text
        infant_mortality = country.find('infant_mortality').text
        mortality_dict['name'].append(name)
        mortality_dict['infant_mortality'].append(infant_mortality)

In [1261]:
pd.DataFrame(mortality_dict).sort_values(by="infant_mortality", ascending=False).head(10)

Unnamed: 0,name,infant_mortality
197,Central African Republic,92.86
214,Guinea-Bissau,90.92
198,Chad,90.3
159,Argentina,9.96
66,Thailand,9.86
58,Bahrain,9.68
123,Greenland,9.42
188,Botswana,9.38
129,Sint Maarten,9.05
99,Sri Lanka,9.02


### 10 cities with the largest population

In [1262]:
population_dict = {'name': [],
                   'population': []}

for country in root.findall('country'):
    pop = []
    name = country.find('name').text
    
    for population in country.findall("population[@measured='census']"):
        
        if population.text != None:
            pop.append(population.text)
        else:
            pop.append(country.find("population").text)
    
    population_dict['name'].append(name)
    # convert population list from string to integer before adding to dict
    # get the largest population in the list
    population_dict['population'].append(max(map(int, pop), default=None))

# create DataFrame and fill any missing values with 0
population_data = pd.DataFrame(population_dict).sort_values(by='population', ascending=False).fillna(0)
# suppress scientific notation on population column
population_data['population'] = population_data['population'].apply(int)

population_data.head(10)

Unnamed: 0,name,population
55,China,1339724852
67,India,1210854977
120,United States,308745538
88,Indonesia,237641326
57,Pakistan,207776954
176,Brazil,190732694
65,Bangladesh,149772364
23,Russia,142856536
202,Nigeria,140431790
98,Japan,128057352


### Name and country of:

In [1263]:
features_dict = {'country': [],
                 'country_id': [],
                 'city': [],
                 'city_id': [],
                 'river_name': [],
                 'lake_name': [],
                 'elevation': []}

# iterate for each country
for country in root.findall('country'):
    country_name = country.find('name').text
    country_no = country.get('car_code')
    
    # then iterate for each city
    for city in country.findall('city'):
        
        city_name = city.find('name').text
        city_no = city.get('id')
        city_elevation = np.nan if city.find('elevation') is None else float(city.find('elevation').text)
        
        lake_name = np.nan
        river_name = np.nan
        
        for lake in city.findall("located_at[@watertype='lake']"):
            
            if len(lake):
                lake_name = np.nan
            else:
                lake_name = lake.get('lake')
            
            features_dict['country'].append(country_name)
            features_dict['country_id'].append(country_no)
            features_dict['city'].append(city_name)
            features_dict['city_id'].append(city_no)
            features_dict['elevation'].append(city_elevation)
            features_dict['lake_name'].append(lake_name)
            features_dict['river_name'].append(river_name)
            
        for river in city.findall("located_at[@watertype='river']"):
            
            if len(river):
                river_name = np.nan
            else:
                river_name = river.get('river')
                
            features_dict['country'].append(country_name)
            features_dict['country_id'].append(country_no)
            features_dict['city'].append(city_name)
            features_dict['city_id'].append(city_no)
            features_dict['elevation'].append(city_elevation)
            features_dict['lake_name'].append(lake_name)
            features_dict['river_name'].append(river_name)
        
        if type(lake_name) != str and type(river_name) != str:
            
            features_dict['country'].append(country_name)
            features_dict['country_id'].append(country_no)
            features_dict['city'].append(city_name)
            features_dict['city_id'].append(city_no)
            features_dict['elevation'].append(city_elevation)
            features_dict['lake_name'].append(lake_name)
            features_dict['river_name'].append(river_name)
            

# create DataFrame
city_features = pd.DataFrame(features_dict)

In [1264]:
city_features

Unnamed: 0,country,country_id,city,city_id,river_name,lake_name,elevation
0,Albania,AL,Tirana,cty-Albania-Tirane,,,110.0
1,Albania,AL,Shkodër,stadt-Shkoder-AL-AL,,lake-Skutarisee,13.0
2,Albania,AL,Durrës,stadt-Durres-AL-AL,,,40.0
3,Albania,AL,Vlorë,stadt-Vlore-AL-AL,,,25.0
4,Albania,AL,Elbasan,stadt-Elbasan-AL-AL,,,150.0
...,...,...,...,...,...,...,...
402,Reunion,REUN,Saint-Denis,city-Saint-Denis-REUN-REUN,,,25.0
403,Reunion,REUN,Saint-Paul,cty-Saint-Paul-REUN-REUN,,,3.0
404,Saint Helena,HELX,Jamestown,city-Jamestown-HELX-HELX,,,25.0
405,Sao Tome and Principe,STP,São Tomé,cty-Sao-Tome-and-Principe-Sao-Tome,,,137.0


In [1265]:
print(len(city_features[~city_features['elevation'].apply(np.isnan)]), 'elevation')
print(len(city_features[city_features['lake_name'].apply(lambda x: isinstance(x, str))]), 'lakes')
print(len(city_features[city_features['river_name'].apply(lambda x: isinstance(x, str))]), 'rivers')

263 elevation
10 lakes
52 rivers


In [1266]:
city_features.sort_values(by='elevation', ascending=False).head(10)

Unnamed: 0,country,country_id,city,city_id,river_name,lake_name,elevation
314,Ecuador,EC,Quito,cty-Ecuador-Quito,,,2850.0
316,Ecuador,EC,Cuenca,city-EC-Cuenca,,,2560.0
73,Bhutan,BHT,Thimphu,cty-Bhutan-Thimphu,,,2334.0
376,Eritrea,ER,Asmara,cty-Eritrea-Asmara,,,2326.0
191,Yemen,YE,Sanaa,cty-Yemen-Sanaa,,,2250.0
225,Guatemala,GCA,Chichicastenango,cty-Guatemala-g,,,1965.0
383,Kenya,EAK,Nakuru,cty-Kenya-5,,,1850.0
60,Afghanistan,AFG,Kabul,cty-Afghanistan-Kabul,,,1791.0
361,Burundi,BI,Gitega,cty-Burundi-Gitega,river-Luvironza,,1725.0
381,Kenya,EAK,Nairobi,cty-Kenya-Nairobi,,,1661.0


In [1267]:
city_features.sort_values(by='river_name').head(10)

Unnamed: 0,country,country_id,city,city_id,river_name,lake_name,elevation
22,Luxembourg,L,Luxembourg,cty-Luxembourg-Luxembourg,river-Alzette,,230.0
370,South Sudan,SSD,Juba,cty-SouthSudan-Juba,river-Bahr_el-Djebel,,550.0
332,Algeria,DZ,Ech Chelif,cty-Algeria-22,river-Chelif,,114.0
331,Algeria,DZ,Mostaganem,cty-Algeria-12,river-Chelif,,104.0
57,Moldova,MD,Tiraspol,cty-Moldova-Tiraspol,river-Dnister,,
32,Bulgaria,BG,Ruse,cty-Bulgaria-5,river-Donau,,45.0
8,Serbia,SRB,Beograd,city-Belgrade-SRB-SRB,river-Donau,,117.0
10,Serbia,SRB,Novi Sad,city-NoviSad-SRB-SRB,river-Donau,,72.0
17,Slovenia,SLO,Maribor,cty-Slovenia-Maribor,river-Drau,,275.0
26,Croatia,HR,Osijek,cty-Croatia-4,river-Drau,,94.0


In [1268]:
city_features.sort_values(by='lake_name').head(10)

Unnamed: 0,country,country_id,city,city_id,river_name,lake_name,elevation
69,Azerbaijan,AZ,Baku,cty-Azerbaijan-Baku,,lake-KaspischesMeer,-28.0
70,Azerbaijan,AZ,Ganja,cty-Azerbaijan-Ganja,,lake-KaspischesMeer,408.0
71,Azerbaijan,AZ,Sumgayit,cty-Azerbaijan-Sumgaitu,,lake-KaspischesMeer,26.0
229,Nicaragua,NIC,Managua,cty-Nicaragua-Managua,,lake-Managua,83.0
235,Nicaragua,NIC,Tipitapa,cty-Nicaragua-8,,lake-Managua,50.0
233,Nicaragua,NIC,Granada,cty-Nicaragua-6,,lake-Nicaragua_See,
1,Albania,AL,Shkodër,stadt-Shkoder-AL-AL,,lake-Skutarisee,13.0
82,Thailand,THA,Songkhla,cty-Thailand-4,,lake-Songkhla,11.0
360,Burundi,BI,Bujumbura,cty-Burundi-Bujumbura,,lake-Tanganjikasee,774.0
382,Kenya,EAK,Kisumu,cty-Kenya-4,,lake-Victoriasee,1131.0


## 1. Longest river

In [1269]:
river_dict = {'name': [],
              'id': [],
              'source': [],
              'length': []}

for river in root.findall('river'):
    river_dict['name'].append(river.find('name').text)
    river_dict['id'].append(river.get('id'))
    river_dict['source'].append(river.find('source').get('country'))
    river_dict['length'].append(np.nan if river.find('length') is None else float(river.find('length').text))

In [1270]:
rivers = pd.DataFrame(river_dict)
rivers

Unnamed: 0,name,id,source,length
0,Thjorsa,river-Thjorsa,IS,230.0
1,Jökulsa a Fjöllum,river-Joekulsa_a_Fjoellum,IS,206.0
2,Thames,river-Themse,GB,346.0
3,Severn,river-Severn,GB,354.0
4,Trent,river-Trent,GB,298.0
...,...,...,...,...
434,Murrumbidgee River,river-MurrumbidgeeRiver,AUS,1579.0
435,Eucumbene River,river-Eucumbene,AUS,83.0
436,Snowy River,river-SnowyRiver,AUS,403.0
437,Waikato River,river-WaikatoRiver,NZ,425.0


In [1271]:
pd.merge(city_features, rivers, left_on='river_name', right_on='id').groupby(by='river_name').max().sort_values(by='length', ascending=False).head(10)

Unnamed: 0_level_0,country,country_id,city,city_id,lake_name,elevation,name,id,source,length
river_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
river-Mekong,Laos,LAO,Vientiane,cty-Laos-Vientiane,,174.0,Mekong,river-Mekong,CN,4350.0
river-Nil,Egypt,ET,Suhaj,cty-Egypt-Cairo,,194.0,Nile,river-Nil,SUD,3090.0
river-Donau,Serbia,SRB,Ruse,cty-Bulgaria-5,,117.0,Donau,river-Donau,D,2845.0
river-Parana,Paraguay,PY,Ciudad del Este,cty-Paraguay-CiudadEste,,34.0,Parana,river-Parana,BR,2640.0
river-Paraguay,Paraguay,PY,Asunción,cty-Paraguay-Asuncion,,43.0,Paraguay,river-Paraguay,BR,2549.0
river-Uruguay,Uruguay,ROU,Salto,cty-Uruguay-Salto,,48.0,Uruguay,river-Uruguay,BR,1790.0
river-Schari,Chad,TCH,N'Djamena,cty-Chad-NDjamena,,298.0,Schari,river-Schari,RCA,1400.0
river-Kura,Georgia,GE,Tbilisi,cty-Georgia-1,,380.0,Kura,river-Kura,TR,1364.0
river-Dnister,Moldova,MD,Tiraspol,cty-Moldova-Tiraspol,,,Dnister,river-Dnister,UA,1352.0
river-Rhein,Liechtenstein,FL,Vaduz,cty-Liechtenstein-Vaduz,,455.0,Rhein,river-Rhein,CH,1324.0


## 2. Largest lake

In [1272]:
list(root.find('lake'))

[<Element 'name' at 0x29909a400>,
 <Element 'located' at 0x29909a450>,
 <Element 'to' at 0x29909a4a0>,
 <Element 'area' at 0x29909a4f0>,
 <Element 'latitude' at 0x29909a540>,
 <Element 'longitude' at 0x29909a590>,
 <Element 'elevation' at 0x29909a5e0>,
 <Element 'depth' at 0x29909a630>]

In [1273]:
lake_dict = {'name': [],
             'location_id': [],
             'id': [],
             'area': []}

for lake in root.findall('lake'):
    lake_dict['name'].append(lake.find('name').text)
    lake_dict['location_id'].append(lake.get('country'))
    lake_dict['id'].append(lake.get('id'))
    lake_dict['area'].append(float(lake.find('area').text))

In [1274]:
lakes = pd.DataFrame(lake_dict)

In [1275]:
pd.merge(city_features, lakes, left_on='lake_name', right_on='id').groupby(by='lake_name').max().sort_values(by='area', ascending=False)

Unnamed: 0_level_0,country,country_id,city,city_id,river_name,elevation,name,location_id,id,area
lake_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
lake-KaspischesMeer,Azerbaijan,AZ,Sumgayit,cty-Azerbaijan-Sumgaitu,,408.0,Caspian Sea,R AZ KAZ IR TM,lake-KaspischesMeer,386400.0
lake-Victoriasee,Kenya,EAK,Kisumu,cty-Kenya-4,,1131.0,Lake Victoria,EAT EAK EAU,lake-Victoriasee,68870.0
lake-Tanganjikasee,Burundi,BI,Bujumbura,cty-Burundi-Bujumbura,,774.0,Lake Tanganjika,ZRE Z BI EAT,lake-Tanganjikasee,32893.0
lake-Nicaragua_See,Nicaragua,NIC,Granada,cty-Nicaragua-6,,,Lake Nicaragua,NIC CR,lake-Nicaragua_See,8157.0
lake-Songkhla,Thailand,THA,Songkhla,cty-Thailand-4,,11.0,Songkhla Lake,THA,lake-Songkhla,1040.0
lake-Managua,Nicaragua,NIC,Tipitapa,cty-Nicaragua-Managua,,83.0,Lake Managua,NIC,lake-Managua,1035.0
lake-Skutarisee,Albania,AL,Shkodër,stadt-Shkoder-AL-AL,,13.0,Lake Skutari,AL MNE,lake-Skutarisee,368.0


## 3. Airport at Highest Elevation

In [1276]:
list(root.find('airport'))

[<Element 'name' at 0x298cc8270>,
 <Element 'latitude' at 0x298cc8400>,
 <Element 'longitude' at 0x298cc83b0>,
 <Element 'elevation' at 0x298cc8360>,
 <Element 'gmtOffset' at 0x298cc8310>]

In [1285]:
airport_dict = {'airport_name': [],
                'city': [],
                'elevation': []}

for airport in root.findall('airport'):
    airport_dict['airport_name'].append(airport.find('name').text)
    airport_dict['city'].append(airport.get('city'))
    
    if airport.find('elevation') is None:
        airport_dict['elevation'].append(np.nan)
    else:
        airport_dict['elevation'].append(float(airport.find('elevation').text))

In [1286]:
for key, value in airport_dict.items():
    print(key, len(value))

airport_name 1317
city 1317
elevation 1317


In [1287]:
airports = pd.DataFrame(airport_dict)
airports

Unnamed: 0,airport_name,city,elevation
0,Herat,cty-Afghanistan-2,977.0
1,Kabul Intl,cty-Afghanistan-Kabul,1792.0
2,Tirana Rinas,cty-Albania-Tirane,38.0
3,Cheikh Larbi Tebessi,cty-Algeria-14,811.0
4,Batna Airport,cty-Algeria-6,822.0
...,...,...,...
1312,Livingstone,cty-Zambia-Livingstone,1007.0
1313,Ndola,cty-Zambia-3,1270.0
1314,Lusaka Intl,cty-Zambia-2,1152.0
1315,J M Nkomo Intl,cty-Zimbabwe-2,1329.0


In [1289]:
pd.merge(city_features, airports, left_on='city_id', right_on='city').groupby(by='city_id').max().sort_values(by='elevation_y', ascending=False).head(10)

Unnamed: 0_level_0,country,country_id,city_x,river_name,lake_name,elevation_x,airport_name,city_y,elevation_y
city_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
cty-Ecuador-Quito,Ecuador,EC,Quito,,,2850.0,Mariscal Sucre Intl,cty-Ecuador-Quito,2813.0
city-EC-Cuenca,Ecuador,EC,Cuenca,,,2560.0,Mariscal Lamar,city-EC-Cuenca,2532.0
cty-Eritrea-Asmara,Eritrea,ER,Asmara,,,2326.0,Asmara Intl,cty-Eritrea-Asmara,2336.0
cty-Bhutan-Thimphu,Bhutan,BHT,Thimphu,,,2334.0,Paro,cty-Bhutan-Thimphu,2235.0
cty-Yemen-Sanaa,Yemen,YE,Sanaa,,,2250.0,Sanaa Intl,cty-Yemen-Sanaa,2200.0
cty-Kenya-16,Kenya,EAK,Eldoret,,,,Eldoret Intl,cty-Kenya-16,2116.0
cty-Afghanistan-Kabul,Afghanistan,AFG,Kabul,,,1791.0,Kabul Intl,cty-Afghanistan-Kabul,1792.0
cty-Namibia-Windhoek,Namibia,NAM,Windhoek,,,,Windhoek Hosea Kutako International Airport,cty-Namibia-Windhoek,1720.0
cty-Kenya-Nairobi,Kenya,EAK,Nairobi,,,1661.0,Nairobi Wilson,cty-Kenya-Nairobi,1691.0
cty-Lesotho-Maseru,Lesotho,LS,Maseru,,,1600.0,Moshoeshoe I Intl,cty-Lesotho-Maseru,1630.0
