# XML exercise

Using data from [**mondial database**](https://drive.google.com/file/d/14lFT4nWHgwN36ij4XZh6OUuup-K9qLgR/view?usp=sharing) find the answers to following questions:

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [62]:
import pandas as pd
import xml.etree.ElementTree as ET
tree=ET.parse('mondial.xml')
root=tree.getroot()

### 1. 10 countries with the lowest infant mortality rates

In [61]:
infant_mortality = []  # Initialize the list where will keep dictionaries with countries and mortality rates

# Iterate through each XML element
for element in root:
    country = element.find('name').text
    
    # there are some countries with missing mortality rates , "try" block is used to set value to None 
    try:
        mortality = element.find('infant_mortality').text
    except AttributeError:
        mortality = None
    
    infant_mortality.append({'Country': country, 'Mortality': mortality})

# Create a DataFrame from the infant_mortality list
df = pd.DataFrame(infant_mortality)

# Drop rows with missing values in the 'Mortality' column
df = df.dropna(subset=['Mortality'])

# Convert the 'Mortality' column data type to float
df['Mortality'] = df['Mortality'].astype(float)

# Sort the DataFrame by 'Mortality' column and display the top 10 rows
top_10_lowest_mortality = df.sort_values(by="Mortality").head(10)

print(top_10_lowest_mortality)

            Country  Mortality
38           Monaco       1.81
98            Japan       2.13
117         Bermuda       2.48
36           Norway       2.48
106       Singapore       2.53
37           Sweden       2.60
10   Czech Republic       2.63
8             Spain       2.70
78        Hong Kong       2.73
79            Macao       3.13


### 2. 10 cities with the largest population

In [157]:
population_by_cities=[]  # Initialize the list where will keep population by cities and year

for city_element in root.iterfind('country/city'):
    for ind in range(len(city_element)):
        if city_element[ind].tag=='population':
            population_by_cities.append([city_element[0].text,city_element[ind].attrib['year'],city_element[ind].text])
            
# Create a DataFrame from the 'population_by_cities' list
df = pd.DataFrame(population_by_cities, columns=["city", "year", "population"])

# keep only the last year of population counting (removing old ones)
df_latest_year=df.sort_values(by='year',ascending=False).drop_duplicates(subset='city')

# converting 'population' data type from string to integer
df_latest_year['population']=pd.to_numeric(df_latest_year['population'])

# sort descending by populations and leave the first 10 rows
df_10_largest_cities=df_latest_year.sort_values(by='population',ascending=False).head(10)

print(df_10_largest_cities)

                city  year  population
398            Seoul  2015     9805506
373       Al Qahirah  2006     8471859
208          Bangkok  2010     8305218
282        Hong Kong  2009     7055071
545        Singapore  2010     5076700
370  Al Iskandariyah  2006     4123869
557       New Taipei  2012     3939305
403            Busan  2015     3440484
229        Pyongyang  2008     3255288
955          Nairobi  2009     3133518


### 3a. Name and country of a longest river

In [214]:
Rivers_list=[] # Initialize the list where will keep rivers names, lenghts and country codes

for element in root.iterfind('river'):
        river_name=element.find('name').text
        river_country_estuary=element.find('estuary').attrib['country']
        
        # was getting error for river length, solve with try block
        try :
            river_lenght=element.find('length').text
        except:
            river_lenght=None
            
        Rivers_list.append([river_name,river_lenght,river_country_estuary])

# Create a DataFrame from the 'Rivers_list'
df=pd.DataFrame(Rivers_list,columns=["River", "Lenght", "Country(ies) Code"])

# Convert 'Lenght' from text to numeric
df['Lenght']=pd.to_numeric(df['Lenght'])

# Sort descending by the lenght and keep the first one
df_sorted=df.sort_values(by="Lenght",ascending=False).head(1)
df_sorted

#lake,area, located country="SF"

Unnamed: 0,River,Lenght,Country(ies) Code
214,Yangtze,6380.0,CN


### 3b. Name and country of a largest lake

In [222]:
Lakes_list=[] # Initialize the list where will keep lakes names, area and country codes

for element in root.iterfind('lake'):
        lake_name=element.find('name').text
        
        # was getting error for lake country , solve with try block
        try:
            lake_country=element.find('located').attrib['country']
        except:
            lake_country=None
        lake_area=element.find('area').text
            
        Lakes_list.append([lake_name,lake_area,lake_country])

# Create a DataFrame from the 'Lakes_list'
df=pd.DataFrame(Lakes_list,columns=["Lake", "Area", "Country(ies) Code"])

# Convert 'Lenght' from text to numeric
df['Area']=pd.to_numeric(df['Area'])

# Sort descending by the area and keep the first one
df_sorted=df.sort_values(by="Area",ascending=False).head(1)
df_sorted

Unnamed: 0,Lake,Area,Country(ies) Code
59,Caspian Sea,386400.0,R


### 3c. Name and country of a largest airport at highest elevation

In [240]:
Airports_list=[] # Initialize the list where will keep airport names, elevation and country codes

for element in root.iterfind('airport'):
        airport_name=element.find('name').text
        airport_country=element.attrib['country']

        try:
            airport_elevation=element.find('elevation').text
        except:
            airport_elevation=None
            
        Airports_list.append([airport_name,airport_elevation,airport_country])

# Create a DataFrame from the 'Airports_list'
df=pd.DataFrame(Airports_list,columns=["Airport", "Elevation", "Country(ies) Code"])

# Convert 'Elevation' from text to numeric
df['Elevation']=pd.to_numeric(df['Elevation'])

# Sort descending by the Elevation and keep the first one
df_sorted=df.sort_values(by="Elevation",ascending=False).head(1)
df_sorted

Unnamed: 0,Airport,Elevation,Country(ies) Code
81,El Alto Intl,4063.0,BOL
