In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# The float lenghts on a lot of these inputs is causing the output to end up in scientific notation
# Going to set a float cap to 3 decimal places for now. Can remove this if it's a problem.
pd.set_option('display.float_format', '{:.3f}'.format)
raw_df = pd.read_csv('nutritional-data.csv')

raw_df.shape

(89010, 61)

In [3]:
# Convert this to a long format so that we don't have a column for each year. 
# This drastically increases the number of rows, but makes the data much cleaner.
data_long = raw_df.melt(id_vars=['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code'],
                      var_name='Year',
                      value_name='Amount')

data_long.shape

(5073570, 6)

In [4]:
data_long.sample(15)

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,Year,Amount
1108280,Greece,GRC,"Improved water source, urban (% of urban popul...",SH.H2O.SAFE.UR.ZS,1972,
643828,Belize,BLZ,"Age population, age 23, female, interpolated",SP.POP.AG23.FE.IN,1967,762.0
1292517,Isle of Man,IMY,"Life expectancy at birth, male (years)",SP.DYN.LE00.MA.IN,1974,
1471031,Israel,ISR,"Primary completion rate, total (% of relevant ...",SE.PRM.CMPT.ZS,1976,
4122379,Channel Islands,CHI,"School enrollment, tertiary, female (% gross)",SE.TER.ENRR.FE,2006,


In [5]:
# There's still a lot of "NaN" in the Amount column so we drop those.

clean_long_df = data_long.dropna(subset=['Amount'])
clean_long_df.sample(25)

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,Year,Amount
4353101,Thailand,THA,"Population ages 35-39, male (% of male populat...",SP.POP.3539.MA.5Y,2008,8.519
1696184,High income,HIC,Male population 30-34,SP.POP.3034.MA,1979,34291103.0
1646182,Hungary,HUN,"Mortality rate, infant (per 1,000 live births)",SP.DYN.IMRT.IN,1978,27.1
4291315,"Bahamas, The",BHS,Out-of-pocket health expenditure (% of total e...,SH.XPD.OOPC.TO.ZS,2008,29.183
53874,Luxembourg,LUX,"Age population, age 21, female, interpolated",SP.POP.AG21.FE.IN,1960,2062.0
91253,East Asia & Pacific (IDA & IBRD countries),TEA,Male population 75-79,SP.POP.7579.MA,1961,1903869.0
3044810,Azerbaijan,AZE,"Mortality rate, adult, female (per 1,000 femal...",SP.DYN.AMRT.FE,1994,116.301
52475,Liberia,LBR,"Age population, age 11, male, interpolated",SP.POP.AG11.MA.IN,1960,12369.0
2482041,Sweden,SWE,Female population 55-59,SP.POP.5559.FE,1987,217040.0
375230,Bahrain,BHR,"Population ages 00-14, total",SP.POP.0014.TO,1964,83861.0


In [6]:
# Figuring out the categegories we're working with.
unique_categories = sorted(clean_long_df['Indicator Name'].unique())
unique_categories

['% of females ages 15-49 having comprehensive correct knowledge about HIV (2 prevent ways and reject 3 misconceptions)',
 '% of males ages 15-49 having comprehensive correct knowledge about HIV (2 prevent ways and reject 3 misconceptions)',
 'AIDS estimated deaths (UNAIDS estimates)',
 'ARI treatment (% of children under 5 taken to a health provider)',
 'Adolescent fertility rate (births per 1,000 women ages 15-19)',
 'Adults (ages 15+) and children (0-14 years) living with HIV',
 'Adults (ages 15+) and children (ages 0-14) newly infected with HIV',
 'Adults (ages 15+) living with HIV',
 'Adults (ages 15+) newly infected with HIV',
 'Age at first marriage, female',
 'Age at first marriage, male',
 'Age dependency ratio (% of working-age population)',
 'Age dependency ratio, old',
 'Age dependency ratio, young',
 'Age population, age 0, female, interpolated',
 'Age population, age 0, male, interpolated',
 'Age population, age 01, female, interpolated',
 'Age population, age 01, male, i

In [16]:
# Male adult mortality rates
indicator_name = "Mortality rate, adult, male (per 1,000 male adults)"
year = "2014"

male_filtered_df = clean_long_df[(clean_long_df['Indicator Name'] == indicator_name) & (
    clean_long_df['Year'] == year)]

# Put them in order by Amount.
male_sorted_df = male_filtered_df.sort_values(by='Amount', ascending=False)

# We can cheat out the top/bottom 10 by using head/tail
male_top_10_countries = male_sorted_df.head(10)
male_bottom_10_countries = male_sorted_df.tail(10)

#Output
print("TOP 10: Mortality rate, adult, male (per 1,000 male adults):")
print(male_top_10_countries[['Country Name', 'Amount', 'Year']])
print("BOTTOM 10: Mortality rate, adult, male (per 1,000 male adults):")
print(male_bottom_10_countries[['Country Name', 'Amount', 'Year']])

TOP 10: Mortality rate, adult, male (per 1,000 male adults):
                     Country Name  Amount  Year
4858821                   Lesotho 580.548  2014
4885041                 Swaziland 575.768  2014
4881591              South Africa 464.093  2014
4867101                Mozambique 425.115  2014
4837086             Cote d'Ivoire 423.848  2014
4833636  Central African Republic 422.681  2014
4895391                  Zimbabwe 413.047  2014
4879176              Sierra Leone 406.904  2014
4833981                      Chad 387.931  2014
4870551                   Nigeria 378.566  2014
BOTTOM 10: Mortality rate, adult, male (per 1,000 male adults):
                 Country Name  Amount  Year
4825701               Bahrain  75.454  2014
4860891      Macao SAR, China  75.240  2014
4879521             Singapore  71.395  2014
4858476               Lebanon  71.358  2014
4838466                Cyprus  70.387  2014
4863306                 Malta  70.033  2014
4871241                Norway  69.472  

In [6]:
unique_categories = sorted(clean_long_df['Indicator Name'].unique())
unique_categories

['% of females ages 15-49 having comprehensive correct knowledge about HIV (2 prevent ways and reject 3 misconceptions)',
 '% of males ages 15-49 having comprehensive correct knowledge about HIV (2 prevent ways and reject 3 misconceptions)',
 'AIDS estimated deaths (UNAIDS estimates)',
 'ARI treatment (% of children under 5 taken to a health provider)',
 'Adolescent fertility rate (births per 1,000 women ages 15-19)',
 'Adults (ages 15+) and children (0-14 years) living with HIV',
 'Adults (ages 15+) and children (ages 0-14) newly infected with HIV',
 'Adults (ages 15+) living with HIV',
 'Adults (ages 15+) newly infected with HIV',
 'Age at first marriage, female',
 'Age at first marriage, male',
 'Age dependency ratio (% of working-age population)',
 'Age dependency ratio, old',
 'Age dependency ratio, young',
 'Age population, age 0, female, interpolated',
 'Age population, age 0, male, interpolated',
 'Age population, age 01, female, interpolated',
 'Age population, age 01, male, i