In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [27]:
# The float lenghts on a lot of these inputs is causing the output to end up in scientific notation
# Going to set a float cap to 3 decimal places for now. Can remove this if it's a problem.
pd.set_option('display.float_format', '{:.3f}'.format)
raw_df = pd.read_csv('nutritional-data.csv')

raw_df.shape

(89010, 61)

In [28]:
# Convert this to a long format so that we don't have a column for each year. 
# This drastically increases the number of rows, but makes the data much cleaner.
data_long = raw_df.melt(id_vars=['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code'],
                      var_name='Year',
                      value_name='Amount')

data_long.shape

(5073570, 6)

In [29]:
data_long.sample(15)

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,Year,Amount
1097684,"Congo, Dem. Rep.",ZAR,"Population ages 45-49, female (% of female pop...",SP.POP.4549.FE.5Y,1972,3.812
2470518,Peru,PER,"School enrollment, tertiary (% gross)",SE.TER.ENRR,1987,26.466
955281,Oman,OMN,Share of women employed in the nonagricultural...,SL.EMP.INSV.FE.ZS,1970,
4030758,Cabo Verde,CPV,"Health expenditure, public (% of GDP)",SH.XPD.PUBL.ZS,2005,3.65
317824,Kyrgyz Republic,KGZ,"Completeness of birth registration, rural (%)",SP.REG.BRTH.RU.ZS,1963,
1099307,Cuba,CUB,"Improved sanitation facilities, urban (% of ur...",SH.STA.ACSN.UR,1972,
1071710,Europe & Central Asia (IDA & IBRD countries),TEC,"Improved water source, urban (% of urban popul...",SH.H2O.SAFE.UR.ZS,1972,
2578072,Uruguay,URY,"Population ages 25-29, male (% of male populat...",SP.POP.2529.MA.5Y,1988,7.39
297058,Costa Rica,CRI,"Age population, age 0, male, interpolated",SP.POP.AG00.MA.IN,1963,30568.0
1680987,Sweden,SWE,"Life expectancy at birth, male (years)",SP.DYN.LE00.MA.IN,1978,72.43


In [31]:
# There's still a lot of "NaN" in the Amount column so we drop those.

clean_long_df = data_long.dropna(subset=['Amount'])
clean_long_df.sample(15)

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,Year,Amount
1779217,"Yemen, Rep.",YEM,"Age population, age 20, female, interpolated",SP.POP.AG20.FE.IN,1979,67540.0
4709412,Timor-Leste,TMP,Male population 20-24,SP.POP.2024.MA,2012,52700.0
779721,Poland,POL,"Age population, age 04, male, interpolated",SP.POP.AG04.MA.IN,1968,271573.0
683747,Mozambique,MOZ,Rural population,SP.RUR.TOTL,1967,8128566.0
4611531,Serbia,SRB,"Population ages10-14, male (% of male population)",SP.POP.1014.MA.5Y,2011,6.498
392257,Ethiopia,ETH,Urban population growth (annual %),SP.URB.GROW,1964,5.779
3633342,Singapore,SGP,"Life expectancy at birth, male (years)",SP.DYN.LE00.MA.IN,2000,76.0
3825524,Vietnam,VNM,Male population 30-34,SP.POP.3034.MA,2002,3173352.0
4831970,Cabo Verde,CPV,"Population ages 60-64, female (% of female pop...",SP.POP.6064.FE.5Y,2014,2.127
4182418,West Bank and Gaza,WBG,Tuberculosis case detection rate (all forms),SH.TBS.DTEC.ZS,2006,18.0


In [33]:
# Figuring out the categegories we're working with.
unique_categories = sorted(clean_long_df['Indicator Name'].unique())
unique_categories

['% of females ages 15-49 having comprehensive correct knowledge about HIV (2 prevent ways and reject 3 misconceptions)',
 '% of males ages 15-49 having comprehensive correct knowledge about HIV (2 prevent ways and reject 3 misconceptions)',
 'AIDS estimated deaths (UNAIDS estimates)',
 'ARI treatment (% of children under 5 taken to a health provider)',
 'Adolescent fertility rate (births per 1,000 women ages 15-19)',
 'Adults (ages 15+) and children (0-14 years) living with HIV',
 'Adults (ages 15+) and children (ages 0-14) newly infected with HIV',
 'Adults (ages 15+) living with HIV',
 'Adults (ages 15+) newly infected with HIV',
 'Age at first marriage, female',
 'Age at first marriage, male',
 'Age dependency ratio (% of working-age population)',
 'Age dependency ratio, old',
 'Age dependency ratio, young',
 'Age population, age 0, female, interpolated',
 'Age population, age 0, male, interpolated',
 'Age population, age 01, female, interpolated',
 'Age population, age 01, male, i

In [40]:
# Filter the data for the specific indicator
indicator_name = "Mortality rate, adult, male (per 1,000 male adults)"
indicator_data = clean_long_df[clean_long_df['Indicator Name']
                               == indicator_name]

# Display the unique years available for this indicator
unique_years = indicator_data['Year'].unique()
print(f"Unique years available for {indicator_name}:")
print(unique_years)

# Display a sample of the data for this indicator to inspect the years and values
print(f"Sample data for {indicator_name}:")
print(indicator_data.sample(10))

Unique years available for Mortality rate, adult, male (per 1,000 male adults):
['1960' '1961' '1962' '1963' '1964' '1965' '1966' '1967' '1968' '1969'
 '1970' '1971' '1972' '1973' '1974' '1975' '1976' '1977' '1978' '1979'
 '1980' '1981' '1982' '1983' '1984' '1985' '1986' '1987' '1988' '1989'
 '1990' '1991' '1992' '1993' '1994' '1995' '1996' '1997' '1998' '1999'
 '2000' '2001' '2002' '2003' '2004' '2005' '2006' '2007' '2008' '2009'
 '2010' '2011' '2012' '2013' '2014']
Sample data for Mortality rate, adult, male (per 1,000 male adults):
                                         Country Name Country Code  \
54006                                      Luxembourg          LUX   
3431556                                      Kiribati          KIR   
2953731                           Antigua and Barbuda          ATG   
4777746                                       Morocco          MAR   
4873656                                          Peru          PER   
4039791                              Eg