In [8]:
# Import libraries
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # data visualization
from numpy import NaN
from matplotlib import pyplot as plt


In [20]:
df_nutrition = pd.read_csv('../data/World_Bank/food_nutrition_2017.csv', encoding = "ISO-8859-1")
df_nutrition.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 186 entries, 0 to 185
Data columns (total 9 columns):
 #   Column                                                                            Non-Null Count  Dtype 
---  ------                                                                            --------------  ----- 
 0   Country Code                                                                      186 non-null    object
 1   Country Name                                                                      186 non-null    object
 2   Percent of the population who cannot afford sufficient calories [CoCA_headcount]  186 non-null    object
 3   Percent of the population who cannot afford nutrient adequacy [CoNA_headcount]    186 non-null    object
 4   Percent of the population who cannot afford a healthy diet [CoHD_headcount]       186 non-null    object
 5   Millions of people who cannot afford sufficient calories [CoCA_unafford_n]        186 non-null    object
 6   Millions of

In [21]:
# Rename columns
rename_nutrition_cols = {
    'Country Code':'country_code',
    'Country Name':'country_name',
    'Percent of the population who cannot afford sufficient calories [CoCA_headcount]': 'calories_pct',
    'Percent of the population who cannot afford nutrient adequacy [CoNA_headcount]': 'nutrients_pct',
    'Percent of the population who cannot afford a healthy diet [CoHD_headcount]': 'diet_pct',
    'Millions of people who cannot afford sufficient calories [CoCA_unafford_n]': 'calories_mills',
    'Millions of people who cannot afford nutrient adequacy [CoNA_unafford_n]': 'nutrients_mills',
    'Millions of people who cannot afford a healthy diet [CoHD_unafford_n]': 'diet_mills',
    'Population [Pop]': 'population'
}
df_nutrition.rename(columns=rename_nutrition_cols, inplace=True)
df_nutrition.info()
#df_nutrition.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 186 entries, 0 to 185
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   country_code     186 non-null    object
 1   country_name     186 non-null    object
 2   calories_pct     186 non-null    object
 3   nutrients_pct    186 non-null    object
 4   diet_pct         186 non-null    object
 5   calories_mills   186 non-null    object
 6   nutrients_mills  186 non-null    object
 7   diet_mills       186 non-null    object
 8   population       186 non-null    object
dtypes: object(9)
memory usage: 13.2+ KB


In [22]:
# Replace these values '..' with NaN
df_nutrition_clean = df_nutrition.replace('..', NaN)
df_nutrition_clean.head()

Unnamed: 0,country_code,country_name,calories_pct,nutrients_pct,diet_pct,calories_mills,nutrients_mills,diet_mills,population
0,ABW,Aruba,,,,,,,105361.0
1,AGO,Angola,57.2,87.1,92.9,17.0,26.0,27.7,29816769.0
2,AIA,Anguilla,,,,,,,
3,ALB,Albania,0.0,13.0,37.8,0.0,0.4,1.1,2873457.0
4,ARE,United Arab Emirates,0.0,0.0,0.0,0.0,0.0,0.0,9487206.0


In [23]:
# Save cleaned data
df_nutrition_clean.to_csv('../data/World_Bank/food_nutrition_2017_clean.csv', index=False)

In [19]:
nutrition_cols_ref = {
    'column_name':[
        'country_code',
        'country_name',
        'calories_pct',
        'nutrients_pct',
        'diet_pct',
        'calories_mills',
        'nutrients_mills',
        'diet_mills',
        'population'
    ],
    'column_description': [
        'Country Code',
        'Country Name',
        'Percent of the population who cannot afford sufficient calories [CoCA_headcount]',
        'Percent of the population who cannot afford nutrient adequacy [CoNA_headcount]',
        'Percent of the population who cannot afford a healthy diet [CoHD_headcount]',
        'Millions of people who cannot afford sufficient calories [CoCA_unafford_n]',
        'Millions of people who cannot afford nutrient adequacy [CoNA_unafford_n]',
        'Millions of people who cannot afford a healthy life [CoHD_unafford_n]',
        'Population [Pop]'
    ],
}
df_nutrition_clean_ref = pd.DataFrame(nutrition_cols_ref, index=range(0, len(nutrition_cols_ref['column_name'])))
df_nutrition_clean_ref.to_csv('../data/World_Bank/food_nutrition_2017_clean_ref.csv', index=False)
df_nutrition_clean_ref

Unnamed: 0,column_name,column_description
0,country_code,Country Code
1,country_name,Country Name
2,calories_pct,Percent of the population who cannot afford su...
3,nutrients_pct,Percent of the population who cannot afford nu...
4,diet_pct,Percent of the population who cannot afford a ...
5,calories_mills,Millions of people who cannot afford sufficien...
6,nutrients_mills,Millions of people who cannot afford nutrient ...
7,diet_mills,Millions of people who cannot afford a healthy...
8,population,Population [Pop]
