In [121]:
import yaml
import pandas as pd
import re

## Data acquisition

The data used in this project is obtained from two different data sources. The BMI data is obtained from https://www.ncdrisc.org/data-downloads-adiposity.html and downloaded in the csv format. This data contains out of all of the mean BMI's (with 95% confidence intervals) per country per sex, ranging from the year 1975 to 2016. In addition, the dataset contains the prevalence of certain BMI ranges (with 95% confidence intervals). For example, a person is classified as obese if the BMI is higher or equal to 30.

The affordability of a healthy diet is obtained from https://databank.worldbank.org/source/food-prices-for-nutrition (select all under country, select 'Affordability of a healthy diet: ratio of cost to food expenditures' under series, select 2017 under year) and downloaded in the csv format. In this dataset the affordability of a healthy diet is defined as the ratio of the cost of a healthy diet to food expenditures. This ratio is given by country and by sex in the year 2017.

Keep in mind that the BMI dataset is from the year 2017 and the healthy diet affordability dataset is from 2016. At this point in time the global BMI data is only available up to 2016 and the healthy diet data from 2017 until 2020. In this project the 2017 and 2016 will be compared. The mean BMI and heatlhy diet affordability are unlikely to change a lot from year to year and therefor are compared. However, when 2017 BMI data is available it is better to use that instead of the 2016 data.

In [122]:
# load the datafiles using a config files
with open("./config.yaml", 'r') as stream:
    config = yaml.safe_load(stream)

bmi = config['bmi']
healthy_diet_affordability = config['food_affordability']

## Data exploration

### BMI

In [123]:
df_bmi

Unnamed: 0,Country/Region/World,ISO,Sex,Year,Mean BMI,Mean BMI lower 95% uncertainty interval,Mean BMI upper 95% uncertainty interval,Prevalence of BMI>=30 kg/m² (obesity),Prevalence of BMI>=30 kg/m² lower 95% uncertainty interval,Prevalence of BMI>=30 kg/m² upper 95% uncertainty interval,...,Prevalence of BMI 25 kg/m² to <30 kg/m² upper 95% uncertainty interval,Prevalence of BMI 30 kg/m² to <35 kg/m²,Prevalence of BMI 30 kg/m² to <35 kg/m² lower 95% uncertainty interval,Prevalence of BMI 30 kg/m² to <35 kg/m² upper 95% uncertainty interval,Prevalence of BMI 35 kg/m² to <40 kg/m²,Prevalence of BMI 35 kg/m² to <40 kg/m² lower 95% uncertainty interval,Prevalence of BMI 35 kg/m² to <40 kg/m² upper 95% uncertainty interval,Prevalence of BMI >=40 kg/m²(morbid obesity),Prevalence of BMI >=40 kg/m² lower 95% uncertainty interval,Prevalence of BMI >=40 kg/m² upper 95% uncertainty interval
41,Afghanistan,AFG,Men,2016,22.682456,20.157475,25.241857,0.033603,0.013884,0.066334,...,0.242503,0.030290,0.011207,0.062681,0.002271,0.000310,0.007487,0.001043,0.000074,0.004265
83,Albania,ALB,Men,2016,27.174471,25.975170,28.338256,0.223735,0.153334,0.300834,...,0.515957,0.178699,0.113401,0.252200,0.037684,0.013616,0.076984,0.007352,0.001180,0.021953
125,Algeria,DZA,Men,2016,24.865386,23.487321,26.220294,0.206662,0.141854,0.279979,...,0.463570,0.163822,0.102610,0.234048,0.031750,0.011066,0.065164,0.011090,0.002289,0.029802
167,American Samoa,ASM,Men,2016,33.066721,31.338678,34.662447,0.587546,0.502606,0.666355,...,0.370043,0.264868,0.191211,0.340031,0.183871,0.109862,0.263723,0.138807,0.067651,0.223666
209,Andorra,AND,Men,2016,27.478395,24.988831,30.001977,0.267498,0.186223,0.354723,...,0.532478,0.198934,0.125035,0.280761,0.052701,0.017627,0.109639,0.015864,0.002652,0.046309
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16631,Venezuela,VEN,Women,2016,26.939841,26.148956,27.733570,0.297526,0.234430,0.364287,...,0.421753,0.196603,0.139026,0.262398,0.075728,0.041070,0.120149,0.025195,0.009495,0.048975
16673,Viet Nam,VNM,Women,2016,21.940594,21.329260,22.564460,0.027065,0.015304,0.044264,...,0.237665,0.024053,0.012707,0.041163,0.002742,0.000968,0.005795,0.000270,0.000051,0.000786
16715,Yemen,YEM,Women,2016,24.197709,23.356827,25.050170,0.229724,0.167397,0.295367,...,0.385797,0.160376,0.103330,0.223892,0.049255,0.023383,0.085374,0.020092,0.006776,0.043427
16757,Zambia,ZMB,Women,2016,23.833982,22.990421,24.689639,0.129887,0.088520,0.178320,...,0.300492,0.091848,0.055134,0.137947,0.027307,0.011942,0.050831,0.010731,0.003255,0.024589


In [152]:
df_bmi = pd.read_csv(bmi, encoding='latin-1')  # file uses latin-1 encoding
df_bmi = df_bmi[df_bmi.Year == 2016]  # we only want the 2016 data
df_bmi = df_bmi.drop(columns='Year')  # remove the year column, we only have 2016 left so redundant
df_bmi.head()

Unnamed: 0,Country/Region/World,ISO,Sex,Mean BMI,Mean BMI lower 95% uncertainty interval,Mean BMI upper 95% uncertainty interval,Prevalence of BMI>=30 kg/m² (obesity),Prevalence of BMI>=30 kg/m² lower 95% uncertainty interval,Prevalence of BMI>=30 kg/m² upper 95% uncertainty interval,Prevalence of BMI>=35 kg/m² (severe obesity),...,Prevalence of BMI 25 kg/m² to <30 kg/m² upper 95% uncertainty interval,Prevalence of BMI 30 kg/m² to <35 kg/m²,Prevalence of BMI 30 kg/m² to <35 kg/m² lower 95% uncertainty interval,Prevalence of BMI 30 kg/m² to <35 kg/m² upper 95% uncertainty interval,Prevalence of BMI 35 kg/m² to <40 kg/m²,Prevalence of BMI 35 kg/m² to <40 kg/m² lower 95% uncertainty interval,Prevalence of BMI 35 kg/m² to <40 kg/m² upper 95% uncertainty interval,Prevalence of BMI >=40 kg/m²(morbid obesity),Prevalence of BMI >=40 kg/m² lower 95% uncertainty interval,Prevalence of BMI >=40 kg/m² upper 95% uncertainty interval
41,Afghanistan,AFG,Men,22.682456,20.157475,25.241857,0.033603,0.013884,0.066334,0.003314,...,0.242503,0.03029,0.011207,0.062681,0.002271,0.00031,0.007487,0.001043,7.4e-05,0.004265
83,Albania,ALB,Men,27.174471,25.97517,28.338256,0.223735,0.153334,0.300834,0.045036,...,0.515957,0.178699,0.113401,0.2522,0.037684,0.013616,0.076984,0.007352,0.00118,0.021953
125,Algeria,DZA,Men,24.865386,23.487321,26.220294,0.206662,0.141854,0.279979,0.04284,...,0.46357,0.163822,0.10261,0.234048,0.03175,0.011066,0.065164,0.01109,0.002289,0.029802
167,American Samoa,ASM,Men,33.066721,31.338678,34.662447,0.587546,0.502606,0.666355,0.322678,...,0.370043,0.264868,0.191211,0.340031,0.183871,0.109862,0.263723,0.138807,0.067651,0.223666
209,Andorra,AND,Men,27.478395,24.988831,30.001977,0.267498,0.186223,0.354723,0.068565,...,0.532478,0.198934,0.125035,0.280761,0.052701,0.017627,0.109639,0.015864,0.002652,0.046309


Some column names are pretty long and make it inconvenient to read. The next code chunk shortens it to make it more readible.

In [153]:
# dict for patterns and replacements for the column names
replace_dict = {'Prevalence': 'Prev', 'kg/m²': '', 'lower 95% uncertainty interval': 'lower', 'upper 95% uncertainty interval': 'upper'}
for pattern, replacement in replace_dict.items():
    df_bmi = df_bmi.rename(columns=lambda column: re.sub(pattern, replacement, column))
    

In [154]:
df_bmi.head()

Unnamed: 0,Country/Region/World,ISO,Sex,Mean BMI,Mean BMI lower,Mean BMI upper,Prev of BMI>=30 (obesity),Prev of BMI>=30 lower,Prev of BMI>=30 upper,Prev of BMI>=35 (severe obesity),...,Prev of BMI 25 to <30 upper,Prev of BMI 30 to <35,Prev of BMI 30 to <35 lower,Prev of BMI 30 to <35 upper,Prev of BMI 35 to <40,Prev of BMI 35 to <40 lower,Prev of BMI 35 to <40 upper,Prev of BMI >=40 (morbid obesity),Prev of BMI >=40 lower,Prev of BMI >=40 upper
41,Afghanistan,AFG,Men,22.682456,20.157475,25.241857,0.033603,0.013884,0.066334,0.003314,...,0.242503,0.03029,0.011207,0.062681,0.002271,0.00031,0.007487,0.001043,7.4e-05,0.004265
83,Albania,ALB,Men,27.174471,25.97517,28.338256,0.223735,0.153334,0.300834,0.045036,...,0.515957,0.178699,0.113401,0.2522,0.037684,0.013616,0.076984,0.007352,0.00118,0.021953
125,Algeria,DZA,Men,24.865386,23.487321,26.220294,0.206662,0.141854,0.279979,0.04284,...,0.46357,0.163822,0.10261,0.234048,0.03175,0.011066,0.065164,0.01109,0.002289,0.029802
167,American Samoa,ASM,Men,33.066721,31.338678,34.662447,0.587546,0.502606,0.666355,0.322678,...,0.370043,0.264868,0.191211,0.340031,0.183871,0.109862,0.263723,0.138807,0.067651,0.223666
209,Andorra,AND,Men,27.478395,24.988831,30.001977,0.267498,0.186223,0.354723,0.068565,...,0.532478,0.198934,0.125035,0.280761,0.052701,0.017627,0.109639,0.015864,0.002652,0.046309


In [126]:
df_bmi.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 400 entries, 41 to 16799
Data columns (total 34 columns):
 #   Column                                                                    Non-Null Count  Dtype  
---  ------                                                                    --------------  -----  
 0   Country/Region/World                                                      400 non-null    object 
 1   ISO                                                                       400 non-null    object 
 2   Sex                                                                       400 non-null    object 
 3   Year                                                                      400 non-null    int64  
 4   Mean BMI                                                                  400 non-null    float64
 5   Mean BMI lower 95% uncertainty interval                                   400 non-null    float64
 6   Mean BMI upper 95% uncertainty interval                        

As can be seen there are 400 entries in total, since every country contains data about both sexes the dataset contains 200 countries. Fortunately, the dataset contains no missing values.

In [142]:
print(df_bmi['Country/Region/World'].nunique())  # confirming that there are indeed 200 countries

200


In [143]:
df_bmi['Mean BMI'].describe()


count    400.000000
mean      26.122793
std        2.674200
min       20.110440
25%       24.153777
50%       26.349274
75%       27.636261
max       35.148865
Name: Mean BMI, dtype: float64

### Affordability of a healthy diet

In [144]:
df_healthy_diet = pd.read_csv(healthy_diet_affordability, encoding='latin-1', skipfooter=5, engine='python')  # skip last lines, does not contain data
df_healthy_diet.drop(columns=['Classification Name', 'Classification Code', 'Time', 'Time Code'], inplace=True)  # irrelevant columns
df_healthy_diet.rename(columns={'Affordability of a healthy diet: ratio of cost to food expenditures [CoHD_fexp]': 'Affordability of a healthy diet'}, inplace=True)

In [145]:
df_healthy_diet

Unnamed: 0,Country Name,Country Code,Affordability of a healthy diet
0,Albania,ALB,0.425
1,Algeria,DZA,0.605
2,Angola,AGO,0.972
3,Anguilla,AIA,0.577
4,Antigua and Barbuda,ATG,0.767
...,...,...,...
181,Vietnam,VNM,1.052
182,West Bank and Gaza,PSE,0.845
183,World,WLD,..
184,Zambia,ZMB,1.821


In [163]:
codes_diet = df_healthy_diet['Country Code']
codes_bmi = df_bmi.ISO

print(set(codes_diet))
print(len(codes_bmi))
print(len(set(codes_diet) ^ set(codes_bmi)))

print(set(codes_bmi) ^ set(codes_diet))

df_merged = df_healthy_diet.merge(right=df_bmi, 
                                  left_on='Country Code',
                                  right_on='ISO',
                                  how='inner')

{'MLI', 'KHM', 'HIC', 'MDV', 'URY', 'COM', 'QAT', 'MNG', 'CUW', 'BTN', 'DNK', 'BOL', 'SVK', 'BON', 'ALB', 'EST', 'AUT', 'BIH', 'KNA', 'GRD', 'AGO', 'GHA', 'SLE', 'IRN', 'BHR', 'NZL', 'POL', 'ATG', 'AIA', 'HND', 'MDA', 'KAZ', 'MEA', 'HUN', 'KEN', 'LTU', 'TCA', 'MDG', 'BLZ', 'ISL', 'ETH', 'ECS', 'PRY', 'MLT', 'ARG', 'CAN', 'VCT', 'MOZ', 'TWN', 'ARE', 'BDI', 'IND', 'JPN', 'IDN', 'MUS', 'HTI', 'DZA', 'ISR', 'UMC', 'TCD', 'MRT', 'NLD', 'KGZ', 'MWI', 'GRC', 'JOR', 'MMR', 'SWZ', 'HKG', 'MKD', 'BEN', 'PHL', 'DEU', 'NER', 'MSR', 'LIC', 'EAS', 'CRI', 'GBR', 'ROU', 'IRQ', 'GNB', 'SEN', 'ZWE', 'SAS', 'FIN', 'NPL', 'ESP', 'SVN', 'JAM', 'TUR', 'RUS', 'COL', 'BRA', 'USA', 'PER', 'SAU', 'SYC', 'MYS', 'BGR', 'KWT', 'CYP', 'DMA', 'ABW', 'MAR', 'TZA', 'LSO', 'AUS', 'BMU', 'GAB', 'MEX', 'NIC', 'AZE', 'LCN', 'ZAF', 'PSE', 'SLV', 'BWA', 'BEL', 'ECU', 'CMR', 'NGA', 'FRA', 'LKA', 'BRB', 'CYM', 'ARM', 'PAN', 'PRT', 'TGO', 'MNE', 'GIN', 'WLD', 'OMN', 'DJI', 'CIV', 'KOR', 'GUY', 'THA', 'DOM', 'TJK', 'BGD', 'SWE'

In [157]:
df_merged['Country Name'].nunique()

165