In [87]:
import pandas as pd
import numpy as np

### Get the health data to analyze the relation between food and health

1. Get lifespan data in different countries from WHO website

In [88]:
data= pd.read_csv('data/data_health/RAW_lifespan.csv')

In [89]:
data_lifespan = data[data['Indicator']=='Life expectancy at birth (years)']
data_lifespan = data_lifespan[data_lifespan['Dim1'] == 'Both sexes']
data_lifespan.drop(columns=['Indicator','Period','Dim1'],inplace=True)
data_lifespan.rename(columns={'First Tooltip':'Life span'},inplace=True)
data_lifespan

Unnamed: 0,Location,Life span
0,Afghanistan,62.69
12,Albania,76.37
24,Algeria,76.36
36,Angola,62.63
48,Antigua and Barbuda,75.00
...,...,...
2148,Venezuela (Bolivarian Republic of),74.05
2160,Viet Nam,76.34
2172,Yemen,65.31
2184,Zambia,62.33


2. Get some food-related common health indices which includes 'overweight','blood_pressure','blood_glucose','cholesterol' from the data download from WHO website. The data shows the percentage of total population in a country who suffer from these common diseases.

In [90]:
data_overweight= pd.read_csv('data/data_health/data_overweight.csv')
data_overweight.rename(columns={'First Tooltip':'Overweight(%)'},inplace=True)
data=pd.merge(data_lifespan,data_overweight[['Location','Overweight(%)']],on='Location')

In [91]:
data_sub= pd.read_csv('data/data_health/data_blood_glucose.csv')
data_sub.rename(columns={'First Tooltip':'High Blood Glucose(%)'},inplace=True)
data=pd.merge(data,data_sub[['Location','High Blood Glucose(%)']],on='Location')

In [92]:
data_sub= pd.read_csv('data/data_health/data_blood_pressure.csv')
data_sub.rename(columns={'First Tooltip':'High Blood Pressure(%)'},inplace=True)
data=pd.merge(data,data_sub[['Location','High Blood Pressure(%)']],on='Location')

In [93]:
data_sub= pd.read_csv('data/data_health/data_cholesterol.csv')
data_sub.rename(columns={'First Tooltip':'High Cholesterol(%)'},inplace=True)
data=pd.merge(data,data_sub[['Location','High Cholesterol(%)']],on='Location')

3. Get some NCD data (non-communicable diseases, including Malignant neoplasms, Diabetes mellitus, Cardiovascular diseases, Chronic obstructive pulmonary disease) in different countries. NCD death probability means the probability of a person will die due to non communicable disease.

In [94]:
data_sub= pd.read_csv('data/data_health/data_NCD_death.csv')
data_sub.rename(columns={'First Tooltip':'NCD death probability(%)'},inplace=True)
data=pd.merge(data,data_sub[['Location','NCD death probability(%)']],on='Location')

4. Get the death rate due to some specific disease types in a country. We only find the number of death in 2016 so we also get the population in different countries in 2016 in order to get death rate.

In [95]:
data_disease= pd.read_csv('data/data_health/data_disease_death.csv')
data_disease = data_disease[['Location','Dim2','First Tooltip']]
data_disease

Unnamed: 0,Location,Dim2,First Tooltip
0,Afghanistan,Malignant neoplasms,19965
1,Afghanistan,Diabetes mellitus,7056
2,Afghanistan,Cardiovascular diseases,51244
3,Afghanistan,Chronic obstructive pulmonary disease,6715
4,Albania,Malignant neoplasms,5138
...,...,...,...
731,Zambia,Chronic obstructive pulmonary disease,1910
732,Zimbabwe,Malignant neoplasms,8872
733,Zimbabwe,Diabetes mellitus,3590
734,Zimbabwe,Cardiovascular diseases,14184


In [96]:
data_pop= pd.read_csv('data/data_health/data_population.csv')
data_pop = data_pop[data_pop['Time']==2016][['Location','PopTotal']]

In [97]:
data_disease= pd.merge(data_disease,data_pop,on='Location')
data_disease['disease(%)']=data_disease['First Tooltip']/data_disease['PopTotal']/10

In [98]:
diseases = ['Malignant neoplasms','Diabetes mellitus','Cardiovascular diseases','Chronic obstructive pulmonary disease']
for d in diseases:
    data_sub=data_disease.loc[data_disease['Dim2']==d]
    data_sub.rename(columns={'disease(%)':d+' death(%)'},inplace=True)
    data=pd.merge(data,data_sub[['Location',d+' death(%)']],on='Location')
data.rename(columns={'Cardiovascular diseases death(%)':'Cardiovascular death(%)','Chronic obstructive pulmonary disease death(%)':'Chronic obstructive pulmonary death(%)'},inplace=True)

5. Get GDP per capita of different countries because we think it is a very important factor for some health indices. For example the life span in wealthy country is usually higher than poorer one(it can be showed in the correlation table below). So we decide to group countries by GDP per capita level and get more reliable food and health relationship in different groupes. 

In [99]:
data_gdp=pd.read_csv('data/GDP_per_capita.csv',header=1,skiprows=1)
data_gdp=data_gdp[['Country Name','2016']]
data_gdp.rename(columns={'Country Name':'Location','2016':'GDP per capita($)'},inplace=True)

In [100]:
data_health = pd.read_csv('data/health.csv')
data_health_wealth = pd.merge(data_health,data_gdp,on='Location')
data_health_wealth.drop(columns='Unnamed: 0',inplace=True)

In [101]:
col = ['Overweight(%)','High Blood Glucose(%)','High Blood Pressure(%)','High Cholesterol(%)']
for c in col:
    data_health_wealth[c]=data_health_wealth[c].str.split('[',expand=True)[0].astype('float')
data_health_wealth

Unnamed: 0,Location,Life span,Overweight(%),High Blood Glucose(%),High Blood Pressure(%),High Cholesterol(%),NCD death probability(%),Malignant neoplasms death(%),Diabetes mellitus death(%),Cardiovascular death(%),Chronic obstructive pulmonary death(%),GDP per capita($)
0,Afghanistan,62.69,23.0,11.9,30.6,21.9,29.8,0.056425,0.019942,0.144826,0.018978,547.228110
1,Albania,76.37,57.7,7.4,29.0,45.3,17.0,0.178006,0.005820,0.517248,0.030730,4124.108543
2,Algeria,76.36,62.0,12.4,25.1,39.4,14.2,0.062060,0.020690,0.170581,0.014710,3948.811897
3,Angola,62.63,27.5,8.2,29.7,30.9,16.5,0.029314,0.008248,0.078152,0.012489,3506.072885
4,Antigua and Barbuda,75.00,48.0,11.5,23.4,43.0,22.6,0.141769,0.056073,0.224291,0.029623,15494.305472
...,...,...,...,...,...,...,...,...,...,...,...,...
151,Uruguay,77.06,62.9,9.5,20.7,42.3,16.7,0.251596,0.027277,0.269644,0.078005,15387.144030
152,Uzbekistan,72.33,48.2,10.6,25.6,26.8,24.5,0.044037,0.018173,0.314604,0.011303,2567.799418
153,Vanuatu,71.98,57.1,15.9,24.2,37.6,23.3,0.071858,0.028384,0.158088,0.031258,2830.965284
154,Zambia,62.33,27.8,6.6,27.1,27.7,17.9,0.049036,0.009014,0.076286,0.011672,1280.578447


In [102]:
data_health_wealth.to_csv('./data/health_wealth.csv')

In [103]:
data_health_wealth.corr()

Unnamed: 0,Life span,Overweight(%),High Blood Glucose(%),High Blood Pressure(%),High Cholesterol(%),NCD death probability(%),Malignant neoplasms death(%),Diabetes mellitus death(%),Cardiovascular death(%),Chronic obstructive pulmonary death(%),GDP per capita($)
Life span,1.0,0.644024,0.007472,-0.684095,0.82228,-0.69551,0.658405,0.060981,0.309859,0.448686,0.648762
Overweight(%),0.644024,1.0,0.440353,-0.509246,0.646648,-0.309185,0.423001,0.164485,0.313552,0.139346,0.377287
High Blood Glucose(%),0.007472,0.440353,1.0,-0.071915,0.014533,0.319658,-0.33051,0.334904,-0.132621,-0.248565,-0.229695
High Blood Pressure(%),-0.684095,-0.509246,-0.071915,1.0,-0.54803,0.596002,-0.244812,-0.093056,0.150414,-0.30892,-0.557749
High Cholesterol(%),0.82228,0.646648,0.014533,-0.54803,1.0,-0.550841,0.692615,0.087813,0.373625,0.434463,0.740194
NCD death probability(%),-0.69551,-0.309185,0.319658,0.596002,-0.550841,1.0,-0.392777,0.177982,0.122294,-0.263336,-0.652555
Malignant neoplasms death(%),0.658405,0.423001,-0.33051,-0.244812,0.692615,-0.392777,1.0,0.025395,0.704076,0.637186,0.496742
Diabetes mellitus death(%),0.060981,0.164485,0.334904,-0.093056,0.087813,0.177982,0.025395,1.0,0.031192,0.197331,-0.092283
Cardiovascular death(%),0.309859,0.313552,-0.132621,0.150414,0.373625,0.122294,0.704076,0.031192,1.0,0.333768,0.048795
Chronic obstructive pulmonary death(%),0.448686,0.139346,-0.248565,-0.30892,0.434463,-0.263336,0.637186,0.197331,0.333768,1.0,0.350574
