In [4]:
import pandas as pd
import numpy as np
import pickle
import matplotlib
import matplotlib.pyplot as plt
import numpy as np

### Get the health data to analyze the relation between food and health

1. Get lifespan data in different countries from WHO website

In [139]:
data= pd.read_csv('data/data_health/RAW_lifespan.csv')

In [140]:
data_lifespan = data[data['Indicator']=='Life expectancy at birth (years)']
data_lifespan = data_lifespan[data_lifespan['Dim1'] == 'Both sexes']
data_lifespan.drop(columns=['Indicator','Period','Dim1'],inplace=True)
data_lifespan.rename(columns={'First Tooltip':'Life span'},inplace=True)
data_lifespan.Location.unique()

array(['Afghanistan', 'Albania', 'Algeria', 'Angola',
       'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia',
       'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh',
       'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan',
       'Bolivia (Plurinational State of)', 'Bosnia and Herzegovina',
       'Botswana', 'Brazil', 'Brunei Darussalam', 'Bulgaria',
       'Burkina Faso', 'Burundi', 'Cabo Verde', 'Cambodia', 'Cameroon',
       'Canada', 'Central African Republic', 'Chad', 'Chile', 'China',
       'Colombia', 'Comoros', 'Congo', 'Costa Rica', 'Côte d’Ivoire',
       'Croatia', 'Cuba', 'Cyprus', 'Czechia',
       "Democratic People's Republic of Korea",
       'Democratic Republic of the Congo', 'Denmark', 'Djibouti',
       'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador',
       'Equatorial Guinea', 'Eritrea', 'Estonia', 'Eswatini', 'Ethiopia',
       'Fiji', 'Finland', 'France', 'Gabon', 'Gambia', 'Georgia',
       'Germany', 'Ghana', 'Gree

2. Get some food-related common health indices which includes 'overweight','blood_pressure','blood_glucose','cholesterol' from the data download from WHO website. The data shows the percentage of total population in a country who suffer from these common diseases.

In [141]:
data_overweight= pd.read_csv('data/data_health/data_overweight.csv')
data_overweight.rename(columns={'First Tooltip':'Overweight(%)'},inplace=True)
data=pd.merge(data_lifespan,data_overweight[['Location','Overweight(%)']],on='Location',how='left')

In [142]:
data_sub= pd.read_csv('data/data_health/data_blood_glucose.csv')
data_sub.rename(columns={'First Tooltip':'High Blood Glucose(%)'},inplace=True)
data=pd.merge(data,data_sub[['Location','High Blood Glucose(%)']],on='Location',how='left')

In [143]:
data_sub= pd.read_csv('data/data_health/data_blood_pressure.csv')
data_sub.rename(columns={'First Tooltip':'High Blood Pressure(%)'},inplace=True)
data=pd.merge(data,data_sub[['Location','High Blood Pressure(%)']],on='Location',how='left')

In [144]:
data_sub= pd.read_csv('data/data_health/data_cholesterol.csv')
data_sub.rename(columns={'First Tooltip':'High Cholesterol(%)'},inplace=True)
data=pd.merge(data,data_sub[['Location','High Cholesterol(%)']],on='Location',how='left')

3. Get some NCD data (non-communicable diseases, including Malignant neoplasms, Diabetes mellitus, Cardiovascular diseases, Chronic obstructive pulmonary disease) in different countries. NCD death probability means the probability of a person will die due to non communicable disease.

In [145]:
data_sub= pd.read_csv('data/data_health/data_NCD_death.csv')
data_sub.rename(columns={'First Tooltip':'NCD death probability(%)'},inplace=True)
data=pd.merge(data,data_sub[['Location','NCD death probability(%)']],on='Location',how='left')

4. Get the death rate due to some specific disease types in a country. We only find the number of death in 2016 so we also get the population in different countries in 2016 in order to get death rate.

In [146]:
data_disease= pd.read_csv('data/data_health/data_disease_death.csv')
data_disease = data_disease[['Location','Dim2','First Tooltip']]
data_disease

Unnamed: 0,Location,Dim2,First Tooltip
0,Afghanistan,Malignant neoplasms,19965
1,Afghanistan,Diabetes mellitus,7056
2,Afghanistan,Cardiovascular diseases,51244
3,Afghanistan,Chronic obstructive pulmonary disease,6715
4,Albania,Malignant neoplasms,5138
...,...,...,...
731,Zambia,Chronic obstructive pulmonary disease,1910
732,Zimbabwe,Malignant neoplasms,8872
733,Zimbabwe,Diabetes mellitus,3590
734,Zimbabwe,Cardiovascular diseases,14184


In [147]:
data_pop= pd.read_csv('data/data_health/data_population.csv')
data_pop = data_pop[data_pop['Time']==2016][['Location','PopTotal']]

In [148]:
data_disease= pd.merge(data_disease,data_pop,on='Location')
data_disease['disease(%)']=data_disease['First Tooltip']/data_disease['PopTotal']/10

In [149]:
diseases = ['Malignant neoplasms','Diabetes mellitus','Cardiovascular diseases','Chronic obstructive pulmonary disease']
for d in diseases:
    data_sub=data_disease.loc[data_disease['Dim2']==d]
    data_sub.rename(columns={'disease(%)':d+' death(%)'},inplace=True)
    data=pd.merge(data,data_sub[['Location',d+' death(%)']],on='Location',how='left')
data.rename(columns={'Cardiovascular diseases death(%)':'Cardiovascular death(%)','Chronic obstructive pulmonary disease death(%)':'Chronic obstructive pulmonary death(%)'},inplace=True)

In [150]:
data.Location.unique()

array(['Afghanistan', 'Albania', 'Algeria', 'Angola',
       'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia',
       'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh',
       'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan',
       'Bolivia (Plurinational State of)', 'Bosnia and Herzegovina',
       'Botswana', 'Brazil', 'Brunei Darussalam', 'Bulgaria',
       'Burkina Faso', 'Burundi', 'Cabo Verde', 'Cambodia', 'Cameroon',
       'Canada', 'Central African Republic', 'Chad', 'Chile', 'China',
       'Colombia', 'Comoros', 'Congo', 'Costa Rica', 'Côte d’Ivoire',
       'Croatia', 'Cuba', 'Cyprus', 'Czechia',
       "Democratic People's Republic of Korea",
       'Democratic Republic of the Congo', 'Denmark', 'Djibouti',
       'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador',
       'Equatorial Guinea', 'Eritrea', 'Estonia', 'Eswatini', 'Ethiopia',
       'Fiji', 'Finland', 'France', 'Gabon', 'Gambia', 'Georgia',
       'Germany', 'Ghana', 'Gree

In [151]:
miss =['America','Iran','Korea','Lao','Lybia','Morocca','Russia','UK','Venezuela','Vietnam']

In [152]:
mapping = ['United States of America','Iran (Islamic Republic of)','Republic of Korea',"Lao People's Democratic Republic",
           'Libya','Morocco','Russian Federation','United Kingdom of Great Britain and Northern Ireland','Venezuela (Bolivarian Republic of)','Viet Nam']

In [153]:
data.replace(mapping,miss,inplace=True)

5. Get GDP per capita of different countries because we think it is a very important factor for some health indices. For example the life span in wealthy country is usually higher than poorer one(it can be showed in the correlation table below). So we decide to group countries by GDP per capita level and get more reliable food and health relationship in different groupes. 

In [154]:
data_gdp=pd.read_csv('data/GDP_per_capita.csv',header=1,skiprows=1)
data_gdp=data_gdp[['Country Name','2016']]
data_gdp.rename(columns={'Country Name':'Location','2016':'GDP per capita($)'},inplace=True)

In [155]:
data_gdp.Location.unique()

array(['Aruba', 'Afghanistan', 'Angola', 'Albania', 'Andorra',
       'Arab World', 'United Arab Emirates', 'Argentina', 'Armenia',
       'American Samoa', 'Antigua and Barbuda', 'Australia', 'Austria',
       'Azerbaijan', 'Burundi', 'Belgium', 'Benin', 'Burkina Faso',
       'Bangladesh', 'Bulgaria', 'Bahrain', 'Bahamas, The',
       'Bosnia and Herzegovina', 'Belarus', 'Belize', 'Bermuda',
       'Bolivia', 'Brazil', 'Barbados', 'Brunei Darussalam', 'Bhutan',
       'Botswana', 'Central African Republic', 'Canada',
       'Central Europe and the Baltics', 'Switzerland', 'Channel Islands',
       'Chile', 'China', "Cote d'Ivoire", 'Cameroon', 'Congo, Dem. Rep.',
       'Congo, Rep.', 'Colombia', 'Comoros', 'Cabo Verde', 'Costa Rica',
       'Caribbean small states', 'Cuba', 'Curacao', 'Cayman Islands',
       'Cyprus', 'Czech Republic', 'Germany', 'Djibouti', 'Dominica',
       'Denmark', 'Dominican Republic', 'Algeria',
       'East Asia & Pacific (excluding high income)',
       '

In [186]:
miss =['America','Iran','Korea','Lao','Lybia','Morocca','UK','Russia','Venezuela','Egypt']

In [187]:
mapping= ['United States','Iran, Islamic Rep.','Korea, Rep.','Lao PDR','Libya','Morocco','United Kingdom',
          'Russian Federation','Venezuela, RB','Egypt, Arab Rep.']

In [188]:
data_gdp.replace(mapping,miss,inplace=True)

In [189]:
data_health_wealth = pd.merge(data,data_gdp,on='Location',how='left')

In [190]:
data_health_wealth[data_health_wealth['GDP per capita($)'].isnull()==True]['Location']

10                                       Bahamas
19              Bolivia (Plurinational State of)
37                                         Congo
39                                 Côte d’Ivoire
43                                       Czechia
44         Democratic People's Republic of Korea
45              Democratic Republic of the Congo
53                                       Eritrea
61                                        Gambia
89                                    Kyrgyzstan
107             Micronesia (Federated States of)
132                          Republic of Moldova
136                                  Saint Lucia
137             Saint Vincent and the Grenadines
146                                     Slovakia
155                           Sudan (until 2011)
159                         Syrian Arab Republic
162    The former Yugoslav Republic of Macedonia
174                  United Republic of Tanzania
179                                    Venezuela
181                 

In [191]:
col = ['Overweight(%)','High Blood Glucose(%)','High Blood Pressure(%)','High Cholesterol(%)']
for c in col:
    data_health_wealth[c]=data_health_wealth[c].str.split('[',expand=True)[0]
data_health_wealth=data_health_wealth.dropna()
data_health_wealth[['Overweight(%)','High Blood Glucose(%)','High Blood Pressure(%)','High Cholesterol(%)']]=data_health_wealth[['Overweight(%)','High Blood Glucose(%)','High Blood Pressure(%)','High Cholesterol(%)']].astype(float)

In [192]:
data_health_wealth.to_csv('./data/health.csv')

In [193]:
data_health_wealth.corr()

Unnamed: 0,Life span,Overweight(%),High Blood Glucose(%),High Blood Pressure(%),High Cholesterol(%),NCD death probability(%),Malignant neoplasms death(%),Diabetes mellitus death(%),Cardiovascular death(%),Chronic obstructive pulmonary death(%),GDP per capita($)
Life span,1.0,0.623106,0.003391,-0.684329,0.820454,-0.696923,0.655501,0.05216,0.299957,0.441168,0.645046
Overweight(%),0.623106,1.0,0.43971,-0.467132,0.638855,-0.284802,0.413819,0.16474,0.315791,0.148007,0.376791
High Blood Glucose(%),0.003391,0.43971,1.0,-0.048936,0.010224,0.331728,-0.334154,0.332607,-0.126479,-0.249638,-0.23236
High Blood Pressure(%),-0.684329,-0.467132,-0.048936,1.0,-0.538085,0.599088,-0.247872,-0.08132,0.156109,-0.316542,-0.566573
High Cholesterol(%),0.820454,0.638855,0.010224,-0.538085,1.0,-0.543696,0.688532,0.084984,0.370173,0.434435,0.732776
NCD death probability(%),-0.696923,-0.284802,0.331728,0.599088,-0.543696,1.0,-0.392803,0.176351,0.127715,-0.256409,-0.645479
Malignant neoplasms death(%),0.655501,0.413819,-0.334154,-0.247872,0.688532,-0.392803,1.0,0.024634,0.699023,0.636893,0.502417
Diabetes mellitus death(%),0.05216,0.16474,0.332607,-0.08132,0.084984,0.176351,0.024634,1.0,0.031913,0.191814,-0.088439
Cardiovascular death(%),0.299957,0.315791,-0.126479,0.156109,0.370173,0.127715,0.699023,0.031913,1.0,0.329942,0.048339
Chronic obstructive pulmonary death(%),0.441168,0.148007,-0.249638,-0.316542,0.434435,-0.256409,0.636893,0.191814,0.329942,1.0,0.371563


In [194]:
data_health_wealth.Location

0              Afghanistan
1                  Albania
2                  Algeria
3                   Angola
4      Antigua and Barbuda
              ...         
177             Uzbekistan
178                Vanuatu
180                Vietnam
182                 Zambia
183               Zimbabwe
Name: Location, Length: 160, dtype: object

In [20]:
data_health = pd.read_csv('data/health.csv')

1. Get relationship between seasoning and health.

In [235]:
recipe_seasoning = pd.read_pickle('./data/recipe_seasoning.pkl',compression = 'gzip')

In [236]:
recipe_seasoning['seasoning'].unique()

array(['spice', 'honey', 'butter', 'oil', 'salt', 'pepper', 'cheese',
       'sugar', 'clove', 'cinnamon', 'mustard', 'vanilla', 'olive',
       'garlic', 'ginger', 'chili', 'cilantro', 'onion', 'cumin',
       'celery', 'herb', 'coriander', 'thyme', 'oregano', 'basil',
       'parsley', 'cayenne', 'sage', 'nutmeg', 'mayonnaise', 'dill',
       'sesame', 'paprika', 'bay', 'syrup', 'cajun', 'allspice',
       'rosemary', 'cardamom', 'curry', 'anise', 'mace', 'tarragon',
       'wasabi'], dtype=object)

Change 'all spice' to 'spice'

In [237]:
recipe_seasoning['seasoning'].replace('allspice','spice',inplace=True)
len(recipe_seasoning['seasoning'].unique().tolist())

43

In [238]:
countall = recipe_seasoning.groupby('country')['seasoning'].count().reset_index(name='countall')
countall

Unnamed: 0,country,countall
0,America,234589
1,Angola,63
2,Argentina,829
3,Austria,749
4,Belgium,802
...,...,...
59,Thailand,7217
60,Turkey,1539
61,UK,3108
62,Venezuela,255


Get the percentage of each spice percentage

In [239]:
count_spice=recipe_seasoning.groupby(['country','seasoning'])['seasoning'].count().reset_index(name='count')

In [240]:
count = pd.merge(count_spice, countall, on='country')
count['count_percent'] = count['count']/count['countall']
count

Unnamed: 0,country,seasoning,count,countall,count_percent
0,America,anise,81,234589,0.000345
1,America,basil,2147,234589,0.009152
2,America,bay,1766,234589,0.007528
3,America,butter,14662,234589,0.062501
4,America,cajun,492,234589,0.002097
...,...,...,...,...,...
2155,Vietnam,spice,16,2015,0.007940
2156,Vietnam,sugar,248,2015,0.123077
2157,Vietnam,syrup,5,2015,0.002481
2158,Vietnam,vanilla,4,2015,0.001985


In [241]:
seasonings = recipe_seasoning['seasoning'].unique().tolist()

In [243]:
data=countall.country

In [244]:
for s in seasonings:
    data_sub=count.loc[count['seasoning']==s]
    data_sub.rename(columns={'count_percent':s},inplace=True)
    data=pd.merge(data,data_sub[['country',s]],on='country',how='left')
data.fillna(value=0)

Unnamed: 0,country,spice,honey,butter,oil,salt,pepper,cheese,sugar,clove,...,bay,syrup,cajun,rosemary,cardamom,curry,anise,mace,tarragon,wasabi
0,America,0.005013,0.008530,0.062501,0.062731,0.119358,0.133263,0.073414,0.076751,0.034396,...,0.007528,0.006927,0.002097,0.003824,0.000703,0.001969,0.000345,0.000345,0.001458,0.000149
1,Angola,0.000000,0.000000,0.031746,0.126984,0.142857,0.126984,0.031746,0.047619,0.111111,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.015873,0.000000,0.000000,0.000000,0.000000
2,Argentina,0.000000,0.002413,0.030157,0.107358,0.118215,0.176116,0.025332,0.026538,0.061520,...,0.014475,0.002413,0.001206,0.003619,0.000000,0.001206,0.000000,0.000000,0.000000,0.000000
3,Austria,0.008011,0.000000,0.122830,0.056075,0.148198,0.092123,0.032043,0.161549,0.034713,...,0.012016,0.005340,0.000000,0.001335,0.001335,0.001335,0.001335,0.001335,0.001335,0.000000
4,Belgium,0.006234,0.006234,0.119701,0.059850,0.142145,0.113466,0.033666,0.102244,0.044888,...,0.019950,0.002494,0.000000,0.006234,0.000000,0.001247,0.000000,0.001247,0.003741,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,Thailand,0.001386,0.008868,0.028267,0.109879,0.054732,0.118470,0.002494,0.090758,0.055702,...,0.001524,0.001801,0.000000,0.000000,0.001524,0.045171,0.001247,0.000277,0.000139,0.000139
60,Turkey,0.009097,0.009747,0.036387,0.109812,0.134503,0.135153,0.027940,0.045484,0.054581,...,0.002599,0.005198,0.000000,0.001300,0.005198,0.000650,0.000650,0.000000,0.000000,0.000000
61,UK,0.010940,0.013835,0.157336,0.032819,0.146396,0.097812,0.037645,0.161519,0.020592,...,0.007400,0.009009,0.000000,0.006113,0.000965,0.002252,0.000000,0.004826,0.001609,0.000000
62,Venezuela,0.011765,0.000000,0.035294,0.090196,0.137255,0.160784,0.031373,0.105882,0.062745,...,0.003922,0.003922,0.000000,0.000000,0.000000,0.003922,0.003922,0.000000,0.000000,0.000000


To see the relationship between seasoning and spice by plotting correlation matrix

In [245]:
data_health.rename(columns={'Location':'country'},inplace=True)
data_all = pd.merge(data,data_health,on='country')
cor = data_all.corr()
cor=cor.iloc[:43,44:]
cor

Unnamed: 0,Life span,Overweight(%),High Blood Glucose(%),High Blood Pressure(%),High Cholesterol(%),NCD death probability(%),Malignant neoplasms death(%),Diabetes mellitus death(%),Cardiovascular death(%),Chronic obstructive pulmonary death(%),GDP per capita($)
spice,-0.453031,0.057281,0.393109,0.313532,-0.415591,0.272801,-0.485704,-0.209062,-0.257206,-0.513586,-0.209639
honey,-0.040877,-0.112564,0.126547,-0.130751,-0.202891,-0.05331,-0.179844,-0.39168,-0.074937,-0.285432,0.030352
butter,0.393813,0.363446,-0.493725,-0.196249,0.599812,-0.356379,0.565326,-0.087559,0.307892,0.295999,0.721086
oil,-0.228288,-0.325116,0.335048,0.075779,-0.398732,0.184755,-0.346725,-0.078979,-0.305,-0.116142,-0.503496
salt,0.187782,0.467576,-0.117164,-0.028088,0.398504,-0.174363,0.278538,0.048407,0.270382,0.137491,0.340313
pepper,0.100501,0.158677,-0.071302,-0.064136,-0.061112,-0.118868,-0.002084,0.086005,-0.028194,0.11757,-0.137628
cheese,0.388384,0.361961,-0.337363,-0.252967,0.453164,-0.313012,0.405164,0.145827,0.243896,0.302457,0.465
sugar,0.244846,-0.170273,-0.501867,-0.282423,0.385976,-0.187745,0.341132,-0.091116,0.016935,0.183568,0.482871
clove,-0.620197,-0.424257,0.345386,0.48283,-0.616158,0.465764,-0.563937,-0.02634,-0.213916,-0.392646,-0.59951
cinnamon,-0.417245,0.021366,0.140323,0.337262,-0.217669,0.1189,-0.304061,-0.053339,-0.172091,-0.377121,-0.048245


The matrix is so big so we can take a look of the order of each column. 

In [246]:
col_names=data_health.drop(columns=['Unnamed: 0','country']).columns.tolist()

In [247]:
i=0
for c in col_names:
    print(cor.reindex(cor[c].abs().sort_values(ascending=False).index).iloc[:3,i])
    i+=1

clove      -0.620197
cardamom   -0.612215
wasabi      0.600710
Name: Life span, dtype: float64
ginger    -0.702706
parsley    0.643926
chili     -0.610575
Name: Overweight(%), dtype: float64
cumin        0.684970
coriander    0.513830
vanilla     -0.505793
Name: High Blood Glucose(%), dtype: float64
clove        0.482830
coriander    0.481932
cardamom     0.474713
Name: High Blood Pressure(%), dtype: float64
clove     -0.616158
cayenne   -0.612810
butter     0.599812
Name: High Cholesterol(%), dtype: float64
wasabi      -0.589414
clove        0.465764
coriander    0.465531
Name: NCD death probability(%), dtype: float64
wasabi       0.748250
cumin       -0.615031
coriander   -0.584676
Name: Malignant neoplasms death(%), dtype: float64
honey        -0.391680
sage         -0.380014
mayonnaise   -0.342271
Name: Diabetes mellitus death(%), dtype: float64
dill      0.694640
wasabi    0.411862
curry    -0.360371
Name: Cardiovascular death(%), dtype: float64
wasabi      0.668140
cardamom   -0.

We can see some interesting conclusion from the correlation matrix. For example, spice reduces the life span. But the data is not enough and has high correlation with GDP so later we will sort these data by GDP to have more resonable conclusion.

In [248]:
recipe_seasoning.groupby(['seasoning','country'])['name'].count()

seasoning  country    
anise      America        81
           Austria         1
           Brazil          1
           Canada          7
           Caribbean       6
                          ..
wasabi     Japan          73
           Mexico          1
           New Zealand     1
           Thailand        1
           Vietnam         1
Name: name, Length: 2160, dtype: int64

We can see some seasonings does not distributed in different countries like wasabi. So their relationship is only influenced by a single country statistics. We need to get more data or treat these as outliers.

2. Let's see the relationship between the nutrition in common food and health

In [11]:
recipe_cooking = pd.read_pickle('./recipe_cooking_e.pkl')

In [12]:
method_list = [
    'bake',
    'barbecue',
    'blanch',
    'boil',
    'braise',
    'broil',
    'deglaze',
    'ferment',
    'fry',
    'griddle',
    'grill',
    'marinate',
    'parboil',
    'poach',
    'roast',
    'saute',
    'scramble',
    'simmer',
    'smoke',
    'sous-vide',
    'steam',
    'stew',
    'toast']
recipe_cooking=recipe_cooking[recipe_cooking['method'].isin(method_list)]

In [13]:
countall = recipe_cooking.groupby('country')['method'].count().reset_index(name='countall')
countall

Unnamed: 0,country,countall
0,America,51861
1,Angola,17
2,Argentina,157
3,Australia,2825
4,Austria,200
...,...,...
61,Thailand,1566
62,Turkey,356
63,UK,984
64,Venezuela,52


In [14]:
count_cooking=recipe_cooking.groupby(['country','method'])['method'].count().reset_index(name='count')

In [15]:
count = pd.merge(count_cooking, countall, on='country')
count['count_percent'] = count['count']/count['countall']
count

Unnamed: 0,country,method,count,countall,count_percent
0,America,bake,17583,51861,0.339041
1,America,barbecue,439,51861,0.008465
2,America,blanch,102,51861,0.001967
3,America,boil,9178,51861,0.176973
4,America,braise,40,51861,0.000771
...,...,...,...,...,...
1023,Vietnam,simmer,92,480,0.191667
1024,Vietnam,smoke,5,480,0.010417
1025,Vietnam,steam,12,480,0.025000
1026,Vietnam,stew,1,480,0.002083


In [16]:
cookings = recipe_cooking['method'].unique().tolist()
len(cookings)

22

In [17]:
data=countall['country']
for c in cookings:
    data_sub=count.loc[count['method']==c]
    data_sub.rename(columns={'count_percent':c},inplace=True)
    data=pd.merge(data,data_sub[['country',c]],on='country',how='left')
data.fillna(value=0)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)


Unnamed: 0,country,bake,boil,marinate,toast,simmer,fry,saute,deglaze,grill,...,roast,barbecue,griddle,broil,stew,blanch,braise,poach,ferment,parboil
0,America,0.339041,0.176973,0.021808,0.020015,0.159927,0.053084,0.075529,0.004030,0.049613,...,0.029348,0.008465,0.007790,0.020728,0.005226,0.001967,0.000771,0.001446,0.000174,0.000559
1,Angola,0.000000,0.352941,0.117647,0.058824,0.235294,0.058824,0.058824,0.000000,0.117647,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,Argentina,0.127389,0.140127,0.101911,0.019108,0.114650,0.012739,0.070064,0.012739,0.222930,...,0.057325,0.044586,0.012739,0.025478,0.025478,0.006369,0.000000,0.000000,0.000000,0.000000
3,Australia,0.300885,0.186549,0.034336,0.031150,0.139469,0.081416,0.038938,0.004956,0.077876,...,0.030088,0.025841,0.002478,0.005664,0.001770,0.005310,0.000708,0.002832,0.000354,0.001062
4,Austria,0.330000,0.210000,0.015000,0.015000,0.180000,0.095000,0.070000,0.020000,0.005000,...,0.010000,0.000000,0.000000,0.000000,0.005000,0.010000,0.000000,0.005000,0.005000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,Thailand,0.042146,0.240741,0.052363,0.008301,0.232439,0.189017,0.066411,0.003193,0.070881,...,0.017880,0.009579,0.000639,0.018519,0.002554,0.004470,0.000639,0.001277,0.000000,0.001277
62,Turkey,0.157303,0.250000,0.033708,0.022472,0.207865,0.084270,0.098315,0.000000,0.053371,...,0.022472,0.011236,0.005618,0.028090,0.011236,0.005618,0.000000,0.000000,0.002809,0.000000
63,UK,0.416667,0.175813,0.004065,0.041667,0.131098,0.074187,0.016260,0.001016,0.038618,...,0.019309,0.001016,0.014228,0.012195,0.012195,0.001016,0.000000,0.005081,0.000000,0.000000
64,Venezuela,0.153846,0.192308,0.000000,0.000000,0.230769,0.115385,0.076923,0.000000,0.076923,...,0.019231,0.019231,0.057692,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [21]:
data_health.rename(columns={'Location':'country'},inplace=True)
data_all = pd.merge(data,data_health,on='country')
cor = data_all.corr()
cor=cor.iloc[:22,23:]
cor

Unnamed: 0,Life span,Overweight(%),High Blood Glucose(%),High Blood Pressure(%),High Cholesterol(%),NCD death probability(%),Malignant neoplasms death(%),Diabetes mellitus death(%),Cardiovascular death(%),Chronic obstructive pulmonary death(%),GDP per capita($)
bake,0.576565,0.573742,-0.448422,-0.437617,0.690533,-0.518097,0.566912,0.052495,0.173929,0.314574,0.778878
boil,-0.247193,0.005889,0.329822,0.266903,-0.16613,0.090873,-0.201911,-0.275729,-0.020669,-0.340568,-0.187716
marinate,-0.26111,-0.435376,0.08757,0.093872,-0.434724,0.232873,-0.325876,-0.151612,-0.307823,-0.229008,-0.398815
toast,-0.059531,0.017711,0.082702,0.011415,-0.093333,-0.139555,-0.151445,-0.23873,0.016679,-0.119248,0.073515
simmer,-0.32574,-0.128484,0.206586,0.404648,-0.364721,0.239583,-0.279038,0.195116,0.128158,-0.191029,-0.410033
fry,-0.443731,-0.55574,0.172775,0.306083,-0.355285,0.503227,-0.345995,-0.13192,-0.139744,-0.216222,-0.379717
saute,-0.129425,0.066024,0.062515,0.278311,-0.197079,0.267757,0.002868,0.180146,0.234901,0.036598,-0.379542
deglaze,0.16151,0.255981,-0.145892,-0.022292,0.232917,-0.175312,0.125143,-0.117665,0.002367,-0.062518,0.306348
grill,-0.211824,-0.318632,0.048732,0.039851,-0.405461,0.01211,-0.323083,-0.07254,-0.39212,-0.127593,-0.336616
smoke,-0.185748,-0.344575,0.173182,0.028868,-0.328178,0.176234,-0.32842,-0.185572,-0.216846,-0.169738,-0.180303


In [22]:
col_names=data_health.drop(columns=['Unnamed: 0','country']).columns.tolist()

In [23]:
i=0
for c in col_names:
    print(cor.reindex(cor[c].abs().sort_values(ascending=False).index).iloc[:3,i])
    i+=1

griddle   -0.706032
stew      -0.631747
bake       0.576565
Name: Life span, dtype: float64
bake     0.573742
fry     -0.555740
steam   -0.512899
Name: Overweight(%), dtype: float64
bake      -0.448422
parboil    0.330547
boil       0.329822
Name: High Blood Glucose(%), dtype: float64
stew       0.578961
parboil    0.492964
griddle    0.464319
Name: High Blood Pressure(%), dtype: float64
bake       0.690533
griddle   -0.497009
stew      -0.459127
Name: High Cholesterol(%), dtype: float64
parboil    0.529414
bake      -0.518097
fry        0.503227
Name: NCD death probability(%), dtype: float64
bake       0.566912
griddle   -0.429359
stew      -0.395640
Name: Malignant neoplasms death(%), dtype: float64
boil      -0.275729
toast     -0.238730
ferment   -0.221484
Name: Diabetes mellitus death(%), dtype: float64
roast       0.402760
grill      -0.392120
marinate   -0.307823
Name: Cardiovascular death(%), dtype: float64
ferment   -0.491343
poach     -0.340903
boil      -0.340568
Name: Chron

We can see bake is more healthy cooking method while stew and gridle reduce life span

The data is not very convincing we will find more ways for example, analyzing the nutrition index to see if it is related to health adn disease.