In [44]:
# Import libraries

import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # data visualization
from numpy import NaN
from matplotlib import pyplot as plt
from kaggle_olympic_games_medals import KaggleOlympicGamesMedals

In [45]:
# Instantiate the olympic games medal class
# and load the data
data_dir = '../data/kaggle/olympic-games-medals'
ogm = KaggleOlympicGamesMedals(data_dir)


Data Loaded


In [46]:
# Get the medals dataframe with standardized country names
df_medals = ogm.get_medals_by_std_country_name()
df_medals.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20124 entries, 0 to 21696
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   discipline_title       20124 non-null  object
 1   slug_game              20124 non-null  object
 2   event_title            20124 non-null  object
 3   event_gender           20124 non-null  object
 4   medal_type             20124 non-null  object
 5   participant_type       20124 non-null  object
 6   country_name           20124 non-null  object
 7   country_3_letter_code  20124 non-null  object
 8   game_end_date          20124 non-null  object
 9   game_start_date        20124 non-null  object
 10  game_location          20124 non-null  object
 11  game_name              20124 non-null  object
 12  game_season            20124 non-null  object
 13  game_year              20124 non-null  int64 
dtypes: int64(1), object(13)
memory usage: 2.3+ MB


In [47]:
# Get list of unique medals countries
med_country_list = df_medals['country_name'].sort_values(ascending=True).unique()
print(med_country_list)
print(len(med_country_list))

['Afghanistan' 'Algeria' 'Argentina' 'Armenia' 'Australasia' 'Australia'
 'Austria' 'Azerbaijan' 'Bahamas' 'Bahrain' 'Barbados' 'Belarus' 'Belgium'
 'Bermuda' 'Bohemia' 'Botswana' 'Brazil' 'Bulgaria' 'Burkina Faso'
 'Burundi' 'Cameroon' 'Canada' 'Chile' 'China' 'Chinese Taipei' 'Colombia'
 'Costa Rica' 'Croatia' 'Cuba' 'Cyprus' 'Czech Republic' "Côte d'Ivoire"
 'Denmark' 'Djibouti' 'Dominican Republic' 'Ecuador' 'Egypt' 'Eritrea'
 'Estonia' 'Ethiopia' 'Fiji' 'Finland' 'France' 'Gabon' 'Georgia'
 'Germany' 'Ghana' 'Great Britain' 'Greece' 'Grenada' 'Guatemala' 'Guyana'
 'Haiti' 'Hong Kong, China' 'Hungary' 'Iceland' 'India' 'Indonesia' 'Iran'
 'Iraq' 'Ireland' 'Israel' 'Italy' 'Jamaica' 'Japan' 'Jordan' 'Kazakhstan'
 'Kenya' 'Kosovo' 'Kuwait' 'Kyrgyzstan' 'Latvia' 'Lebanon' 'Liechtenstein'
 'Lithuania' 'Luxembourg' 'MIX' 'Malaysia' 'Mauritius' 'Mexico' 'Mongolia'
 'Montenegro' 'Morocco' 'Mozambique' 'Namibia' 'Netherlands'
 'Netherlands Antilles' 'New Zealand' 'Niger' 'Nigeria' 'North K

In [48]:
# Load the happiness dataset reference and data into dataframes
df_happiness_ref = pd.read_csv('../data/world-happiness/world_happiness_reference.csv')
df_happiness_ref.to_csv('../data/etl/happiness_reference.csv', index=False)
df_happiness = pd.read_csv('../data/world-happiness/world_happiness.csv')
# Rename columns
cols = {
    'Country name':'country_name', 
    'Life Ladder':'happiness',
    'Log GDP per capita':'wealth',
    'Social support':'support',
    'Healthy life expectancy at birth':'health',
    'Freedom to make life choices':'freedom',
    'Generosity':'generosity',
    'Perceptions of corruption':'corruption',
    'Positive affect':'positivity',
    'Negative affect':'negativity'
}
df_happiness = df_happiness.rename(columns=cols)
#df_happiness.info()

In [49]:
# Get list of unique happiness countries
hap_country_list = df_happiness['country_name'].unique()
print(hap_country_list)
print(len(hap_country_list))

['Afghanistan' 'Albania' 'Algeria' 'Angola' 'Argentina' 'Armenia'
 'Australia' 'Austria' 'Azerbaijan' 'Bahrain' 'Bangladesh' 'Belarus'
 'Belgium' 'Belize' 'Benin' 'Bhutan' 'Bolivia' 'Bosnia and Herzegovina'
 'Botswana' 'Brazil' 'Bulgaria' 'Burkina Faso' 'Burundi' 'Cambodia'
 'Cameroon' 'Canada' 'Central African Republic' 'Chad' 'Chile' 'China'
 'Colombia' 'Comoros' 'Congo (Brazzaville)' 'Congo (Kinshasa)'
 'Costa Rica' 'Croatia' 'Cuba' 'Cyprus' 'Czechia' 'Denmark' 'Djibouti'
 'Dominican Republic' 'Ecuador' 'Egypt' 'El Salvador' 'Estonia' 'Eswatini'
 'Ethiopia' 'Finland' 'France' 'Gabon' 'Gambia' 'Georgia' 'Germany'
 'Ghana' 'Greece' 'Guatemala' 'Guinea' 'Guyana' 'Haiti' 'Honduras'
 'Hong Kong S.A.R. of China' 'Hungary' 'Iceland' 'India' 'Indonesia'
 'Iran' 'Iraq' 'Ireland' 'Israel' 'Italy' 'Ivory Coast' 'Jamaica' 'Japan'
 'Jordan' 'Kazakhstan' 'Kenya' 'Kosovo' 'Kuwait' 'Kyrgyzstan' 'Laos'
 'Latvia' 'Lebanon' 'Lesotho' 'Liberia' 'Libya' 'Lithuania' 'Luxembourg'
 'Madagascar' 'Malawi' 'M

In [50]:
df_nutrition_ref = pd.read_csv('../data/World_Bank/food_nutrition_2017_clean_ref.csv')
df_nutrition_ref.to_csv('../data/etl/nutrition_2017_reference.csv', index=False)
df_nutrition_ref

Unnamed: 0,column_name,column_description
0,country_code,Country Code
1,country_name,Country Name
2,calories_pct,Percent of the population who cannot afford su...
3,nutrients_pct,Percent of the population who cannot afford nu...
4,diet_pct,Percent of the population who cannot afford a ...
5,calories_mills,Millions of people who cannot afford sufficien...
6,nutrients_mills,Millions of people who cannot afford nutrient ...
7,diet_mills,Millions of people who cannot afford a healthy...
8,population,Population [Pop]


In [51]:
df_nutrition = pd.read_csv('../data/World_Bank/food_nutrition_2017_clean.csv', encoding = "ISO-8859-1")
df_nutrition.loc[df_nutrition['country_name'] == 'Russian Federation', 'country_name'] = 'Russia'
df_nutrition.loc[df_nutrition['country_name'] == 'United Kingdom', 'country_name'] = 'Great Britain'
df_nutrition['population'] = df_nutrition['population'].apply(lambda x: x/1000)
df_nutrition['good_diet_pct'] = df_nutrition['diet_pct'].apply(lambda x: 100-x if x > 0 else NaN)
df_nutrition.rename(columns={'diet_pct': 'bad_diet_pct'}, inplace=True)
df_nutrition = df_nutrition[['country_code', 'country_name', 'calories_pct', 'nutrients_pct', 'good_diet_pct', 'bad_diet_pct', 'calories_mills', 'nutrients_mills', 'diet_mills', 'population']]
df_nutrition.to_csv('../data/etl/nutrition_2017_by_country.csv', index=False)
df_nutrition.info()
df_nutrition.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 186 entries, 0 to 185
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   country_code     186 non-null    object 
 1   country_name     186 non-null    object 
 2   calories_pct     155 non-null    float64
 3   nutrients_pct    155 non-null    float64
 4   good_diet_pct    151 non-null    float64
 5   bad_diet_pct     155 non-null    float64
 6   calories_mills   155 non-null    float64
 7   nutrients_mills  155 non-null    float64
 8   diet_mills       155 non-null    float64
 9   population       183 non-null    float64
dtypes: float64(8), object(2)
memory usage: 14.7+ KB


Unnamed: 0,country_code,country_name,calories_pct,nutrients_pct,good_diet_pct,bad_diet_pct,calories_mills,nutrients_mills,diet_mills,population
0,ABW,Aruba,,,,,,,,105.361
1,AGO,Angola,57.2,87.1,7.1,92.9,17.0,26.0,27.7,29816.769
2,AIA,Anguilla,,,,,,,,
3,ALB,Albania,0.0,13.0,62.2,37.8,0.0,0.4,1.1,2873.457
4,ARE,United Arab Emirates,0.0,0.0,,0.0,0.0,0.0,0.0,9487.206
5,ARG,Argentina,0.3,7.1,89.0,11.0,0.1,3.1,4.8,44044.811
6,ARM,Armenia,0.6,16.8,59.1,40.9,0.0,0.5,1.2,2944.789
7,ATG,Antigua and Barbuda,,,,,,,,95.425
8,AUS,Australia,0.2,0.5,99.3,0.7,0.1,0.1,0.2,24601.86
9,AUT,Austria,0.2,0.5,99.4,0.6,0.0,0.0,0.1,8797.566


In [52]:
# Get list of unique nutrition countries
nut_country_list = df_nutrition['country_name'].sort_values(ascending=True).unique()
print(nut_country_list)
print(len(nut_country_list))

['Albania' 'Algeria' 'Angola' 'Anguilla' 'Antigua and Barbuda' 'Argentina'
 'Armenia' 'Aruba' 'Australia' 'Austria' 'Azerbaijan' 'Bahamas, The'
 'Bahrain' 'Bangladesh' 'Barbados' 'Belarus' 'Belgium' 'Belize' 'Benin'
 'Bermuda' 'Bhutan' 'Bolivia' 'Bonaire' 'Bosnia and Herzegovina'
 'Botswana' 'Brazil' 'British Virgin Islands' 'Brunei Darussalam'
 'Bulgaria' 'Burkina Faso' 'Burundi' 'Cabo Verde' 'Cambodia' 'Cameroon'
 'Canada' 'Cayman Islands' 'Central African Republic' 'Chad' 'Chile'
 'China' 'Colombia' 'Comoros' 'Congo, Dem. Rep.' 'Congo, Rep.'
 'Costa Rica' 'Croatia' 'Curacao' 'Cyprus' 'Czech Republic'
 "CÃ\x83Â´te d'Ivoire" 'Denmark' 'Djibouti' 'Dominica'
 'Dominican Republic' 'East Asia & Pacific' 'Ecuador' 'Egypt, Arab Rep.'
 'El Salvador' 'Equatorial Guinea' 'Estonia' 'Eswatini' 'Ethiopia'
 'Europe & Central Asia' 'Fiji' 'Finland' 'France' 'Gabon' 'Gambia, The'
 'Germany' 'Ghana' 'Great Britain' 'Greece' 'Grenada' 'Guinea'
 'Guinea-Bissau' 'Guyana' 'Haiti' 'High income' 'Honduras'

In [53]:
df_gdp = pd.read_csv('../data/World_Bank/World_Bank_GDP_Cleaned.csv')
df_gdp.head(10)

Unnamed: 0,Country Name,Country Code,1960,1961,1962,1963,1964,1965,1966,1967,...,2015,2016,2017,2018,2019,2020,2021,2022,2023,Average
0,Aruba,ABW,,,,,,,,,...,2962.91,2983.64,3092.43,3276.18,3395.8,2558.91,3103.18,3544.71,,2048.89
1,Africa Eastern and Southern,AFE,21216.96,22307.47,23702.47,25779.38,28049.54,30374.91,33049.16,35933.76,...,899295.68,829829.96,940105.48,1012719.34,1006527.29,929074.09,1086772.16,1183962.13,1236163.04,376676.48
2,Afghanistan,AFG,,,,,,,,,...,19134.22,18116.57,18753.46,18053.22,18799.44,19955.93,14266.5,14502.16,,13093.44
3,Africa Western and Central,AFW,11884.13,12685.66,13606.83,14439.98,15769.11,16934.48,18048.04,16495.77,...,769367.32,692181.09,685750.16,768189.59,823933.63,787146.72,845993.05,877140.81,796586.16,279521.42
4,Angola,AGO,,,,,,,,,...,90496.42,52761.62,73690.15,79450.69,70897.96,48501.56,66505.13,104399.75,84722.96,54654.05
5,Albania,ALB,,,,,,,,,...,11386.85,11861.2,13019.73,15156.42,15401.83,15162.73,17930.57,18916.38,22977.68,7868.64
6,Andorra,AND,,,,,,,,,...,2789.88,2896.61,3000.16,3218.42,3155.15,2891.0,3324.68,3380.6,3727.67,1699.46
7,Arab World,ARB,,,,,,26772.09,28352.06,30601.97,...,2544136.35,2499965.91,2571053.72,2865543.59,2898668.72,2576553.32,2969116.81,3613682.35,3491516.94,1074237.82
8,United Arab Emirates,ARE,,,,,,,,,...,370275.47,369255.33,390516.8,427049.43,417989.72,349473.02,415178.79,507063.97,504173.45,159731.61
9,Argentina,ARG,,,,,,,,,...,594749.29,557532.32,643628.39,524819.89,447754.68,385740.51,487902.57,631133.38,640591.41,317986.22


In [54]:
df_gdp_avg = df_gdp[['Country Name', 'Average']].copy()
df_gdp_avg.rename(columns={'Country Name': 'country_name', 'Average': 'gdp_avg'}, inplace=True)
df_gdp_avg.loc[df_gdp_avg['country_name'] == 'Russian Federation', 'country_name'] = 'Russia'
df_gdp_avg.loc[df_gdp_avg['country_name'] == 'United Kingdom', 'country_name'] = 'Great Britain'
df_gdp_avg.to_csv('../data/etl/gdp_avg_by_country.csv', index=False)
df_gdp_avg.head(10)

Unnamed: 0,country_name,gdp_avg
0,Aruba,2048.89
1,Africa Eastern and Southern,376676.48
2,Afghanistan,13093.44
3,Africa Western and Central,279521.42
4,Angola,54654.05
5,Albania,7868.64
6,Andorra,1699.46
7,Arab World,1074237.82
8,United Arab Emirates,159731.61
9,Argentina,317986.22


In [55]:
# Get the count of medals for each country
drop_medal_cols = ['slug_game', 'game_end_date', 'game_start_date', 'game_location', 'game_name', 'game_year']
group_medal_cols = ['game_season', 'country_3_letter_code', 'country_name', 'discipline_title', 'event_title', 'event_gender', 'medal_type']
df_medals_season_country = df_medals.drop(columns=drop_medal_cols).groupby(
    group_medal_cols
)['participant_type'].count().reset_index()
df_medals_season_country.rename(columns={'participant_type':'medal_count', 'country_3_letter_code':'country_code'}, inplace=True)
df_medals_season_country.to_csv('../data/etl/medals_by_season_country_discip_event_type.csv', index=False)
df_medals_season_country.head(10)


Unnamed: 0,game_season,country_code,country_name,discipline_title,event_title,event_gender,medal_type,medal_count
0,Summer,AFG,Afghanistan,Taekwondo,58 - 68 kg men,Men,BRONZE,1
1,Summer,AFG,Afghanistan,Taekwondo,beijing 2008 taekwondo - 58 kg men,Men,BRONZE,1
2,Summer,AHO,Netherlands Antilles,Sailing,division ii - windsurfer men,Open,SILVER,1
3,Summer,ALG,Algeria,Athletics,1500m men,Men,GOLD,2
4,Summer,ALG,Algeria,Athletics,1500m men,Men,SILVER,1
5,Summer,ALG,Algeria,Athletics,1500m women,Women,GOLD,2
6,Summer,ALG,Algeria,Athletics,5000m men,Men,SILVER,1
7,Summer,ALG,Algeria,Athletics,800m men,Men,BRONZE,1
8,Summer,ALG,Algeria,Athletics,800m men,Men,SILVER,1
9,Summer,ALG,Algeria,Athletics,high jump men,Men,BRONZE,1


In [56]:
drop_medal_cols = ['game_season', 'discipline_title', 'event_title', 'event_gender', 'medal_type']
group_medal_cols = ['country_code', 'country_name']
df_medals_country_only = df_medals_season_country.drop(columns=drop_medal_cols).groupby(
    group_medal_cols
)['medal_count'].sum().reset_index()
df_medals_country_only.to_csv('../data/etl/medals_by_country.csv', index=False)
df_medals_country_only.head(10)

Unnamed: 0,country_code,country_name,medal_count
0,AFG,Afghanistan,2
1,AHO,Netherlands Antilles,1
2,ALG,Algeria,17
3,ANZ,Australasia,12
4,ARG,Argentina,77
5,ARM,Armenia,18
6,AUS,Australia,566
7,AUT,Austria,342
8,AZE,Azerbaijan,50
9,BAH,Bahamas,16


In [57]:
df_happiness_country = df_happiness.drop(columns=['year']).groupby('country_name').mean().reset_index()
df_happiness_country.loc[df_happiness_country['country_name'] == 'United Kingdom', 'country_name'] = 'Great Britain'
df_happiness_country.to_csv('../data/etl/happiness_avg_by_country.csv', index=False)
df_happiness_country.head(10)

Unnamed: 0,country_name,happiness,wealth,support,health,freedom,generosity,corruption,positivity,negativity
0,Afghanistan,3.219917,7.585624,0.476654,52.711667,0.480475,0.056325,0.835824,0.421804,0.37084
1,Albania,5.072808,9.416875,0.714302,68.54875,0.694877,-0.06828,0.868671,0.559857,0.294572
2,Algeria,5.39188,9.338236,0.811683,66.136363,0.520711,-0.132543,0.688052,0.540448,0.266245
3,Angola,4.420299,8.985547,0.737973,52.150001,0.455957,-0.093218,0.867018,0.625734,0.351173
4,Argentina,6.289722,10.028808,0.901894,66.7,0.777786,-0.153203,0.839019,0.737995,0.288382
5,Armenia,4.679549,9.339652,0.732543,65.866177,0.605719,-0.198377,0.820091,0.502449,0.44282
6,Australia,7.242307,10.764977,0.942319,70.570588,0.914413,0.254107,0.429199,0.737576,0.218661
7,Austria,7.178953,10.889285,0.918656,70.487501,0.894061,0.132884,0.560432,0.715841,0.190152
8,Azerbaijan,4.935251,9.527749,0.760488,62.507812,0.681153,-0.158288,0.693731,0.518247,0.240791
9,Bahrain,5.998169,10.784244,0.874401,65.655,0.862112,0.06464,0.553173,0.67094,0.351944


In [58]:
# Merge medals and happiness datasets
df_merged_medal_hap = df_medals_country_only.merge(right=df_happiness_country, how='left',
                             left_on='country_name', right_on='country_name')
df_merged_medal_hap.head(10)

Unnamed: 0,country_code,country_name,medal_count,happiness,wealth,support,health,freedom,generosity,corruption,positivity,negativity
0,AFG,Afghanistan,2,3.219917,7.585624,0.476654,52.711667,0.480475,0.056325,0.835824,0.421804,0.37084
1,AHO,Netherlands Antilles,1,,,,,,,,,
2,ALG,Algeria,17,5.39188,9.338236,0.811683,66.136363,0.520711,-0.132543,0.688052,0.540448,0.266245
3,ANZ,Australasia,12,,,,,,,,,
4,ARG,Argentina,77,6.289722,10.028808,0.901894,66.7,0.777786,-0.153203,0.839019,0.737995,0.288382
5,ARM,Armenia,18,4.679549,9.339652,0.732543,65.866177,0.605719,-0.198377,0.820091,0.502449,0.44282
6,AUS,Australia,566,7.242307,10.764977,0.942319,70.570588,0.914413,0.254107,0.429199,0.737576,0.218661
7,AUT,Austria,342,7.178953,10.889285,0.918656,70.487501,0.894061,0.132884,0.560432,0.715841,0.190152
8,AZE,Azerbaijan,50,4.935251,9.527749,0.760488,62.507812,0.681153,-0.158288,0.693731,0.518247,0.240791
9,BAH,Bahamas,16,,,,,,,,,


In [59]:
# Merge with nutrition dataset
df_merged_medal_hap_nut = df_merged_medal_hap.merge(right=df_nutrition, how='left',
                             left_on='country_name', right_on='country_name')
df_merged_medal_hap_nut.drop(columns=['country_code_x', 'country_code_y'], inplace=True)
df_merged_medal_hap_nut.reset_index(drop=True, inplace=True)
df_merged_medal_hap_nut.head()

Unnamed: 0,country_name,medal_count,happiness,wealth,support,health,freedom,generosity,corruption,positivity,negativity,calories_pct,nutrients_pct,good_diet_pct,bad_diet_pct,calories_mills,nutrients_mills,diet_mills,population
0,Afghanistan,2,3.219917,7.585624,0.476654,52.711667,0.480475,0.056325,0.835824,0.421804,0.37084,,,,,,,,
1,Netherlands Antilles,1,,,,,,,,,,,,,,,,,
2,Algeria,17,5.39188,9.338236,0.811683,66.136363,0.520711,-0.132543,0.688052,0.540448,0.266245,0.2,7.2,64.8,35.2,0.1,3.0,14.6,41389.174
3,Australasia,12,,,,,,,,,,,,,,,,,
4,Argentina,77,6.289722,10.028808,0.901894,66.7,0.777786,-0.153203,0.839019,0.737995,0.288382,0.3,7.1,89.0,11.0,0.1,3.1,4.8,44044.811


In [60]:
df_gdp_avg.head()

Unnamed: 0,country_name,gdp_avg
0,Aruba,2048.89
1,Africa Eastern and Southern,376676.48
2,Afghanistan,13093.44
3,Africa Western and Central,279521.42
4,Angola,54654.05


In [61]:
# df_gdp_avg
df_merged_medal_hap_nut_gdp = df_merged_medal_hap_nut.merge(right=df_gdp_avg, how='left',
                             left_on='country_name', right_on='country_name')
df_merged_medal_hap_nut_gdp.reset_index(drop=True, inplace=True)
df_merged_medal_hap_nut_gdp.head()

Unnamed: 0,country_name,medal_count,happiness,wealth,support,health,freedom,generosity,corruption,positivity,negativity,calories_pct,nutrients_pct,good_diet_pct,bad_diet_pct,calories_mills,nutrients_mills,diet_mills,population,gdp_avg
0,Afghanistan,2,3.219917,7.585624,0.476654,52.711667,0.480475,0.056325,0.835824,0.421804,0.37084,,,,,,,,,13093.44
1,Netherlands Antilles,1,,,,,,,,,,,,,,,,,,
2,Algeria,17,5.39188,9.338236,0.811683,66.136363,0.520711,-0.132543,0.688052,0.540448,0.266245,0.2,7.2,64.8,35.2,0.1,3.0,14.6,41389.174,79710.57
3,Australasia,12,,,,,,,,,,,,,,,,,,
4,Argentina,77,6.289722,10.028808,0.901894,66.7,0.777786,-0.153203,0.839019,0.737995,0.288382,0.3,7.1,89.0,11.0,0.1,3.1,4.8,44044.811,317986.22


In [62]:
# Save to csv
df_merged_medal_hap_nut_gdp.to_csv('../data/etl/merged_medal_hap_nut_gdp_by_country.csv', index=False)

In [63]:
# Set country name as index
df_merged_medal_hap_nut_gdp = df_merged_medal_hap_nut_gdp.set_index('country_name')


In [64]:
df_merged_medal_hap_nut_gdp.head(10)


Unnamed: 0_level_0,medal_count,happiness,wealth,support,health,freedom,generosity,corruption,positivity,negativity,calories_pct,nutrients_pct,good_diet_pct,bad_diet_pct,calories_mills,nutrients_mills,diet_mills,population,gdp_avg
country_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Afghanistan,2,3.219917,7.585624,0.476654,52.711667,0.480475,0.056325,0.835824,0.421804,0.37084,,,,,,,,,13093.44
Netherlands Antilles,1,,,,,,,,,,,,,,,,,,
Algeria,17,5.39188,9.338236,0.811683,66.136363,0.520711,-0.132543,0.688052,0.540448,0.266245,0.2,7.2,64.8,35.2,0.1,3.0,14.6,41389.174,79710.57
Australasia,12,,,,,,,,,,,,,,,,,,
Argentina,77,6.289722,10.028808,0.901894,66.7,0.777786,-0.153203,0.839019,0.737995,0.288382,0.3,7.1,89.0,11.0,0.1,3.1,4.8,44044.811,317986.22
Armenia,18,4.679549,9.339652,0.732543,65.866177,0.605719,-0.198377,0.820091,0.502449,0.44282,0.6,16.8,59.1,40.9,0.0,0.5,1.2,2944.789,7407.48
Australia,566,7.242307,10.764977,0.942319,70.570588,0.914413,0.254107,0.429199,0.737576,0.218661,0.2,0.5,99.3,0.7,0.1,0.1,0.2,24601.86,521918.13
Austria,342,7.178953,10.889285,0.918656,70.487501,0.894061,0.132884,0.560432,0.715841,0.190152,0.2,0.5,99.4,0.6,0.0,0.0,0.1,8797.566,197169.69
Azerbaijan,50,4.935251,9.527749,0.760488,62.507812,0.681153,-0.158288,0.693731,0.518247,0.240791,0.0,0.0,,0.0,0.0,0.0,0.0,9854.033,30686.34
Bahamas,16,,,,,,,,,,,,,,,,,,


In [65]:
# Check correlations between columns
df_merged_medal_hap_nut_gdp.corr()

Unnamed: 0,medal_count,happiness,wealth,support,health,freedom,generosity,corruption,positivity,negativity,calories_pct,nutrients_pct,good_diet_pct,bad_diet_pct,calories_mills,nutrients_mills,diet_mills,population,gdp_avg
medal_count,1.0,0.355586,0.357803,0.291602,0.284972,0.17584,0.144428,-0.198347,0.164676,-0.204697,-0.160419,-0.30194,0.35691,-0.343019,-0.052136,-0.052336,-0.04744,0.229562,0.858683
happiness,0.355586,1.0,0.816091,0.762118,0.763315,0.652632,0.325894,-0.562221,0.601594,-0.372836,-0.445801,-0.701876,0.7619,-0.754073,-0.19263,-0.208925,-0.212301,-0.116704,0.237327
wealth,0.357803,0.816091,1.0,0.73085,0.816775,0.497857,0.092069,-0.458153,0.298551,-0.279036,-0.552569,-0.872884,0.895549,-0.892065,-0.246917,-0.218126,-0.219779,-0.103476,0.248334
support,0.291602,0.762118,0.73085,1.0,0.646804,0.499487,0.134438,-0.29214,0.488799,-0.486312,-0.501859,-0.691144,0.730155,-0.720806,-0.2171,-0.328746,-0.341011,-0.185146,0.182812
health,0.284972,0.763315,0.816775,0.646804,1.0,0.430558,0.050002,-0.350378,0.280501,-0.19066,-0.607838,-0.853124,0.851832,-0.846876,-0.293222,-0.177807,-0.173161,-0.043193,0.195896
freedom,0.17584,0.652632,0.497857,0.499487,0.430558,1.0,0.396809,-0.548813,0.70462,-0.41133,-0.282297,-0.346318,0.388841,-0.401372,-0.082066,0.008591,0.0012,0.052297,0.148059
generosity,0.144428,0.325894,0.092069,0.134438,0.050002,0.396809,1.0,-0.373938,0.401587,-0.161033,0.060758,0.093614,-0.110418,0.083391,0.101863,0.084913,0.086993,-0.015731,0.070355
corruption,-0.198347,-0.562221,-0.458153,-0.29214,-0.350378,-0.548813,-0.373938,1.0,-0.316435,0.288833,0.159478,0.259364,-0.289108,0.316306,0.137014,0.09996,0.104769,0.118001,-0.078551
positivity,0.164676,0.601594,0.298551,0.488799,0.280501,0.70462,0.401587,-0.316435,1.0,-0.361892,-0.023155,-0.092354,0.170789,-0.175315,0.066444,-0.056769,-0.068174,-0.013906,0.145508
negativity,-0.204697,-0.372836,-0.279036,-0.486312,-0.19066,-0.41133,-0.161033,0.288833,-0.361892,1.0,0.077849,0.231419,-0.259882,0.276438,0.019136,0.122149,0.119779,-0.04099,-0.146909


In [66]:
# Try normaizing the data to see if that changes the correlations
from sklearn.preprocessing import MinMaxScaler

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

cols_to_scale = [
    'medal_count',
    'happiness',
    'wealth',
    'support',
    'health',
    'calories_pct',
    'nutrients_pct',
    'bad_diet_pct',
    'good_diet_pct',
    'calories_mills',
    'nutrients_mills',
    'diet_mills',
    'population',
    'gdp_avg'
]

# Normalize the numeric columns in a new dataframe
df_merged_medal_hap_nut_gdp_scaled = df_merged_medal_hap_nut_gdp.copy()
df_merged_medal_hap_nut_gdp_scaled[cols_to_scale] = scaler.fit_transform(df_merged_medal_hap_nut_gdp_scaled[cols_to_scale])

df_merged_medal_hap_nut_gdp_scaled


Unnamed: 0_level_0,medal_count,happiness,wealth,support,health,freedom,generosity,corruption,positivity,negativity,calories_pct,nutrients_pct,good_diet_pct,bad_diet_pct,calories_mills,nutrients_mills,diet_mills,population,gdp_avg
country_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Afghanistan,0.000341,0.000000,0.182081,0.105192,0.383175,0.480475,0.056325,0.835824,0.421804,0.370840,,,,,,,,,0.001507
Netherlands Antilles,0.000000,,,,,,,,,,,,,,,,,,
Algeria,0.005455,0.488729,0.535349,0.702498,0.780680,0.520711,-0.132543,0.688052,0.540448,0.266245,0.003135,0.080357,0.639630,0.361026,0.000947,0.003913,0.014564,0.029599,0.009311
Australasia,0.003750,,,,,,,,,,,,,,,,,,
Argentina,0.025912,0.690758,0.674545,0.863331,0.797369,0.777786,-0.153203,0.839019,0.737995,0.288382,0.004702,0.079241,0.888090,0.112821,0.000947,0.004044,0.004788,0.031502,0.037225
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Vietnam,0.001023,0.499012,0.462573,0.730580,0.744071,0.898700,-0.024657,0.771082,0.623233,0.204675,0.020376,0.177455,0.668378,0.332308,0.011364,0.019697,0.030623,0.067712,
West Indies Federation,0.000341,,,,,,,,,,,,,,,,,,
Yugoslavia,0.029322,,,,,,,,,,,,,,,,,,
Zambia,0.000341,0.257879,0.277354,0.548344,0.363496,0.774665,0.020810,0.821056,0.676104,0.302208,1.000000,0.912946,0.101643,0.898462,0.101326,0.018002,0.014763,0.012026,0.000950


In [67]:
# Check correlations between columns
df_merged_medal_hap_nut_gdp_scaled.corr()

# The correlations are not different so the corr() function is normailizing the data.

Unnamed: 0,medal_count,happiness,wealth,support,health,freedom,generosity,corruption,positivity,negativity,calories_pct,nutrients_pct,good_diet_pct,bad_diet_pct,calories_mills,nutrients_mills,diet_mills,population,gdp_avg
medal_count,1.0,0.355586,0.357803,0.291602,0.284972,0.17584,0.144428,-0.198347,0.164676,-0.204697,-0.160419,-0.30194,0.35691,-0.343019,-0.052136,-0.052336,-0.04744,0.229562,0.858683
happiness,0.355586,1.0,0.816091,0.762118,0.763315,0.652632,0.325894,-0.562221,0.601594,-0.372836,-0.445801,-0.701876,0.7619,-0.754073,-0.19263,-0.208925,-0.212301,-0.116704,0.237327
wealth,0.357803,0.816091,1.0,0.73085,0.816775,0.497857,0.092069,-0.458153,0.298551,-0.279036,-0.552569,-0.872884,0.895549,-0.892065,-0.246917,-0.218126,-0.219779,-0.103476,0.248334
support,0.291602,0.762118,0.73085,1.0,0.646804,0.499487,0.134438,-0.29214,0.488799,-0.486312,-0.501859,-0.691144,0.730155,-0.720806,-0.2171,-0.328746,-0.341011,-0.185146,0.182812
health,0.284972,0.763315,0.816775,0.646804,1.0,0.430558,0.050002,-0.350378,0.280501,-0.19066,-0.607838,-0.853124,0.851832,-0.846876,-0.293222,-0.177807,-0.173161,-0.043193,0.195896
freedom,0.17584,0.652632,0.497857,0.499487,0.430558,1.0,0.396809,-0.548813,0.70462,-0.41133,-0.282297,-0.346318,0.388841,-0.401372,-0.082066,0.008591,0.0012,0.052297,0.148059
generosity,0.144428,0.325894,0.092069,0.134438,0.050002,0.396809,1.0,-0.373938,0.401587,-0.161033,0.060758,0.093614,-0.110418,0.083391,0.101863,0.084913,0.086993,-0.015731,0.070355
corruption,-0.198347,-0.562221,-0.458153,-0.29214,-0.350378,-0.548813,-0.373938,1.0,-0.316435,0.288833,0.159478,0.259364,-0.289108,0.316306,0.137014,0.09996,0.104769,0.118001,-0.078551
positivity,0.164676,0.601594,0.298551,0.488799,0.280501,0.70462,0.401587,-0.316435,1.0,-0.361892,-0.023155,-0.092354,0.170789,-0.175315,0.066444,-0.056769,-0.068174,-0.013906,0.145508
negativity,-0.204697,-0.372836,-0.279036,-0.486312,-0.19066,-0.41133,-0.161033,0.288833,-0.361892,1.0,0.077849,0.231419,-0.259882,0.276438,0.019136,0.122149,0.119779,-0.04099,-0.146909
