In [65]:
# Import libraries
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # data visualization
from matplotlib import pyplot as plt

In [66]:
# Get the medal details dataset
df_medal_details = pd.read_csv('../data/etl/medals_by_season_country_discip_event_type.csv')
df_medal_details.head()

Unnamed: 0,game_season,country_code,country_name,discipline_title,event_title,event_gender,medal_type,medal_count
0,Summer,AFG,Afghanistan,Taekwondo,58 - 68 kg men,Men,BRONZE,1
1,Summer,AFG,Afghanistan,Taekwondo,beijing 2008 taekwondo - 58 kg men,Men,BRONZE,1
2,Summer,AHO,Netherlands Antilles,Sailing,division ii - windsurfer men,Open,SILVER,1
3,Summer,ALG,Algeria,Athletics,1500m men,Men,GOLD,2
4,Summer,ALG,Algeria,Athletics,1500m men,Men,SILVER,1


In [67]:
# Get all of the other datasets in the etl folder
df_hapiness = pd.read_csv('../data/etl/happiness_avg_by_country.csv')
df_hapiness.loc[df_hapiness['country'] == 'United Kingdom', 'country'] = 'Great Britain'
df_gdp = pd.read_csv('../data/etl/gdp_avg_by_country.csv')
df_gdp.loc[df_gdp['country_name'] == 'Russian Federation', 'country_name'] = 'Russia'
df_gdp.loc[df_gdp['country_name'] == 'United Kingdom', 'country_name'] = 'Great Britain'
df_nutrition = pd.read_csv('../data/etl/nutrition_2017_by_country.csv')
df_nutrition.loc[df_nutrition['country_name'] == 'Russian Federation', 'country_name'] = 'Russia'
df_nutrition.loc[df_nutrition['country_name'] == 'United Kingdom', 'country_name'] = 'Great Britain'
df_nutrition['population'] = df_nutrition['population'].apply(lambda x: x/1000)
df_nutrition['good_diet_pct'] = df_nutrition['diet_pct'].apply(lambda x: 100-x if x > 0 else 0)
df_nutrition.rename(columns={'diet_pct': 'bad_diet_pct'}, inplace=True)
df_nutrition.head()


Unnamed: 0,country_code,country_name,calories_pct,nutrients_pct,bad_diet_pct,calories_mills,nutrients_mills,diet_mills,population,good_diet_pct
0,ABW,Aruba,,,,,,,105.361,0.0
1,AGO,Angola,57.2,87.1,92.9,17.0,26.0,27.7,29816.769,7.1
2,AIA,Anguilla,,,,,,,,0.0
3,ALB,Albania,0.0,13.0,37.8,0.0,0.4,1.1,2873.457,62.2
4,ARE,United Arab Emirates,0.0,0.0,0.0,0.0,0.0,0.0,9487.206,0.0


In [68]:
# Unwanted columns - Simplify the dataset
unwanted_cols = [
    'support',
    'health',
    'freedom',
    'generosity',
    'corruption',
    'positivity',
    'negativity',
    'calories_pct',
    'nutrients_pct',
    'calories_mills',
    'nutrients_mills',
    'diet_mills'
]

In [69]:
# Function to merge a medals slice with the other datasets
def merge_hap_gdp_nut(df_medal_slice: pd.DataFrame) -> pd.DataFrame:
    df_merged = pd.merge(df_medal_slice, df_hapiness, left_on='country_name', right_on='country', how='left')
    df_merged = pd.merge(df_merged, df_gdp, on='country_name', how='left')
    df_merged = pd.merge(df_merged, df_nutrition, on='country_name', how='left')
    df_merged = df_merged.drop(columns=['country', 'country_code']).reset_index(drop=True)
    df_merged.drop(unwanted_cols, axis=1, inplace=True)
    return df_merged

In [70]:
# Function to slice the medal details dataset and prepare it for correlation
def slice_medal_details_data(data_to_select: dict[str, str]) -> pd.DataFrame:
    drop_medal_cols = ['game_season', 'country_code', 'discipline_title', 'event_title', 'event_gender', 'medal_type']
    group_medal_cols = ['country_name']

    df_slice = df_medal_details.copy()
    for key, value in data_to_select.items():
        df_slice = df_slice[df_slice[key] == value]

    df_slice = df_slice.drop(columns=drop_medal_cols)
    df_slice = df_slice.groupby(group_medal_cols)['medal_count'].sum().reset_index()
    return df_slice.reset_index(drop=True)

In [71]:
def box_plot_medals(df: pd.DataFrame, title: str) -> None:
    medals = df['medal_count']
    fig1, ax1 = plt.subplots()
    ax1.set_title(title)
    ax1.set_ylabel('Medal Count')
    ax1.boxplot(medals)
    plt.show()

In [72]:
# Women's summer games correlation
data_to_select = {
    'game_season': 'Summer', 
    'event_gender': 'Women'
}
df = slice_medal_details_data(data_to_select)
df = merge_hap_gdp_nut(df)
df.info()
df.set_index('country_name').corr()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   country_name   103 non-null    object 
 1   medal_count    103 non-null    int64  
 2   happiness      85 non-null     float64
 3   wealth         84 non-null     float64
 4   gdp_avg        84 non-null     float64
 5   bad_diet_pct   72 non-null     float64
 6   population     76 non-null     float64
 7   good_diet_pct  76 non-null     float64
dtypes: float64(6), int64(1), object(1)
memory usage: 6.6+ KB


Unnamed: 0,medal_count,happiness,wealth,gdp_avg,bad_diet_pct,population,good_diet_pct
medal_count,1.0,0.234169,0.292613,0.831738,-0.269807,0.313934,0.299061
happiness,0.234169,1.0,0.80293,0.208903,-0.729592,-0.178541,0.44892
wealth,0.292613,0.80293,1.0,0.228058,-0.890775,-0.159272,0.542274
gdp_avg,0.831738,0.208903,0.228058,1.0,-0.197154,0.409605,0.227722
bad_diet_pct,-0.269807,-0.729592,-0.890775,-0.197154,1.0,0.15974,-0.804307
population,0.313934,-0.178541,-0.159272,0.409605,0.15974,1.0,-0.058454
good_diet_pct,0.299061,0.44892,0.542274,0.227722,-0.804307,-0.058454,1.0


In [73]:
# Top and Bottom Womens Summer Games
df.reset_index(drop=True, inplace=True)
df = df.sort_values(by='medal_count', ascending=False).reset_index(drop=True)
display(df.head())
display(df.tail())

Unnamed: 0,country_name,medal_count,happiness,wealth,gdp_avg,bad_diet_pct,population,good_diet_pct
0,United States,712,7.029156,10.976059,8536365.96,2.0,325122.128,98.0
1,Russia,473,5.564873,10.148258,1062108.61,4.0,144496.739,96.0
2,Germany,451,6.82241,10.824794,1857701.61,0.2,82657.002,99.8
3,China,356,5.160455,9.340597,3291242.38,14.3,1396215.0,85.7
4,Australia,216,7.242307,10.764977,521918.13,0.7,24601.86,99.3


Unnamed: 0,country_name,medal_count,happiness,wealth,gdp_avg,bad_diet_pct,population,good_diet_pct
98,Chile,1,6.357284,10.05155,94857.75,3.4,18470.435,96.6
99,Sri Lanka,1,4.263359,9.297134,25338.77,52.3,21444.0,47.7
100,Iran,1,4.875571,9.590507,,,,
101,Peru,1,5.601965,9.313264,102416.78,23.7,31444.299,76.3
102,Latvia,1,5.569637,10.204665,23121.8,3.4,1942.248,96.6


In [74]:
# Men's summer games correlation
data_to_select = {
    'game_season': 'Summer', 
    'event_gender': 'Men'
}
df = slice_medal_details_data(data_to_select)
df = merge_hap_gdp_nut(df)
df.info()
df.set_index('country_name').corr()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136 entries, 0 to 135
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   country_name   136 non-null    object 
 1   medal_count    136 non-null    int64  
 2   happiness      110 non-null    float64
 3   wealth         109 non-null    float64
 4   gdp_avg        112 non-null    float64
 5   bad_diet_pct   93 non-null     float64
 6   population     102 non-null    float64
 7   good_diet_pct  102 non-null    float64
dtypes: float64(6), int64(1), object(1)
memory usage: 8.6+ KB


Unnamed: 0,medal_count,happiness,wealth,gdp_avg,bad_diet_pct,population,good_diet_pct
medal_count,1.0,0.302195,0.323346,0.838378,-0.318831,0.180862,0.335593
happiness,0.302195,1.0,0.825564,0.220894,-0.756928,-0.12513,0.445868
wealth,0.323346,0.825564,1.0,0.219088,-0.881313,-0.123776,0.493703
gdp_avg,0.838378,0.220894,0.219088,1.0,-0.210609,0.42153,0.233127
bad_diet_pct,-0.318831,-0.756928,-0.881313,-0.210609,1.0,0.1216,-0.805067
population,0.180862,-0.12513,-0.123776,0.42153,0.1216,1.0,-0.019485
good_diet_pct,0.335593,0.445868,0.493703,0.233127,-0.805067,-0.019485,1.0


In [75]:
# Top and Bottom Mens Summer Games
df.reset_index(drop=True, inplace=True)
df = df.sort_values(by='medal_count', ascending=False).reset_index(drop=True)
display(df.head())
display(df.tail())

Unnamed: 0,country_name,medal_count,happiness,wealth,gdp_avg,bad_diet_pct,population,good_diet_pct
0,United States,1793,7.029156,10.976059,8536365.96,2.0,325122.128,98.0
1,Russia,992,5.564873,10.148258,1062108.61,4.0,144496.739,96.0
2,Germany,793,6.82241,10.824794,1857701.61,0.2,82657.002,99.8
3,Great Britain,617,6.889592,10.695583,1345386.44,0.5,66058.859,99.5
4,France,554,6.659597,10.681801,1332030.59,0.1,66918.02,99.9


Unnamed: 0,country_name,medal_count,happiness,wealth,gdp_avg,bad_diet_pct,population,good_diet_pct
131,Iraq,1,4.774596,9.109163,70166.73,53.3,37552.789,46.7
132,Tonga,1,,,231.03,,,
133,Togo,1,3.719667,7.550613,2674.59,,7698.476,0.0
134,Mauritius,1,5.849597,9.976086,4751.99,14.8,1264.613,85.2
135,Sudan,1,4.378741,8.475512,21197.29,86.3,40813.398,13.7


In [78]:
# Women's Winter games correlation
data_to_select = {
    'game_season': 'Winter', 
    'event_gender': 'Women'
}
df = slice_medal_details_data(data_to_select)
df = merge_hap_gdp_nut(df)
df.info()
df.set_index('country_name').corr()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37 entries, 0 to 36
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   country_name   37 non-null     object 
 1   medal_count    37 non-null     int64  
 2   happiness      31 non-null     float64
 3   wealth         31 non-null     float64
 4   gdp_avg        30 non-null     float64
 5   bad_diet_pct   27 non-null     float64
 6   population     28 non-null     float64
 7   good_diet_pct  28 non-null     float64
dtypes: float64(6), int64(1), object(1)
memory usage: 2.4+ KB


Unnamed: 0,medal_count,happiness,wealth,gdp_avg,bad_diet_pct,population,good_diet_pct
medal_count,1.0,0.293902,0.368509,0.49753,-0.104417,0.141647,0.095559
happiness,0.293902,1.0,0.792075,0.025073,-0.707371,-0.34649,-0.177127
wealth,0.368509,0.792075,1.0,0.129109,-0.719835,-0.528144,-0.124514
gdp_avg,0.49753,0.025073,0.129109,1.0,0.185989,0.490593,0.103407
bad_diet_pct,-0.104417,-0.707371,-0.719835,0.185989,1.0,0.692564,-0.062678
population,0.141647,-0.34649,-0.528144,0.490593,0.692564,1.0,-0.000965
good_diet_pct,0.095559,-0.177127,-0.124514,0.103407,-0.062678,-0.000965,1.0


In [79]:
# Top and Bottom Woemns Winter Games
df.reset_index(drop=True, inplace=True)
df = df.sort_values(by='medal_count', ascending=False).reset_index(drop=True)
display(df.head())
display(df.tail())

Unnamed: 0,country_name,medal_count,happiness,wealth,gdp_avg,bad_diet_pct,population,good_diet_pct
0,Germany,201,6.82241,10.824794,1857701.61,0.2,82657.002,99.8
1,United States,139,7.029156,10.976059,8536365.96,2.0,325122.128,98.0
2,Russia,126,5.564873,10.148258,1062108.61,4.0,144496.739,96.0
3,Canada,100,7.296854,10.756211,755211.83,0.7,36545.295,99.3
4,Norway,79,7.463886,11.074348,186487.61,0.5,5276.968,99.5


Unnamed: 0,country_name,medal_count,happiness,wealth,gdp_avg,bad_diet_pct,population,good_diet_pct
32,Spain,2,6.494954,10.547632,623171.73,1.9,46593.236,98.1
33,Denmark,1,7.664026,10.896594,161727.08,0.2,5764.98,99.8
34,Belgium,1,6.966089,10.812809,242886.71,0.3,11375.158,99.7
35,Uzbekistan,1,5.865984,8.719908,36442.72,,,
36,Yugoslavia,1,,,,,,


In [80]:
# Men's Winter games correlation
data_to_select = {
    'game_season': 'Winter', 
    'event_gender': 'Men'
}
df = slice_medal_details_data(data_to_select)
df = merge_hap_gdp_nut(df)
df.info()
df.set_index('country_name').corr()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38 entries, 0 to 37
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   country_name   38 non-null     object 
 1   medal_count    38 non-null     int64  
 2   happiness      33 non-null     float64
 3   wealth         33 non-null     float64
 4   gdp_avg        32 non-null     float64
 5   bad_diet_pct   30 non-null     float64
 6   population     31 non-null     float64
 7   good_diet_pct  31 non-null     float64
dtypes: float64(6), int64(1), object(1)
memory usage: 2.5+ KB


Unnamed: 0,medal_count,happiness,wealth,gdp_avg,bad_diet_pct,population,good_diet_pct
medal_count,1.0,0.50056,0.441564,0.298308,-0.280353,-4.9e-05,0.058399
happiness,0.50056,1.0,0.856339,0.069879,-0.682786,-0.285016,-0.195809
wealth,0.441564,0.856339,1.0,0.081644,-0.674269,-0.453117,-0.096583
gdp_avg,0.298308,0.069879,0.081644,1.0,0.108115,0.498329,0.094008
bad_diet_pct,-0.280353,-0.682786,-0.674269,0.108115,1.0,0.573609,-0.081156
population,-4.9e-05,-0.285016,-0.453117,0.498329,0.573609,1.0,-0.004963
good_diet_pct,0.058399,-0.195809,-0.096583,0.094008,-0.081156,-0.004963,1.0


In [81]:
# Top and Bottom Mens Winter Games
df.reset_index(drop=True, inplace=True)
df = df.sort_values(by='medal_count', ascending=False).reset_index(drop=True)
display(df.head())
display(df.tail())

Unnamed: 0,country_name,medal_count,happiness,wealth,gdp_avg,bad_diet_pct,population,good_diet_pct
0,Norway,318,7.463886,11.074348,186487.61,0.5,5276.968,99.5
1,Germany,189,6.82241,10.824794,1857701.61,0.2,82657.002,99.8
2,Russia,173,5.564873,10.148258,1062108.61,4.0,144496.739,96.0
3,United States,167,7.029156,10.976059,8536365.96,2.0,325122.128,98.0
4,Austria,153,7.178953,10.889285,197169.69,0.6,8797.566,99.4


Unnamed: 0,country_name,medal_count,happiness,wealth,gdp_avg,bad_diet_pct,population,good_diet_pct
33,New Zealand,2,7.2607,10.617803,76140.29,,4813.6,0.0
34,Luxembourg,2,7.058734,11.643438,25419.95,0.4,596.336,99.6
35,Ukraine,2,4.785123,9.421713,102028.93,,,
36,Romania,1,5.77022,10.115109,126545.07,11.9,19588.715,88.1
37,Bulgaria,1,4.75198,9.96113,35563.0,11.3,7075.947,88.7


In [85]:
# Alpine Skiing correlation
data_to_select = {
    'discipline_title': 'Alpine Skiing'
}
df = slice_medal_details_data(data_to_select)
df = merge_hap_gdp_nut(df)
df.set_index('country_name').corr()

Unnamed: 0,medal_count,happiness,wealth,gdp_avg,bad_diet_pct,population,good_diet_pct
medal_count,1.0,0.305798,0.331354,0.135036,-0.228137,0.067566,-0.12019
happiness,0.305798,1.0,0.718135,-0.073963,-0.749038,-0.254792,-0.264539
wealth,0.331354,0.718135,1.0,0.068029,-0.627043,-0.099482,-0.093282
gdp_avg,0.135036,-0.073963,0.068029,1.0,0.10461,0.952866,0.161322
bad_diet_pct,-0.228137,-0.749038,-0.627043,0.10461,1.0,0.232156,0.108659
population,0.067566,-0.254792,-0.099482,0.952866,0.232156,1.0,0.187215
good_diet_pct,-0.12019,-0.264539,-0.093282,0.161322,0.108659,0.187215,1.0


In [86]:
# Top and Bottom Alpine Skiing
df.reset_index(drop=True, inplace=True)
df = df.sort_values(by='medal_count', ascending=False).reset_index(drop=True)
display(df.head())
display(df.tail())

Unnamed: 0,country_name,medal_count,happiness,wealth,gdp_avg,bad_diet_pct,population,good_diet_pct
0,Austria,128,7.178953,10.889285,197169.69,0.6,8797.566,99.4
1,Switzerland,74,7.435587,11.125057,305620.08,0.0,8451.84,0.0
2,France,51,6.659597,10.681801,1332030.59,0.1,66918.02,99.9
3,United States,48,7.029156,10.976059,8536365.96,2.0,325122.128,98.0
4,Germany,41,6.82241,10.824794,1857701.61,0.2,82657.002,99.8


Unnamed: 0,country_name,medal_count,happiness,wealth,gdp_avg,bad_diet_pct,population,good_diet_pct
17,Slovakia,1,6.102455,10.271837,,,,
18,New Zealand,1,7.2607,10.617803,76140.29,,4813.6,0.0
19,Japan,1,6.023527,10.593319,2938993.71,2.5,126785.797,97.5
20,Finland,1,7.624132,10.761079,125374.08,0.1,5508.214,99.9
21,Australia,1,7.242307,10.764977,521918.13,0.7,24601.86,99.3


In [88]:
# Athletics correlation
data_to_select = {
    'discipline_title': 'Athletics'
}
df = slice_medal_details_data(data_to_select)
df = merge_hap_gdp_nut(df)
df.set_index('country_name').corr()

Unnamed: 0,medal_count,happiness,wealth,gdp_avg,bad_diet_pct,population,good_diet_pct
medal_count,1.0,0.22575,0.21394,0.853669,-0.190929,0.136839,0.225912
happiness,0.22575,1.0,0.828379,0.216188,-0.790918,-0.142476,0.474491
wealth,0.21394,0.828379,1.0,0.220412,-0.903651,-0.124657,0.560182
gdp_avg,0.853669,0.216188,0.220412,1.0,-0.226407,0.414933,0.248202
bad_diet_pct,-0.190929,-0.790918,-0.903651,-0.226407,1.0,0.103415,-0.878356
population,0.136839,-0.142476,-0.124657,0.414933,0.103415,1.0,-0.021081
good_diet_pct,0.225912,0.474491,0.560182,0.248202,-0.878356,-0.021081,1.0


In [89]:
# Top and Bottom Athletics
df.reset_index(drop=True, inplace=True)
df = df.sort_values(by='medal_count', ascending=False).reset_index(drop=True)
display(df.head())
display(df.tail())

Unnamed: 0,country_name,medal_count,happiness,wealth,gdp_avg,bad_diet_pct,population,good_diet_pct
0,United States,821,7.029156,10.976059,8536365.96,2.0,325122.128,98.0
1,Germany,265,6.82241,10.824794,1857701.61,0.2,82657.002,99.8
2,Russia,254,5.564873,10.148258,1062108.61,4.0,144496.739,96.0
3,Great Britain,211,6.889592,10.695583,1345386.44,0.5,66058.859,99.5
4,Finland,114,7.624132,10.761079,125374.08,0.1,5508.214,99.9


Unnamed: 0,country_name,medal_count,happiness,wealth,gdp_avg,bad_diet_pct,population,good_diet_pct
92,Côte d'Ivoire,1,,,,,,
93,Djibouti,1,4.822565,8.053561,1313.37,64.6,944.1,35.4
94,Eritrea,1,,,977.77,,,
95,Iran,1,4.875571,9.590507,,,,
96,Zambia,1,4.365957,8.058288,8339.74,87.6,16853.608,12.4
