In [1]:
# Import libraries
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # data visualization
from matplotlib import pyplot as plt
from kaggle_olympic_games_medals import KaggleOlympicGamesMedals

In [2]:
# Instantiate the olympic games medal class
# and load the data
data_dir = '../data/kaggle/olympic-games-medals'
ogm = KaggleOlympicGamesMedals(data_dir)

Data Loaded


In [3]:
# Get the medals dataframe with standardized country names
df_medals = ogm.get_medals_by_std_country_name()
df_medals.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20124 entries, 0 to 21696
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   discipline_title       20124 non-null  object
 1   slug_game              20124 non-null  object
 2   event_title            20124 non-null  object
 3   event_gender           20124 non-null  object
 4   medal_type             20124 non-null  object
 5   participant_type       20124 non-null  object
 6   country_name           20124 non-null  object
 7   country_3_letter_code  20124 non-null  object
 8   game_end_date          20124 non-null  object
 9   game_start_date        20124 non-null  object
 10  game_location          20124 non-null  object
 11  game_name              20124 non-null  object
 12  game_season            20124 non-null  object
 13  game_year              20124 non-null  int64 
dtypes: int64(1), object(13)
memory usage: 2.3+ MB


In [4]:
# Get the count of medals for each country
drop_medal_cols = ['country_3_letter_code', 'game_end_date', 'game_start_date', 'game_location', 'game_name', 'game_year']
group_medal_cols = ['slug_game', 'game_season', 'country_name', 'discipline_title', 'event_title', 'event_gender', 'medal_type']
df_medals_slug_season_country = df_medals.drop(columns=drop_medal_cols).groupby(
    group_medal_cols
)['participant_type'].count().reset_index()
df_medals_slug_season_country.rename(columns={'participant_type':'medal_count'}, inplace=True)
df_medals_slug_season_country.to_csv('../data/etl/medals_by_slug_season_country_discip_event_type.csv', index=False)
df_medals_slug_season_country.head(10)

Unnamed: 0,slug_game,game_season,country_name,discipline_title,event_title,event_gender,medal_type,medal_count
0,albertville-1992,Winter,Austria,Alpine Skiing,alpine combined women,Women,GOLD,1
1,albertville-1992,Winter,Austria,Alpine Skiing,alpine combined women,Women,SILVER,1
2,albertville-1992,Winter,Austria,Alpine Skiing,downhill men,Men,BRONZE,1
3,albertville-1992,Winter,Austria,Alpine Skiing,downhill men,Men,GOLD,1
4,albertville-1992,Winter,Austria,Alpine Skiing,downhill women,Women,BRONZE,1
5,albertville-1992,Winter,Austria,Alpine Skiing,giant slalom women,Women,SILVER,1
6,albertville-1992,Winter,Austria,Alpine Skiing,slalom men,Men,BRONZE,1
7,albertville-1992,Winter,Austria,Alpine Skiing,slalom women,Women,GOLD,1
8,albertville-1992,Winter,Austria,Bobsleigh,four-man men,Men,GOLD,1
9,albertville-1992,Winter,Austria,Luge,singles men,Men,BRONZE,1


In [5]:
# Get the medal details dataset
df_medal_details = pd.read_csv('../data/etl/medals_by_slug_season_country_discip_event_type.csv')
df_medal_details.head()

Unnamed: 0,slug_game,game_season,country_name,discipline_title,event_title,event_gender,medal_type,medal_count
0,albertville-1992,Winter,Austria,Alpine Skiing,alpine combined women,Women,GOLD,1
1,albertville-1992,Winter,Austria,Alpine Skiing,alpine combined women,Women,SILVER,1
2,albertville-1992,Winter,Austria,Alpine Skiing,downhill men,Men,BRONZE,1
3,albertville-1992,Winter,Austria,Alpine Skiing,downhill men,Men,GOLD,1
4,albertville-1992,Winter,Austria,Alpine Skiing,downhill women,Women,BRONZE,1


In [6]:
df_pivot = df_medal_details.pivot(index=['slug_game', 'game_season', 'country_name', 'discipline_title', 'event_title', 'event_gender'], columns='medal_type', values='medal_count')
df_pivot

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,medal_type,BRONZE,GOLD,SILVER
slug_game,game_season,country_name,discipline_title,event_title,event_gender,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
albertville-1992,Winter,Austria,Alpine Skiing,alpine combined women,Women,,1.0,1.0
albertville-1992,Winter,Austria,Alpine Skiing,downhill men,Men,1.0,1.0,
albertville-1992,Winter,Austria,Alpine Skiing,downhill women,Women,1.0,,
albertville-1992,Winter,Austria,Alpine Skiing,giant slalom women,Women,,,1.0
albertville-1992,Winter,Austria,Alpine Skiing,slalom men,Men,1.0,,
...,...,...,...,...,...,...,...,...
vancouver-2010,Winter,United States,Snowboard,half-pipe women,Women,1.0,,1.0
vancouver-2010,Winter,United States,Snowboard,snowboard cross men,Men,,1.0,
vancouver-2010,Winter,United States,Speed skating,1000m men,Men,1.0,1.0,
vancouver-2010,Winter,United States,Speed skating,1500m men,Men,,,1.0


In [7]:
df_pivot.reset_index(drop=False, inplace=True)
df_pivot = df_pivot.rename(columns={'GOLD': 'gold', 'SILVER': 'silver', 'BRONZE': 'bronze'})
df_pivot = df_pivot[['slug_game', 'country_name', 'game_season', 'discipline_title', 'event_title', 'event_gender', 'gold', 'silver', 'bronze']]
df_pivot['total_medals'] = df_pivot[['gold', 'silver', 'bronze']].sum(1)
df_pivot

medal_type,slug_game,country_name,game_season,discipline_title,event_title,event_gender,gold,silver,bronze,total_medals
0,albertville-1992,Austria,Winter,Alpine Skiing,alpine combined women,Women,1.0,1.0,,2.0
1,albertville-1992,Austria,Winter,Alpine Skiing,downhill men,Men,1.0,,1.0,2.0
2,albertville-1992,Austria,Winter,Alpine Skiing,downhill women,Women,,,1.0,1.0
3,albertville-1992,Austria,Winter,Alpine Skiing,giant slalom women,Women,,1.0,,1.0
4,albertville-1992,Austria,Winter,Alpine Skiing,slalom men,Men,,,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...
17738,vancouver-2010,United States,Winter,Snowboard,half-pipe women,Women,,1.0,1.0,2.0
17739,vancouver-2010,United States,Winter,Snowboard,snowboard cross men,Men,1.0,,,1.0
17740,vancouver-2010,United States,Winter,Speed skating,1000m men,Men,1.0,,1.0,2.0
17741,vancouver-2010,United States,Winter,Speed skating,1500m men,Men,,1.0,,1.0


In [8]:
df_pivot_sum = df_pivot.drop(columns=['slug_game'])
df_pivot_sum = df_pivot_sum.groupby(
    ['country_name', 'game_season', 'discipline_title', 'event_title', 'event_gender']
).sum().reset_index()
df_pivot_sum.to_csv('../data/etl/medals_by_type_country_season_discip_event_gender.csv', index=False)
df_pivot_sum

medal_type,country_name,game_season,discipline_title,event_title,event_gender,gold,silver,bronze,total_medals
0,Afghanistan,Summer,Taekwondo,58 - 68 kg men,Men,0.0,0.0,1.0,1.0
1,Afghanistan,Summer,Taekwondo,beijing 2008 taekwondo - 58 kg men,Men,0.0,0.0,1.0,1.0
2,Algeria,Summer,Athletics,1500m men,Men,2.0,1.0,0.0,3.0
3,Algeria,Summer,Athletics,1500m women,Women,2.0,0.0,0.0,2.0
4,Algeria,Summer,Athletics,5000m men,Men,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...
8585,Zimbabwe,Summer,Hockey,hockey women,Women,1.0,0.0,0.0,1.0
8586,Zimbabwe,Summer,Swimming,100m backstroke women,Women,0.0,2.0,0.0,2.0
8587,Zimbabwe,Summer,Swimming,200m backstroke women,Women,2.0,0.0,0.0,2.0
8588,Zimbabwe,Summer,Swimming,200m individual medley women,Women,0.0,1.0,1.0,2.0
