In [1]:
# Import libraries
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # data visualization
from matplotlib import pyplot as plt

In [2]:
# Get the medal type details dataset
df_medal_type_details = pd.read_csv('../data/etl/medals_by_type_country_season_discip_event_gender.csv')
df_medal_type_details.head()

Unnamed: 0,country_name,game_season,discipline_title,event_title,event_gender,gold,silver,bronze,total_medals
0,Afghanistan,Summer,Taekwondo,58 - 68 kg men,Men,0.0,0.0,1.0,1.0
1,Afghanistan,Summer,Taekwondo,beijing 2008 taekwondo - 58 kg men,Men,0.0,0.0,1.0,1.0
2,Algeria,Summer,Athletics,1500m men,Men,2.0,1.0,0.0,3.0
3,Algeria,Summer,Athletics,1500m women,Women,2.0,0.0,0.0,2.0
4,Algeria,Summer,Athletics,5000m men,Men,0.0,1.0,0.0,1.0


In [7]:
# Summarize the dataset by country_name and game_season on total_medals
df_medal_summary = df_medal_type_details.drop(columns=['discipline_title', 'event_title', 'gold', 'silver', 'bronze'])
df_medal_summary = df_medal_summary.groupby(['country_name', 'game_season', 'event_gender']).sum().reset_index()
df_medal_summary = df_medal_summary.sort_values(by='total_medals', ascending=True).reset_index(drop=True)
df_medal_summary

Unnamed: 0,country_name,game_season,event_gender,total_medals
0,Peru,Summer,Women,1.0
1,Canada,Winter,Open,1.0
2,Russia,Winter,Open,1.0
3,Samoa,Summer,Women,1.0
4,San Marino,Summer,Men,1.0
...,...,...,...,...
413,Great Britain,Summer,Men,617.0
414,United States,Summer,Women,712.0
415,Germany,Summer,Men,793.0
416,Russia,Summer,Men,992.0


In [8]:
# Function to slice the medal summary dataset
def slice_medal_summary_data(data_to_select: dict[str, str], max_num_medals: int = 1) -> pd.DataFrame:
    group_medal_cols = ['country_name', 'game_season', 'event_gender']
    df_slice = df_medal_summary.copy()
    for key, value in data_to_select.items():
        df_slice = df_slice[df_slice[key] == value]

    df_slice = df_slice.groupby(group_medal_cols).sum().reset_index()
    # Select only rows with less than max_num_medals
    df_slice = df_slice[df_slice['total_medals'] <= max_num_medals]
    df_slice.sort_values(by='country_name', ascending=True, inplace=True)
    return df_slice.reset_index(drop=True)

In [12]:
# Women's summer games: countries with only one medal over all games
data_to_select = {
    'game_season': 'Summer', 
    'event_gender': 'Women'
}
df = slice_medal_summary_data(data_to_select, max_num_medals=1)
df.info()
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21 entries, 0 to 20
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   country_name  21 non-null     object 
 1   game_season   21 non-null     object 
 2   event_gender  21 non-null     object 
 3   total_medals  21 non-null     float64
dtypes: float64(1), object(3)
memory usage: 804.0+ bytes


Unnamed: 0,country_name,game_season,event_gender,total_medals
0,Bermuda,Summer,Women,1.0
1,Bohemia,Summer,Women,1.0
2,Burundi,Summer,Women,1.0
3,Chile,Summer,Women,1.0
4,Fiji,Summer,Women,1.0
5,Georgia,Summer,Women,1.0
6,Iceland,Summer,Women,1.0
7,Iran,Summer,Women,1.0
8,Latvia,Summer,Women,1.0
9,Montenegro,Summer,Women,1.0


In [13]:
# Women's winter games: countries with only one medal over all games
data_to_select = {
    'game_season': 'Winter', 
    'event_gender': 'Women'
}
df = slice_medal_summary_data(data_to_select, max_num_medals=1)
df.info()
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   country_name  4 non-null      object 
 1   game_season   4 non-null      object 
 2   event_gender  4 non-null      object 
 3   total_medals  4 non-null      float64
dtypes: float64(1), object(3)
memory usage: 260.0+ bytes


Unnamed: 0,country_name,game_season,event_gender,total_medals
0,Belgium,Winter,Women,1.0
1,Denmark,Winter,Women,1.0
2,Uzbekistan,Winter,Women,1.0
3,Yugoslavia,Winter,Women,1.0


In [14]:
# Men's summer games: countries with only one medal over all games
data_to_select = {
    'game_season': 'Summer', 
    'event_gender': 'Men'
}
df = slice_medal_summary_data(data_to_select, max_num_medals=1)
df.info()
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   country_name  20 non-null     object 
 1   game_season   20 non-null     object 
 2   event_gender  20 non-null     object 
 3   total_medals  20 non-null     float64
dtypes: float64(1), object(3)
memory usage: 772.0+ bytes


Unnamed: 0,country_name,game_season,event_gender,total_medals
0,Barbados,Summer,Men,1.0
1,Bermuda,Summer,Men,1.0
2,Burkina Faso,Summer,Men,1.0
3,Burundi,Summer,Men,1.0
4,Cyprus,Summer,Men,1.0
5,Djibouti,Summer,Men,1.0
6,Eritrea,Summer,Men,1.0
7,Gabon,Summer,Men,1.0
8,Guatemala,Summer,Men,1.0
9,Guyana,Summer,Men,1.0


In [15]:
# Men's winter games: countries with only one medal over all games
data_to_select = {
    'game_season': 'Winter', 
    'event_gender': 'Men'
}
df = slice_medal_summary_data(data_to_select, max_num_medals=1)
df.info()
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   country_name  2 non-null      object 
 1   game_season   2 non-null      object 
 2   event_gender  2 non-null      object 
 3   total_medals  2 non-null      float64
dtypes: float64(1), object(3)
memory usage: 196.0+ bytes


Unnamed: 0,country_name,game_season,event_gender,total_medals
0,Bulgaria,Winter,Men,1.0
1,Romania,Winter,Men,1.0
