In [2]:
import pandas as pd
import numpy as np
import plotly_express as px
import hashlib

# Load datasets
data_athletes = pd.read_csv('Data/athlete_events.csv')
data_noc = pd.read_csv('Data/noc_regions.csv')

### Which countries are included in the dataset?

In [3]:
# Extract unique region names
unique_regions = data_noc['region'].unique()
exclusions_region = ['Individual Olympic Athletes', 'NA'] # Exclusion list for regions

# TUV was described as 'NA' in the region column but had a name in the notes column
if 'Tuvalu' not in unique_regions:
    # Manually add 'Tuvalu' to the list of unique regions
    unique_regions = list(unique_regions) + ['Tuvalu']

# Print the list of unique region names
for region in unique_regions:
    if region not in exclusions_region:
        print(region)

Afghanistan
Curacao
Albania
Algeria
Andorra
Angola
Antigua
Australia
Argentina
Armenia
Aruba
American Samoa
Austria
Azerbaijan
Bahamas
Bangladesh
Barbados
Burundi
Belgium
Benin
Bermuda
Bhutan
Bosnia and Herzegovina
Belize
Belarus
Czech Republic
Boliva
Botswana
Brazil
Bahrain
Brunei
Bulgaria
Burkina Faso
Central African Republic
Cambodia
Canada
Cayman Islands
Republic of Congo
Chad
Chile
China
Ivory Coast
Cameroon
Democratic Republic of the Congo
Cook Islands
Colombia
Comoros
Cape Verde
Costa Rica
Croatia
Greece
Cuba
Cyprus
Denmark
Djibouti
Dominica
Dominican Republic
Ecuador
Egypt
Eritrea
El Salvador
Spain
Estonia
Ethiopia
Russia
Fiji
Finland
France
Germany
Micronesia
Gabon
Gambia
UK
Guinea-Bissau
Georgia
Equatorial Guinea
Ghana
Grenada
Guatemala
Guinea
Guam
Guyana
Haiti
Honduras
Hungary
Indonesia
India
Iran
Ireland
Iraq
Iceland
Israel
Virgin Islands, US
Italy
Virgin Islands, British
Jamaica
Jordan
Japan
Kazakhstan
Kenya
Kyrgyzstan
Kiribati
South Korea
Kosovo
Saudi Arabia
Kuwait
Laos
L

### How many countries are included in the dataset?

In [4]:
# Filter out excluded regions and count
filtered_regions = [
    region for region in unique_regions
        if region not in exclusions_region]

number_of_unique_regions = len(filtered_regions)

print(f"Total number of unique regions: {number_of_unique_regions}")

Total number of unique regions: 207


### Which sports are in the event?

In [None]:
sports_list = data_athletes['Sport'].unique()

print("Types of sports:")
for sport in sports_list:
    print(sport)

### What types of medals are won?

In [None]:
medal_types = data_athletes['Medal'].unique()

# Removes the rows where no medals has been won
medal_types_filtered = np.delete(medal_types, 0)

print("Types of medals:")
for medal in medal_types_filtered:
    print(medal)

### Sex distribiution chart

In [None]:
# Remove duplicate entries based on 'ID' to ensure each athlete is counted only once.
data_unique_athletes = data_athletes.drop_duplicates(subset=['ID'])

# Count the number of male and female participants
sex_distribution = data_unique_athletes['Sex'].value_counts()

# Creating a pie chart
sex_distribution_piechart = px.pie(sex_distribution, 
            # The lambda function in the names argument is used to map "F" to "Female" and "M" to "Male".
             names=sex_distribution.index.map(lambda x: 'Female' if x == 'F' else 'Male'),
             values=sex_distribution.values,
             title='Sex distribution of all athletes',
             labels={'names' : 'Sex', "values" : 'Amount'})

sex_distribution_piechart.write_html(("../Projekt_OS_Australien/Visualisering/Sex_distribution_piechart.html"))

### Top 10 countries based on total medals won

In [None]:
# Creates a new dataframe grouping that only shows the columns "Team" and "Total Medals"
# Value_counts Counts the values for each row
# Unstack transforms the groupby into a new dataframe
# fillna changes the missing data to having a value of 0 so it wont effect the counting
# Sum counts everthing in the first axis which is "Total Medals" after we used reset_index to both reset the index and change the name of the second column.
country_medals = data_athletes.groupby("NOC")["Medal"].value_counts().unstack().fillna(0).sum(axis=1).reset_index(name="Total Medals")

top_ten_countries = country_medals.sort_values(by="Total Medals", ascending=False).head(10)

top_ten_countries_diagram = px.bar(
    top_ten_countries,
    x="NOC", y="Total Medals",
    title="Top 10 countries based on total medals won:",
    color="NOC",
    labels={"NOC" : "Countries", "Total Medals" : "Medals"}
)

top_ten_countries_diagram.write_html("../Projekt_OS_Australien/Visualisering/Top_ten_countries_medals.html")

### Age Statistics

In [32]:
# Create dataframe grouping with athletes age, sex and medal
# Dropping rows without age data and filling NaN in Medals with "No medal"
# Showing age statistics overall and by sex
athlete_age_medal = data_athletes[["Sex", "Age", "Medal"]].dropna(subset=["Age"]).fillna({"Medal": "No medal"})

age_stats = athlete_age_medal["Age"]
age_stats_by_sex = athlete_age_medal.groupby("Sex")["Age"]

In [33]:
age_stats.describe()

count    261642.000000
mean         25.556898
std           6.393561
min          10.000000
25%          21.000000
50%          24.000000
75%          28.000000
max          97.000000
Name: Age, dtype: float64

In [34]:
age_stats_by_sex.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
F,74098.0,23.732881,5.795252,11.0,20.0,23.0,27.0,74.0
M,187544.0,26.277562,6.474972,10.0,22.0,25.0,29.0,97.0


### Medals by age group

In [51]:
# Defining age bins
# Creating new column "Age Group" based on the age bins
# Group by "Sex", "Age Group" and "Medals" and count the medal occurrences
# Sort the columns, gold first, no medal last.
age_bins = [10, 20, 30, 40, 50, 60, 70, 80, 90]
athlete_age_medal["Age Group"] = pd.cut(athlete_age_medal["Age"], bins=age_bins, labels=[f'{i}-{i+9}' for i in age_bins[:-1]])
age_group_medals = athlete_age_medal.groupby(["Sex", "Age Group", "Medal"], observed=False).size().unstack(fill_value=0)
age_group_medals = age_group_medals[["Gold", "Silver", "Bronze"]]

In [52]:
age_group_medals

Unnamed: 0_level_0,Medal,Gold,Silver,Bronze
Sex,Age Group,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
F,10-19,852,837,787
F,20-29,2469,2458,2531
F,30-39,398,401,407
F,40-49,22,28,32
F,50-59,2,5,3
F,60-69,1,0,3
F,70-79,0,0,0
F,80-89,0,0,0
M,10-19,985,936,1014
M,20-29,6696,6405,6581


In [57]:
# Melt the DataFrame for easier plotting
# Create bar chart using Plotly Express
age_group_medals_melted = pd.melt(age_group_medals.reset_index(), id_vars=["Sex", "Age Group"], var_name="Medal", value_name="Count")

age_group_medals_diagram = px.bar(
    age_group_medals_melted,
    x="Age Group",
    y="Count",
    color="Medal",
    color_discrete_map={"Gold": "gold", "Silver": "silver", "Bronze": "brown"},
    barmode="group",
    facet_col="Sex",
    category_orders={"Medal": ["Gold", "Silver", "Bronze", "No medal"]},
    labels={"Count": "Medal Count", "Age Group": "Age Group"},
    title="Medal Counts per Age Group and Sex"
)

age_group_medals_diagram.write_html("../Projekt_OS_Australien/Visualisering/Age_group_medals_by_sex.html")

### Anonymization function for names in dataset

In [35]:
#Anonymize a name using SHA-256 hashing algorithm
def anonymize_names(name):
    name_bytes = name.encode() # Transforming the string to bytes with .encode(), since the hash function demands it 
    hash_object = hashlib.sha256() # Creating an SHA-256 hashobject with hashlib.sha256()
    hash_object.update(name_bytes) # Updating the hashobject with the koded name-string (name_bytes)
    hashed_name = hash_object.hexdigest() # Making the hashe object into hexdecimal format
    return hashed_name

# Using the function to anonymize the name column
data_athletes['Name'] = data_athletes['Name'].apply(anonymize_names)