In [5]:
import pandas as pd
import numpy as np
import plotly_express as px

# Load datasets
data_athletes = pd.read_csv('Data/athlete_events.csv')
data_noc = pd.read_csv('Data/noc_regions.csv')

In [2]:
# Extract unique region names
unique_regions = data_noc['region'].unique()
exclusions_region = ['Individual Olympic Athletes', 'NA'] # Exclusion list for regions

# TUV was described as 'NA' in the region column but had a name in the notes column
if 'Tuvalu' not in unique_regions:
    # Manually add 'Tuvalu' to the list of unique regions
    unique_regions = list(unique_regions) + ['Tuvalu']

# Print the list of unique region names
for region in unique_regions:
    if region not in exclusions_region:
        print(region)

Afghanistan
Curacao
Albania
Algeria
Andorra
Angola
Antigua
Australia
Argentina
Armenia
Aruba
American Samoa
Austria
Azerbaijan
Bahamas
Bangladesh
Barbados
Burundi
Belgium
Benin
Bermuda
Bhutan
Bosnia and Herzegovina
Belize
Belarus
Czech Republic
Boliva
Botswana
Brazil
Bahrain
Brunei
Bulgaria
Burkina Faso
Central African Republic
Cambodia
Canada
Cayman Islands
Republic of Congo
Chad
Chile
China
Ivory Coast
Cameroon
Democratic Republic of the Congo
Cook Islands
Colombia
Comoros
Cape Verde
Costa Rica
Croatia
Greece
Cuba
Cyprus
Denmark
Djibouti
Dominica
Dominican Republic
Ecuador
Egypt
Eritrea
El Salvador
Spain
Estonia
Ethiopia
Russia
Fiji
Finland
France
Germany
Micronesia
Gabon
Gambia
UK
Guinea-Bissau
Georgia
Equatorial Guinea
Ghana
Grenada
Guatemala
Guinea
Guam
Guyana
Haiti
Honduras
Hungary
Indonesia
India
Iran
Ireland
Iraq
Iceland
Israel
Virgin Islands, US
Italy
Virgin Islands, British
Jamaica
Jordan
Japan
Kazakhstan
Kenya
Kyrgyzstan
Kiribati
South Korea
Kosovo
Saudi Arabia
Kuwait
Laos
L

In [3]:
# Filter out excluded regions and count
filtered_regions = [
    region for region in unique_regions
        if region not in exclusions_region]

number_of_unique_regions = len(filtered_regions)

print(f"Total number of unique regions: {number_of_unique_regions}")

Total number of unique regions: 207


### Which sports are in the event?

In [21]:
sports_list = data_athletes['Sport'].unique()

print("Types of sports:")
for sport in sports_list:
    print(sport)

Types of sports:
Basketball
Judo
Football
Tug-Of-War
Speed Skating
Cross Country Skiing
Athletics
Ice Hockey
Swimming
Badminton
Sailing
Biathlon
Gymnastics
Art Competitions
Alpine Skiing
Handball
Weightlifting
Wrestling
Luge
Water Polo
Hockey
Rowing
Bobsleigh
Fencing
Equestrianism
Shooting
Boxing
Taekwondo
Cycling
Diving
Canoeing
Tennis
Modern Pentathlon
Figure Skating
Golf
Softball
Archery
Volleyball
Synchronized Swimming
Table Tennis
Nordic Combined
Baseball
Rhythmic Gymnastics
Freestyle Skiing
Rugby Sevens
Trampolining
Beach Volleyball
Triathlon
Ski Jumping
Curling
Snowboarding
Rugby
Short Track Speed Skating
Skeleton
Lacrosse
Polo
Cricket
Racquets
Motorboating
Military Ski Patrol
Croquet
Jeu De Paume
Roque
Alpinism
Basque Pelota
Aeronautics


### What types of medals are won?

In [20]:
medal_types = data_athletes['Medal'].unique()

# Removes the rows where no medals has been won
medal_types_filtered = np.delete(medal_types, 0)

print("Types of medals:")
for medal in medal_types_filtered:
    print(medal)

Types of medals:
Gold
Bronze
Silver


### Top 10 countries based on total medals won

In [6]:
# Creates a new dataframe grouping that only shows the columns "Team" and "Total Medals"
# Value_counts Counts the values for each row
# Unstack transforms the groupby into a new dataframe
# fillna changes the missing data to having a value of 0 so it wont effect the counting
# Sum counts everthing in the first axis which is "Total Medals" after we used reset_index to both reset the index and change the name of the second column.
country_medals = data_athletes.groupby("NOC")["Medal"].value_counts().unstack().fillna(0).sum(axis=1).reset_index(name="Total Medals")

top_ten_countries = country_medals.sort_values(by="Total Medals", ascending=False).head(10)

top_ten_countries_diagram = px.bar(
    top_ten_countries,
    x="NOC", y="Total Medals",
    title="Top 10 countries based on total medals won:",
    color="NOC",
    labels={"NOC" : "Countries", "Total Medals" : "Medals"}
)


