In [2]:
pip install dash

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import numpy as np
import plotly.express as px
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import plotly.graph_objects as go
import plotly.subplots as sp
import pandas as pd
from plotly.subplots import make_subplots

In [4]:
olympics_data_original = pd.read_csv('all_athlete_games.csv')
olympics_data = olympics_data_original.copy()

In [5]:
olympics_data['Age'] = olympics_data['Age'].astype('Int64')

In [6]:
import re

olympics_data['Event'] = olympics_data.apply(
    lambda row: re.sub(f'^{re.escape(row["Sport"])}\\s*', '',
                       re.sub(r'\s*metres$', 'm',
                       re.sub(r'^Athletics\s*', '', row['Event']))),
    axis=1
)


In [7]:
olympics_data['Event'].unique().size

929

In [8]:
olympics_data['Team'] = olympics_data['Team'].str.replace(r'\d+$', '', regex=True)

In [9]:
olympics_data[olympics_data['Sport'] == 'Swimming'].head(2)

Unnamed: 0,Entry ID,Name,Gender,Age,Team,NOC,Year,Season,City,Sport,Event,Medal
6,29,"Einar Ferdinand ""Einari"" Aalto",Male,26,Finland,FIN,1952,Summer,Helsinki,Swimming,Men's 400 metres Freestyle,
10,35,Arvo Ossian Aaltonen,Male,22,Finland,FIN,1912,Summer,Stockholm,Swimming,Men's 200 metres Breaststroke,


# Age

In [10]:
df = olympics_data[olympics_data['Sport'] == 'Swimming'].copy()

bins = [10, 14, 17, 20, 23, 26, 30, 35, 100]
labels = ["10-14", "15-17", "18-20", "21-23", "24-26", "27-30", "31-35", "36+"]
df["Age Group"] = pd.cut(df["Age"], bins=bins, labels=labels, right=False)

df_grouped = df.groupby(["Year", "Age Group"]).size().reset_index(name="Count")

color_palette = ["#2E86C1", "#1ABC9C", "#F39C12", "#E74C3C", "#8E44AD", "#16A085", "#D35400", "#34495E"]

# Bubble chart
fig = px.scatter(df_grouped,
                 x="Year",
                 y="Age Group",
                 size="Count",
                 color="Age Group",
                 color_discrete_sequence=color_palette,
                 title="Age Distribution of Swimming Athletes Over Time",
                 labels={"Year": "Year", "Age Group": "Age Group", "Count": "Number of Athletes"},
                 opacity=0.85,
                 size_max=40)

# Styling
fig.update_layout(
    template="simple_white",
    font=dict(size=14),
    title_font=dict(size=18),
    plot_bgcolor="white",
    paper_bgcolor="white",
    xaxis=dict(
        showline=False,
        showgrid=False,
        tickmode="array",
    ),
    yaxis=dict(
        showline=False,
        showgrid=True,
        gridcolor="lightgray",
        gridwidth=1,
        griddash="dashdot",
    ),
    legend_title_text="Age Groups",
)

fig.show()

  df_grouped = df.groupby(["Year", "Age Group"]).size().reset_index(name="Count")


# Age avec moyenne

In [11]:
df = olympics_data[olympics_data['Sport'] == 'Swimming'].copy()

bins = [10, 14, 17, 20, 23, 26, 30, 35, 100]
labels = ["10-14", "15-17", "18-20", "21-23", "24-26", "27-30", "31-35", "36+"]
df["Age Group"] = pd.cut(df["Age"], bins=bins, labels=labels, right=False)

df_grouped = df.groupby(["Year", "Age Group"]).size().reset_index(name="Count")

# Average age
df_mean_age = df.groupby("Year")["Age"].mean().reset_index()
df_mean_age["Age Group"] = pd.cut(
    df_mean_age["Age"],
    bins=bins,
    labels=labels,
    right=False
)

color_palette = ["#2E86C1", "#1ABC9C", "#F39C12", "#E74C3C", "#8E44AD", "#16A085", "#D35400", "#34495E"]

fig = px.scatter(df_grouped,
                 x="Year",
                 y="Age Group",
                 size="Count",
                 color="Age Group",
                 color_discrete_sequence=color_palette,
                 title="Age Distribution of Swimming Athletes Over Time",
                 labels={"Year": "Year", "Age Group": "Age Group", "Count": "Number of Athletes"},
                 opacity=0.85,
                 size_max=40)

fig.add_scatter(
    x=df_mean_age["Year"],
    y=df_mean_age["Age Group"],
    mode="markers",
    marker=dict(color="#000000", size=6),
    name=""
)


fig.add_scatter(
    x=df_mean_age["Year"],
    y=df_mean_age["Age Group"],
    mode="lines",
    line=dict(color="#000000", width=2),
    name="Mean Age Trend"
)

fig.update_layout(
    template="simple_white",
    font=dict(size=14),
    title_font=dict(size=18),
    plot_bgcolor="white",
    paper_bgcolor="white",
    xaxis=dict(
        showline=False,
        showgrid=False,
        tickmode="array",
    ),
    yaxis=dict(
        showline=False,
        showgrid=True,
        gridcolor="lightgray",
        gridwidth=1,
        griddash="dashdot",
    ),
    legend_title_text="Age Groups",
)

fig.show()





# Age par Categorie

In [12]:
df = olympics_data[olympics_data["Sport"] == "Swimming"].copy()

bins = [10, 14, 17, 20, 23, 26, 30, 35, 100]
labels = ["10-14", "15-17", "18-20", "21-23", "24-26", "27-30", "31-35", "36+"]
df["Age Group"] = pd.cut(df["Age"], bins=bins, labels=labels, right=False)


df_grouped = df.groupby(["Year", "Age Group", "Event"]).size().reset_index(name="Count")

color_palette = ["#2E86C1", "#1ABC9C", "#F39C12", "#E74C3C", "#8E44AD", "#16A085", "#D35400", "#34495E"]

app = dash.Dash(__name__)

app.layout = html.Div([
    html.Label("Select Event:", style={"font-weight": "bold", "font-size": "16px"}),
    dcc.Dropdown(
        id="event-dropdown",
        options=[{"label": event, "value": event} for event in df_grouped["Event"].unique()],
        value=df_grouped["Event"].unique()[0],
        clearable=False,
        style={"width": "50%", "margin-bottom": "20px"}
    ),

    # Bubble Chart
    dcc.Graph(id="bubble-chart")
])


@app.callback(
    Output("bubble-chart", "figure"),
    [Input("event-dropdown", "value")]
)
def update_chart(selected_event):
    filtered_df = df_grouped[df_grouped["Event"] == selected_event]
    fig = px.scatter(filtered_df,
                     x="Year",
                     y="Age Group",
                     size="Count",
                     color="Age Group",
                     color_discrete_sequence=color_palette,
                     title=f"Age Distribution of {selected_event} Over Time",
                     labels={"Year": "Year", "Age Group": "Age Group", "Count": "Number of Athletes"},
                     opacity=0.85,
                     size_max=40)
    fig.update_layout(
        template="simple_white",
        font=dict(size=14),
        title_font=dict(size=18),
        plot_bgcolor="white",
        paper_bgcolor="white",
        xaxis=dict(showline=False, showgrid=False, tickmode="array"),
        yaxis=dict(showline=False, showgrid=True, gridcolor="lightgray", gridwidth=1, griddash="dashdot"),
        legend_title_text="Age Groups"
    )

    return fig

# Run app
if __name__ == "__main__":
    app.run_server(debug=True)





# Age par médaille

In [13]:
df_grouped = df.groupby(["Medal", "Age Group"]).size().reset_index(name="Count")

medal_colors = {"Gold": "#FFD700", "Silver": "#C0C0C0", "Bronze": "#CD7F32"}

fig = px.scatter(df_grouped,
                 x="Medal",
                 y="Age Group",
                 size="Count",
                 color="Medal",
                 color_discrete_map=medal_colors,
                 title="Age Distribution of Swimming Athletes by Medal",
                 labels={"Medal": "Medal Type", "Age Group": "Age Group", "Count": "Number of Athletes"},
                 opacity=0.85,
                 size_max=40)

fig.update_layout(
    template="simple_white",
    font=dict(size=14),
    title_font=dict(size=18),
    plot_bgcolor="white",
    paper_bgcolor="white",
    xaxis=dict(
        showline=False,
        showgrid=False,
        tickmode="array",
        categoryorder="array",
        categoryarray=["Gold", "Silver", "Bronze"]
    ),
    yaxis=dict(
        showline=False,
        showgrid=True,
        gridcolor="lightgray",
        gridwidth=1,
        griddash="dashdot",
    ),
    legend_title_text="Medal Type",
)

fig.show()





# Participations

In [14]:
# A améliorer

df = olympics_data[olympics_data["Sport"] == "Swimming"].copy()
df = df.drop_duplicates(subset=["Name", "Year"])

df = df.sort_values(["Name", "Year"])
df["Participation_Number"] = df.groupby("Name").cumcount() + 1

df["Medal_Status"] = df["Medal"].apply(lambda x: "Medal Won" if pd.notna(x) else "No Medal")

participation_counts = df.groupby(["Participation_Number", "Medal_Status"]).size().unstack(fill_value=0)

participation_percentages = participation_counts.div(participation_counts.sum(axis=1), axis=0) * 100

colors = {"Medal Won": "#F4C542", "No Medal": "#A0A0A0"}

participation_levels = sorted(participation_percentages.index)
num_participations = len(participation_levels)

fig = sp.make_subplots(
    rows=1, cols=num_participations,
    specs=[[{"type": "domain"}] * num_participations]
)

for i, p in enumerate(participation_levels):
    values = [
        participation_percentages.loc[p]["Medal Won"],
        participation_percentages.loc[p]["No Medal"]
    ]

    fig.add_trace(go.Pie(
        labels=["Medal Won", "No Medal"],
        values=values,
        marker=dict(colors=[colors["Medal Won"], colors["No Medal"]]),
        hole=0.4,
        textinfo='percent',
        showlegend=(i == 0)
    ), row=1, col=i+1)

    num_pies = len(participation_levels)
    x_position = (i + 0.5) / num_pies

    fig.add_annotation(
        x=x_position,
        y=0.5,
        text=f"{p}",
        showarrow=False,
        font=dict(size=16, color="black", family="Arial Black"),
        xref="paper",
        yref="paper"
    )

fig.update_layout(
    title_text="Medal Distribution by Olympic Participation",
    height=400,
    width=130 * num_participations,
    margin=dict(t=80, b=50, l=50, r=50),
    font=dict(size=12)
)

fig.show()

# Meilleurs athlètes

In [15]:
df = olympics_data[olympics_data["Sport"] == "Swimming"].copy()
df["Medal"] = df["Medal"].fillna("No Medal")

medal_counts = df[df["Medal"] != "No Medal"].groupby(["Name", "Medal"]).size().reset_index(name="Count")

top_athletes = medal_counts.groupby("Name")["Count"].sum().nlargest(10)
medal_counts = medal_counts[medal_counts["Name"].isin(top_athletes.index)]
medal_colors = {"Gold": "#FFD700", "Silver": "#C0C0C0", "Bronze": "#CD7F32"}

ordered_athletes = top_athletes.index[::-1]

# Stacked bar chart
fig = px.bar(
    medal_counts,
    x="Count",
    y="Name",
    color="Medal",
    orientation="h",
    title="Olympic Hall of Fame – Who Dominates the Pool?",
    labels={"Count": "Total Medals", "Name": "Athletes"},
    color_discrete_map=medal_colors,
    category_orders={"Name": ordered_athletes}
)

fig.update_layout(
    template="plotly_white",
    xaxis_title="Total Medals",
    yaxis_title="",
    showlegend=True
)

fig.show()

# Pays

In [16]:
df_medals = olympics_data[olympics_data["Sport"] == "Swimming"]
df_medals = df_medals[df_medals["Medal"].notna()]
total_medal_counts = df_medals["NOC"].value_counts()

# Select Top 3 countries + Canada (athlete's country) + "Other"
top_countries = total_medal_counts.head(3).index.tolist() + ['CAN']

df_grouped = df_medals["NOC"].value_counts().reset_index()
df_grouped.columns = ["Country", "Medals"]
df_grouped["Country"] = df_grouped["Country"].apply(lambda x: x if x in top_countries else "Other")

df_grouped = df_grouped.groupby("Country")["Medals"].sum().reset_index()

df_grouped = df_grouped.sort_values(by="Medals", ascending=False)
if "Other" in df_grouped["Country"].values:
    df_grouped = pd.concat([
        df_grouped[df_grouped["Country"] != "Other"],
        df_grouped[df_grouped["Country"] == "Other"]
    ])

df_grouped["Normalized"] = (df_grouped["Medals"] / df_grouped["Medals"].sum() * 100).round().astype(int)

while df_grouped["Normalized"].sum() > 100:
    df_grouped.loc[df_grouped["Normalized"].idxmax(), "Normalized"] -= 1
while df_grouped["Normalized"].sum() < 100:
    df_grouped.loc[df_grouped["Normalized"].idxmax(), "Normalized"] += 1

country_mapping = {country: i + 1 for i, country in enumerate(df_grouped["Country"])}

flat_list = []
for country, count in zip(df_grouped["Country"], df_grouped["Normalized"]):
    flat_list.extend([country_mapping[country]] * count)

waffle_array = np.array(flat_list).reshape(10, 10, order="F")

label_map = {v: k for k, v in country_mapping.items()}
labels = np.vectorize(label_map.get)(waffle_array)

color_palette = ["#00ECC2", "#0078FF", "#cbe58e", "#FF4359"]
country_colors = {country: color_palette[i] for i, country in enumerate(top_countries)}
country_colors["Other"] = "rgba(170, 170, 170, 0.3)"  # gray for others

# Waffle chart
fig = go.Figure()

fig.add_trace(
    go.Heatmap(
        x=np.arange(10), y=np.arange(10),
        z=waffle_array, customdata=labels,
        xgap=3, ygap=3,
        colorscale=[(i / 4, color_palette[i]) for i in range(4)] + [(1.0, "rgba(170, 170, 170, 0.3)")],
        showscale=False,
        hovertemplate="%{customdata}<extra></extra>"
    )
)

for country, color in country_colors.items():
    fig.add_trace(
        go.Scatter(
            x=[None], y=[None], mode='markers',
            marker=dict(color=color, symbol='circle', size=10),
            name=country
        )
    )

fig.update_layout(
    title="Swimming Medals – All Editions",
    width=400, height=400,
    plot_bgcolor='rgba(0,0,0,0)',
    paper_bgcolor='rgba(0,0,0,0)',
)

fig.update_xaxes(showgrid=False, zeroline=False, showticklabels=False)
fig.update_yaxes(showgrid=False, zeroline=False, showticklabels=False)

fig.show()

In [17]:
year_groups = [
    (1896, 1920),
    (1924, 1952),
    (1956, 1984),
    (1988, 2008),
    (2012, 2020)
]

df_swimming = olympics_data[olympics_data["Sport"] == "Swimming"]
df_medals = df_swimming[df_swimming["Medal"].notna()]

total_medal_counts = df_medals["NOC"].value_counts()

top_countries = total_medal_counts.head(3).index.tolist() + ['CAN']
print(top_countries)

color_palette = ["#00ECC2", "#0078FF", "#cbe58e", "#FF4359"]
country_colors = {country: color_palette[i] for i, country in enumerate(top_countries)}
print(country_colors)
country_colors["Other"] = "rgba(170, 170, 170, 0.3)"

fig = make_subplots(rows=1, cols=5, subplot_titles=[f"{start}-{end}" for start, end in year_groups])

for idx, (start, end) in enumerate(year_groups):

    df_period = df_medals[(df_medals["Year"] >= start) & (df_medals["Year"] <= end)]
    df_grouped = df_period["NOC"].value_counts().reset_index()
    df_grouped.columns = ["Country", "Medals"]

    df_grouped["Country"] = df_grouped["Country"].apply(lambda x: x if x in top_countries else "Other")
    df_grouped = df_grouped.groupby("Country")["Medals"].sum().reset_index()

    df_grouped = df_grouped.sort_values(by="Medals", ascending=False)
    if "Other" in df_grouped["Country"].values:
        df_grouped = pd.concat([
            df_grouped[df_grouped["Country"] != "Other"],
            df_grouped[df_grouped["Country"] == "Other"]
        ])

    df_grouped["Normalized"] = (df_grouped["Medals"] / df_grouped["Medals"].sum() * 100).round().astype(int)

    while df_grouped["Normalized"].sum() > 100:
        df_grouped.loc[df_grouped["Normalized"].idxmax(), "Normalized"] -= 1
    while df_grouped["Normalized"].sum() < 100:
        df_grouped.loc[df_grouped["Normalized"].idxmax(), "Normalized"] += 1

    country_mapping = {country: i + 1 for i, country in enumerate(df_grouped["Country"])}

    flat_list = []
    for country, count in zip(df_grouped["Country"], df_grouped["Normalized"]):
        flat_list.extend([country_mapping[country]] * count)

    waffle_array = np.array(flat_list).reshape(10, 10, order="F")  # Fill column-wise

    label_map = {v: k for k, v in country_mapping.items()}
    labels = np.vectorize(label_map.get)(waffle_array)

    fig.add_trace(
        go.Heatmap(
            x=np.arange(10), y=np.arange(10),
            z=waffle_array, customdata=labels,
            xgap=3, ygap=3,
            colorscale=[(i / 4, color_palette[i]) for i in range(4)] + [(1.0, "rgba(170, 170, 170, 0.3)")],
            showscale=False,
            hovertemplate="%{customdata}<extra></extra>"
        ),
        row=1, col=idx + 1
    )

legend_entries = []
for country, color in country_colors.items():
    legend_entries.append(
        go.Scatter(
            x=[None], y=[None], mode='markers',
            marker=dict(color=color, symbol='circle', size=10),
            name=country
        )
    )

# Add the legend entries to the figure
fig.add_traces(legend_entries)

fig.update_layout(
    title="Swimming Medals by Period (Each Square Represents 1% of Medals Won)",
    width=1500, height=400,
    plot_bgcolor='rgba(0,0,0,0)',
    paper_bgcolor='rgba(0,0,0,0)',
)

for i in range(1, 6):
    fig.update_xaxes(showgrid=False, zeroline=False, showticklabels=False, row=1, col=i)
    fig.update_yaxes(showgrid=False, zeroline=False, showticklabels=False, row=1, col=i)

fig.show()


['USA', 'AUS', 'GER', 'CAN']
{'USA': '#00ECC2', 'AUS': '#0078FF', 'GER': '#cbe58e', 'CAN': '#FF4359'}


# Pays/Participations

In [18]:
df_medals = olympics_data[(olympics_data["Sport"] == "Swimming")]

total_medal_counts = df_medals["NOC"].value_counts()

top_countries = total_medal_counts.head(3).index.tolist() + ['CAN']

df_grouped = df_medals["NOC"].value_counts().reset_index()
df_grouped.columns = ["Country", "Participations"]
df_grouped["Country"] = df_grouped["Country"].apply(lambda x: x if x in top_countries else "Other")

df_grouped = df_grouped.groupby("Country")["Participations"].sum().reset_index()

df_grouped = df_grouped.sort_values(by="Participations", ascending=False)
if "Other" in df_grouped["Country"].values:
    df_grouped = pd.concat([
        df_grouped[df_grouped["Country"] != "Other"],
        df_grouped[df_grouped["Country"] == "Other"]
    ])

df_grouped["Normalized"] = (df_grouped["Participations"] / df_grouped["Participations"].sum() * 100).round().astype(int)

while df_grouped["Normalized"].sum() > 100:
    df_grouped.loc[df_grouped["Normalized"].idxmax(), "Normalized"] -= 1
while df_grouped["Normalized"].sum() < 100:
    df_grouped.loc[df_grouped["Normalized"].idxmax(), "Normalized"] += 1

country_mapping = {country: i + 1 for i, country in enumerate(df_grouped["Country"])}

flat_list = []
for country, count in zip(df_grouped["Country"], df_grouped["Normalized"]):
    flat_list.extend([country_mapping[country]] * count)

waffle_array = np.array(flat_list).reshape(10, 10, order="F")

label_map = {v: k for k, v in country_mapping.items()}
labels = np.vectorize(label_map.get)(waffle_array)

color_palette = ["#00ECC2", "#0078FF", "#cbe58e", "#FF4359"]
country_colors = {country: color_palette[i] for i, country in enumerate(top_countries)}
country_colors["Other"] = "rgba(170, 170, 170, 0.3)"  # Light gray for others

fig = go.Figure()

fig.add_trace(
    go.Heatmap(
        x=np.arange(10), y=np.arange(10),
        z=waffle_array, customdata=labels,
        xgap=3, ygap=3,
        colorscale=[(i / 4, color_palette[i]) for i in range(4)] + [(1.0, "rgba(170, 170, 170, 0.3)")],
        showscale=False,
        hovertemplate="%{customdata}<extra></extra>"
    )
)

for country, color in country_colors.items():
    fig.add_trace(
        go.Scatter(
            x=[None], y=[None], mode='markers',
            marker=dict(color=color, symbol='circle', size=10),
            name=country
        )
    )


fig.update_layout(
    title="Swimming Participations – All Editions",
    width=400, height=400,
    plot_bgcolor='rgba(0,0,0,0)',
    paper_bgcolor='rgba(0,0,0,0)',
)

fig.update_xaxes(showgrid=False, zeroline=False, showticklabels=False)
fig.update_yaxes(showgrid=False, zeroline=False, showticklabels=False)

fig.show()

In [19]:
year_groups = [
    (1896, 1920),
    (1924, 1952),
    (1956, 1984),
    (1988, 2008),
    (2012, 2020)
]

df_swimming = olympics_data[olympics_data["Sport"] == "Swimming"]
total_participation_counts = df_swimming["NOC"].value_counts()

top_countries = total_participation_counts.head(3).index.tolist() + ['CAN']

color_palette = ["#00ECC2", "#0078FF", "#cbe58e", "#FF4359"]
country_colors = {country: color_palette[i] for i, country in enumerate(top_countries)}
country_colors["Other"] = "rgba(170, 170, 170, 0.3)"

fig = make_subplots(rows=1, cols=5, subplot_titles=[f"{start}-{end}" for start, end in year_groups])

for idx, (start, end) in enumerate(year_groups):
    df_period = df_swimming[(df_swimming["Year"] >= start) & (df_swimming["Year"] <= end)]
    df_grouped = df_period["NOC"].value_counts().reset_index()
    df_grouped.columns = ["Country", "Participations"]
    df_grouped["Country"] = df_grouped["Country"].apply(lambda x: x if x in top_countries else "Other")

    df_grouped = df_grouped.groupby("Country")["Participations"].sum().reset_index()

    df_grouped = df_grouped.sort_values(by="Participations", ascending=False)
    if "Other" in df_grouped["Country"].values:
        df_grouped = pd.concat([
            df_grouped[df_grouped["Country"] != "Other"],
            df_grouped[df_grouped["Country"] == "Other"]
        ])

    df_grouped["Normalized"] = (df_grouped["Participations"] / df_grouped["Participations"].sum() * 100).round().astype(int)

    while df_grouped["Normalized"].sum() > 100:
        df_grouped.loc[df_grouped["Normalized"].idxmax(), "Normalized"] -= 1
    while df_grouped["Normalized"].sum() < 100:
        df_grouped.loc[df_grouped["Normalized"].idxmax(), "Normalized"] += 1

    country_mapping = {country: i + 1 for i, country in enumerate(df_grouped["Country"])}

    flat_list = []
    for country, count in zip(df_grouped["Country"], df_grouped["Normalized"]):
        flat_list.extend([country_mapping[country]] * count)

    waffle_array = np.array(flat_list).reshape(10, 10, order="F")

    label_map = {v: k for k, v in country_mapping.items()}
    labels = np.vectorize(label_map.get)(waffle_array)

    fig.add_trace(
        go.Heatmap(
            x=np.arange(10), y=np.arange(10),
            z=waffle_array, customdata=labels,
            xgap=3, ygap=3,
            colorscale=[(i / 4, color_palette[i]) for i in range(4)] + [(1.0, "rgba(170, 170, 170, 0.3)")],
            showscale=False,
            hovertemplate="%{customdata}<extra></extra>"
        ),
        row=1, col=idx + 1
    )

legend_entries = []
for country, color in country_colors.items():
    legend_entries.append(
        go.Scatter(
            x=[None], y=[None], mode='markers',
            marker=dict(color=color, symbol='circle', size=10),
            name=country
        )
    )

fig.add_traces(legend_entries)

fig.update_layout(
    title="Swimming Participations by Period (Each Square Represents 1% of Participations)",
    width=1500, height=400,
    plot_bgcolor='rgba(0,0,0,0)',
    paper_bgcolor='rgba(0,0,0,0)',
)

for i in range(1, 6):
    fig.update_xaxes(showgrid=False, zeroline=False, showticklabels=False, row=1, col=i)
    fig.update_yaxes(showgrid=False, zeroline=False, showticklabels=False, row=1, col=i)

fig.show()

# Genre

In [20]:
swimming_events = olympics_data[olympics_data["Sport"] == "Athletics"]["Event"]

df = pd.DataFrame(swimming_events, columns=['Event'])

df['Clean_Event'] = df['Event'].str.replace(r"Men's |Women's |Mixed ", '', regex=True)
df['Gender'] = df['Event'].str.extract(r"(Men's|Women's)")

event_counts = df.pivot_table(index='Clean_Event', columns='Gender', aggfunc='size', fill_value=0).reset_index()

both_genders = event_counts[(event_counts["Men's"] > 0) & (event_counts["Women's"] > 0)]

event_counts_melted = both_genders.melt(id_vars="Clean_Event", var_name="Gender", value_name="Count")

event_counts_melted = event_counts_melted[event_counts_melted["Count"] > 0]

fig = px.scatter(
    event_counts_melted,
    x="Count",
    y="Clean_Event",
    color="Gender",
    title="Number of Men's and Women's Participations in Athletics",
    labels={"Clean_Event": "Event", "Count": "Number of Events"},
    color_discrete_map={"Men's": "blue", "Women's": "pink"},
    symbol="Gender"
)

for event in both_genders["Clean_Event"]:
    men_count = both_genders.loc[both_genders["Clean_Event"] == event, "Men's"].values[0]
    women_count = both_genders.loc[both_genders["Clean_Event"] == event, "Women's"].values[0]

    fig.add_trace(go.Scatter(
        x=[men_count, women_count],
        y=[event, event],
        mode="lines",
        line=dict(color="gray", width=2, dash="dot"),
        showlegend=False
    ))

fig.update_layout(
    width=1000,
    height=700,
    yaxis_categoryorder="total ascending",
    xaxis_title="Number of Participants",
    yaxis_title="Category",
    legend_title="Gender",
    plot_bgcolor="white",
    paper_bgcolor="white",
    font=dict(size=14)
)

fig.update_layout(
    yaxis=dict(
        tickmode="array",
        tickvals=event_counts_melted["Clean_Event"].unique(),
    )
)


fig.show()

In [21]:
athletics_data = olympics_data[olympics_data["Sport"] == "Athletics"]
gender_counts = athletics_data.groupby(["Year", "Gender"]).size().reset_index(name="Count")

pivot_df = gender_counts.pivot(index="Year", columns="Gender", values="Count").fillna(0)

pivot_df["Total"] = pivot_df.sum(axis=1)
pivot_df["Female %"] = (pivot_df["Female"] / pivot_df["Total"]) * 100
pivot_df["Male %"] = (pivot_df["Male"] / pivot_df["Total"]) * 100

pivot_df = pivot_df.reset_index()

pivot_df["Year"] = pivot_df["Year"].astype(str)

fig = px.bar(
    pivot_df,
    x="Year",
    y=["Female %", "Male %"],
    labels={"value": "Percentage of Athletes", "Year": "Olympic Year"},
    title="Evolution of Male and Female Participation in Olympic Athletics",
    color_discrete_map={"Female %": "pink", "Male %": "blue"}
)

fig.update_layout(
    barmode="relative",
    xaxis=dict(
        type="category",
        tickmode="array",
        tickvals=pivot_df["Year"],
        ticktext=pivot_df["Year"],
    ),
    yaxis=dict(title="% Share of Athletes"),
    plot_bgcolor="white"
)

fig.add_hline(y=50, line_dash="dash", line_color="black", annotation_text="50%",
              annotation_position="right", annotation_font_size=14, annotation_font_color="black")

fig.update_xaxes(tickangle=-90)

fig.show()

# Longeur de carrière

In [22]:
df = olympics_data.copy()

# A corriger
df['Career Length'] = df.groupby('Name')['Year'].transform('nunique')

avg_career_length = df.groupby('Sport')['Career Length'].mean().reset_index()

avg_career_length = avg_career_length.sort_values('Career Length', ascending=False)

avg_career_length['Color'] = avg_career_length['Sport'].apply(lambda x: 'red' if x == 'Swimming' else 'gray')

fig = px.bar(
    avg_career_length,
    x='Sport',
    y='Career Length',
    color='Color',
    color_discrete_map={'red': 'red', 'gray': 'gray'},
    labels={'Career Length': 'Average Career Length (Years)'},
    title='Average Olympic Career Length by Sport',
    category_orders={'Sport': avg_career_length['Sport'].tolist()}
)

fig.update_layout(
    xaxis_title='Sport',
    yaxis_title='Average Career Length (Years)',
    showlegend=False,
    template='simple_white'
)

fig.show()