This notebook is an investigation into which plots are interesting and which aren't.

In [None]:
import plotly.graph_objects as go
import json
from collections import Counter

with open('results.json', 'r') as fp:
    data = json.load(fp)

def save_figure(fig: go.Figure, name: str) -> None:
    with open(name, 'w') as fp:
        fp.write(fig.to_json())

EVENTS = {
    1: "Marathon",
    8: "Wheelchair Marathon",
    2: "Half Marathon",
    6: "Wheelchair Half Marathon",
    3: "10km",
    9: "Wheelchair 10km",
    4: "5km",
    5: "2km",
}

EVENTS_OF_INTEREST = {
    "Marathon",
    "Half Marathon",
    "10km",
    "5km",
    "2km",
}


In [None]:
counter = Counter(row["event_name"] for row in data)
events = []
participants = []
for _, event_name in sorted(EVENTS.items(), key=lambda x: x[0]):
    events.append(event_name)
    participants.append(counter[event_name])

fig = go.Figure(
    data={"x": events, "y": participants, "type": "bar"},
    layout={
        "title": "Number of Participants in each Event",
        "xaxis_title": "Event",
        "yaxis_title": "Number of Participants",
    },
)
fig.show()
save_figure(fig, 'number_of_participants_in_each_event.json')

In [None]:
len(data)

The Sunshine Coast Marathon Festival reported record breaking numbers over the weekend with news outlets reporting: "A record breaking 12,000 participants". As we can see, this is almost correct but I'm only seeing 11,635 results, so I'm going to have to fact check the OurSC news outlet on that one. Nevertheless, this is still an excellent turnout.


In [None]:
finish_status = [row['time'] if len(row['time'].split(':')) == 1 else 'Finished' for row in data]
counter = Counter(finish_status)
x = []
y = []
for finish_category, num_in_category in sorted(counter.items(), key=lambda x: x[1], reverse=True):
    x.append(finish_category)
    y.append(num_in_category)

fig = go.Figure(
    data={'x': x, 'y': y, 'type': 'bar'},
    layout={
        "title": "How Many People Finished their Races",
        "xaxis_title": "Race Status",
        "yaxis_title": "Number of People",
    }
)
fig.show()
save_figure(fig, 'how_many_people_finished_their_races.json')

Of the 11,635 participants, most of them finished their events. We can see a whopping 1703 participants in either the "Not Started" and "DNS" categories (why these are different categories, (since they both mean "didn't start") I don't know, but I'd guess they both mean different things, but from different recording mechanisms. Perhaps 1692 people didn't pick up their race bib, but then a further 11 who did pick up their gear still didn't show up to their race.

Also we can see the 156 DNF's, which is remarkably low really, showing that if you start your race, you have only a ~1.5% of not finishing it.

I wonder how these rates change depending on the race entered...

In [None]:
import plotly.express as px
import pandas as pd

df = pd.DataFrame(data)

# Create a copy of the DataFrame
test_df = df.copy()

# Replace values in 'time' that contain ':' with 'finished'
test_df['time'] = test_df['time'].apply(lambda x: "finished"if':'in x else x)

test_df = test_df[test_df['event_name'].isin(EVENTS_OF_INTEREST)]

# Group by 'event_name' and 'time', and count the occurrences
grouped_df = test_df.groupby(['event_name', 'time']).size().reset_index(name='count')

# Calculate the total counts per 'event_name'
total_counts = test_df.groupby('event_name').size().reset_index(name='total_count')

# Merge the total counts with the grouped counts
merged_df = pd.merge(grouped_df, total_counts, on='event_name')

# Normalize the 'count' by the 'total_count'
merged_df['Percentage (%)'] = merged_df['count'] / merged_df['total_count'] * 100

# Specify the order of event names
event_order = ["Marathon", "Half Marathon", "10km", "5km", "2km"]
color_order = ['finished', 'Not started', 'DNS', 'DNF', "QRY", "DQ"]

# Plot the normalized counts
fig = px.bar(merged_df, x='event_name', y='Percentage (%)', color='time', category_orders={'event_name': event_order, 'time':['finished', 'Not started']})
for trace in fig.data:
    if trace.name == 'finished':
        trace.visible = 'legendonly'
fig.update_layout(
    title="Race Results by Event Type as a Percentage",
    xaxis_title="Event",
    title_x=0.5
)

fig.show()
save_figure(fig, 'race_result_by_event_type.json')

Now that we're looking at the individual events, we can see that if you ran the marathon, there is actually a 4.6% chance that you would have dropped out, but for all other lengths, it was less than 1%. We can also see that the rate of disqualifications is significantly lower for all non-marathon events. This makes sense since the entire festival is centered around the marathon.

In [None]:
australia_or_not_df = df.copy()
australia_or_not_df['australia_or_not'] = australia_or_not_df['country'].where(australia_or_not_df['country'] == "AUSTRALIA", 'OTHER')
australia_or_not_df = australia_or_not_df.groupby(['australia_or_not']).size().reset_index(name='count')
australia_or_not_df['Country'] = ''

fig = px.bar(australia_or_not_df, x='Country', y='count', color='australia_or_not', text='australia_or_not')
fig.update_layout(showlegend=False, title='Australian Entrants Vs Other', xaxis_title='Entrants', yaxis_title='Number of People')

fig.show()
save_figure(fig, 'australia_vs_other.json')

We can see that almost 2% of race entrants registered under countries other than Australia. Here is the breakdown of where the non-Australian atheletes came from:

In [None]:
others_df = df.copy()
others_df = others_df.groupby(['country']).size().reset_index(name='count')
others_df = others_df[others_df['country'] != 'AUSTRALIA']
others_df = others_df.sort_values(ascending=False, by=['count'])
others_df['Country'] = 'Other'

fig = px.bar(others_df, x='Country', y='count', color='country', text='country')
fig.update_layout(showlegend=False, title='A Breakdown of Other Countries', xaxis_title='', yaxis_title='Number of People')
fig.show()
save_figure(fig, 'other_countries_breakdown.json')

We have great representation from a large number of countries. I can figure out an athlete's country by looking at the flag displayed on the website. The flags seem to use ISO 3166-1 2 letter country codes which let me link them to countries, but there are still 2 athletes who have a blank country in the data. I wonder if these athletes somehow managed to register without a country, or if they registered with a country that doesn't have a valid ISO code. I suppose the question: "How many countries are there?" is actually very contentious with no consensus between governments worldwide, so it makes sense that we can run into an edge case like this.

In [None]:
import plotly.graph_objects as go
from collections import defaultdict
import datetime as dt

times = defaultdict(list)

for row in data:
# Process the times and group by genderfor row in data:
    if row['event_name'] in EVENTS_OF_INTEREST:
        time = row['time']
        if len(time.split(':')) == 1:
            continue

        hours, minutes, seconds = time.split(':')
        time = dt.timedelta(hours=int(hours), minutes=int(minutes), seconds=int(seconds))
        time = dt.datetime(2000, 1, 1) + time
        times[row['event_name']].append(time)

half_marathon_times = times['Half Marathon']

# Create the stacked histogram
fig = go.Figure()
fig.add_trace(go.Histogram(
    x=half_marathon_times,
    name='idk',
))

# Update the layout for stacked histogram
fig.update_layout(
    title="Number of Participants in the Half Marathon",
    yaxis_title="Number of Participants",
    xaxis={"tickformat": "%H:%M:%S", 'title': 'Time (HH:MM:SS)'},  # Format x-axis ticks as HH:MM:SS
)

fig.show()

You'll see that the finishing times have a very long tail. In future time distributions, I'm going to remove the slowest 0.5% of runners. I'd rather not, but in an era where most of our phone usage is vertical, this will make the visualisations much easier to see.

In [None]:
import plotly.graph_objects as go
from collections import defaultdict
import datetime as dt
from plotly.subplots import make_subplots

times = defaultdict(lambda: {"Male": [], "Female": []})

for row in data:
# Process the times and group by gender for row in data:
    if row['event_name'] in EVENTS_OF_INTEREST:
        time = row['time']
        if len(time.split(':')) == 1:
            continue

        hours, minutes, seconds = time.split(':')
        time = dt.timedelta(hours=int(hours), minutes=int(minutes), seconds=int(seconds))
        time = dt.datetime(2000, 1, 1) + time
        times[row['event_name']][row['gender']].append(time)

for event in EVENTS_OF_INTEREST:
    for gender in ['Male', 'Female']:
        if gender not in times[event]:
            continue
        series = pd.Series(times[event][gender])
        series = series[series < series.quantile(0.995)]
        times[event][gender] = list(series)

half_marathon_times_male = times['Half Marathon']['Male']
half_marathon_times_female = times['Half Marathon']['Female']

# Create the stacked histogram
fig = go.Figure()

fig = make_subplots(rows=2, cols=1, shared_xaxes=True)
fig.add_trace(go.Histogram(
    x=half_marathon_times_male,
    name='Male'
    # marker_color='blue',
), row=1, col=1)
fig.add_trace(go.Histogram(
    x=half_marathon_times_female,
    name='Female'
    # marker_color='pink',
), row=2, col=1)

# Update the layout for stacked histogram
fig.update_layout(
    title="Number of Participants in the Half Marathon by Gender",
    xaxis_title="Time (HH:MM:SS)",
    yaxis_title="Number of Participants",
    xaxis={"tickformat": "%H:%M:%S"},  # Format x-axis ticks as HH:MM:SS
)

fig.show()

Finally just throw in a plot with filters for category, event, and gender, just for fun.

The race results also include everyones names, but I've been intentionally avoiding doing too much with names, because people get a little icky about using personal information, but here are the top 10 most common entrant names by gender, just out of curiousity.

In [None]:
first_name_gender = [(row['name'].split(' ')[0], row['gender']) for row in data]
male_counter = Counter(first_name for first_name, gender in first_name_gender if gender == 'Male')
female_counter = Counter(first_name for first_name, gender in first_name_gender if gender == 'Female')
male_x = []
male_y = []
for first_letter, num_in_category in sorted(male_counter.items(), key=lambda x: x[1], reverse=True)[0:10]:
    male_x.append(first_letter)
    male_y.append(num_in_category)

female_x = []
female_y = []
for first_letter, num_in_category in sorted(female_counter.items(), key=lambda x: x[1], reverse=True)[0:10]:
    female_x.append(first_letter)
    female_y.append(num_in_category)

fig = make_subplots(rows=2, cols=1, shared_xaxes=False)
fig.add_trace(go.Bar(x=male_x, y=male_y, name='Male Names'), row=1, col=1)
fig.add_trace(go.Bar(x=female_x, y=female_y, name='Female Names'), row=2, col=1)
fig.update_layout({
        "title": "10 Most Popular Named Entrants",
        "xaxis_title": "First Name",
        "yaxis_title": "Number of People",
        'xaxis': {'title': '', 'anchor': 'y1'},
        'xaxis2': {'title': 'First Name', 'anchor': 'y2'},
    }
)
fig.show()
save_figure(fig, 'most_popular_names.json')