# 1 Setting up the environment

## 1.1 Requirements & packages

Make sure the environment is configured as per the README file, the requirements are installed, and the relevant libraries, packages and modules are imported.

In [35]:
!pip install -r ../requirements.txt



In [36]:
import os
import json
import requests
import sqlite3

import numpy as np
import pandas as pd 
from sqlalchemy import create_engine
from lets_plot import * # This imports all of ggplot2's functions
LetsPlot.setup_html()
import plotly.express as px
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
from raceplotly.plots import barplot

from pprint import pprint

## 1.2 Connect to the gymternet database

In [37]:
%load_ext sql
%config SqlMagic.autocommit=True

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [38]:
%sql sqlite:///../data/clean/gymternet.db --alias gymternet 
engine = create_engine('sqlite:///../data/clean/gymternet.db')

--------
# 3 EXTRA CREDIT WORK
## 3.1 Ten of the top 10 GOATs of all time

Do you think I'm only interested in 10s? Children focus on 10s. I'm interested in the truth. I'm interested in what matters. 10s are shiny, certainly, but any good person-who-can-do-basic-mathematics can see that a gymnast who gets a 10 one week and then a 5 the next week is not as useful as a gymnast that gets a 9.9 week after week.

With this in mind, let's find the gymnasts who have the highest average scores across the apparatus and across the seasons.

It's easy to do well if you never compete. For the purposes of this analysis, I am only interested in gymnasts who compete a minimum of 6 times over the course of the season (per apparatus). This will necessarily impact the data from the 2020 and 2021 seasons, which were impacted heavily by COVID restrictions.

### 3.1.1 Retrieve the data from the database

Getting the averages is straightforward enough!

In [56]:
%%sql --alias gymternet

SELECT 
    g.gymnast_id AS 'Gymnast ID',
    g.first_name AS 'First Name',
    g.last_name AS 'Last Name',
    t.team_name AS 'Team',
    AVG(r.vt_score) AS 'Vault',
    COUNT(r.vt_score) AS 'VT Count',
    AVG(r.ub_score) AS 'Uneven Bars',
    COUNT(r.ub_score) AS 'UB Count',
    AVG(r.bb_score) AS 'Balance Beam',
    COUNT(r.bb_score) AS 'BB Count',
    AVG(r.fx_score) AS 'Floor Exercise',
    COUNT(r.fx_score) AS 'FX Count',
    AVG(r.aa_score) AS 'All Around',
    COUNT(r.aa_score) AS 'AA Count',
    m.year AS 'Season'
FROM gymnast_results AS r
LEFT JOIN gymnasts AS g
ON g.gymnast_id = r.gymnast_id
LEFT JOIN teams as t
ON t.team_id = g.team_id
LEFT JOIN meets as m
ON m.meet_id = r.meet_id
GROUP BY g.gymnast_id, m.year;

Gymnast ID,First Name,Last Name,Team,Vault,VT Count,Uneven Bars,UB Count,Balance Beam,BB Count,Floor Exercise,FX Count,All Around,AA Count,Season
249,Kristina,Heymann,California,9.829166666666666,12,,0,,0,,0,,0,2015
251,Serena,Leong,California,9.839285714285714,7,,0,9.62857142857143,7,9.65,7,,0,2015
251,Serena,Leong,California,9.7625,8,,0,,0,9.771875,8,,0,2016
258,Jordan,Widener,California,,0,9.76216216216216,37,,0,,0,,0,2015
1158,Breanna,Hughes,Utah,9.825,2,9.756976744186046,43,,0,9.725,8,,0,2015
1158,Breanna,Hughes,Utah,9.88181818181818,66,9.798484848484849,66,9.630833333333332,60,9.882575757575758,66,39.19166666666667,60,2016
1642,Crystal,Paz,California,9.55,5,,0,9.275,8,9.7,10,,0,2015
2505,Georgia,Dabritz,Utah,9.90566037735849,53,9.963068181818182,66,9.70904255319149,47,9.92253787878788,66,39.48048780487805,41,2015
2508,Kailah,Delaney,Utah,9.885377358490564,53,,0,9.648484848484848,33,,0,,0,2015
2508,Kailah,Delaney,Utah,9.85483870967742,62,,0,9.425833333333332,60,9.86875,8,,0,2016


But getting the means are a giant pain with SQLite. In this case, I'll got for a maximalist approach and whittle down the data in pandas.

In [57]:
%%sql --alias gymternet

SELECT 
    g.gymnast_id AS 'Gymnast ID',
    g.first_name AS 'First Name',
    g.last_name AS 'Last Name',
    t.team_name AS 'Team',
    r.vt_score AS 'Vault',
    r.ub_score AS 'Uneven Bars',
    r.bb_score AS 'Balance Beam',
    r.fx_score AS 'Floor Exercise',
    m.year AS 'Season'
FROM gymnast_results AS r
LEFT JOIN gymnasts AS g
ON g.gymnast_id = r.gymnast_id
LEFT JOIN teams as t
ON t.team_id = g.team_id
LEFT JOIN meets as m
ON m.meet_id = r.meet_id;

Gymnast ID,First Name,Last Name,Team,Vault,Uneven Bars,Balance Beam,Floor Exercise,Season
30950,Sierra,Ballard,LSU,,,9.2,9.9,2024
30950,Sierra,Ballard,LSU,,,9.2,9.9,2024
30952,Haleigh,Bryant,LSU,9.95,9.875,9.925,9.925,2024
30952,Haleigh,Bryant,LSU,9.95,9.875,9.925,9.925,2024
31947,Ashley,Cowan,LSU,,9.8,,,2024
31947,Ashley,Cowan,LSU,,9.8,,,2024
32453,Amari,Drayton,LSU,9.925,,,9.925,2024
32453,Amari,Drayton,LSU,9.925,,,9.925,2024
30953,Olivia,Dunne,LSU,,,,9.875,2024
30953,Olivia,Dunne,LSU,,,,9.875,2024


### 2.3.2 Import the data into a dataframe

In [58]:
# Export the above query to a new df
average_scores_query = """
SELECT 
    g.gymnast_id AS 'Gymnast ID',
    g.first_name AS 'First Name',
    g.last_name AS 'Last Name',
    t.team_name AS 'Team',
    AVG(r.vt_score) AS 'Vault',
    COUNT(r.vt_score) AS 'VT Count',
    AVG(r.ub_score) AS 'Uneven Bars',
    COUNT(r.ub_score) AS 'UB Count',
    AVG(r.bb_score) AS 'Balance Beam',
    COUNT(r.bb_score) AS 'BB Count',
    AVG(r.fx_score) AS 'Floor Exercise',
    COUNT(r.fx_score) AS 'FX Count',
    AVG(r.aa_score) AS 'All Around',
    COUNT(r.aa_score) AS 'AA Count',
    m.year AS 'Season'
FROM gymnast_results AS r
LEFT JOIN gymnasts AS g
ON g.gymnast_id = r.gymnast_id
LEFT JOIN teams as t
ON t.team_id = g.team_id
LEFT JOIN meets as m
ON m.meet_id = r.meet_id
GROUP BY g.gymnast_id, m.year;
"""

# Execute the query and store the result in a DataFrame
average_scores_df = pd.read_sql_query(average_scores_query, engine)

# Preview the df
average_scores_df.head()

Unnamed: 0,Gymnast ID,First Name,Last Name,Team,Vault,VT Count,Uneven Bars,UB Count,Balance Beam,BB Count,Floor Exercise,FX Count,All Around,AA Count,Season
0,249,Kristina,Heymann,California,9.829167,12,,0,,0,,0,,0,2015
1,251,Serena,Leong,California,9.839286,7,,0,9.628571,7,9.65,7,,0,2015
2,251,Serena,Leong,California,9.7625,8,,0,,0,9.771875,8,,0,2016
3,258,Jordan,Widener,California,,0,9.762162,37,,0,,0,,0,2015
4,1158,Breanna,Hughes,Utah,9.825,2,9.756977,43,,0,9.725,8,,0,2015


In [59]:
# Export the above query to a new df
median_scores_query = """
SELECT 
    g.gymnast_id AS 'Gymnast ID',
    g.first_name AS 'First Name',
    g.last_name AS 'Last Name',
    t.team_name AS 'Team',
    r.vt_score AS 'Vault',
    r.ub_score AS 'Uneven Bars',
    r.bb_score AS 'Balance Beam',
    r.fx_score AS 'Floor Exercise',
    m.year AS 'Season'
FROM gymnast_results AS r
LEFT JOIN gymnasts AS g
ON g.gymnast_id = r.gymnast_id
LEFT JOIN teams as t
ON t.team_id = g.team_id
LEFT JOIN meets as m
ON m.meet_id = r.meet_id;
"""

# Execute the query and store the result in a DataFrame
median_scores_df = pd.read_sql_query(median_scores_query, engine)

# Preview the df
median_scores_df.head()

Unnamed: 0,Gymnast ID,First Name,Last Name,Team,Vault,Uneven Bars,Balance Beam,Floor Exercise,Season
0,30950,Sierra,Ballard,LSU,,,9.2,9.9,2024
1,30950,Sierra,Ballard,LSU,,,9.2,9.9,2024
2,30952,Haleigh,Bryant,LSU,9.95,9.875,9.925,9.925,2024
3,30952,Haleigh,Bryant,LSU,9.95,9.875,9.925,9.925,2024
4,31947,Ashley,Cowan,LSU,,9.8,,,2024


### 2.3.3 Prepare the data for plotting

In each season, a GOAT candidate must have competed at least 6 times per apparatus (or All Around) of candidacy.

I could go ahead and exclude all rows where `VT Count`, `UB Count`, `BB Count`, `FX Count` AND `AA Count` ALL are less than 6, but I think it's probably going to be easier to work with dataframes specific to the apparatus. This gives me an opportunity to limit the DataFrames to the top-20 in each apparatus as well.

In [60]:
# Before we start, join First Name and Last Name into new column, Name
average_scores_df['Name'] = average_scores_df['First Name'] + ' ' + average_scores_df['Last Name']

#Drop first name and last name columns
average_scores_df = average_scores_df.drop(columns=['First Name', 'Last Name'])

# First up, the VT DF

# Drop any rows where a gymnast has performed fewer than 6 vaults
vault_average_df = average_scores_df[(average_scores_df['VT Count'] >= 6)]

# Sort by average vault score in descending order
vault_average_df = vault_average_df.sort_values(by='Vault', ascending=False)

# Get the first 20 unique values in the 'Gymnast ID' column
top_20_ave_vaulters = vault_average_df['Gymnast ID'].drop_duplicates().head(20).tolist()

# Filter the original DataFrame to include all rows with these unique values
vault_average_df = vault_average_df[vault_average_df['Gymnast ID'].isin(top_20_ave_vaulters)]

# Preview the df
vault_average_df.head()

Unnamed: 0,Gymnast ID,Team,Vault,VT Count,Uneven Bars,UB Count,Balance Beam,BB Count,Floor Exercise,FX Count,All Around,AA Count,Season,Name
6941,30055,Oklahoma,9.956818,11,9.867857,21,9.853846,13,,0,,0,2020,Olivia Trautman
6944,30055,Oklahoma,9.954717,53,9.916509,53,9.851866,67,,0,,0,2023,Olivia Trautman
9166,30952,LSU,9.948529,51,9.857843,51,9.676163,43,9.712255,51,39.205233,43,2021,Haleigh Bryant
4640,24714,Oklahoma,9.948148,27,9.953448,29,9.858333,27,9.938889,18,39.738889,18,2020,Maggie Nichols
1215,23070,Stanford,9.938837,62,9.828302,53,9.9,6,9.878947,19,,0,2015,Elizabeth Price


In [61]:
# Next, the UB DF

# Drop any rows where a gymnast has performed fewer than 6 bars routines
bars_average_df = average_scores_df[(average_scores_df['UB Count'] >= 6)]

# Sort by average bars score in descending order
bars_average_df = bars_average_df.sort_values(by='Uneven Bars', ascending=False)

# Get the first 20 unique values in the 'Gymnast ID' column
top_20_ave_barsers = bars_average_df['Gymnast ID'].drop_duplicates().head(20).tolist()

# Filter the original DataFrame to include all rows with these unique values
bars_average_df = bars_average_df[bars_average_df['Gymnast ID'].isin(top_20_ave_barsers)]

# Preview the df
bars_average_df.head()

Unnamed: 0,Gymnast ID,Team,Vault,VT Count,Uneven Bars,UB Count,Balance Beam,BB Count,Floor Exercise,FX Count,All Around,AA Count,Season,Name
10870,31789,UCLA,9.906818,77,9.977597,77,9.817532,77,9.957143,77,39.659091,77,2023,Jordan Chiles
7,2505,Utah,9.90566,53,9.963068,66,9.709043,47,9.922538,66,39.480488,41,2015,Georgia Dabritz
9904,31267,Auburn,9.9,20,9.956818,22,9.897727,22,9.894444,18,39.705556,18,2023,Sunisa Lee
569,22717,Florida,9.937245,49,9.955455,55,9.893056,36,9.825833,30,39.6125,22,2015,Bridget Sloan
4639,24714,Oklahoma,9.93625,40,9.955,60,9.916379,58,9.925,16,39.721875,16,2019,Maggie Nichols


In [62]:
# Next, the BB DF

# Drop any rows where a gymnast has performed fewer than 6 beam routines
beam_average_df = average_scores_df[(average_scores_df['BB Count'] >= 6)]

# Sort by average beam score in descending order
beam_average_df = beam_average_df.sort_values(by='Balance Beam', ascending=False)

# Get the first 20 unique values in the 'Gymnast ID' column
top_20_ave_beamers = beam_average_df['Gymnast ID'].drop_duplicates().head(20).tolist()

# Filter the original DataFrame to include all rows with these unique values
beam_average_df = beam_average_df[beam_average_df['Gymnast ID'].isin(top_20_ave_beamers)]

# Preview the df
beam_average_df.head()

Unnamed: 0,Gymnast ID,Team,Vault,VT Count,Uneven Bars,UB Count,Balance Beam,BB Count,Floor Exercise,FX Count,All Around,AA Count,Season,Name
8151,30545,Utah,9.845,55,9.900649,77,9.967532,77,9.917063,63,39.638182,55,2023,Maile O'Keefe
4638,24714,Oklahoma,9.92,60,9.947177,62,9.954032,62,9.94496,62,39.770602,54,2018,Maggie Nichols
8741,30786,Alabama,9.866667,48,9.924185,46,9.950815,46,9.932143,42,39.704762,42,2021,Luisa Blanco
1359,23140,UCLA,,0,9.843939,66,9.949414,64,,0,,0,2018,Peng-Peng Lee
7949,30466,Missouri,9.837924,59,9.784322,59,9.945339,59,9.85572,59,39.423305,59,2022,Sienna Schreiber


In [63]:
# Next, the FX DF

# Drop any rows where a gymnast has performed fewer than 6 floor routines
floor_average_df = average_scores_df[(average_scores_df['FX Count'] >= 6)]

# Sort by average beam score in descending order
floor_average_df = floor_average_df.sort_values(by='Floor Exercise', ascending=False)

# Get the first 20 unique values in the 'Gymnast ID' column
top_20_ave_floorers = floor_average_df['Gymnast ID'].drop_duplicates().head(20).tolist()

# Filter the original DataFrame to include all rows with these unique values
floor_average_df = floor_average_df[floor_average_df['Gymnast ID'].isin(top_20_ave_floorers)]

# Preview the df
floor_average_df.head()

Unnamed: 0,Gymnast ID,Team,Vault,VT Count,Uneven Bars,UB Count,Balance Beam,BB Count,Floor Exercise,FX Count,All Around,AA Count,Season,Name
6643,29946,Florida,9.930952,63,9.949603,63,9.94316,53,9.992857,49,39.830053,47,2022,Trinity Thomas
3513,24237,UCLA,9.7875,4,,0,9.927381,63,9.964796,49,,0,2019,Katelyn Ohashi
6641,29946,Florida,9.876316,19,9.847368,19,9.928947,19,9.959211,19,39.611842,19,2020,Trinity Thomas
9226,30973,Minnesota,9.894318,66,9.836742,66,,0,9.957787,61,,0,2022,Mya Hooten
6940,30055,Oklahoma,9.879508,61,9.875,2,9.873305,59,9.957787,61,39.725,2,2019,Olivia Trautman


In [64]:
# Finally, the AA DF

# Drop any rows where a gymnast has performed fewer than 6 floor routines
aa_average_df = average_scores_df[(average_scores_df['AA Count'] >= 6)]

# Sort by average beam score in descending order
aa_average_df = aa_average_df.sort_values(by='All Around', ascending=False)

# Get the first 20 unique values in the 'Gymnast ID' column
top_20_ave_aaers = aa_average_df['Gymnast ID'].drop_duplicates().head(20).tolist()

# Filter the original DataFrame to include all rows with these unique values
aa_average_df = aa_average_df[aa_average_df['Gymnast ID'].isin(top_20_ave_aaers)]

 

# Preview the df
aa_average_df.head()

aa_average_df.sort_values(by='All Around', ascending=True)

Unnamed: 0,Gymnast ID,Team,Vault,VT Count,Uneven Bars,UB Count,Balance Beam,BB Count,Floor Exercise,FX Count,All Around,AA Count,Season,Name
3379,24174,LSU,9.838679,53,9.766393,61,9.820082,61,9.761111,9,38.875000,9,2016,Sarah Finnegan
9166,30952,LSU,9.948529,51,9.857843,51,9.676163,43,9.712255,51,39.205233,43,2021,Haleigh Bryant
7856,30433,Kentucky,9.725000,18,9.800000,18,9.815909,22,9.884091,22,39.231250,16,2020,Raena Worley
8148,30545,Utah,9.775000,12,9.760000,20,9.887500,20,9.853125,16,39.240000,10,2020,Maile O'Keefe
10869,31789,UCLA,9.853261,46,9.796136,55,9.607500,40,9.917857,49,39.306250,40,2022,Jordan Chiles
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4640,24714,Oklahoma,9.948148,27,9.953448,29,9.858333,27,9.938889,18,39.738889,18,2020,Maggie Nichols
6644,29946,Florida,9.908036,56,9.951786,56,9.915556,45,8.136585,41,39.741667,24,2023,Trinity Thomas
9332,31018,Oregon State,9.928571,56,9.942857,56,9.934375,56,9.949554,56,39.755357,56,2022,Jade Carey
4638,24714,Oklahoma,9.920000,60,9.947177,62,9.954032,62,9.944960,62,39.770602,54,2018,Maggie Nichols


In [65]:
merged_ave_df = pd.concat([vault_average_df, bars_average_df, beam_average_df, floor_average_df, aa_average_df])

# Preview the df
merged_ave_df.head()

# Group on gymnast_id, create new column with list of seasons, and average the scores
merged_ave_df = merged_ave_df.groupby('Gymnast ID').agg({'Vault': 'mean', 'Uneven Bars': 'mean', 'Balance Beam': 'mean', 'Floor Exercise': 'mean', 'All Around': 'mean', 'Season': lambda x: list(x)}).reset_index()
# Preview the df
merged_ave_df.head()

Unnamed: 0,Gymnast ID,Vault,Uneven Bars,Balance Beam,Floor Exercise,All Around,Season
0,2505,9.90566,9.963068,9.709043,9.922538,39.480488,[2015]
1,2518,9.923077,9.855769,9.765385,8.459615,38.003846,[2015]
2,22713,9.892308,9.730288,9.870385,9.947885,39.411413,[2015]
3,22715,9.92348,9.898766,9.855629,9.837291,39.535795,"[2017, 2018, 2015, 2016, 2018, 2017, 2015, 201..."
4,22717,9.912853,9.933051,9.85625,9.846713,39.534783,"[2015, 2016, 2015, 2016, 2015, 2016]"


In [66]:
my_raceplot = barplot(aa_average_df,
                      item_column='Name',
                      value_column='All Around',
                      time_column='Season')

my_raceplot.plot(title = "Top AA'ers over time",
                 item_label = 'Gymnast',
                 value_label = 'Ave. Score',
                 frame_duration = 800)

# Who got the highest average/median of all time
# Who got the highest average/median each year

# Which teams own the most goats?

### 2.3.4 Prepare the plots

In [67]:
aa_average_df_sorted = aa_average_df.sort_values(by='All Around', ascending=False)

aa_average_df_sorted['All Around'].max()

39.83005319148936

In [68]:
# Ensure all gymnasts are included in every season
all_gymnasts = aa_average_df['Name'].unique()
all_seasons = aa_average_df['Season'].unique()
aa_average_df_full = aa_average_df.set_index(['Name', 'Season']).unstack(fill_value=0).stack().reset_index()

# Define the range of the data
min_score = 38.75
max_score = 39.85

# Plot with color scaling
top_aaers_ave = px.bar(aa_average_df_full, 
                       y="Name", 
                       x="All Around", 
                       animation_frame="Season", 
                       color="All Around", 
                       hover_name="All Around",
                       range_x=[38.75, 39.85],  # Set the x-axis range
                       range_color=[min_score, max_score],  # Set the color scale range to 9-10
                       color_continuous_scale=px.colors.sequential.Viridis,
                       opacity=0.8
                      )

# Customize the layout
top_aaers_ave.update_layout(
    title="Best (on average) All Around Gymnasts",
    title_font=dict(size=12, family='Helvetica', color='black'),
    yaxis_title="Gymnast",
    yaxis_title_font=dict(size=12, family='Helvetica', color='black'),
    xaxis_title="Score",
    xaxis_title_font=dict(size=12, family='Helvetica', color='black'),
    font=dict(size=10, family='Helvetica', color='black'),
    plot_bgcolor='white',  # Set plot background to white
    paper_bgcolor='white',  # Set paper background to white
    xaxis=dict(
        gridcolor='#EEEEEE'  # Set x-axis grid lines to light grey
    ),
    yaxis=dict(
        gridcolor='#EEEEEE'  # Set y-axis grid lines to light grey
    ),
    coloraxis_colorbar=dict(
        title="Score",
        tickvals=[],  # Hide tick values
    ),
    legend=dict(
        orientation="h",  # horizontal legend
        yanchor="bottom",  
        y=-1,  
        xanchor="center",  
        x=0.5  
    )
)

# Export the plot to html file
top_aaers_ave.write_html("../docs/figures/06.top_aaers_per_year.html")

# Show the plot
top_aaers_ave.show()




