In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.figure_factory as ff
%matplotlib inline
from datetime import datetime
import re
#from nssstats.plots import std_plot
#from nssstats.plots import iqr_plot
#from nssstats.plots import quadrant_plot, half_plot
#from ipywidgets import interact, FloatSlider

In [None]:
sprinters = pd.read_csv("../data/Worlds_Fastest_Sprinters_Stats.csv")

# **Data** **Basics**

In [None]:
sprinters.head()

In [None]:
sprinters.info

In [None]:
sprinters.shape

In [None]:
print(sprinters.dtypes)


In [None]:
sprinters.describe()


In [None]:
sprinters.corr


In [None]:
sprinters.isnull().sum()


# **General** **EDA**

Let's make a column for the total PR time

In [None]:
sprinters['Total_Time_PRs'] = round(sprinters['100_PR'] + sprinters['200_PR'] + sprinters['400_PR'],2)
sprinters.head()

Let's make a column for the total career average time

In [None]:
sprinters['Total_Time_Career_Avg'] = round(sprinters['Avg Season Best 100m'] + sprinters['Avg Season Best 200m'] + sprinters['Avg Season Best 400m'],2)
sprinters.head()

Let's make a column for the actual age of the athletes

In [None]:
# Ensure the DOB column is in datetime format
sprinters['DOB'] = pd.to_datetime(sprinters['DOB'], errors='coerce')  # Coerce will handle invalid dates as NaT

# Get the current year
current_year = datetime.now().year

# Calculate the age by subtracting the birth year from the current year
sprinters['Age'] = current_year - sprinters['DOB'].dt.year

# Display the updated DataFrame with the new 'Age' column
print(sprinters[['DOB', 'Age']].head())


Let's make a code for the average top 25 all-time mark for each athlete

In [None]:
sprinters['avg_top25_rnk'] = sprinters[['T25_100_AT_RK_NUM', 'T25_200_AT_RK_NUM', 'T25_400_AT_RK_NUM']].mean(axis=1)
sprinters

*Let's make a few dataframes regarding the athletes time*

Data Frame 1 (PRs)

In [None]:
sprinters_pr = sprinters[['Athlete','100_PR','200_PR','400_PR','Total_Time_PRs']]
sprinters_pr

In [None]:
sprinters_pr = sprinters_pr.sort_values(by='Total_Time_PRs', ascending=False)
sprinters_pr

In [None]:
sprinters_pr = pd.DataFrame(sprinters_pr)

Let's make a dataframe without null values in the PR

In [None]:
sprinters_pr_clean = sprinters_pr.dropna(subset=['100_PR','200_PR','400_PR'])
sprinters_pr_clean

In [None]:
sprinters_pr_clean = pd.DataFrame(sprinters_pr_clean)
sprinters_pr_clean.head(3)

In [None]:
sprinters_pr_clean.shape

Only 41 athletes (60%) have competed in all three events

In [None]:
sprinters_pr_clean = sprinters_pr_clean.sort_values(by='Total_Time_PRs')
sprinters_pr_clean

In [None]:
#What sample size of the dataframe to we want to make into a figure factory table
sprinters_pr_clean_sample = sprinters_pr_clean[1:10]

#Cusomize Colors (Add colorscale=colorscale in parentheses of ff.create table)
#colorscale = [[0, '#4d004c'],[.5, '#f2e5ff'],[1, '#ffffff']]
#Cusomize Font Colors (Add font_colors=font in parentheses of ff.create table)
#font=['#FCFCFC', '#00EE00', '#008B00', '#004F00', '#660000', '#CD0000', '#FF3030']

table_data = sprinters_pr_clean

#Adding graphs to the tables (Add table_data in parentheses of ff.create table)
'''
#Scatter Plot
fig.add_trace(go.Scatter(x=Athlete, y=Total_Time_PRs,
                    marker=dict(color='#0099ff'),
                    name='Goals For<br>Per Game',
                    xaxis='x2', yaxis='y2'))



fig.update_layout(
    title_text = 'Total Time PRs',
    margin = {'t':50, 'b':100},
    xaxis = {'domain': [0, .5]},
    xaxis2 = {'domain': [0.6, 1.]},
    yaxis2 = {'anchor': 'x2', 'title': 'Goals'}
)
'''
'''
#Bar Chart
fig.add_trace(go.Bar(x=Athlete, y=100_PR, xaxis='x2', yaxis='y2',
                marker=dict(color='#0099ff'),
                name='100M<br>PR'))

fig.add_trace(go.Bar(x=Athlete, y=200_PR, xaxis='x2', yaxis='y2',
                marker=dict(color='#404040'),
                name='200M<br>PR'))

fig.add_trace(go.Bar(x=Athlete, y=400_PR, xaxis='x2', yaxis='y2',
                marker=dict(color='#404040'),
                name='400M<br>PR'))

fig.add_trace(go.Bar(x=Athlete, y=Total_Time_PRs, xaxis='x2', yaxis='y2',
                marker=dict(color='#404040'),
                name='Total PR<br>Time'))

fig.update_layout(
    title_text = 'Sprinter Personal Records',
    height = 800,
    margin = {'t':75, 'l':50},
    yaxis = {'domain': [0, .45]},
    xaxis2 = {'anchor': 'y2'},
    yaxis2 = {'domain': [.6, 1], 'anchor': 'x2', 'title': 'Goals'}
'''

fig =  ff.create_table(sprinters_pr_clean_sample)
fig.show()

#fig.write_html("sprinters_pr_top10_ff.html")
#fig.write_image("sprinters_pr_top10_ff.svg")


In [None]:
#Let's make the entire dataframe a figure factory table


fig =  ff.create_table(sprinters_pr_clean)
fig.show()

#fig.write_html("sprinters_pr_ff.html")
#fig.write_image("sprinters_pr_ff.svg")

***Note***:

If you go by Personal Records, Micheal Johnson is the fastest short sprinter with the lowest total time.

Even with Usain Bolt's world record times in the 100m & 200m, his 400m PR causes hime to fall all the way down to 10th.

Data Frame 2 (Career Average)

In [None]:
sprinters_career_avg = sprinters[['Athlete','Avg Season Best 100m','Avg Season Best 200m','Avg Season Best 400m','Total_Time_Career_Avg']]
sprinters_career_avg

In [None]:
sprinters_career_avg = sprinters_career_avg.sort_values(by='Total_Time_Career_Avg', ascending=False)
sprinters_career_avg

In [None]:
sprinters_career_avg = pd.DataFrame(sprinters_career_avg)

Let's make a dataframe without null values in the Avergae Times

In [None]:
sprinters_career_avg_clean = sprinters_career_avg.dropna(subset=['Avg Season Best 100m','Avg Season Best 200m','Avg Season Best 400m'])
sprinters_career_avg_clean

In [None]:
sprinters_career_avg_clean = pd.DataFrame(sprinters_career_avg_clean)
sprinters_career_avg_clean.head(3)

In [None]:
sprinters_career_avg_clean.shape

In [None]:
sprinters_career_avg_clean = sprinters_career_avg_clean.sort_values(by='Total_Time_Career_Avg')
sprinters_career_avg_clean

In [None]:
#What sample size of the dataframe to we want to make into a figure factory table
sprinters_career_avg_clean_sample = sprinters_career_avg_clean[1:10]

#Cusomize Colors (Add colorscale=colorscale in parentheses of ff.create table)
#colorscale = [[0, '#4d004c'],[.5, '#f2e5ff'],[1, '#ffffff']]
#Cusomize Font Colors (Add font_colors=font in parentheses of ff.create table)
#font=['#FCFCFC', '#00EE00', '#008B00', '#004F00', '#660000', '#CD0000', '#FF3030']

table_data = sprinters_career_avg_clean

#Adding graphs to the tables (Add table_data in parentheses of ff.create table)
'''
#Scatter Plot
fig.add_trace(go.Scatter(x=Athlete, y=Total_Time_Career_Avg,
                    marker=dict(color='#0099ff'),
                    name='Goals For<br>Per Game',
                    xaxis='x2', yaxis='y2'))



fig.update_layout(
    title_text = 'Total Time Career Average',
    margin = {'t':50, 'b':100},
    xaxis = {'domain': [0, .5]},
    xaxis2 = {'domain': [0.6, 1.]},
    yaxis2 = {'anchor': 'x2', 'title': 'Goals'}
)
'''
'''
#Bar Chart
fig.add_trace(go.Bar(x=Athlete, y=100_PR, xaxis='x2', yaxis='y2',
                marker=dict(color='#0099ff'),
                name='100M<br>PR'))

fig.add_trace(go.Bar(x=Athlete, y=200_PR, xaxis='x2', yaxis='y2',
                marker=dict(color='#404040'),
                name='200M<br>PR'))

fig.add_trace(go.Bar(x=Athlete, y=400_PR, xaxis='x2', yaxis='y2',
                marker=dict(color='#404040'),
                name='400M<br>PR'))

fig.add_trace(go.Bar(x=Athlete, y=Total_Time_Career_Avg, xaxis='x2', yaxis='y2',
                marker=dict(color='#404040'),
                name='Total PR<br>Time'))

fig.update_layout(
    title_text = 'Sprinter Career Averages',
    height = 800,
    margin = {'t':75, 'l':50},
    yaxis = {'domain': [0, .45]},
    xaxis2 = {'anchor': 'y2'},
    yaxis2 = {'domain': [.6, 1], 'anchor': 'x2', 'title': 'Goals'}
'''

fig =  ff.create_table(sprinters_career_avg_clean_sample)
fig.show()

#fig.write_html("sprinters_career_avg_top10_ff.html")
#fig.write_image("sprinters_career_avg_top10_ff.svg")


In [None]:
#Let's make the entire dataframe a figure factory table


fig =  ff.create_table(sprinters_career_avg_clean)
fig.show()

#fig.write_html("sprinters_career_avg_ff.html")
#fig.write_image("sprinters_career_avg_ff.svg")

***Note***:

If you go by Total Career average across all three events, Micheal Johnson is the fastest short sprinter with the lowest total average time.

Once again Usian Bolt slower times in the 400m cause in the drop down to 9th, even though has the fastest average 100m & 200m (See code / analysis below).

Data Frame 3 (PRs & Career Average)

In [None]:
sprint_career = sprinters[['Athlete','Avg Season Best 100m','Avg Season Best 200m','Avg Season Best 400m', '100_PR','200_PR','400_PR', 'Total_Time_PRs', 'Total_Time_Career_Avg', 'avg_top25_rnk']]
sprint_career

In [None]:
sprint_career = pd.DataFrame(sprint_career)

*Let make a data frame for Top 25 All-Time Career Rankings*

In [None]:
sprinters_alltime_rnk = sprinters[['Athlete','Country','DOB','Age', 'T25_100_AT_RK_NUM', 'T25_200_AT_RK_NUM', 'T25_400_AT_RK_NUM','avg_top25_rnk']]
sprinters_alltime_rnk.head(3)

Let's createa a dataframe without any null values

In [None]:
sprinters_alltime_rnk_clean = sprinters_alltime_rnk.dropna(subset=['T25_100_AT_RK_NUM', 'T25_200_AT_RK_NUM', 'T25_400_AT_RK_NUM','avg_top25_rnk'])
sprinters_alltime_rnk_clean

In [None]:
sprinters_alltime_rnk_clean.shape

In [None]:
sprinters_alltime_rnk_clean = sprinters_alltime_rnk_clean.sort_values(by='avg_top25_rnk')
sprinters_alltime_rnk_clean

*Let's make a dataframe for each event*

Let's add second database to join number of season to each events dataframe (For Years Competed in each event).

In [None]:
sprinters_df2 = pd.read_csv("../data/Worlds_Fastest_Sprinters_Master_List_Yearly_Progression.csv")
sprinters_df2.head(3)

In [None]:
yrs_competed_100m = sprinters_df2[sprinters_df2['Event'] == '100m'].groupby('Athlete').size().reset_index(name='Years')

In [None]:
yrs_competed_100m = yrs_competed_100m.sort_values(by='Years', ascending=False)
yrs_competed_100m.head()

In [None]:
yrs_competed_200m = sprinters_df2[sprinters_df2['Event'] == '200m'].groupby('Athlete').size().reset_index(name='Years')

In [None]:
yrs_competed_200m = yrs_competed_200m.sort_values(by='Years', ascending=False)
yrs_competed_200m.head()

In [None]:
yrs_competed_400m = sprinters_df2[sprinters_df2['Event'] == '400m'].groupby('Athlete').size().reset_index(name='Years')

In [None]:
yrs_competed_400m = yrs_competed_400m.sort_values(by='Years', ascending=False)
yrs_competed_400m.head()

100m

In [None]:
df_100m= sprinters[['Athlete', 'Country','Continent','Status', 'DOB','Year Born','Month Born','Decade Born','Avg Season Best 100m','100_PR','T25_100_All_Time_Rank','T25_100_AT_RK_NUM']]
df_100m

In [None]:
df_100m['Avg_100m_PR_Diff'] = round(df_100m['Avg Season Best 100m'] - df_100m['100_PR'],2)
df_100m.head(3)

In [None]:
df_100m = pd.merge(df_100m, yrs_competed_100m, on=['Athlete'],how='left')
df_100m.head(3)

In [None]:
df_100m = df_100m.rename(columns={'Years': 'Years_Competed_100m'})
df_100m.head(3)

In [None]:
#df_100m = df_100m.sort_values(by='Avg Season Best 100m', ascending=False)
#df_100m

In [None]:
df_100m = df_100m.sort_values(by='Avg Season Best 100m')
df_100m

**Top 5 Career Averages in the 100m (Rounded to the nearest hundreth)**



1.   Usain Bolt: 9.80 (11 Seasons)
2.   Christian Coleman: 9.93 (9 Seasons)
3.   Justin Gatlin: 9.94 (17 Seasons)
4.   Asafa Powell: 9.99 (20 Seasons)
5.   Ferdidnad Omanyala: 10.00 (6 Seasons)








In [None]:
# @title Year Born 100m


df_100m['Year Born'].plot(kind='hist', bins=20, title='Year Born')
plt.gca().spines[['top', 'right',]].set_visible(False)

#plt.savefig('year_born_100.png', format='png', dpi=300)
#plt.savefig('year_born_100.jpg', format='jpg', dpi=300)

In [None]:
# @title Year Born vs Avg Season Best 100m

df_100m.plot(kind='scatter', x='Year Born', y='Avg Season Best 100m', s=32, alpha=.8)
plt.gca().spines[['top', 'right',]].set_visible(False)

#plt.savefig('year_born_avg_100.png', format='png', dpi=300)
#plt.savefig('year_born_avg_100.jpg', format='jpg', dpi=300)

In [None]:
# @title Avg Season Best 100m vs 100_PR

df_100m.plot(kind='scatter', x='Avg Season Best 100m', y='100_PR', s=32, alpha=.8)
plt.gca().spines[['top', 'right',]].set_visible(False)

#plt.savefig('avg_100__vs_pr.png', format='png', dpi=300)
#plt.savefig('avg_100_vs_pr.jpg', format='jpg', dpi=300)

Version 2

In [None]:
df_100m = px.data.iris() # iris is a pandas DataFrame
fig = px.scatter(df_100m, x="Avg Season Best 100m", y="100_PR")
fig.show()

#fig.write_html("100m_avg_vs_pr_scatter.html")
#fig.write_image("100m_avg_vs_pr_scatter.svg")

Version 3

In [None]:
df_100m = px.data.iris()
fig = px.scatter(df_100m, x="Avg Season Best 100m", y="100_PR", color="Country",
                 size='Avg Season Best 100m', hover_data=['100_PR'])
fig.show()

#fig.write_html("100m_avg_vs_pr_scatter_2.html")
#fig.write_image("100m_avg_vs_pr_scatter_2.svg")

Version 4 (With Error Bars)

In [None]:
df_100m = px.data.iris()
df_100m["e"] = df_100m["Avg Season Best 100m"]/100
fig = px.scatter(df_100m, x="Avg Season Best 100m", y="100_PR", color="Country",
                 error_x="e", error_y="e")
fig.show()

#fig.write_html("100m_avg_vs_pr_scatter_3.html")
#fig.write_image("100m_avg_vs_pr_scatter_3.svg")

Version 5 (Using Dash)

In [None]:
from dash import Dash, dcc, html, Input, Output

In [None]:


app = Dash(__name__)


app.layout = html.Div([
    html.H4('Career Average 100m vs 100m PR'),
    dcc.Graph(id="scatter-plot"),
    html.P("Filter by Career Average 100m:"),
    dcc.RangeSlider(
        id='range-slider',
        min=0, max=2.5, step=0.1,
        marks={0: '0', 2.5: '2.5'},
        value=[0.5, 2]
    ),
])


@app.callback(
    Output("scatter-plot", "figure"),
    Input("range-slider", "value"))
def update_bar_chart(slider_range):
    df_100m = px.data.iris() # replace with your own data source
    low, high = slider_range
    mask = (df_100m['Avg Season Best 100m'] > low) & (df_100m['Avg Season Best 100m'] < high)
    fig = px.scatter(
        df[mask], x="Avg Season Best 100m", y="100m_PR",
        color="Country", size='Avg Season Best 100m',
        hover_data=['100m_PR'])
    return fig


app.run_server(debug=True)

#fig.write_html("100m_avg_vs_pr_scatter_dash.html")
#fig.write_image("100m_avg_vs_pr_scatter_dash.svg")

# ***Statistical Analysis 100m***

In [None]:
from scipy.stats import zscore
from sklearn.linear_model import LinearRegression

Linear Regression

In [None]:
# Function to calculate standard deviation (consistency)
df_100m['consistency'] = df_100m['Avg Season Best 100m']  # Placeholder: You could replace with actual std per year data if available

### 1. Regression Analysis ###
# Linear regression: relationship between average_time and years_competed
X = df_100m[['Years_Competed_100m', '100_PR']]
y = df_100m['Avg Season Best 100m']

# Fit model
model = LinearRegression()
model.fit(X, y)

# Predictions and residuals
df_100m['predicted_time'] = model.predict(X)
df_100m['residuals'] = df_100m['Avg Season Best 100m'] - df_100m['predicted_time']

print("Regression coefficients (slope):", model.coef_)
print("Intercept:", model.intercept_)


Z-Score Standardization

In [None]:
# Z-score for average_time and years_competed
df_100m['z_time'] = zscore(df_100m['Avg Season Best 100m'])
df_100m['z_years'] = zscore(df_100m['Years_Competed_100m'])
df_100m['z_PR'] = zscore(df_100m['100_PR'])

# Z-score comparison (combine time and years)
df_100m['z_combined'] = (df_100m['z_time'] + df_100m['z_years']) + df_100m['z_PR'] / 3

Efficiency / Ratio Analysis

In [None]:
# Efficiency score (average_time per year competed)
df_100m['efficiency_score'] = df_100m['Avg Season Best 100m'] / df_100m['Years_Competed_100m']

# Efficiency score: How close the sprinter's average time is to their personal best
df_100m['efficiency_score_pr'] = df_100m['100_PR'] / df_100m['Avg Season Best 100m']

Ranking System

In [None]:
# Combine rankings based on average_time, consistency, and longevity (years_competed)
df_100m['rank_personal_best'] = df_100m['100_PR'].rank(ascending=True)  # Lower personal best is better
df_100m['rank_average_time'] = df_100m['Avg Season Best 100m'].rank(ascending=True)  # Lower is better
df_100m['rank_consistency'] = df_100m['consistency'].rank(ascending=True)  # Lower std dev is better
df_100m['rank_years_competed'] = df_100m['Years_Competed_100m'].rank(ascending=False)  # Longer careers are better

#Final ranking
df_100m['final_rank'] = df_100m[['rank_personal_best','rank_average_time', 'rank_consistency', 'rank_years_competed']].mean(axis=1)

Scatter Plot Visualization

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Years_Competed_100m', y='Avg Season Best 100m', data=df_100m, s=100, hue='final_rank', palette='coolwarm')
plt.title('Years Competed vs. Career Average 100m Time')
plt.xlabel('Years Competed')
plt.ylabel('Average 100m Time (s)')
plt.show()

#plt.savefig('avg_100_vs_yrs_competed_ranked.png', format='png', dpi=300)
#plt.savefig('avg_100_vs_yrs_competed_ranked.jpg', format='jpg', dpi=300)

Interactive Scatter Plot

In [None]:
# @title Years Competed vs. Career Average 100m Time

df_100m = px.data.iris()
fig = px.scatter(df_100m, x="Years_Competed_100m", y="Avg Season Best 100m", color="final_rank",
                 size='Years_Competed_100m', hover_data=['Avg Season Best 100m']) #Potentially switch out career average for personal record regarding hover data.
fig.show()

#fig.write_html("avg_100_vs_yrs_competed_ranked.html")
#fig.write_image("avg_100_vs_yrs_competed_ranked.svg")

In [None]:
print(df_100m[['Athlete', 'Avg Season Best 100m', 'Years_Competed_100m', '100_PR', 'residuals', 'z_combined', 'efficiency_score', 'efficiency_score_pr','final_rank']])

In [None]:
df_100m_stat_analysis = df_100m[['Athlete', 'Avg Season Best 100m', 'Years_Competed_100m', '100_PR', 'residuals', 'z_combined', 'efficiency_score', 'efficiency_score_pr','final_rank']]

In [None]:
df_100m_stat_analysis = df_100m_stat_analysis.sort_values(by='final_rank')
df_100m_stat_analysis.head(3)

In [None]:
#What sample size of the dataframe to we want to make into a figure factory table
df_100m_stat_analysis_sample = df_100m_stat_analysis[1:10]

#Cusomize Colors (Add colorscale=colorscale in parentheses of ff.create table)
#colorscale = [[0, '#4d004c'],[.5, '#f2e5ff'],[1, '#ffffff']]
#Cusomize Font Colors (Add font_colors=font in parentheses of ff.create table)
#font=['#FCFCFC', '#00EE00', '#008B00', '#004F00', '#660000', '#CD0000', '#FF3030']

table_data = df_100m_stat_analysis


fig =  ff.create_table(df_100m_stat_analysis_sample)
fig.show()

#fig.write_html("df_100m_stat_analysis_sample_ff.html")
#fig.write_image("df_100m_stat_analysis_sample_ff.svg")


In [None]:
fig =  ff.create_table(df_100m_stat_analysis)
fig.show()

#fig.write_html("df_100m_stat_analysis_ff.html")
#fig.write_image("df_100m_stat_analysis_ff.svg")

In [None]:
fig =  ff.create_table(df_100m)
fig.show()

#fig.write_html("df_100m_ff.html")
#fig.write_image("df_100m_ff.svg")

In [None]:
df_100m = pd.DataFrame(df_100m)

In [None]:
df_100m_stat_analysis = pd.DataFrame(df_100m_stat_analysis)

200m

In [None]:
df_200m= sprinters[['Athlete', 'Country','Continent','Status', 'DOB','Year Born','Month Born','Decade Born','Avg Season Best 200m','200_PR','T25_200_All_Time_Rank','T25_200_AT_RK_NUM']]
df_200m

In [None]:
df_200m['Avg_200m_PR_Diff'] = round(df_200m['Avg Season Best 200m'] - df_200m['200_PR'],2)
df_200m.head(3)

In [None]:
df_200m = pd.merge(df_200m, yrs_competed_200m, on=['Athlete'],how='left')
df_200m.head(3)

In [None]:
df_200m = df_200m.rename(columns={'Years': 'Years_Competed_200m'})
df_200m.head(3)

In [None]:
#df_200m = df_200m.sort_values(by='Avg Season Best 200m', ascending=False)
#df_200m

In [None]:
df_200m = df_200m.sort_values(by='Avg Season Best 200m')
df_200m

**Top 5 Career Averages in the 200m (Rounded to the nearest hundreth)**



1.   Usain Bolt: 19.85 (15 Seasons)
2.   Michael Johnson: 19.97 (14 Seasons)
3.   Maurice Greene: 19.99 (5 Seasons)
4.   Erriyon Knighton: 20.05 (6 Seasons)
5.   Noah Lyles: 20.05 (13 Seasons)








In [None]:
# @title Year Born 200m


df_200m['Year Born'].plot(kind='hist', bins=20, title='Year Born')
plt.gca().spines[['top', 'right',]].set_visible(False)

#plt.savefig('year_born_200.png', format='png', dpi=300)
#plt.savefig('year_born_200.jpg', format='jpg', dpi=300)

In [None]:
# @title Year Born vs Avg Season Best 200m

df_200m.plot(kind='scatter', x='Year Born', y='Avg Season Best 200m', s=32, alpha=.8)
plt.gca().spines[['top', 'right',]].set_visible(False)

#plt.savefig('year_born_avg_200.png', format='png', dpi=300)
#plt.savefig('year_born_avg_200.jpg', format='jpg', dpi=300)

In [None]:
# @title Avg Season Best 200m vs 200_PR

df_200m.plot(kind='scatter', x='Avg Season Best 200m', y='200_PR', s=32, alpha=.8)
plt.gca().spines[['top', 'right',]].set_visible(False)

#plt.savefig('avg_200_pr.png', format='png', dpi=300)
#plt.savefig('avg_200_pr.jpg', format='jpg', dpi=300)

Version 2

In [None]:
df_200m = px.data.iris() # iris is a pandas DataFrame
fig = px.scatter(df_200m, x="Avg Season Best 200m", y="200_PR")
fig.show()

#fig.write_html("200m_avg_vs_pr_scatter.html")
#fig.write_image("200m_avg_vs_pr_scatter.svg")

Version 3

In [None]:
df_200m = px.data.iris()
fig = px.scatter(df_200m, x="Avg Season Best 200m", y="200_PR", color="Country",
                 size='Avg Season Best 200m', hover_data=['200_PR'])
fig.show()

#fig.write_html("200m_avg_vs_pr_scatter_2.html")
#fig.write_image("200m_avg_vs_pr_scatter_2.svg")

Version 4 (With Error Bars)

In [None]:
df_200m = px.data.iris()
df_200m["e"] = df_200m["Avg Season Best 200m"]/100
fig = px.scatter(df_200m, x="Avg Season Best 200m", y="200_PR", color="Country",
                 error_x="e", error_y="e")
fig.show()

#fig.write_html("200m_avg_vs_pr_scatter_3.html")
#fig.write_image("200m_avg_vs_pr_scatter_3.svg")

Version 5 (Using Dash)

In [None]:


app = Dash(__name__)


app.layout = html.Div([
    html.H4('Career Average 200m vs 200m PR'),
    dcc.Graph(id="scatter-plot"),
    html.P("Filter by Career Average 200m:"),
    dcc.RangeSlider(
        id='range-slider',
        min=0, max=2.5, step=0.1,
        marks={0: '0', 2.5: '2.5'},
        value=[0.5, 2]
    ),
])


@app.callback(
    Output("scatter-plot", "figure"),
    Input("range-slider", "value"))
def update_bar_chart(slider_range):
    df_200m = px.data.iris() # replace with your own data source
    low, high = slider_range
    mask = (df_200m['Avg Season Best 200m'] > low) & (df_100m['Avg Season Best 200m'] < high)
    fig = px.scatter(
        df[mask], x="Avg Season Best 100m", y="100m_PR",
        color="Country", size='Avg Season Best 100m',
        hover_data=['100m_PR'])
    return fig


app.run_server(debug=True)

#fig.write_html("200m_avg_vs_pr_scatter_dash.html")
#fig.write_image("200m_avg_vs_pr_scatter_dash.svg")

# ***Statistical Analysis 200m***

Linear Regression

In [None]:
# Function to calculate standard deviation (consistency)
df_200m['consistency'] = df_200m['Avg Season Best 200m']  # Placeholder: You could replace with actual std per year data if available

### 1. Regression Analysis ###
# Linear regression: relationship between average_time and years_competed
X = df_200m[['Years_Competed_200m','200_PR']]
y = df_200m['Avg Season Best 200m']

# Fit model
model = LinearRegression()
model.fit(X, y)

# Predictions and residuals
df_200m['predicted_time'] = model.predict(X)
df_200m['residuals'] = df_200m['Avg Season Best 200m'] - df_200m['predicted_time']

print("Regression coefficients (slope):", model.coef_)
print("Intercept:", model.intercept_)


Z-Score Standardization

In [None]:
# Z-score for average_time and years_competed
df_200m['z_time'] = zscore(df_200m['Avg Season Best 200m'])
df_200m['z_years'] = zscore(df_200m['Years_Competed_200m'])
df_200m['z_PR'] = zscore(df_200m['200_PR'])

# Z-score comparison (combine time and years)
df_200m['z_combined'] = (df_200m['z_time'] + df_200m['z_years'] + df_200m['z_PR']) / 3

Efficiency / Ratio Analysis

In [None]:
# Efficiency score (average_time per year competed)
df_200m['efficiency_score'] = df_200m['Avg Season Best 200m'] / df_200m['Years_Competed_200m']

# Efficiency score: How close the sprinter's average time is to their personal best
df_200m['efficiency_score_pr'] = df_200m['200_PR'] / df_200m['Avg Season Best 200m']

Ranking System

In [None]:
# Combine rankings based on average_time, consistency, and longevity (years_competed)
df_200m['rank_personal_best'] = df_200m['100_PR'].rank(ascending=True)  # Lower personal best is better
df_200m['rank_average_time'] = df_200m['Avg Season Best 200m'].rank(ascending=True)  # Lower is better
df_200m['rank_consistency'] = df_200m['consistency'].rank(ascending=True)  # Lower std dev is better
df_200m['rank_years_competed'] = df_200m['Years_Competed_200m'].rank(ascending=False)  # Longer careers are better

#Final ranking
df_200m['final_rank'] = df_200m[['rank_personal_best','rank_average_time', 'rank_consistency', 'rank_years_competed']].mean(axis=1)

Scatter Plot Visualization

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Years_Competed_200m', y='Avg Season Best 200m', data=df_200m, s=100, hue='final_rank', palette='coolwarm')
plt.title('Years Competed vs. Career Average 200m Time')
plt.xlabel('Years Competed')
plt.ylabel('Average 200m Time (s)')
plt.show()

#plt.savefig('avg_200_vs_yrs_competed_ranked.png', format='png', dpi=300)
#plt.savefig('avg_200_vs_yrs_competed_ranked.jpg', format='jpg', dpi=300)

Interactive Scatter Plot

In [None]:
# @title Years Competed vs. Career Average 200m Time

df_200m = px.data.iris()
fig = px.scatter(df_200m, x="Years_Competed_200m", y="Avg Season Best 200m", color="final_rank",
                 size='Years_Competed_200m', hover_data=['Avg Season Best 200m']) #Potentially switch out career average for personal record regarding hover data.
fig.show()

#fig.write_html("avg_200_vs_yrs_competed_ranked.html")
#fig.write_image("avg_200_vs_yrs_competed_ranked.svg")

In [None]:
print(df_200m[['Athlete', 'Avg Season Best 200m', 'Years_Competed_200m','200_PR', 'residuals', 'z_combined', 'efficiency_score', 'efficiency_score_pr' 'final_rank']])

In [None]:
df_200m_stat_analysis = df_200m[['Athlete', 'Avg Season Best 200m', 'Years_Competed_200m',' 200_PR', 'residuals', 'z_combined', 'efficiency_score', 'efficiency_score_pr' 'final_rank']]

In [None]:
df_200m_stat_analysis = df_200m_stat_analysis.sort_values(by='final_rank')
df_200m_stat_analysis.head(3)

In [None]:
#What sample size of the dataframe to we want to make into a figure factory table
df_200m_stat_analysis_sample = df_200m_stat_analysis[1:10]

#Cusomize Colors (Add colorscale=colorscale in parentheses of ff.create table)
#colorscale = [[0, '#4d004c'],[.5, '#f2e5ff'],[1, '#ffffff']]
#Cusomize Font Colors (Add font_colors=font in parentheses of ff.create table)
#font=['#FCFCFC', '#00EE00', '#008B00', '#004F00', '#660000', '#CD0000', '#FF3030']

table_data = df_200m_stat_analysis


fig =  ff.create_table(df_200m_stat_analysis_sample)
fig.show()

#fig.write_html("df_200m_stat_analysis_sample_ff.html")
#fig.write_image("df_200m_stat_analysis_sample_ff.svg")


In [None]:
fig =  ff.create_table(df_200m_stat_analysis)
fig.show()

#fig.write_html("df_200m_stat_analysis_ff.html")
#fig.write_image("df_200m_stat_analysis_ff.svg")

In [None]:
fig =  ff.create_table(df_200m)
fig.show()

#fig.write_html("df_200m_ff.html")
#fig.write_image("df_200m_ff.svg")

In [None]:
df_200m_stat_analysis = pd.DataFrame(df_200m_stat_analysis)

In [None]:
df_200m = pd.DataFrame(df_200m)

400m

In [None]:
df_400m= sprinters[['Athlete', 'Country','Continent','Status', 'DOB','Year Born','Month Born','Decade Born','Avg Season Best 400m','400_PR','T25_400_All_Time_Rank','T25_400_AT_RK_NUM']]
df_400m

In [None]:
df_400m['Avg_400m_PR_Diff'] = round(df_400m['Avg Season Best 400m'] - df_400m['400_PR'],2)
df_400m.head(3)

In [None]:
df_400m = pd.merge(df_400m, yrs_competed_400m, on=['Athlete'],how='left')
df_400m.head(3)

In [None]:
df_400m = df_400m.rename(columns={'Years': 'Years_Competed_400m'})
df_400m.head(3)

In [None]:
#df_400m = df_400m.sort_values(by='Avg Season Best 400m', ascending=False)
#df_400m

In [None]:
df_400m = df_400m.sort_values(by='Avg Season Best 400m')
df_400m

**Top 5 Career Averages in the 400m (Rounded to the nearest hundreth)**



1.   Larry James: 43.97 (1 Season)
2.   Michael Johnson: 44.22 (14 Seasons)
3.   Steven Gardiner: 44.39 (10 Seasons)
4.   Lee Evans : 44.41 (2 Seasons)
5.   Wayne Van Niekerk: 44.48 (11 Seasons)








In [None]:
# @title Year Born 400m


df_400m['Year Born'].plot(kind='hist', bins=20, title='Year Born')
plt.gca().spines[['top', 'right',]].set_visible(False)

#plt.savefig('year_born_400.png', format='png', dpi=300)
#plt.savefig('year_born_400.jpg', format='jpg', dpi=300)

In [None]:
# @title Year Born vs Avg Season Best 400m

df_400m.plot(kind='scatter', x='Year Born', y='Avg Season Best 400m', s=32, alpha=.8)
plt.gca().spines[['top', 'right',]].set_visible(False)

#plt.savefig('year_born_avg_400.png', format='png', dpi=300)
#plt.savefig('year_born_avg_400.jpg', format='jpg', dpi=300)

In [None]:
# @title Avg Season Best 400m vs 400_PR

df_400m.plot(kind='scatter', x='Avg Season Best 400m', y='400_PR', s=32, alpha=.8)
plt.gca().spines[['top', 'right',]].set_visible(False)

#plt.savefig('avg_400_pr.png', format='png', dpi=300)
#plt.savefig('avg_400_pr.jpg', format='jpg', dpi=300)

Version 2

In [None]:
df_400m = px.data.iris() # iris is a pandas DataFrame
fig = px.scatter(df_400m, x="Avg Season Best 400m", y="400_PR")
fig.show()

#fig.write_html("400m_avg_vs_pr_scatter.html")
#fig.write_image("400m_avg_vs_pr_scatter.svg")

Version 3

In [None]:
df_400m = px.data.iris()
fig = px.scatter(df_400m, x="Avg Season Best 400m", y="400_PR", color="Country",
                 size='Avg Season Best 400m', hover_data=['400_PR'])
fig.show()

#fig.write_html("400m_avg_vs_pr_scatter_2.html")
#fig.write_image("400m_avg_vs_pr_scatter_2.svg")

Version 4 (With Error Bars)

In [None]:
df_400m = px.data.iris()
df_400m["e"] = df_400m["Avg Season Best 400m"]/100
fig = px.scatter(df_400m, x="Avg Season Best 400m", y="400_PR", color="Country",
                 error_x="e", error_y="e")
fig.show()

#fig.write_html("100m_avg_vs_pr_scatter_3.html")
#fig.write_image("100m_avg_vs_pr_scatter_3.svg")

Version 5 (Using Dash)

In [None]:


app = Dash(__name__)


app.layout = html.Div([
    html.H4('Career Average 400m vs 400m PR'),
    dcc.Graph(id="scatter-plot"),
    html.P("Filter by Career Average 400m:"),
    dcc.RangeSlider(
        id='range-slider',
        min=0, max=2.5, step=0.1,
        marks={0: '0', 2.5: '2.5'},
        value=[0.5, 2]
    ),
])


@app.callback(
    Output("scatter-plot", "figure"),
    Input("range-slider", "value"))
def update_bar_chart(slider_range):
    df_400m = px.data.iris() # replace with your own data source
    low, high = slider_range
    mask = (df_400m['Avg Season Best 400m'] > low) & (df_400m['Avg Season Best 400m'] < high)
    fig = px.scatter(
        df[mask], x="Avg Season Best 400m", y="400m_PR",
        color="Country", size='Avg Season Best 400m',
        hover_data=['400m_PR'])
    return fig


app.run_server(debug=True)

#fig.write_html("400m_avg_vs_pr_scatter_dash.html")
#fig.write_image("400m_avg_vs_pr_scatter_dash.svg")

# ***Statistical Analysis 400m***

Linear Regression

In [None]:
# Function to calculate standard deviation (consistency)
df_400m['consistency'] = df_400m['Avg Season Best 400m']  # Placeholder: You could replace with actual std per year data if available

### 1. Regression Analysis ###
# Linear regression: relationship between average_time and years_competed
X = df_400m[['Years_Competed_400m', '400_PR']]
y = df_400m['Avg Season Best 400m']

# Fit model
model = LinearRegression()
model.fit(X, y)

# Predictions and residuals
df_400m['predicted_time'] = model.predict(X)
df_400m['residuals'] = df_400m['Avg Season Best 400m'] - df_400m['predicted_time']

print("Regression coefficients (slope):", model.coef_)
print("Intercept:", model.intercept_)


Z-Score Standardization

In [None]:
# Z-score for average_time and years_competed
df_400m['z_time'] = zscore(df_400m['Avg Season Best 400m'])
df_400m['z_years'] = zscore(df_400m['Years_Competed_400m'])
df_400m['z_PR'] = zscore(df_400m['400_PR'])

# Z-score comparison (combine time and years)
df_400m['z_combined'] = (df_400m['z_time'] + df_400m['z_years'] + df_400m['z_PR']  ) / 3

Efficiency / Ratio Analysis

In [None]:
# Efficiency score (average_time per year competed)
df_400m['efficiency_score'] = df_400m['Avg Season Best 400m'] / df_400m['Years_Competed_400m']

# Efficiency score: How close the sprinter's average time is to their personal best
df_400m['efficiency_score_pr'] = df_400m['400_PR'] / df_400m['Avg Season Best 400m']

Ranking System

In [None]:
# Combine rankings based on average_time, consistency, and longevity (years_competed)
df_400m['rank_personal_best'] = df_100m['400_PR'].rank(ascending=True)  # Lower personal best is better
df_400m['rank_average_time'] = df_400m['Avg Season Best 400m'].rank(ascending=True)  # Lower is better
df_400m['rank_consistency'] = df_400m['consistency'].rank(ascending=True)  # Lower std dev is better
df_400m['rank_years_competed'] = df_400m['Years_Competed_400m'].rank(ascending=False)  # Longer careers are better

#Final ranking
df_400m['final_rank'] = df_400m[['rank_personal_best','rank_average_time', 'rank_consistency', 'rank_years_competed']].mean(axis=1)

Scatter Plot Visualization

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Years_Competed_400m', y='Avg Season Best 400m', data=df_400m, s=100, hue='final_rank', palette='coolwarm')
plt.title('Years Competed vs. Career Average 100m Time')
plt.xlabel('Years Competed')
plt.ylabel('Average 400m Time (s)')
plt.show()

#plt.savefig('avg_400_vs_yrs_competed_ranked.png', format='png', dpi=300)
#plt.savefig('avg_400_vs_yrs_competed_ranked.jpg', format='jpg', dpi=300)

Interactive Scatter Plot

In [None]:
# @title Years Competed vs. Career Average 400m Time

df_400m = px.data.iris()
fig = px.scatter(df_400m, x="Years_Competed_400m", y="Avg Season Best 400m", color="final_rank",
                 size='Years_Competed_400m', hover_data=['Avg Season Best 400m']) #Potentially switch out career average for personal record regarding hover data.
fig.show()

#fig.write_html("avg_400_vs_yrs_competed_ranked.html")
#fig.write_image("avg_400_vs_yrs_competed_ranked.svg")

In [None]:
print(df_400m[['Athlete', 'Avg Season Best 400m', 'Years_Competed_400m', '400_PR', 'residuals', 'z_combined', 'efficiency_score', 'efficiency_score_pr' 'final_rank']])

In [None]:
df_400m_stat_analysis = df_400m[['Athlete', 'Avg Season Best 400m', 'Years_Competed_400m', '400_PR', 'residuals', 'z_combined', 'efficiency_score', 'efficiency_score_pr' 'final_rank']]

In [None]:
df_400m_stat_analysis = df_400m_stat_analysis.sort_values(by='final_rank')
df_400m_stat_analysis.head(3)

In [None]:
#What sample size of the dataframe to we want to make into a figure factory table
df_400m_stat_analysis_sample = df_400m_stat_analysis[1:10]

#Cusomize Colors (Add colorscale=colorscale in parentheses of ff.create table)
#colorscale = [[0, '#4d004c'],[.5, '#f2e5ff'],[1, '#ffffff']]
#Cusomize Font Colors (Add font_colors=font in parentheses of ff.create table)
#font=['#FCFCFC', '#00EE00', '#008B00', '#004F00', '#660000', '#CD0000', '#FF3030']

table_data = df_400m_stat_analysis


fig =  ff.create_table(df_400m_stat_analysis_sample)
fig.show()

#fig.write_html("df_400m_stat_analysis_sample_ff.html")
#fig.write_image("df_400m_stat_analysis_sample_ff.svg")


In [None]:
fig =  ff.create_table(df_400m_stat_analysis)
fig.show()

#fig.write_html("df_400m_stat_analysis_ff.html")
#fig.write_image("df_400m_stat_analysis_ff.svg")

In [None]:
fig =  ff.create_table(df_400m)
fig.show()

#fig.write_html("df_400m_ff.html")
#fig.write_image("df_400m_ff.svg")

In [None]:
df_400m = pd.DataFrame(df_400m)

In [None]:
df_400m_stat_analysis = pd.DataFrame(df_400m_stat_analysis)

*Lets's Create an honors dataframe*

In [None]:
honors= sprinters[['Athlete', 'Country','Continent','Status', 'DOB','Year Born','Month Born','Decade Born','Honors']]
honors

In [None]:
honors = pd.DataFrame(honors)

*Let's create a dataframe for sprinters that are ranked in the Top 25 All-time in the 100m, 200m and 400m (Similar to the one above, but with all the columns)*

In [None]:
'''
all_rnk_sprinters = sprinters.dropna(subset=['T25_100_AT_RK_NUM', 'T25_200_AT_RK_NUM', 'T25_400_AT_RK_NUM'])

print(all_rnk_sprinters)
'''

How many sprinters are ranked in the top 25 in all three events?

In [None]:
#len(all_rnk_sprinters)

In [None]:
#all_rnk_sprinters = pd.DataFrame(all_rnk_sprinters)

We are going to make a subset of this data frame

In [None]:
#all_rnk_sprinters_2 = all_rnk_sprinters[['Athlete','T25_100_AT_RK_NUM', 'T25_200_AT_RK_NUM', 'T25_400_AT_RK_NUM']]

Country

In [None]:
country = pd.DataFrame(sprinters.Country.value_counts().reset_index())
country

In [None]:
country.columns = ['Country','Sprinters']
country

In [None]:
country.Sprinters.nlargest(5)


In [None]:
country.Sprinters.nsmallest(5)

In [None]:
px.bar(hs_affiliation,x='Country',y='Sprinters', text='Number of Sprinters',title='Top All-Time Sprinters By Country')

The United States has produced most of the Top 25 All-Time ranked shorts sprinters in the world (33) , followed by Jamaica (9) with Canada and South Africa tied for third (2).

In [None]:
country = pd.DataFrame(country)
country.head(3)


Active vs Retired (Status)

In [None]:
status = pd.DataFrame(sprinters.Status.value_counts().reset_index())
status

In [None]:
status.columns = ['Status','Sprinters']
status

In [None]:
plt.pie(sprinters.Status.value_counts())
plt.show()

#plt.savefig('sprinter_status.png', format='png', dpi=300)
#plt.savefig('sprinter_status.jpg', format='jpg', dpi=300)

Interactive Chart(s)

In [None]:
fig = px.pie(status, values='Sprinters', names='Status')
fig.show()

#fig.write_html("sprinter_status.html")
#fig.write_image("sprinter_status.svg")

In [None]:
labels = ['Active','Retired']
values = [39, 29]

# Use `hole` to create a donut-like pie chart
fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.3)])
fig.show()

#fig.write_html("sprinter_status_donut_chart.html")
#fig.write_image("sprinter_status_donut_chart.svg")

In [None]:
status['Percent'] = round((status['Sprinters'] / 68) * 100,1)
status.head()

In [None]:
px.bar(hs_affiliation,x='Status',y='Sprinters', text='Number of Sprinters',title='Status of Top All-Time Sprinters')

#fig.write_html("sprinter_status_bar_chart.html")
#fig.write_image("sprinter_status_bar_chart.svg")

In [None]:
status = pd.DataFrame(status)
status.head(3)

Continent

In [None]:
continent = pd.DataFrame(sprinters.Continent.value_counts().reset_index())
continent

In [None]:
continent.columns = ['Continent','Sprinters']
continent

In [None]:
# @title Continent vs Sprinters

figsize = (12, 1.2 * len(continent['Continent'].unique()))
plt.figure(figsize=figsize)
sns.violinplot(continent, x='Sprinters', y='Continent', inner='stick', palette='Dark2')
sns.despine(top=True, right=True, bottom=True, left=True)

#plt.savefig('sprinter_continent_bar_chart.png', format='png', dpi=300)
#plt.savefig('sprinter_continent_bar_chart.jpg', format='jpg', dpi=300)

In [None]:
px.bar(hs_affiliation,x='Continent',y='Sprinters', text='Number of Sprinters',title='Top All-Time Sprinters By Contintent')

#fig.write_html("sprinter_continent_bar_chart.html")
#fig.write_image("sprinter_continent_bar_chart.svg")

In [None]:
continent = pd.DataFrame(continent)
continent.head(3)

Year Born

In [None]:
year = pd.DataFrame(sprinters['Year Born'].value_counts().reset_index())
year

In [None]:
year.columns = ['Year','Sprinters']
year

In [None]:
year.Sprinters.nlargest(5)


In [None]:
year.Sprinters.nsmallest(5)

In [None]:
px.bar(hs_affiliation,x='Year',y='Sprinters', text='Number of Sprinters',title='Year Top All-Time Sprinters Were Born')


#fig.write_html("sprinter_year_born_bar_chart.html")
#fig.write_image("sprinter_year_born_bar_chart.svg")

In [None]:
year = pd.DataFrame(year)
year.head(3)

Month Born

In [None]:
month = pd.DataFrame(sprinters['Month Born'].value_counts().reset_index())
month

In [None]:
month.columns = ['Month','Sprinters']
month

In [None]:
month.Sprinters.nlargest(5)

In [None]:
month.Sprinters.nsmallest(5)

In [None]:
px.bar(hs_affiliation,x='Month',y='Sprinters', text='Number of Sprinters',title='Month Top All-Time Sprinters Were Born')


#fig.write_html("sprinter_month_born_bar_chart.html")
#fig.write_image("sprinter_month_born_bar_chart.svg")

In [None]:
month = pd.DataFrame(month)
month.head(3)

Decade Born

In [None]:
decade = pd.DataFrame(sprinters['Decade Born'].value_counts().reset_index())
decade

In [None]:
decade.columns = ['Decade','Sprinters']
decade

In [None]:
# @title Decade vs Sprinters

figsize = (12, 1.2 * len(decade['Decade'].unique()))
plt.figure(figsize=figsize)
sns.violinplot(decade, x='Sprinters', y='Decade', inner='stick', palette='Dark2')
sns.despine(top=True, right=True, bottom=True, left=True)

#plt.savefig('sprinter_decade_born_bar_chart.png', format='png', dpi=300)
#plt.savefig('sprinter_decade_born_bar_chart.jpg', format='jpg', dpi=300)

In [None]:
px.bar(hs_affiliation,x='Decade',y='Sprinters', text='Number of Sprinters',title='Decade Top All-Time Sprinters Were Born')


#fig.write_html("sprinter_decade_born_bar_chart.html")
#fig.write_image("sprinter_decade_born_bar_chart.svg")

In [None]:
decade = pd.DataFrame(decade)
decade.head(3)

**Athlete Honors**

Let's create a dataframe for the Olympic Champions

In [None]:
olympic_champions = sprinters[sprinters['Honors'].str.contains("Olympic Champion", case=False, na=False)]
olympic_champions

In [None]:
olympic_champions.shape

32 Sprinters (47%) are Olympic Champions

In [None]:
olympic_champions = pd.DataFrame(olympic_champions)
olympic_champions.head(3)

Let's create a dataframe for the World Outdoor Champions

In [None]:
world_out_champion = sprinters[sprinters['Honors'].str.contains("World Champion", case=False, na=False)]
world_out_champion

In [None]:
world_out_champion.shape

In [None]:
world_out_champion = pd.DataFrame(world_out_champion)
world_out_champion.head(3)

Let's create a dataframe for the World Indoor Champions

In [None]:
world_ind_champion = sprinters[sprinters['Honors'].str.contains("World Indoor Champion", case=False, na=False)]
world_ind_champion

In [None]:
world_ind_champion.shape

In [None]:
world_ind_champion = pd.DataFrame(world_out_champion)
world_ind_champion.head(3)

Let's create a dataframe for the athletes that won both the Olympics and World Championships


In [None]:
oly_world_champ = sprinters[sprinters['Honors'].str.contains("Olympic Champion", case=False, na=False) & sprinters['Honors'].str.contains("World Champion", case=False, na=False)]
oly_world_champ

In [None]:
oly_world_champ.shape

In [None]:
oly_world_champ = pd.DataFrame(oly_world_champ)
oly_world_champ.head(3)

How many sprinter won a global championship (Olympic, World Indoor or Outdoor)?

In [None]:
global_champions_count = sprinters['Honors'].str.contains("Olympic Champion|World Champion|World Indoor Champion", case=False, na=False).sum

In [None]:
global_champions = sprinters[sprinters['Honors'].str.contains("Olympic Champion", case=False, na=False) | sprinters['Honors'].str.contains("World Champion", case=False, na=False) | sprinters['Honors'].str.contains("World Indoor Champion", case=False, na=False)]
global_champions

In [None]:
global_champions.shape

In [None]:
global_champions = pd.DataFrame(global_champions)
global_champions.head(3)

Let's make a dataframe for Olympic Medalist

In [None]:
olympic_medalist = sprinters[sprinters['Honors'].str.contains("Olympic Champion", case=False, na=False) | sprinters['Honors'].str.contains("Olympic Games silver medallist", case=False, na=False) | sprinters['Honors'].str.contains("Olympic Games bronze medallist", case=False, na=False)]
olympic_medalist

In [None]:
olympic_medalist = pd.DataFrame(olympic_medalist)
olympic_medalist.head(3)

Let's make a dataframe for World Championship Outdoor Medalist

In [None]:
wc_out_medalist = sprinters[sprinters['Honors'].str.contains("World Champion", case=False, na=False) | sprinters['Honors'].str.contains("World Championships silver medallist", case=False, na=False) | sprinters['Honors'].str.contains("World Championships bronze medallist", case=False, na=False)]
wc_out_medalist

In [None]:
wc_out_medalist.shape

In [None]:
wc_out_medalist = pd.DataFrame(wc_out_medalist)
wc_out_medalist.head(3)

Let's make a dataframe for World Championship Indoor Medalist

In [None]:
wc_ind_medalist = sprinters[sprinters['Honors'].str.contains("World Indoor Champion", case=False, na=False) | sprinters['Honors'].str.contains("World Indoor Championships silver medallist", case=False, na=False) | sprinters['Honors'].str.contains("World Indoor Championships bronze medallist", case=False, na=False)]
wc_ind_medalist

In [None]:
wc_ind_medalist.shape

In [None]:
wc_ind_medalist = pd.DataFrame(wc_ind_medalist)
wc_ind_medalist.head(3)

Let's make a dataframe for Global Medalist

In [None]:
global_medalist = sprinters[sprinters['Honors'].str.contains("World Champion", case=False, na=False) |
                            sprinters['Honors'].str.contains("World Championships silver medallist", case=False, na=False) |
                            sprinters['Honors'].str.contains("World Championships bronze medallist", case=False, na=False) |
                            sprinters['Honors'].str.contains("World Indoor Champion", case=False, na=False) |
                            sprinters['Honors'].str.contains("World Indoor Championships silver medallist", case=False, na=False) |
                            sprinters['Honors'].str.contains("World Indoor Championships bronzw medallist", case=False, na=False)]

global_medalist

In [None]:
global_medalist.shape

In [None]:
global_medalist = pd.DataFrame(global_medalist)
global_medalist.head(3)

Let's make a dataframe for Diamond League Final Winners

In [None]:
dl_final_winner = sprinters[sprinters['Honors'].str.contains("Diamond League Final winner", case=False, na=False)]
dl_final_winner

In [None]:
dl_final_winner.shape

In [None]:
dl_final_winner = pd.DataFrame(dl_final_winner)
dl_final_winner.head(3)

Let's make a dataframe for Golden League Final Winners

In [None]:
gl_final_winner = sprinters[sprinters['Honors'].str.contains("Golden League Final winner", case=False, na=False)]
gl_final_winner

In [None]:
gl_final_winner = pd.DataFrame(gl_final_winner)
gl_final_winner.head(3)

Let's make a dataframe of all the Golden and Diamond League Winners

In [None]:
gl_dl_winner = sprinters[sprinters['Honors'].str.contains("Diamond League Final winner", case=False, na=False) |
                            sprinters['Honors'].str.contains("Golden League Final winner", case=False, na=False)]

gl_dl_winner

In [None]:
gl_dl_winner = pd.DataFrame(gl_dl_winner)
gl_dl_winner.head(3)

# **Visualizations Based On Data**

*Geographic Visualizations*

Sprinter Nationality Geographic Map

In [None]:
# Create a geographic map using Plotly
fig = px.choropleth(country,
                    locations="Country",
                    locationmode='country names',
                    color="Sprinters",
                    hover_name="Country",
                    color_continuous_scale="Viridis",
                    title="Sprinters' Countries")

# Show the map
fig.show()

#fig.write_html("sprinters_countries_map.html")
#fig.write_image("sprinters_countries_map.svg")

Sprinter Continent Geographic Map

In [None]:
# Prepare a DataFrame with unique country-continent pairs
country_continent = sprinters[['Country', 'Continent']].drop_duplicates()

# Create a map coloring countries based on their continent
fig = px.choropleth(country_continent,
                    locations="Country",
                    locationmode='country names',
                    color="Continent",
                    #color_discrete_sequence=px.colors.qualitative.Set1, #Parameter to customize the color scheme
                    hover_name="Country",
                    title="Sprinters by Continent")

#fig.update_geos(scope='world')  # Use this adjust map scope Options: 'africa', 'asia', 'europe', 'north america', 'south america', 'usa', 'world'


# Show the map
fig.show()


#fig.write_html("sprinters_continent_map.html")
#fig.write_image("sprinters_continent_map.svg")


Sprint Age / Longevity Geographic Map

In [None]:
# Calculate the average age per country (this will act as a proxy for career longevity)
avg_age_per_country = sprinters.groupby('Country')['Age'].mean().reset_index()
avg_age_per_country.columns = ['Country', 'Average Age']

# Create a geographic map using Plotly
fig = px.choropleth(avg_age_per_country,
                    locations="Country",
                    locationmode='country names',
                    color="Average Age",
                    hover_name="Country",
                    color_continuous_scale="Viridis",
                    title="Average Age of Sprinters per Country (Sprinter Longevity)")

# Show the map
fig.show()

#fig.write_html("sprinters_current_age_country_map.html")
#fig.write_image("sprinters_current_age_country_map.svg")

Heatmap of Personal Records (PRs)

In [None]:
# Calculate the average 100m PR per country
avg_100m_pr_per_country = df_100m.groupby('Country')['100_PR'].mean().reset_index()

fig = px.choropleth(avg_100m_pr_per_country,
                    locations="Country",
                    locationmode='country names',
                    color="100_PR",
                    hover_name="Country",
                    color_continuous_scale="Viridis",
                    title="Average 100m Personal Records by Country (Faster = Darker)")

fig.show()
#fig.write_html("avg_100m_PR_country_map.html")
#fig.write_image("avg_100m_PR_country_map.svg")

In [None]:
# Calculate the average 200m PR per country
avg_200m_pr_per_country = df_200m.groupby('Country')['200_PR'].mean().reset_index()

fig = px.choropleth(avg_200m_pr_per_country,
                    locations="Country",
                    locationmode='country names',
                    color="200_PR",
                    hover_name="Country",
                    color_continuous_scale="Viridis",
                    title="Average 200m Personal Records by Country (Faster = Darker)")

fig.show()
#fig.write_html("avg_200m_PR_country_map.html")
#fig.write_image("avg_200m_PR_country_map.svg")

In [None]:
# Calculate the average 400m PR per country
avg_400m_pr_per_country = df_400m.groupby('Country')['400_PR'].mean().reset_index()

fig = px.choropleth(avg_400m_pr_per_country,
                    locations="Country",
                    locationmode='country names',
                    color="400_PR",
                    hover_name="Country",
                    color_continuous_scale="Viridis",
                    title="Average 400m Personal Records by Country (Faster = Darker)")

fig.show()
#fig.write_html("avg_400m_PR_country_map.html")
#fig.write_image("avg_400m_PR_country_map.svg")

Map of Medal Winners or Honors

In [None]:
honors_df = honors[honors['Honors'].str.contains("Olympic Champion|World Champion", case=False, na=False)]
honors_count_per_country = honors_df['Country'].value_counts().reset_index()
honors_count_per_country.columns = ['Country', 'Honors Count']

fig = px.choropleth(honors_count_per_country,
                    locations="Country",
                    locationmode='country names',
                    color="Honors Count",
                    hover_name="Country",
                    color_continuous_scale="OrRd",
                    title="Countries with the Most Olympic and World Champions")

fig.show()
#fig.write_html("oly_world_champs_country_map.html")
#fig.write_image("oly_world_champs_country_map.svg")

 Sprinters' Birth Month Distribution

In [None]:
# Extract birth months from DOB
sprinters['Birth Month'] = pd.DatetimeIndex(sprinters['DOB']).month

# Find the most common birth month for sprinters in each country
common_birth_month = sprinters.groupby('Country')['Birth Month'].agg(lambda x: x.value_counts().index[0]).reset_index()

fig = px.choropleth(common_birth_month,
                    locations="Country",
                    locationmode='country names',
                    color="Birth Month",
                    hover_name="Country",
                    title="Most Common Birth Month for Sprinters by Country")

fig.show()
#fig.write_html("birth_month_distribution_country_map.html")
#fig.write_image("birth_month_distribution_country_map.svg")

 Sprinter Performance by Decade

In [None]:
# Extract the birth year and decade
sprinters['Birth Year'] = pd.DatetimeIndex(sprinters['DOB']).year
sprinters['Decade'] = (sprinters['Birth Year'] // 10) * 10

# Calculate the average 100m PR by country and decade
avg_100m_per_decade = sprinters.groupby(['Country', 'Decade'])['100_PR'].mean().reset_index()

fig = px.choropleth(avg_100m_per_decade,
                    locations="Country",
                    locationmode='country names',
                    color="100_PR",
                    animation_frame="Decade",  # Animate through decades
                    hover_name="Country",
                    color_continuous_scale="Viridis",
                    title="Average 100m Personal Records by Decade")

fig.show()
#fig.write_html("100m_sprinter_performance_by_decade_country_map.html")
#fig.write_image("100m_sprinter_performance_by_decade_country_map.svg")

In [None]:
# Extract the birth year and decade
sprinters['Birth Year'] = pd.DatetimeIndex(sprinters['DOB']).year
sprinters['Decade'] = (sprinters['Birth Year'] // 10) * 10

# Calculate the average 100m PR by country and decade
avg_200m_per_decade = sprinters.groupby(['Country', 'Decade'])['200_PR'].mean().reset_index()

fig = px.choropleth(avg_200m_per_decade,
                    locations="Country",
                    locationmode='country names',
                    color="200_PR",
                    animation_frame="Decade",  # Animate through decades
                    hover_name="Country",
                    color_continuous_scale="Viridis",
                    title="Average 200m Personal Records by Decade")

fig.show()
#fig.write_html("200m_sprinter_performance_by_decade_country_map.html")
#fig.write_image("200m_sprinter_performance_by_decade_country_map.svg")

In [None]:
# Extract the birth year and decade
sprinters['Birth Year'] = pd.DatetimeIndex(sprinters['DOB']).year
sprinters['Decade'] = (sprinters['Birth Year'] // 10) * 10

# Calculate the average 100m PR by country and decade
avg_400m_per_decade = sprinters.groupby(['Country', 'Decade'])['400_PR'].mean().reset_index()

fig = px.choropleth(avg_400m_per_decade,
                    locations="Country",
                    locationmode='country names',
                    color="400_PR",
                    animation_frame="Decade",  # Animate through decades
                    hover_name="Country",
                    color_continuous_scale="Viridis",
                    title="Average 400m Personal Records by Decade")

fig.show()
#fig.write_html("400m_sprinter_performance_by_decade_country_map.html")
#fig.write_image("400m_sprinter_performance_by_decade_country_map.svg")

Map of Rising Stars (Youngest Sprinters)

In [None]:
# Filter sprinters younger than 25
young_sprinters = sprinters[sprinters['Age'] < 25]

# Count young sprinters by country
young_sprinters_per_country = young_sprinters['Country'].value_counts().reset_index()
young_sprinters_per_country.columns = ['Country', 'Number of Young Sprinters']

fig = px.choropleth(young_sprinters_per_country,
                    locations="Country",
                    locationmode='country names',
                    color="Number of Young Sprinters",
                    hover_name="Country",
                    color_continuous_scale="Purples",
                    title="Countries with the Most Young Sprinters (Under 25)")

fig.show()
#fig.write_html("young_sprinters_country_map.html")
#fig.write_image("young_sprinters_country_map.svg")

Map Showing Top Sprinters per Country

In [None]:
# Get the best 100m PR for each country
best_100m_sprinter_per_country = sprinters.loc[sprinters.groupby('Country')['100_PR'].idxmin()]

# Plot the map
fig = px.choropleth(best_100m_sprinter_per_country,
                    locations="Country",
                    locationmode='country names',
                    color="100_PR",
                    hover_name="Country",
                    color_continuous_scale="Blues",
                    title="Top 100m Sprinter per Country (Fastest PR)")

fig.show()


In [None]:
# Get the best 100m PR for each country
best_200m_sprinter_per_country = sprinters.loc[sprinters.groupby('Country')['200_PR'].idxmin()]

# Plot the map
fig = px.choropleth(best_200m_sprinter_per_country,
                    locations="Country",
                    locationmode='country names',
                    color="200_PR",
                    hover_name="Country",
                    color_continuous_scale="Blues",
                    title="Top 200m Sprinter per Country (Fastest PR)")

fig.show()


In [None]:
# Get the best 400m PR for each country
best_400m_sprinter_per_country = sprinters.loc[sprinters.groupby('Country')['400_PR'].idxmin()]

# Plot the map
fig = px.choropleth(best_400m_sprinter_per_country,
                    locations="Country",
                    locationmode='country names',
                    color="400_PR",
                    hover_name="Country",
                    color_continuous_scale="Blues",
                    title="Top 400m Sprinter per Country (Fastest PR)")

fig.show()


Sprinter Density by Country

In [None]:
# Count the number of sprinters per country
sprinters_per_country = data_cleaned['Country'].value_counts().reset_index()
sprinters_per_country.columns = ['Country', 'Number of Sprinters']

# Assuming you have population data for each country in a separate dataframe (e.g., 'population_data.csv')
population_data = pd.read_csv('path_to_population_data.csv')

# Merge population data with sprinters data
merged_data = pd.merge(sprinters_per_country, population_data, on='Country')

# Calculate sprinter density (sprinters per 100,000 people)
merged_data['Sprinter Density'] = (merged_data['Number of Sprinters'] / merged_data['Population']) * 100000

# Plot the map
fig = px.choropleth(merged_data,
                    locations="Country",
                    locationmode='country names',
                    color="Sprinter Density",
                    hover_name="Country",
                    color_continuous_scale="Viridis",
                    title="Sprinter Density by Country (per 100,000 people)")

fig.show()


Sprinters' Age Distribution by Country/Continent

In [None]:
# Calculate the average age per country
data_cleaned['DOB'] = pd.to_datetime(data_cleaned['DOB'], errors='coerce')
data_cleaned['Age'] = pd.datetime.now().year - data_cleaned['DOB'].dt.year
avg_age_per_country = data_cleaned.groupby('Country')['Age'].mean().reset_index()

# Plot the map
fig = px.choropleth(avg_age_per_country,
                    locations="Country",
                    locationmode='country names',
                    color="Age",
                    hover_name="Country",
                    color_continuous_scale="Blues",
                    title="Average Age of Sprinters by Country")

fig.show()


Sprinters' Speed Heatmap

In [None]:
# Calculate the average 100m PR per country
avg_100m_pr_per_country = data_cleaned.groupby('Country')['100_PR'].mean().reset_index()

# Plot the map
fig = px.choropleth(avg_100m_pr_per_country,
                    locations="Country",
                    locationmode='country names',
                    color="100_PR",
                    hover_name="Country",
                    color_continuous_scale="Viridis",
                    title="Average 100m Personal Records by Country")

fig.show()


Career Longevity by Country

In [None]:
# Calculate the average age per country (proxy for career longevity)
avg_age_per_country = data_cleaned.groupby('Country')['Age'].mean().reset_index()

# Plot the map
fig = px.choropleth(avg_age_per_country,
                    locations="Country",
                    locationmode='country names',
                    color="Age",
                    hover_name="Country",
                    color_continuous_scale="Viridis",
                    title="Average Career Longevity by Country (based on age)")

fig.show()


*General Visualizations*

Status Vs Year Born

In [None]:
figsize = (12, 1.2 * len(honors['Status'].unique()))
plt.figure(figsize=figsize)
sns.violinplot(honors, x='Year Born', y='Status', inner='stick', palette='Dark2')
sns.despine(top=True, right=True, bottom=True, left=True)

Distribution of Personal Records (PRs)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Histogram for 100m PRs
plt.figure(figsize=(10,6))
sns.histplot(data_cleaned['100_PR'].dropna(), bins=20, kde=True)
plt.title("Distribution of 100m Personal Records")
plt.xlabel("100m PR (seconds)")
plt.ylabel("Count")
plt.show()

# Boxplot for 100m, 200m, 400m PRs
plt.figure(figsize=(10,6))
sns.boxplot(data=data_cleaned[['100_PR', '200_PR', '400_PR']])
plt.title("Boxplot of 100m, 200m, and 400m Personal Records")
plt.ylabel("Time (seconds)")
plt.show()


Correlation Heatmap

In [None]:
# Correlation matrix
corr_matrix = data_cleaned[['100_PR', '200_PR', '400_PR', 'Age']].corr()

# Plot the heatmap
plt.figure(figsize=(8,6))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", vmin=-1, vmax=1)
plt.title("Correlation Heatmap of Sprinter Data")
plt.show()


PRs by Age Groups

In [None]:
# Create age groups
bins = [18, 25, 30, 35, 40, 50]
labels = ['18-25', '26-30', '31-35', '36-40', '41-50']
data_cleaned['Age Group'] = pd.cut(data_cleaned['Age'], bins=bins, labels=labels)

# Boxplot for 100m PR by Age Group
plt.figure(figsize=(10,6))
sns.boxplot(x='Age Group', y='100_PR', data=data_cleaned)
plt.title("100m PRs by Age Group")
plt.xlabel("Age Group")
plt.ylabel("100m PR (seconds)")
plt.show()


Sprint Event Specialization

In [None]:
# Find the best event (minimum PR) for each sprinter
sprinters['Best Event'] = sprinters[['100_PR', '200_PR', '400_PR']].idxmin(axis=1)

# Count the number of sprinters specialized in each event
event_specialization = sprinters['Best Event'].value_counts()

# Pie chart for event specialization
plt.figure(figsize=(8,8))
plt.pie(event_specialization.values, labels=event_specialization.index, autopct='%1.1f%%', startangle=140)
plt.title("Sprinter Specialization by Event (100m, 200m, 400m)")
plt.show()


Code for Visualizations (Can be used after more data gathering or for follow up project)

Honors by Event

In [None]:
# Count the number of honors for each event
honors_df = data_cleaned[data_cleaned['Honors'].str.contains("Olympic Champion|World Champion", case=False, na=False)]
event_honors = honors_df[['100_PR', '200_PR', '400_PR']].count()

# Bar plot for honors by event
plt.figure(figsize=(10,6))
sns.barplot(x=event_honors.index, y=event_honors.values)
plt.title("Number of Honors by Event (100m, 200m, 400m)")
plt.xlabel("Event")
plt.ylabel("Number of Honors")
plt.show()


Comparison of Performance by Gender

In [None]:
# Boxplot for 100m PRs by gender
plt.figure(figsize=(10,6))
sns.boxplot(x='Gender', y='100_PR', data=data_cleaned)
plt.title("Comparison of 100m PR by Gender")
plt.xlabel("Gender")
plt.ylabel("100m PR (seconds)")
plt.show()


Let's put all the data frames created into an excel workbook

In [None]:
xlwriter = pd.ExcelWriter('Worlds_Fatest_Sprinters_EDA.xlsx')
sprinters_pr.to_excel(xlwriter, sheet_name='Sprinters_PRs')
sprinters_pr_clean.to_excel(xlwriter, sheet_name='Sprinters_PRs_Clean')
sprinters_career_avg.to_excel(xlwriter, sheet_name='Sprint_Career_Avg')
sprinters_career_avg_clean.to_excel(xlwriter, sheet_name='Sprint_Career_Avg_Clean')
sprint_career.to_excel(xlwriter, sheet_name='Sprint_Career')
sprinters_alltime_rnk.to_excel(xlwriter, sheet_name='Sprinter_All-Time_Rank')
sprinters_alltime_rnk_clean.to_excel(xlwriter, sheet_name='Sprinter_All-Time_Rank_Clean')
df_100m.to_excel(xlwriter, sheet_name='100m')
df_100m_stat_analysis.to_excel(xlwriter, sheet_name='100m Statisitcal Analysis')
df_200m.to_excel(xlwriter, sheet_name='200m')
df_200m_stat_analysis.to_excel(xlwriter, sheet_name='200m Statisitcal Analysis')
df_400m.to_excel(xlwriter, sheet_name='400m')
df_400m_stat_analysis.to_excel(xlwriter, sheet_name='400m Statisitcal Analysis')
honors.to_excel(xlwriter, sheet_name='Honors')
#all_rnk_sprinters.to_excel(xlwriter, sheet_name='All_Ranked_Sprinters')
#all_rnk_sprinters_2.to_excel(xlwriter, sheet_name='All_Ranked_Sprinters_2')
country.to_excel(xlwriter, sheet_name='Country')
status.to_excel(xlwriter, sheet_name='Status')
continent.to_excel(xlwriter, sheet_name='Continent')
year.to_excel(xlwriter, sheet_name='Year_Born')
month.to_excel(xlwriter, sheet_name='Month_Born')
decade.to_excel(xlwriter, sheet_name='Decade_Born')
olympic_champion.to_excel(xlwriter, sheet_name='Olympic Champions')
world_out_champion.to_excel(xlwriter, sheet_name='World Outdoor Champions')
world_ind_champion.to_excel(xlwriter, sheet_name='World Indoor Champions')
oly_world_champ.to_excel(xlwriter, sheet_name='Olympic & World Outdoor Champions')
global_champions.to_excel(xlwriter, sheet_name='Global Champions')
olympic_medalist.to_excel(xlwriter, sheet_name='Olympic Medalist')
wc_out_medalist.to_excel(xlwriter, sheet_name='WC Outdoor Medalist')
wc_ind_medalist.to_excel(xlwriter, sheet_name='WC Outdoor Medalist')
global_medalist.to_excel(xlwriter, sheet_name='Global Medalist')
dl_final_winner.to_excel(xlwriter, sheet_name='Diamond League Final Winner')
gl_final_winner.to_excel(xlwriter, sheet_name='Golden League Final Winner')
gl_dl_winner.to_excel(xlwriter, sheet_name='Diamond or Golden League Final Winner')
xlwriter.close()