# **200m Career Data**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.figure_factory as ff
%matplotlib inline
from datetime import datetime
import re
from nssstats.plots import std_plot
from nssstats.plots import iqr_plot
from nssstats.plots import quadrant_plot, half_plot
from nssstats.cm import cm_analysis
from ipywidgets import interact, FloatSlider
from sklearn.model_selection import train_test_split
from scipy.stats import probplot
from scipy.stats import t, sem
from scipy.stats import chi2
from statsmodels.stats.proportion import proportion_confint
import statsmodels.formula.api as sm

In [None]:
sprinters = pd.read_csv("Worlds_Fastest_Sprinters_Stats.csv")

# **Data** **Basics**

In [None]:
sprinters.head()

In [None]:
sprinters.info

In [None]:
sprinters.shape

In [None]:
print(sprinters.dtypes)


In [None]:
sprinters.describe()


In [None]:
sprinters.corr


In [None]:
sprinters.isnull().sum()


# **General** **EDA**

Let's make a column for the total PR time

In [None]:
sprinters['Total_Time_PRs'] = round(sprinters['100_PR'] + sprinters['200_PR'] + sprinters['400_PR'],2)
sprinters.head()

Let's make a column for the total career average time

In [None]:
sprinters['Total_Time_SB_Avg'] = round(sprinters['Avg_Season_Best_100m'] + sprinters['Avg_Season_Best_200m'] + sprinters['Avg_Season_Best_400m'],2)
sprinters.head()

Let's make a column for the actual age of the athletes

In [None]:
# Ensure the DOB column is in datetime format
sprinters['DOB'] = pd.to_datetime(sprinters['DOB'], errors='coerce')  # Coerce will handle invalid dates as NaT

# Get the current year
current_year = datetime.now().year

# Calculate the age by subtracting the birth year from the current year
sprinters['Age'] = current_year - sprinters['DOB'].dt.year

# Display the updated DataFrame with the new 'Age' column
print(sprinters[['DOB', 'Age']].head())


Let's add the sprinter's photo to the database by merging it with the photo csv

In [None]:
sprinter_photo = pd.read_csv("Sprinter_Photo.csv")

In [None]:
sprinter_photo.head(3)

In [None]:
sprinters = pd.merge(sprinters, sprinter_photo, on=['Athlete'],how='left')
sprinters.head(3)

Let's add second database to join number of season to each events dataframe (For Years Competed in each event).

In [None]:
sprinters_df2 = pd.read_csv("Worlds_Fastest_Sprinters_Master_List_Yearly_Progression.csv")
sprinters_df2.head(3)

In [None]:
yrs_competed_200m = sprinters_df2[sprinters_df2['Event'] == '200m'].groupby('Athlete').size().reset_index(name='Years')

In [None]:
yrs_competed_200m = yrs_competed_200m.sort_values(by='Years', ascending=False)
yrs_competed_200m.head()

Let's add a third database which incorportae's every race in each athletes's career.

In [None]:
sprinters_df3 = pd.read_csv("Sprinter_Career.csv")
sprinters_df3.head(3)

In [None]:
All_200m_Races = sprinters_df3[sprinters_df3['Event'] == '200m']
All_200m_Races.head(3)

Let's drop all the races that were DNS, DNF, or DQ

In [None]:
All_200m_Races = All_200m_Races[~All_200m_Races['Time'].isin(['DNS', 'DNF', 'DQ'])]
All_200m_Races.head(3)

Let's make sure that the time column is now a numeric datatype

In [None]:
All_200m_Races['Time'] = pd.to_numeric(All_200m_Races['Time'], errors='coerce')

Let's drop all the Indoor marks

In [None]:
All_200m_Races = All_200m_Races[All_200m_Races['Meet_Type'] != 'Indoor']
All_200m_Races.head(3)

Let's drop times that aren't legal from the dataframe

In [None]:
All_200m_Races = All_200m_Races[All_200m_Races['Legal'] != 'NO']
All_200m_Races.head(3)

Let's look at a couple of visualizations of this dataframe

In [None]:
All_200m_Races['Time'].hist();

In [None]:
probplot(All_200m_Races['Time'], plot=plt);

In [None]:
plt.figure(figsize = (10,6))

std_plot(All_200m_Races['Time'], edgecolor = 'black', linewidth = 2)

In [None]:
plt.figure(figsize = (10,6))

iqr_plot(All_200m_Races['Time'], bins = 25, edgecolor = 'black', linewidth = 2)

In [None]:
plt.figure(figsize = (10,6))
sns.boxplot(x = All_200m_Races['Time']);

In [None]:
sns.boxplot(data = All_200m_Races, y = 'Athlete', x = 'Time')
plt.xticks([0,1], ['', ''])
plt.xlabel('200m Time')
plt.title('200m Times Over Entire Career');

Let's get the Caeer Average for Each Athlete

In [None]:
Career_average_200m = All_200m_Races.groupby('Athlete')['Time'].mean().reset_index(name='Career_Avg_200m')

In [None]:
#All_200m_Races = pd.merge(All_200m_Races, Career_average_200m, on=['Athlete'],how='left')
#All_200m_Races.head(3)

Total Races for Each Athlete

In [None]:
athlete_race_count_200m = All_200m_Races.groupby('Athlete').size().reset_index(name='total_races_200m')
athlete_race_count_200m.head(3)

Number of races for each athlete by year

In [None]:
#athlete_race_count_per_year_200m = hundred_meter_races.groupby(['Athlete', 'Year']).size().reset_index(name='races_per_year_200m')
#athlete_race_count_per_year_200m.head(3)

Meerge Data Season and total races data next

In [None]:
seasons_and_races_200m = pd.merge(athlete_race_count_200m, yrs_competed_200m, on=['Athlete'],how='left')
seasons_and_races_200m.head(3)

Let's add the career average for each athlete

In [None]:
seasons_and_races_200m = pd.merge(seasons_and_races_200m, Career_average_200m, on=['Athlete'],how='left')
seasons_and_races_200m.head(3)

In [None]:
seasons_and_races_200m['Avg_Races_Year_200m'] = round(seasons_and_races_200m['total_races_200m'] / seasons_and_races_200m['Years'],2)
sprinters.head()

In [None]:
df_200m= sprinters[['Athlete', 'Country','Continent','Status', 'DOB','Year Born','Month Born','Decade Born','Avg_Season_Best_200m','200_PR','T25_200_All_Time_Rank','T25_200_AT_RK_NUM']]
df_200m

In [None]:
df_200m['SB_Avg_200m_PR_Diff'] = round(df_200m['Avg_Season_Best_200m'] - df_200m['200_PR'],2)
df_200m.head(3)

In [None]:
df_200m = pd.merge(df_200m, seasons_and_races_200m, on=['Athlete'],how='left')
df_200m.head(3)

In [None]:
df_200m = df_200m.rename(columns={'Years': 'Years_Competed_200m'})
df_200m.head(3)

In [None]:
df_200m['Career_Avg_200m_PR_Diff'] = round(df_200m['Career_Avg_200m'] - df_200m['200_PR'],2)
df_200m.head(3)

In [None]:
#df_200m = df_200m.sort_values(by='Avg_Season_Best_200m', ascending=False)
#df_200m

In [None]:
#df_200m = df_200m.sort_values(by='Avg_Season_Best_200m')
#df_200m

In [None]:
df_200m = df_200m.sort_values(by='Career_Avg_200m')
df_200m

**Top 5 200m Times (PRs)**



1.   Usain Bolt: 19.19 (15 Seasons)
2.   Yohan Blake: 19.26 ( Seasons)
3.   Noah Lyles: 19.31 (13 Seasons)
4.   Michael Johnson: 19.32 (14 Seasons)
5.   Letsile Tebogo: 19.46 ( Seasons)








**Top 5 Career Season Best Averages in the 200m (Rounded to the nearest hundreth)**



1.   Usain Bolt: 19.85 (15 Seasons)
2.   Michael Johnson: 19.97 (14 Seasons)
3.   Maurice Greene: 19.99 (5 Seasons)
4.   Erriyon Knighton: 20.05 (6 Seasons)
5.   Noah Lyles: 20.05 (13 Seasons)








**Top 5 Career Averages in the 200m (Rounded to the nearest hundreth)**



1.   Michael Johnson: 20.19 (14 Seasons)
2.   Justin Gatlin: 20.25 (13 Seasons)
3.   Carl Lewis: 20.25 (15 Seasons)
4.   Letsile Tobogo: 20.27 (6 Seasons)
5.   Maurice Greene: 20.28 (5 Seasons)








# ***EDA Visualizations***

In [None]:
# @title Year Born 200m


df_200m['Year Born'].plot(kind='hist', bins=20, title='Year Born')
plt.gca().spines[['top', 'right',]].set_visible(False)

#plt.savefig('year_born_200.png', format='png', dpi=300)
#plt.savefig('year_born_200.jpg', format='jpg', dpi=300)

In [None]:
plt.figure(figsize = (10,6))

df_200m.groupby('Country')['Athlete'].count().plot(kind = 'bar')
plt.title('200m Sprinters By 100m BY Country')
plt.ylabel('count')
plt.xticks(rotation = 0);

In [None]:
plt.figure(figsize = (10,6))

df_200m.groupby('Continent')['Athlete'].count().plot(kind = 'bar')
plt.title('200m Sprinters By 100m BY Continent')
plt.ylabel('count')
plt.xticks(rotation = 0);

In [None]:
# @title Year Born vs Avg Season Best 200m

df_200m.plot(kind='scatter', x='Year Born', y='Avg_Season_Best_200m', s=32, alpha=.8)
plt.gca().spines[['top', 'right',]].set_visible(False)

#plt.savefig('year_born_SB_avg_200.png', format='png', dpi=300)
#plt.savefig('year_born_SB_avg_200.jpg', format='jpg', dpi=300)

In [None]:
# @title Year Born vs Career Avg  200m

df_200m.plot(kind='scatter', x='Year Born', y='Career_Avg_200m', s=32, alpha=.8)
plt.gca().spines[['top', 'right',]].set_visible(False)

#plt.savefig('year_born_Career_avg_200.png', format='png', dpi=300)
#plt.savefig('year_born_Career_avg_200.jpg', format='jpg', dpi=300)

In [None]:
# @title Avg Season Best 200m vs 200_PR

df_200m.plot(kind='scatter', x='Avg_Season_Best_200m', y='200_PR', s=32, alpha=.8)
plt.gca().spines[['top', 'right',]].set_visible(False)

#plt.savefig('SB_avg_200_pr.png', format='png', dpi=300)
#plt.savefig('SB_avg_200_pr.jpg', format='jpg', dpi=300)

Version 2

In [None]:
df_200m = px.data.iris() # iris is a pandas DataFrame
fig = px.scatter(df_200m, x="Avg_Season_Best_200m", y="200_PR")
fig.show()

#fig.write_html("SB_200m_avg_vs_pr_scatter.html")
#fig.write_image("SB_200m_avg_vs_pr_scatter.svg")

Version 3

In [None]:
df_200m = px.data.iris()
fig = px.scatter(df_200m, x="Avg_Season_Best_200m", y="200_PR", color="Country",
                 size='Avg_Season_Best_200m', hover_data=['200_PR'])
fig.show()

#fig.write_html("SB_200m_avg_vs_pr_scatter_2.html")
#fig.write_image("SB_200m_avg_vs_pr_scatter_2.svg")

Version 4 (With Error Bars)

In [None]:
df_200m = px.data.iris()
df_200m["e"] = df_200m["Avg_Season_Best_200m"]/100
fig = px.scatter(df_200m, x="Avg_Season_Best_200m", y="200_PR", color="Country",
                 error_x="e", error_y="e")
fig.show()

#fig.write_html("SB_200m_avg_vs_pr_scatter_3.html")
#fig.write_image("SB_200m_avg_vs_pr_scatter_3.svg")

Version 5 (Using Dash)

In [None]:


app = Dash(__name__)


app.layout = html.Div([
    html.H4('Career Average 200m vs 200m PR'),
    dcc.Graph(id="scatter-plot"),
    html.P("Filter by Career Season Best Average 200m:"),
    dcc.RangeSlider(
        id='range-slider',
        min=0, max=2.5, step=0.1,
        marks={0: '0', 2.5: '2.5'},
        value=[0.5, 2]
    ),
])


@app.callback(
    Output("scatter-plot", "figure"),
    Input("range-slider", "value"))
def update_bar_chart(slider_range):
    df_200m = px.data.iris() # replace with your own data source
    low, high = slider_range
    mask = (df_200m['Avg_Season_Best_200m'] > low) & (df_100m['Avg_Season_Best_200m'] < high)
    fig = px.scatter(
        df[mask], x="Avg_Season_Best_200m", y="200m_PR",
        color="Country", size='Avg Season Best 200m',
        hover_data=['200m_PR'])
    return fig


app.run_server(debug=True)

#fig.write_html("SB_200m_avg_vs_pr_scatter_dash.html")
#fig.write_image("SB_200m_avg_vs_pr_scatter_dash.svg")

In [None]:
# @title Career Avg 200m vs 200_PR

df_200m.plot(kind='scatter', x='Career_Avg_200m', y='200_PR', s=32, alpha=.8)
plt.gca().spines[['top', 'right',]].set_visible(False)

#plt.savefig('career_avg_200__vs_pr.png', format='png', dpi=300)
#plt.savefig('career_avg_200_vs_pr.jpg', format='jpg', dpi=300)

Let's look at this more in depth

In [None]:
plt.figure(figsize = (12,8))

sns.scatterplot(data = df_200m, x = 'Career_Avg_200m', y = '200_PR',
                hue = 'cylinders', palette = 'Blues', edgecolor = 'black'
               )
plt.title('Career Average 200m vs. 200m PR');

In [None]:
quadrant_plot(df_200m['Career_Avg_200m'], df_200m['200_PR'], labels = ['Career Average 200m', '200m PR'], figsize = (12,8))

In [None]:
quadrant_plot(df_200m['Career_Avg_200m'],
              df_200m['200_PR'],
              labels = ['Career Average 200m', '200m PR'],
              quadrant = 4,
              figsize = (12,8))

In [None]:
quadrant_plot(df_200m['Career_Avg_200m'],
              df_200m['200_PR'],
              labels = ['Career Average 200m', '200m PR'],
              quadrant = 2,
              figsize = (12,8))

In [None]:
quadrant_plot(df_200m['Career_Avg_200m'],
              df_200m['200_PR'],
              labels = ['Career Average 200m', '200m PR'],
              quadrant = 3,
              figsize = (12,8))

In [None]:
quadrant_plot(df_200m['Career_Avg_200m'],
              df_200m['200_PR'],
              labels = ['Career Average 200m', '200m PR'],
              quadrant = 1,
              figsize = (12,8))

In [None]:
half_plot(df_200m['Career_Avg_200m'], df_200m['100_PR'], labels = ['Career_Avg_200m', '200_PR'],
          figsize = (12,8), half = 'left')

In [None]:
half_plot(df_200m['Career_Avg_200m'], df_200m['200_PR'], labels = ['Career_Avg_200m', '200_PR'],
          figsize = (12,8), half = 'right')

In [None]:
fig, ax = plt.subplots(figsize = (12,8))
df_200m.plot(kind = 'scatter', x = 'Career_Avg_200m', y = '200_PR', ax = ax)

x = np.linspace(df_200m['Career_Avg_200m'].min(), df_200m['200_PR'].max(), 100)
z = np.polyfit(df_200m['Career_Avg_200m'], df_200m['200_PR'], 1)
p = np.poly1d(z)
plt.plot(x,p(x),"r--")

plt.title('Career Average 200m vs. 200m PR');

Heatmaps

In [None]:
# Calculate the average 200m PR per country
avg_200m_pr_per_country = df_200m.groupby('Country')['200_PR'].mean().reset_index()

fig = px.choropleth(avg_200m_pr_per_country,
                    locations="Country",
                    locationmode='country names',
                    color="200_PR",
                    hover_name="Country",
                    color_continuous_scale="Viridis",
                    title="Average 200m Personal Records by Country (Faster = Darker)")

fig.show()
#fig.write_html("avg_200m_PR_country_map.html")
#fig.write_image("avg_200m_PR_country_map.svg")

In [None]:
# Extract birth months from DOB
sprinters['Birth Month'] = pd.DatetimeIndex(sprinters['DOB']).month

# Find the most common birth month for sprinters in each country
common_birth_month = sprinters.groupby('Country')['Birth Month'].agg(lambda x: x.value_counts().index[0]).reset_index()

fig = px.choropleth(common_birth_month,
                    locations="Country",
                    locationmode='country names',
                    color="Birth Month",
                    hover_name="Country",
                    title="Most Common Birth Month for Sprinters by Country")

fig.show()
#fig.write_html("birth_month_distribution_country_map.html")
#fig.write_image("birth_month_distribution_country_map.svg")

In [None]:
# Extract the birth year and decade
sprinters['Birth Year'] = pd.DatetimeIndex(sprinters['DOB']).year
sprinters['Decade'] = (sprinters['Birth Year'] // 10) * 10

# Calculate the average 100m PR by country and decade
avg_200m_per_decade = sprinters.groupby(['Country', 'Decade'])['200_PR'].mean().reset_index()

fig = px.choropleth(avg_200m_per_decade,
                    locations="Country",
                    locationmode='country names',
                    color="200_PR",
                    animation_frame="Decade",  # Animate through decades
                    hover_name="Country",
                    color_continuous_scale="Viridis",
                    title="Average 200m Personal Records by Decade")

fig.show()
#fig.write_html("200m_sprinter_performance_by_decade_country_map.html")
#fig.write_image("200m_sprinter_performance_by_decade_country_map.svg")

# ***Statistical Analysis 200m***

In [None]:
from scipy.stats import zscore
from sklearn.linear_model import LinearRegression

Linear Regression

In [None]:
# Function to calculate standard deviation (consistency)
df_200m['consistency'] = df_200m['Avg_Season_Best_200m']  # Placeholder: You could replace with actual std per year data if available

### 1. Regression Analysis ###
# Linear regression: relationship between average_time and years_competed
X = df_200m[['Years_Competed_200m','200_PR']]
y = df_200m['Avg_Season_Best_200m']

# Fit model
model = LinearRegression()
model.fit(X, y)

# Predictions and residuals
df_200m['predicted_time'] = model.predict(X)
df_200m['residuals'] = df_200m['Avg_Season_Best_200m'] - df_200m['predicted_time']

print("Regression coefficients (slope):", model.coef_)
print("Intercept:", model.intercept_)


Z-Score Standardization

In [None]:
# Z-score for average_time and years_competed
df_200m['z_time'] = zscore(df_200m['Avg_Season_Best_200m'])
df_200m['z_years'] = zscore(df_200m['Years_Competed_200m'])
df_200m['z_PR'] = zscore(df_200m['200_PR'])
df_200m['z_career_avg_season'] = zscore(df_200m['Career_Avg_200m'])
df_200m['z_total_races'] = zscore(df_200m['total_races_200m'])
df_200m['z_races_per_year'] = zscore(df_200m['Avg_Races_Year_200m'])

# Z-score comparison (combine time and years)
df_200m['z_combined'] = (df_200m['z_time'] + df_200m['z_years'] + df_200m['z_PR']+ df_200m['z_career_avg_season'] + df_200m['z_total_races'] + df_200m['z_races_per_year']) / 6

Efficiency / Ratio Analysis

In [None]:
# Efficiency score (average_time per year competed)
df_200m['efficiency_score'] = df_200m['Avg Season Best 200m'] / df_200m['Years_Competed_200m']

# Efficiency score: How close the sprinter's average season best to their personal best
df_200m['efficiency_score_pr'] = df_200m['200_PR'] / df_200m['Avg_Season_Best_200m']

# Efficiency score: How close the sprinter's average is to their personal best
df_200m['efficiency_score_pr'] = df_200m['100_PR'] / df_200m['Career_Avg_100m']

Ranking System

In [None]:
# Combine rankings based on average_time, consistency, and longevity (years_competed)
df_200m['rank_personal_best'] = df_200m['100_PR'].rank(ascending=True)  # Lower personal best is better
df_200m['rank_average_sb'] = df_200m['Avg_Season_Best_200m'].rank(ascending=True)  # Lower is better
df_200m['rank_career_avg'] = df['Career_Avg_200m'].rank(ascending=True)
#df_200m['rank_consistency'] = df_200m['consistency'].rank(ascending=True)  # Lower std dev is better
df_200m['rank_consistency'] = df_200m['consistency'].abs().rank(ascending=True) # Lower residuals (consistency) is better
df_200m['rank_years_competed'] = df_200m['Years_Competed_200m'].rank(ascending=False)  # Longer careers are better
df_200m['rank_total_races'] = df_200m['total_races_200m'].rank(ascending=False)  # More races is better
df_200m['rank_races_per_year'] = df_200m['races_per_year_200m'].rank(ascending=False)  # More races per year is better

#Final ranking
df_200m['final_rank'] = df_200m[['rank_personal_best','rank_average_sb', 'rank_career_avg',  'rank_consistency', 'rank_years_competed','rank_total_races','rank_races_per_year']].mean(axis=1)

Scatter Plot Visualization

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Years_Competed_200m', y='Avg_Season_Best_200m', data=df_200m, s=100, hue='final_rank', palette='coolwarm')
plt.title('Years Competed vs. Career Average 200m Time')
plt.xlabel('Years Competed')
plt.ylabel('Average 200m Time (s)')
plt.show()

#plt.savefig('SB_avg_200_vs_yrs_competed_ranked.png', format='png', dpi=300)
#plt.savefig('SB_avg_200_vs_yrs_competed_ranked.jpg', format='jpg', dpi=300)

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Years_Competed_200m', y='Career_Avg_200m', data=df_200m, s=100, hue='final_rank', palette='coolwarm')
plt.title('Years Competed vs. Career Average 200m Time')
plt.xlabel('Years Competed')
plt.ylabel('Career Average 200m Time (s)')
plt.show()

#plt.savefig('Career_avg_200_vs_yrs_competed_ranked.png', format='png', dpi=300)
#plt.savefig('Career_avg_200_vs_yrs_competed_ranked.jpg', format='jpg', dpi=300)

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Years_Competed_200m', y='200_PR', data=df_200m, s=100, hue='final_rank', palette='coolwarm')
plt.title('Years Competed vs. 200m PR')
plt.xlabel('Years Competed')
plt.ylabel('200m PR (s)')
plt.show()

#plt.savefig('200_PR_vs_yrs_competed_ranked.png', format='png', dpi=300)
#plt.savefig('200m_PR_vs_yrs_competed_ranked.jpg', format='jpg', dpi=300)

Interactive Scatter Plot

In [None]:
# @title Years Competed vs. Career Average 200m Time

df_200m = px.data.iris()
fig = px.scatter(df_200m, x="Years_Competed_200m", y="Avg_Season_Best_200m", color="final_rank",
                 size='Years_Competed_200m', hover_data=['Avg_Season_Best_200m']) #Potentially switch out career average for personal record regarding hover data.
fig.show()

#fig.write_html("avg_200_vs_yrs_competed_ranked.html")
#fig.write_image("avg_200_vs_yrs_competed_ranked.svg")

In [None]:
# @title Years Competed vs. Career Average 200m Time

df_200m = px.data.iris()
fig = px.scatter(df_200m, x="Years_Competed_200m", y="Career_Avg_200m", color="final_rank",
                 size='Years_Competed_200m', hover_data=['Career_Avg_200m']) #Potentially switch out career average for personal record regarding hover data.
fig.show()

#fig.write_html("Career_avg_200_vs_yrs_competed_ranked.html")
#fig.write_image("Career_avg_200_vs_yrs_competed_ranked.svg")

In [None]:
# @title Years Competed vs. 200m PR

df_200m = px.data.iris()
fig = px.scatter(df_200m, x="Years_Competed_200m", y="200_PR", color="final_rank",
                 size='Years_Competed_200m', hover_data=['200_PR'])
fig.show()

#fig.write_html("200m_PR_vs_yrs_competed_ranked.html")
#fig.write_image("200m_PR_vs_yrs_competed_ranked.svg")

In [None]:
print(df_200m[['Athlete', 'Avg_Season_Best_200m', 'Years_Competed_200m','200_PR', 'residuals', 'z_combined', 'efficiency_score','efficiency_score_sb', 'efficiency_score_pr' 'final_rank']])

In [None]:
df_200m_stat_analysis = df_200m[['Athlete', 'Avg_Season_Best_200m', 'Years_Competed_200m',' 200_PR', 'residuals', 'z_combined', 'efficiency_score','efficiency_score_sb', 'efficiency_score_pr' 'final_rank']]

In [None]:
df_200m_stat_analysis = df_200m_stat_analysis.sort_values(by='final_rank')
df_200m_stat_analysis.head(3)

In [None]:
#What sample size of the dataframe to we want to make into a figure factory table
df_200m_stat_analysis_sample = df_200m_stat_analysis[1:10]

#Cusomize Colors (Add colorscale=colorscale in parentheses of ff.create table)
#colorscale = [[0, '#4d004c'],[.5, '#f2e5ff'],[1, '#ffffff']]
#Cusomize Font Colors (Add font_colors=font in parentheses of ff.create table)
#font=['#FCFCFC', '#00EE00', '#008B00', '#004F00', '#660000', '#CD0000', '#FF3030']

table_data = df_200m_stat_analysis


fig =  ff.create_table(df_200m_stat_analysis_sample)
fig.show()

#fig.write_html("df_200m_stat_analysis_sample_ff.html")
#fig.write_image("df_200m_stat_analysis_sample_ff.svg")


In [None]:
fig =  ff.create_table(df_200m_stat_analysis)
fig.show()

#fig.write_html("df_200m_stat_analysis_ff.html")
#fig.write_image("df_200m_stat_analysis_ff.svg")

In [None]:
fig =  ff.create_table(df_200m)
fig.show()

#fig.write_html("df_200m_ff.html")
#fig.write_image("df_200m_ff.svg")

In [None]:
df_200m_stat_analysis = pd.DataFrame(df_200m_stat_analysis)

In [None]:
df_200m = pd.DataFrame(df_200m)

# **Analysis of Personal Best Data (PB/PR)**

In [None]:
sprinters['200_PR'].mean()

In [None]:
sprinters['200_PR'].median()

In [None]:
sprinters['200_PR'].max()

In [None]:
sprinters.nlargest(1,'200_PR')

In [None]:
sprinters.nsmallest(1,'200_PR')

In [None]:
sprinters['200_PR'].max()- sprinters['200_PR'].min()


Variance and Standard Devivation


In [None]:
sprinters['200_PR_deviation'] = sprinters['200_PR'] - sprinters['200_PR'].mean()
sprinters.head()

In [None]:
sprinters['200_PR'].std()


In [None]:
sprinters['200_PR_deviation'].mean()


In [None]:
sprinters['squared_200_PR_deviation'] = sprinters['200_PR_deviation']**2
sprinters

Population Standard Deviation



In [None]:
np.sqrt(sprinters['squared_200_PR_deviation'].mean())

In [None]:
sprinters['200_PR'].var(ddof = 0)


In [None]:
sprinters['200_PR'].std(ddof = 0)


In [None]:
plt.figure(figsize = (10,6))

std_plot(sprinters['200_PR'], edgecolor = 'black', linewidth = 2)

z-scores

In [None]:
sprinters['200_PR_z-score'] = (sprinters['200_PR'] - sprinters['200_PR'].mean()) / sprinters['200_PR'].std(ddof = 0)


In [None]:
sprinters['200_PR_z-score'].std()


Let's look at height z-scores for Usain Bolt

In [None]:
sprinters.loc[(sprinters.Athlete == 'Usain Bolt')]


Quartiles and Quantiles/Percentiles


In [None]:
sprinters['200_PR'].quantile(q = 0.25)


In [None]:
sprinters['200_PR'].quantile(q = 0.5)


In [None]:
sprinters['200_PR'].quantile(q = 0.75)


In [None]:
sprinters['200_PR'].describe()


Interquartile Range



In [None]:
sprinters['200_PR'].quantile(q = 0.75) - sprinters['200_PR'].quantile(q = 0.25)


In [None]:
plt.figure(figsize = (10,6))

iqr_plot(sprinters['200_PR'], bins = 25, edgecolor = 'black', linewidth = 2)

Observing Outliers in the Dataset



In [None]:
plt.figure(figsize = (10,6))
sns.boxplot(x = sprinters['200_PR']);

# **Analysis of Season Best Data**

In [None]:
SB_200m = sprinters_df2[sprinters_df2['Event'] == '200m']

In [None]:
SB_200m['Time'].mean()

In [None]:
SB_200m['Time'].median()

In [None]:
SB_200m['Time'].max()

In [None]:
SB_200m.nlargest(1,'Time')

In [None]:
SB_200m.nsmallest(1,'Time')

In [None]:
SB_200m['Time'].max()- SB_200m['Time'].min()


Variance and Standard Devivation


In [None]:
SB_200m['200m_SB_deviation'] = SB_200m['Time'] - SB_200m['Time'].mean()
SB_100m.head()

In [None]:
SB_200m['Time'].std()


In [None]:
SB_200m['200m_SB_deviation'].mean()


In [None]:
SB_200m['squared_200m_SB_deviation'] = SB_200m['200m_SB_deviation']**2
SB_200m

Population Standard Deviation



In [None]:
np.sqrt(SB_200m['squared_200m_SB_deviation'].mean())

In [None]:
SB_200m['Time'].var(ddof = 0)


In [None]:
SB_200m['Time'].std(ddof = 0)


In [None]:
plt.figure(figsize = (10,6))

std_plot(SB_200m['Time'], edgecolor = 'black', linewidth = 2)

z-scores

In [None]:
SB_200m['200m_SB_z-score'] = (SB_200m['Time'] - SB_200m['Time'].mean()) / SB_200m['Time'].std(ddof = 0)


In [None]:
SB_200m['200m_SB_z-score'].std()


Let's look at height z-scores for Usain Bolt

In [None]:
SB_200m.loc[(SB_200m.Athlete == 'Usain Bolt')]


Quartiles and Quantiles/Percentiles


In [None]:
SB_200m['Time'].quantile(q = 0.25)


In [None]:
SB_200m['Time'].quantile(q = 0.5)


In [None]:
SB_200m['Time'].quantile(q = 0.75)


In [None]:
SB_200m['Time'].describe()


Interquartile Range



In [None]:
SB_200m['Time'].quantile(q = 0.75) - SB_200m['Time'].quantile(q = 0.25)


In [None]:
plt.figure(figsize = (10,6))

iqr_plot(SB_200m['Time'], bins = 25, edgecolor = 'black', linewidth = 2)

Observing Outliers in the Dataset



In [None]:
plt.figure(figsize = (10,6))
sns.boxplot(x = SB_200m['Time']);

**Statisical Tests**

*ANOVA test comparing event times across athletes*

This test will check if there are statistically significant differences in the average times for different athletes in a specific event.

It will identify whether the differences in mean times between athletes are greater than would be expected by random chance.

200m ANOVA

In [None]:
# Create a list of times for each athlete
athlete_SB_times_200 = [group['Time'].values for name, group in SB_200m.groupby('Athlete')]

# Run one-way ANOVA
f_stat, p_val = stats.f_oneway(*athlete_times_200)
print(f"ANOVA result: F-statistic = {f_stat}, p-value = {p_val}")


*T-test - Compare Two Athletes' Performances*



We will use the T-test to T-test to compare the performance of two athletes in a particular event. This will assess if the difference between the two athletes' performance is statistically significant.

200m T-test

In [None]:
# Filter data for two specific athletes in the 100m event
athlete_200_SB_a = sprinters_df2[(sprinters_df2['Athlete'] == 'Athlete A') & (sprinters_df2['Event'] == '200m')]['Time']
athlete_200_SB_b = sprinters_df2[(sprinters_df2['Athlete'] == 'Athlete B') & (sprinters_df2['Event'] == '200m')]['Time']

# Run independent t-test
t_stat, p_val = stats.ttest_ind(athlete_200_SB_a, athlete_200_SB_b)
print(f"T-test result: T-statistic = {t_stat}, p-value = {p_val}")


*Correlation Analysis – Time vs. Year for a Specific Event*

This will allow us to explore whether there's a trend in athletes' performance over time by checking the correlation between Year and Time in different events.

200m Correlation Analysis

In [None]:
correlation_200_SB = SB_200m['Year'].corr(SB_200m['Time'])
print(f"Correlation between Year and Time (200m): {correlation}")


*Time Series Analysis – Track Athlete Performance Over Time*

Let's perform time series analysis to track an individual athlete’s performance. This will allow us to detect patterns, trends, or seasonal effects in an athlete's performance over time.

In [None]:
# Filter data for one athlete
athlete_data_SB = sprinters_df2[sprinters_df2['Athlete'] == 'Athlete A'].sort_values(by='Year')

# Calculate the rolling average (moving average) for the time over 3 events
athlete_data_SB['Moving_Avg'] = athlete_data_SB['Time'].rolling(window=3).mean()

# Plot the moving average
athlete_data_SB[['Year', 'Moving_Avg']].plot(x='Year', y='Moving_Avg')


*Linear Regression – Predict Time Based on Year and Other Variables*



We can use linear regression to predict race times based on year, athlete, location, or other factors. This can help us model how times change over time or in different conditions.

*Linear Regression 200m*

In [None]:
# Prepare the data (for example, predicting time in the 200m event)
event_data_SB = sprinters_df2[sprinters_df2['Event'] == '200m']
X = event_data_SB[['Year']]  # You can add other features such as 'Location', 'Athlete'
y = event_data_SB['Time']

# Fit the linear regression model
model = LinearRegression()
model.fit(X, y)

# Print the coefficients and intercept
print(f"Coefficient: {model.coef_}, Intercept: {model.intercept_}")


Logistic Regression Model

In [None]:
logreg = sm.logit('Year ~ Time', data = event_data_SB).fit()

In [None]:
logreg.summary()

In [None]:
logreg.predict(event_data_SB)

Logistic Regression Interference

In [None]:
logreg_full = sm.logit('Year ~ Time + Insert_Column_Here', data = event_data_SB).fit()
logreg_reduced = sm.logit('Year ~ Time', data = event_data_SB).fit()

In [None]:
logreg_full.llf

In [None]:
logreg_reduced.llf

In [None]:
G2 = -2 * (logreg_reduced.llf - logreg_full.llf)
G2

Chi Squared Distribution

In [None]:
df = logreg_full.df_model - logreg_reduced.df_model

In [None]:
chi2.sf(G2, df = df)

Let's consense the logistic regression interference and chi-squared distribution into one cell.

In [None]:
logreg_full = sm.logit('Year ~ Time + Insert_Column_Here', data = event_data_SB).fit()
logreg_reduced = sm.logit('Year ~ Time', data = event_data_SB).fit()

G2 = -2 * (logreg_reduced.llf - logreg_full.llf)
df = logreg_full.df_model - logreg_reduced.df_model
print(f'p-value: {chi2.sf(G2, df = df)}')

Generating Predictions

We'll stratify to ensure that the proportion of the stratify variable is the same in the training data and in the test data.

In [None]:
SB_train, SB_test = train_test_split(event_data_SB, test_size = 0.25, stratify = event_data_CD['Year'], random_state = 321)

We'll fit a model using several of our predictor variables.

In [None]:
logreg_pred = sm.logit('Year ~ Time + Insert + Columns + Here + In + These + Blank + Spaces + Check + Dataframe',
                       data = event_data_SB).fit()

In [None]:
y_pred = logreg_pred.predict(CD_test) > 0.5

Let's see how well the predicted values matche up to the true labels.

In [None]:
pd.crosstab(SB_test['Year'], y_pred)

In [None]:
cm_analysis(SB_test['Year'], y_pred, labels = [0, 1], figsize = (7,6))

Visualizations

In [None]:
SB_200m['Time'].hist();

In [None]:
probplot(SB_200m['Time'], plot=plt);

In [None]:
plt.figure(figsize = (10,6))

std_plot(SB_200m['Time'], edgecolor = 'black', linewidth = 2)

In [None]:
plt.figure(figsize = (10,6))

iqr_plot(SB_200m['Time'], bins = 25, edgecolor = 'black', linewidth = 2)

In [None]:
plt.figure(figsize = (10,6))
sns.boxplot(x = SB_200m['Time']);

# **Analysis of Career Data**

In [None]:
CD_200m = sprinters_df3[sprinters_df3['Event'] == '100m']

In [None]:
CD_200m['Time'].mean()

In [None]:
CD_200m['Time'].median()

In [None]:
CD_200m['Time'].max()

In [None]:
CD_200m.nlargest(1,'Time')

In [None]:
CD_200m.nsmallest(1,'Time')

In [None]:
CD_200m['Time'].max()- CD_200m['Time'].min()


Variance and Standard Devivation


In [None]:
CD_200m['200m_CD_deviation'] = CD_200m['Time'] - CD_200m['Time'].mean()
CD_200m.head()

In [None]:
CD_200m['Time'].std()


In [None]:
CD_200m['200m_CD_deviation'].mean()


In [None]:
CD_200m['squared_200m_CD_deviation'] = CD_200m['200m_CD_deviation']**2
CD_200m

Population Standard Deviation



In [None]:
np.sqrt(CD_200m['squared_200m_CD_deviation'].mean())

In [None]:
CD_200m['Time'].var(ddof = 0)


In [None]:
CD_200m['Time'].std(ddof = 0)


In [None]:
plt.figure(figsize = (10,6))

std_plot(CD_200m['Time'], edgecolor = 'black', linewidth = 2)

z-scores

In [None]:
CD_200m['200m_CD_z-score'] = (CD_200m['Time'] - CD_200m['Time'].mean()) / CD_200m['Time'].std(ddof = 0)


In [None]:
CD_200m['200m_CD_z-score'].std()


Let's look at height z-scores for Usain Bolt

In [None]:
CD_200m.loc[(CD_200m.Athlete == 'Usain Bolt')]


Quartiles and Quantiles/Percentiles


In [None]:
CD_200m['Time'].quantile(q = 0.25)


In [None]:
CD_200m['Time'].quantile(q = 0.5)


In [None]:
CD_200m['Time'].quantile(q = 0.75)


In [None]:
CD_200m['Time'].describe()


Interquartile Range



In [None]:
CD_200m['Time'].quantile(q = 0.75) - CD_200m['Time'].quantile(q = 0.25)


In [None]:
plt.figure(figsize = (10,6))

iqr_plot(CD_200m['Time'], bins = 25, edgecolor = 'black', linewidth = 2)

Observing Outliers in the Dataset



In [None]:
plt.figure(figsize = (10,6))
sns.boxplot(x = CD_200m['Time']);

Visualizations

In [None]:
CD_200m['Time'].hist();

In [None]:
probplot(CD_200m['Time'], plot=plt);

In [None]:
plt.figure(figsize = (10,6))

std_plot(CD_200m['Time'], edgecolor = 'black', linewidth = 2)

In [None]:
plt.figure(figsize = (10,6))

iqr_plot(CD_200m['Time'], bins = 25, edgecolor = 'black', linewidth = 2)

In [None]:
plt.figure(figsize = (10,6))
sns.boxplot(x = CD_200m['Time']);

**Statisical Tests**

200m ANOVA

In [None]:
# Create a list of times for each athlete
athlete_SB_times_200 = [group['Time'].values for name, group in CD_200m.groupby('Athlete')]

# Run one-way ANOVA
f_stat, p_val = stats.f_oneway(*athlete_times_200)
print(f"ANOVA result: F-statistic = {f_stat}, p-value = {p_val}")


200m T-test

In [None]:
# Filter data for two specific athletes in the 200m event
athlete_200_CD_a = sprinters_df3[(sprinters_df3['Athlete'] == 'Athlete A') & (sprinters_df3['Event'] == '200m')]['Time']
athlete_200_CD_b = sprinters_df3[(sprinters_df3['Athlete'] == 'Athlete B') & (sprinters_df3['Event'] == '200m')]['Time']

# Run independent t-test
t_stat, p_val = stats.ttest_ind(athlete_200_CD_a, athlete_200_CD_b)
print(f"T-test result: T-statistic = {t_stat}, p-value = {p_val}")


200m Correlation Analysis

In [None]:
correlation_200_CD = SB_200m['Year'].corr(CD_200m['Time'])
print(f"Correlation between Year and Time (200m): {correlation}")


*Time Series Analysis – Track Athlete Performance Over Time*

In [None]:
# Filter data for one athlete
athlete_data_CD = sprinters_df3[sprinters_df3['Athlete'] == 'Athlete A'].sort_values(by='Year')

# Calculate the rolling average (moving average) for the time over 3 events
athlete_data_CD['Moving_Avg'] = athlete_data_CD['Time'].rolling(window=3).mean()

# Plot the moving average
athlete_data_CD[['Year', 'Moving_Avg']].plot(x='Year', y='Moving_Avg')


*Linear Regression – Predict Time Based on Year and Other Variables*



*Linear Regression 200m*

In [None]:
# Prepare the data (for example, predicting time in the 100m event)
event_data_CD = sprinters_df3[sprinters_df3['Event'] == '200m']
X = event_data_CD[['Year']]  # You can add other features such as 'Location', 'Athlete'
y = event_data_CD['Time']

# Fit the linear regression model
model = LinearRegression()
model.fit(X, y)

# Print the coefficients and intercept
print(f"Coefficient: {model.coef_}, Intercept: {model.intercept_}")


Logistic Regression Model

In [None]:
logreg = sm.logit('Year ~ Time', data = event_data_CD).fit()

In [None]:
logreg.summary()

In [None]:
logreg.predict(event_data_CD)

Logistic Regression Interference

In [None]:
logreg_full = sm.logit('Year ~ Time + Insert_Column_Here', data = event_data_CD).fit()
logreg_reduced = sm.logit('Year ~ Time', data = event_data_CD).fit()

In [None]:
logreg_full.llf

In [None]:
logreg_reduced.llf

In [None]:
G2 = -2 * (logreg_reduced.llf - logreg_full.llf)
G2

Chi Squared Distribution

In [None]:
df2 = logreg_full.df_model - logreg_reduced.df_model

In [None]:
chi2.sf(G2, df2 = df2)

Let's consense the logistic regression interference and chi-squared distribution into one cell.

In [None]:
logreg_full = sm.logit('Year ~ Time + Insert_Column_Here', data = event_data_CD).fit()
logreg_reduced = sm.logit('Year ~ Time', data = event_data_CD).fit()

G2 = -2 * (logreg_reduced.llf - logreg_full.llf)
df2 = logreg_full.df_model - logreg_reduced.df_model
print(f'p-value: {chi2.sf(G2, df2 = df2)}')

Generating Predictions

We'll stratify to ensure that the proportion of the stratify variable is the same in the training data and in the test data.

In [None]:
CD_train, CD_test = train_test_split(event_data_CD, test_size = 0.25, stratify = event_data_CD['Year'], random_state = 321)

We'll fit a model using several of our predictor variables.

In [None]:
logreg_pred = sm.logit('Year ~ Time + Insert + Columns + Here + In + These + Blank + Spaces + Check + Dataframe',
                       data = event_data_CD).fit()

In [None]:
y_pred = logreg_pred.predict(CD_test) > 0.5

Let's see how well the predicted values matche up to the true labels.

In [None]:
pd.crosstab(CD_test['Year'], y_pred)

In [None]:
cm_analysis(CD_test['Year'], y_pred, labels = [0, 1], figsize = (7,6))

**Update these and place them where necessary (If necessary)**

Single Linear Regrssion

In [None]:
lr = sm.ols(
    formula = 'Insert_variable_1 ~ Insert_variable_2',
    data = possum
).fit()

In [None]:
lr.summary()

In [None]:
x_pred = pd.DataFrame({'Insert_variable_2': np.linspace(start = possum['Insert_variable_2'].min(),
                                             stop = possum['Insert_variable_2'].max(),
                                             num = 250)
                      })

pred = lr.predict(x_pred)

possum.plot(
    kind = 'scatter',
    x = 'Insert_variable_2',
    y = 'Insert_variable_1',
    figsize = (10,6)
)

plt.plot(x_pred['Insert_variable_2'], pred, color = 'black');

In [None]:
lr.rsquared

Let's verify that this is the correct $R^2$ value.

To compute TSS, we need to look at the difference between the target values and the average value of the target variable.

In [None]:
tss = ((possum['Insert_variable_1'] - possum['Insert_variable_1'].mean())**2).sum()

For RSS, we need to consider the difference between the target and the predicted value.

In [None]:
rss = ((possum['Insert_variable_1'] - lr.fittedvalues)**2).sum()

Now, we can verify that we get the same result

In [None]:
(tss - rss) / tss

In [None]:
lr.pvalues['Insert_variable_2']

In [None]:
lr.conf_int(0.05)

In [None]:
possum['Insert_variable_2'].describe()

Predictions using the linear regression model

In [None]:
lr.predict(pd.DataFrame({'Insert_variable_2': [91]})) #Enter time in Square brackets

In [None]:
lr.get_prediction(pd.DataFrame({'Insert_variable_2': [91]})).conf_int()

To get a prediction interval, which tells us what we can expect for a new observation, we can specify obs = True.

In [None]:
lr.get_prediction(pd.DataFrame({'Insert_variablel_2': [91]})).conf_int(obs = True)

Let's get all these predictions in a summary frame method

In [None]:
lr.get_prediction(pd.DataFrame({'Insert_variable_2': [91]})).summary_frame()

Multi-Linear Regression Models

In [None]:
lr_reduced = sm.ols('Insert_variable_1 ~ Insert_variable_2', data = possum).fit()
lr_full = sm.ols('Insert_variable_1 ~ Insert_variable_2 + Insert_variable_3', data = possum).fit()

In [None]:
stats.stats.anova_lm(lr_reduced, lr_full)

Interactions in the multi-linear regression model

In [None]:
lr_full =sm.ols('Insert_variable_1 ~ Insert_variable_2 + Insert_variable_a1 + Insert_variable_2:sex', data = possum).fit()
lr_full.summary()

In [None]:
lr_reduced =sm.ols('Insert_variable_1 ~ Insert_variable_2 + Insert_variable_a1', data = possum).fit()
lr_full =sm.ols('Insert_variable_1 ~ Insert_variable_2 + Insert_variable_a1 + Insert_variable_2:sex', data = possum).fit()

stats.stats.anova_lm(lr_reduced, lr_full)

In [None]:
lr_df = sm.ols('variable_a ~ variable_b', data = cars).fit()

plt.figure(figsize = (10,6))
plt.scatter(df['variable_b'], lr_df.resid)
xmin, xmax = plt.xlim()
plt.hlines(y = 0, xmin = xmin, xmax = xmax)
plt.xlim(xmin, xmax);

In [None]:
var = 'variable_b'

x_pred = pd.DataFrame({
    var: np.linspace(start = cars[var].min(),
                               stop = cars[var].max(), num = 250)
})

pred = lr_poly_log.get_prediction(x_pred).summary_frame()

cars.plot(kind = 'scatter', x = var, y = 'variable_a', figsize = (10,6))

plt.plot(x_pred[var], np.exp(pred['mean']), color = 'grey', label = 'predicted mean')

plt.plot(x_pred[var], np.exp(pred['mean_ci_lower']), color = 'blue', label = 'confidence interval')
plt.plot(x_pred[var], np.exp(pred['mean_ci_upper']), color = 'blue')

plt.plot(x_pred[var], np.exp(pred['obs_ci_lower']), color = 'black', label = 'prediction interval')
plt.plot(x_pred[var], np.exp(pred['obs_ci_upper']), color = 'black')

plt.legend();

# **Let's put all the data frames created into an excel workbook**

In [None]:
xlwriter = pd.ExcelWriter('200M_Analysis.xlsx')
df_200m.to_excel(xlwriter, sheet_name='200m')
df_200m_stat_analysis.to_excel(xlwriter, sheet_name='200m Statisitcal Analysis')
xlwriter.close()