In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import re
import plotly.express as px
import scipy.stats as stats
from sklearn.linear_model import LinearRegression
from scipy.stats import chi2_contingency
from sklearn.cluster import KMeans
from nssstats.plots import std_plot
from nssstats.plots import iqr_plot
from nssstats.plots import quadrant_plot, half_plot
from ipywidgets import interact, FloatSlider

In [None]:
sprinters = pd.read_csv("../data/Worlds_Fastest_Sprinters_Master_List_Yearly_Progression.csv")
sprinters

**Data Basics**

In [None]:
sprinters.head()

In [None]:
sprinters.info

In [None]:
sprinters.shape

In [None]:
print(sprinters.dtypes)


In [None]:
sprinters.describe()


In [None]:
sprinters.corr


**Statistical Analysis**

*100m*

In [None]:
df_100m = sprinters[sprinters['Event'] == '100m']
df_100m

In [None]:
df_100m['Time'].mean()

In [None]:
df_100m['Time'].median()

In [None]:
df_100m['Time'].max()

In [None]:
df_100m.nlargest(1,'Time')

In [None]:
df_100m.nsmallest(1,'Time')

In [None]:
df_100m['Time'].max()- df_100m['Time'].min()


Variance and Standard Devivation


In [None]:
df_100m['100m_deviation'] = df_100m['Time'] - df_100m['Time'].mean()
df_100m.head()

In [None]:
df_100m['Time'].std()


In [None]:
df_100m['100m_deviation'].mean()


In [None]:
df_100m['squared_100m_deviation'] = df_100m['100m_deviation']**2
df_100m

Population Standard Deviation



In [None]:
np.sqrt(df_100m['squared_100m_deviation'].mean())

In [None]:
df_100m['Time'].var(ddof = 0)


In [None]:
df_100m['Time'].std(ddof = 0)


In [None]:
plt.figure(figsize = (10,6))

std_plot(sprinters['Time'], edgecolor = 'black', linewidth = 2)

z-scores

In [None]:
df_100m['100m_z-score'] = (df_100m['Time'] - df_100m['Time'].mean()) / df_100m['Time'].std(ddof = 0)


In [None]:
df_100m['100m_z-score'].std()


Let's look at height z-scores for Usain Bolt

In [None]:
df_100m.loc[(df_100m.Athlete == 'Usain Bolt')]


Quartiles and Quantiles/Percentiles


In [None]:
df_100m['Time'].quantile(q = 0.25)


In [None]:
df_100m['Time'].quantile(q = 0.5)


In [None]:
df_100m['Time'].quantile(q = 0.75)


In [None]:
df_100m['Time'].describe()


Interquartile Range



In [None]:
df_100m['Time'].quantile(q = 0.75) - df_100m['Time'].quantile(q = 0.25)


In [None]:
plt.figure(figsize = (10,6))

iqr_plot(df_100m['Time'], bins = 25, edgecolor = 'black', linewidth = 2)

Observing Outliers in the Dataset



In [None]:
plt.figure(figsize = (10,6))
sns.boxplot(x = df_100m['Time']);

*100m*

In [None]:
df_200m = sprinters[sprinters['Event'] == '200m']
df_200m

In [None]:
df_200m['Time'].mean()

In [None]:
df_200m['Time'].median()

In [None]:
df_200m['Time'].max()

In [None]:
df_200m.nlargest(1,'Time')

In [None]:
df_200m.nsmallest(1,'Time')

In [None]:
df_200m['Time'].max()- df_200m['Time'].min()


Variance and Standard Devivation


In [None]:
df_200m['200m_deviation'] = df_200m['Time'] - df_200m['Time'].mean()
df_200m.head()

In [None]:
df_200m['Time'].std()


In [None]:
df_200m['200m_deviation'].mean()


In [None]:
df_200m['squared_200m_deviation'] = df_200m['200m_deviation']**2
df_200m

Population Standard Deviation



In [None]:
np.sqrt(df_200m['squared_200m_deviation'].mean())

In [None]:
df_200m['Time'].var(ddof = 0)


In [None]:
df_200m['Time'].std(ddof = 0)


In [None]:
plt.figure(figsize = (10,6))

std_plot(df_200m['Time'], edgecolor = 'black', linewidth = 2)

z-scores

In [None]:
df_200m['200m_z-score'] = (df_200m['Time'] - df_200m['Time'].mean()) / sprinters['Time'].std(ddof = 0)


In [None]:
df_200m['200m_z-score'].std()


Let's look at height z-scores for Usain Bolt

In [None]:
df_100m.loc[(df_100m.Athlete == 'Usain Bolt')]


Quartiles and Quantiles/Percentiles


In [None]:
df_200m['Time'].quantile(q = 0.25)


In [None]:
df_200m['Time'].quantile(q = 0.5)


In [None]:
df_200m['Time'].quantile(q = 0.75)


In [None]:
df_200m['Time'].describe()


Interquartile Range



In [None]:
df_200m['Time'].quantile(q = 0.75) - df_200m['Time'].quantile(q = 0.25)


In [None]:
plt.figure(figsize = (10,6))

iqr_plot(df_200m['Time'], bins = 25, edgecolor = 'black', linewidth = 2)

Observing Outliers in the Dataset



In [None]:
plt.figure(figsize = (10,6))
sns.boxplot(x = df_200m['Time']);

*400m*

In [None]:
df_400m = sprinters[sprinters['Event'] == '400m']
df_400m

In [None]:
df_400m['Time'].mean()

In [None]:
df_400m['Time'].median()

In [None]:
df_400m['Time'].max()

In [None]:
df_400m.nlargest(1,'Time')

In [None]:
df_400m.nsmallest(1,'Time')

In [None]:
df_400m['Time'].max()- df_400m['Time'].min()


Variance and Standard Devivation


In [None]:
df_400m['400m_deviation'] = df_400m['Time'] - df_400m['Time'].mean()
df_400m.head()

In [None]:
df_400m['Time'].std()


In [None]:
df_400m['400m_deviation'].mean()


In [None]:
df_400m['squared_400m_deviation'] = df_400m['400m_deviation']**2
df_400m

Population Standard Deviation



In [None]:
np.sqrt(df_400m['squared_400m_deviation'].mean())

In [None]:
df_400m['Time'].var(ddof = 0)


In [None]:
df_400m['Time'].std(ddof = 0)


In [None]:
plt.figure(figsize = (10,6))

std_plot(df_400m['Time'], edgecolor = 'black', linewidth = 2)

z-scores

In [None]:
df_400m['400m_z-score'] = (df_400m['Time'] - df_400m['Time'].mean()) / df_400m['Time'].std(ddof = 0)


In [None]:
df_400m['400m_z-score'].std()


Let's look at height z-scores for Usain Bolt

In [None]:
df_400m.loc[(df_400m.Athlete == 'Usain Bolt')]


Quartiles and Quantiles/Percentiles


In [None]:
df_400m['Time'].quantile(q = 0.25)


In [None]:
df_400m['Time'].quantile(q = 0.5)


In [None]:
df_400m['Time'].quantile(q = 0.75)


In [None]:
df_400m['Time'].describe()


Interquartile Range



In [None]:
df_400m['Time'].quantile(q = 0.75) - df_400m['Time'].quantile(q = 0.25)


In [None]:
plt.figure(figsize = (10,6))

iqr_plot(df_400m['Time'], bins = 25, edgecolor = 'black', linewidth = 2)

Observing Outliers in the Dataset



In [None]:
plt.figure(figsize = (10,6))
sns.boxplot(x = df_400m['Time']);

# **Statisical Tests**


*ANOVA test comparing event times across athletes*

This test will check if there are statistically significant differences in the average times for different athletes in a specific event.

It will identify whether the differences in mean times between athletes are greater than would be expected by random chance.

100m ANOVA

In [None]:
# Create a list of times for each athlete
athlete_times_100 = [group['Time'].values for name, group in df_100m.groupby('Athlete')]

# Run one-way ANOVA
f_stat, p_val = stats.f_oneway(*athlete_times_100)
print(f"ANOVA result: F-statistic = {f_stat}, p-value = {p_val}")


200m ANOVA

In [None]:
athlete_times_200 = [group['Time'].values for name, group in df_200m.groupby('Athlete')]

# Run one-way ANOVA
f_stat, p_val = stats.f_oneway(*athlete_times_200)
print(f"ANOVA result: F-statistic = {f_stat}, p-value = {p_val}")


400m ANOVA

In [None]:
athlete_times_400 = [group['Time'].values for name, group in df_400m.groupby('Athlete')]

# Run one-way ANOVA
f_stat, p_val = stats.f_oneway(*athlete_times_400)
print(f"ANOVA result: F-statistic = {f_stat}, p-value = {p_val}")


*T-test - Compare Two Athletes' Performances*



We will use the T-test to T-test to compare the performance of two athletes in a particular event. This will assess if the difference between the two athletes' performance is statistically significant.

100m T-test

In [None]:
# Filter data for two specific athletes in the 100m event
athlete_100_a = sprinters[(sprinters['Athlete'] == 'Athlete A') & (sprinters['Event'] == '100m')]['Time']
athlete_100_b = sprinters[(sprinters['Athlete'] == 'Athlete B') & (sprinters['Event'] == '100m')]['Time']

# Run independent t-test
t_stat, p_val = stats.ttest_ind(athlete_100_a, athlete_100_b)
print(f"T-test result: T-statistic = {t_stat}, p-value = {p_val}")


200m T-test

In [None]:
athlete_200_a = sprinters[(sprinters['Athlete'] == 'Athlete A') & (sprinters['Event'] == '200m')]['Time']
athlete_200_b = sprinters[(sprinters['Athlete'] == 'Athlete B') & (sprinters['Event'] == '200m')]['Time']

t_stat, p_val = stats.ttest_ind(athlete_200_a, athlete_200_b)
print(f"T-test result: T-statistic = {t_stat}, p-value = {p_val}")

400m T-test

In [None]:
athlete_400_a = sprinters[(sprinters['Athlete'] == 'Athlete A') & (sprinters['Event'] == '400m')]['Time']
athlete_400_b = sprinters[(sprinters['Athlete'] == 'Athlete B') & (sprinters['Event'] == '400m')]['Time']

t_stat, p_val = stats.ttest_ind(athlete_400_a, athlete_400_b)
print(f"T-test result: T-statistic = {t_stat}, p-value = {p_val}")

*Correlation Analysis – Time vs. Year for a Specific Event*

This will allow us to explore whether there's a trend in athletes' performance over time by checking the correlation between Year and Time in different events.

100m Correlation Analysis

In [None]:
correlation_100 = df_100m['Year'].corr(df_100m['Time'])
print(f"Correlation between Year and Time (100m): {correlation}")


200m Correlation Analysis

In [None]:
correlation_200 = df_200m['Year'].corr(df_200m['Time'])
print(f"Correlation between Year and Time (200m): {correlation}")

400m Correlation Analysis

In [None]:
correlation_400 = df_400m['Year'].corr(df_400m['Time'])
print(f"Correlation between Year and Time (400m): {correlation}")

Chi-Square Test – Analyze Event Participation Across Different Countries

We can use a Chi-Square test to determine if there's an association between the event type and country.

We'll need to import and merge our data that provides the countries of these athletes

In [None]:
sprinters_df2 = pd.read_csv("../data/Worlds_Fastest_Sprinters_Stats.csv")
sprinters_df2.head(3)

In [None]:
sprinters_df3 = pd.merge(sprinters,sprinters_df2, on=['Athlete'],how='outer')
sprinters_df3.head()


Let's adjust the dataframe to provide only the colums we want.

In [None]:
sprinters_df3 = sprinters_df3[['Athlete', 'Country','Continent', 'DOB','Event','Year','Time','Location','Date']]
sprinters_df3.head()

Now that the data has been merged, let's conduct the Chi-Square Test

In [None]:
# Create a contingency table for event type vs. country
contingency_table = pd.crosstab(sprinters_df3['Event'], sprinters_df3['Country'])

# Run Chi-Square test
chi2, p, dof, expected = chi2_contingency(contingency_table)
print(f"Chi-Square test: chi2 = {chi2}, p-value = {p}")


*Time Series Analysis – Track Athlete Performance Over Time*

Let's perform time series analysis to track an individual athlete’s performance. This will allow us to detect patterns, trends, or seasonal effects in an athlete's performance over time.

In [None]:
# Filter data for one athlete
athlete_data = sprinters[sprinters['Athlete'] == 'Athlete A'].sort_values(by='Year')

# Calculate the rolling average (moving average) for the time over 3 events
athlete_data['Moving_Avg'] = athlete_data['Time'].rolling(window=3).mean()

# Plot the moving average
athlete_data[['Year', 'Moving_Avg']].plot(x='Year', y='Moving_Avg')


*K-Means Clustering – Group Athletes Based on Performance*

We want to group athletes based on their performance metrics across multiple events. This is useful for identifying similar performance profiles among athletes.

In [None]:
# Select performance data (for example, average times for 100m, 200m, and 400m)
performance_data = sprinters.groupby('Athlete').mean()[['Time']].dropna()

# Fit KMeans clustering with 3 clusters
kmeans = KMeans(n_clusters=3)
performance_data['Cluster'] = kmeans.fit_predict(performance_data)

# Display cluster assignments
print(performance_data)


*Linear Regression – Predict Time Based on Year and Other Variables*



We can use linear regression to predict race times based on year, athlete, location, or other factors. This can help us model how times change over time or in different conditions.

*Linear Regression 100m*

In [None]:
# Prepare the data (for example, predicting time in the 100m event)
event_data = sprinters[sprinters['Event'] == '100m']
X = event_data[['Year']]  # You can add other features such as 'Location', 'Athlete'
y = event_data['Time']

# Fit the linear regression model
model = LinearRegression()
model.fit(X, y)

# Print the coefficients and intercept
print(f"Coefficient: {model.coef_}, Intercept: {model.intercept_}")


*Linear Regression 200m*

In [None]:
# Prepare the data (for example, predicting time in the 200m event)
event_data = sprinters[sprinters['Event'] == '200m']
X = event_data[['Year']]  # You can add other features such as 'Location', 'Athlete'
y = event_data['Time']

# Fit the linear regression model
model = LinearRegression()
model.fit(X, y)

# Print the coefficients and intercept
print(f"Coefficient: {model.coef_}, Intercept: {model.intercept_}")


*Linear Regression 400m*

In [None]:
# Prepare the data (for example, predicting time in the 100m event)
event_data = sprinters[sprinters['Event'] == '400m']
X = event_data[['Year']]  # You can add other features such as 'Location', 'Athlete'
y = event_data['Time']

# Fit the linear regression model
model = LinearRegression()
model.fit(X, y)

# Print the coefficients and intercept
print(f"Coefficient: {model.coef_}, Intercept: {model.intercept_}")
