In [18]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_1samp
import matplotlib.pyplot as plt
from typing import List, Dict, Tuple

In [19]:
drivers = pd.read_csv("dataset/drivers.csv")
drivers_standings = pd.read_csv("dataset/driver_standings.csv")
races = pd.read_csv("dataset/races.csv")

In [20]:
# drivers' details, driverId, dob, ... and all the races he participated in
# with the years of those races so foreach driver we have a list of his
# races and the year of each race
driver_races_per_year = pd.merge(
    drivers[["driverId", "dob", "forename", "surname", "nationality"]],
    drivers_standings[["raceId", "driverId"]],
    on=["driverId"],
    how="inner",
).merge(races[["raceId", "year"]], on=["raceId"], how="inner")

# extract the year only from the date of birth
driver_races_per_year["dob"] = pd.DatetimeIndex(driver_races_per_year["dob"]).year

# merge their names, makes it easiear to google a driver
driver_races_per_year["Name"] = (
    driver_races_per_year["forename"] + " " + driver_races_per_year["surname"]
)

# the data to work on has: the driver name, his dob, his final race year, and his nationality
data = driver_races_per_year.groupby(by="driverId", as_index=False).agg(
    {"Name": "first", "dob": "first", "year": "max", "nationality": "first"}
)

# remove drivers whose last race was after 2020
data = data[data["year"] <= 2020]

# calculate the retirement age
data["retirement_age"] = data["year"] - data["dob"]
# remove driver where the year difference is less than 29, they are not considered retired.
data = data[data["retirement_age"] >= 29]

In [21]:
display(data)
# the average retirement age
print(data["retirement_age"].mean())

# calculate the average retirement age per country
data_per_country = data.groupby("nationality",as_index=False).agg(
    {"nationality": "first", "retirement_age": "mean"}
)
display(data_per_country)

Unnamed: 0,driverId,Name,dob,year,nationality,retirement_age
1,2,Nick Heidfeld,1977,2011,German,34
2,3,Nico Rosberg,1985,2016,German,31
4,5,Heikki Kovalainen,1981,2013,Finnish,32
6,7,Sébastien Bourdais,1979,2009,French,30
9,10,Timo Glock,1982,2012,German,30
...,...,...,...,...,...,...
802,811,Bruno Senna,1983,2012,Brazilian,29
804,813,Pastor Maldonado,1985,2015,Venezuelan,30
805,814,Paul di Resta,1986,2017,British,31
818,827,André Lotterer,1981,2014,German,33


36.034201954397396


Unnamed: 0,nationality,retirement_age
0,American,37.164062
1,American-Italian,41.0
2,Argentine,35.666667
3,Argentine-Italian,31.0
4,Australian,34.333333
5,Austrian,32.6
6,Belgian,37.25
7,Brazilian,33.894737
8,British,35.218487
9,Canadian,36.111111


In [22]:
null_hypothesis = 36
t_statistic, p_value = ttest_1samp(data["retirement_age"], null_hypothesis)

# print the test statistic and p-value
print("t-statistic:", t_statistic)
print("p-value:", p_value)

# determine the conclusion
alpha = 0.05
if p_value <= alpha:
    print("Reject the null hypothesis")
else:
    print("Fail to reject the null hypothesis")

t-statistic: 0.15373719302602776
p-value: 0.8778675541721859
Fail to reject the null hypothesis
