In [None]:
import pandas as pd
import numpy as np

In [None]:
raw_dfs: dict[str, pd.DataFrame] = dict()
raw_dfs["results"] = pd.read_parquet(".data_parquet/results.parquet")
raw_dfs["pit_stops"] = pd.read_parquet(".data_parquet/pit_stops.parquet")

In [None]:
raw_dfs: dict[str, pd.DataFrame] = dict()
raw_dfs["results"] = pd.read_parquet(".data_parquet/results.parquet")
raw_dfs["pit_stops"] = pd.read_parquet(".data_parquet/pit_stops.parquet")
raw_dfs["races"] = pd.read_parquet(".data_parquet/races.parquet")
races_after_1970 = raw_dfs["races"].query("year >= 1970").index
winners_df = raw_dfs["results"].query("position == 1.0 & raceId in @races_after_1970", engine="python")
display("Winners dataframe", winners_df)
# Data for clustering wins:
# - Difference between the first place to second place and third place
# - Amount of pit stops for winning driver
def calculate_cluster_info(row):
    raceId = row.name[0]
    driverId = row.name[1]
    try:
        # Get millis for second and third place
        second_millis = raw_dfs["results"].query("raceId == @raceId & position == 2.0")["milliseconds"].values[0]
        difference_second = second_millis - row["milliseconds"]
        row["difference_second"] = difference_second
    except:
        row["difference_second"] = np.NAN
    try:
        third_millis = raw_dfs["results"].query("raceId == @raceId & position == 3.0")["milliseconds"].values[0]
        difference_third = third_millis - row["milliseconds"]
        row["difference_third"] = difference_third
    except:
        row["difference_third"] = np.NAN
    try:
        if raceId not in raw_dfs["pit_stops"].index.levels[0]:
            raise ValueError("No pit stops recorded for this race")
        total_pit_stops = raw_dfs["pit_stops"].query("raceId == @raceId & driverId == @driverId").index.size
        row["total_pit_stops"] = total_pit_stops
    except:
        row["total_pit_stops"] = np.NAN
    return row

winner_df = winners_df.apply(calculate_cluster_info, axis=1)
winner_df

In [None]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

df_to_pca = winner_df[["grid", "milliseconds", "fastestLapSpeed", "difference_second", "difference_third", "total_pit_stops"]]
df_to_pca = df_to_pca.dropna()
scaler = StandardScaler()
scaler.fit(df_to_pca)
df_to_pca_scaled = scaler.transform(df_to_pca)
pca = PCA(n_components=2)
pca_to_df = pca.fit_transform(df_to_pca_scaled)
pca_df = pd.DataFrame(pca_to_df, columns=["x", "y"])
pca_df.index = df_to_pca.index
kmeans = KMeans(n_clusters=3)
kmeans.fit(pca_df)
pca_df["cluster"] = kmeans.labels_

fig, ax = plt.subplots(figsize=(16,9))
ax.scatter(pca_df["x"], pca_df["y"], c=pca_df["cluster"], cmap="viridis")
for i, txt in enumerate(pca_df.index):
    ax.annotate(txt, (pca_df["x"].iloc[i], pca_df["y"].iloc[i]))
plt.show()


In [None]:
wtf_races = [847, 1066, 1104, 1015, 994, 1047, 960, 900, 1092]
raw_dfs["races"] = pd.read_parquet(".data_parquet/races.parquet")
display(raw_dfs["races"].loc[wtf_races])
display(winner_df.loc[(wtf_races, slice(None))])