In [None]:
import pandas as pd
import numpy as np

In [None]:
raw_dfs: dict[str, pd.DataFrame] = dict()
raw_dfs["results"] = pd.read_parquet(".data_parquet/results.parquet")
raw_dfs["pit_stops"] = pd.read_parquet(".data_parquet/pit_stops.parquet")

In [None]:
raw_dfs: dict[str, pd.DataFrame] = dict()
raw_dfs["results"] = pd.read_parquet(".data_parquet/results.parquet")
raw_dfs["pit_stops"] = pd.read_parquet(".data_parquet/pit_stops.parquet")
raw_dfs["races"] = pd.read_parquet(".data_parquet/races.parquet")
races_after_1970 = raw_dfs["races"].query("year >= 1970").index
winners_df = raw_dfs["results"].query("position == 1.0 & raceId in @races_after_1970", engine="python")
display("Winners dataframe", winners_df)
# Data for clustering wins:
# - Difference between the first place to second place and third place
# - Amount of pit stops for winning driver
def calculate_cluster_info(row):
    raceId = row.name[0]
    driverId = row.name[1]
    try:
        # Get millis for second and third place
        second_millis = raw_dfs["results"].query("raceId == @raceId & position == 2.0")["milliseconds"].values[0]
        difference_second = second_millis - row["milliseconds"]
        row["difference_second"] = difference_second
    except:
        row["difference_second"] = np.NAN
    try:
        third_millis = raw_dfs["results"].query("raceId == @raceId & position == 3.0")["milliseconds"].values[0]
        difference_third = third_millis - row["milliseconds"]
        row["difference_third"] = difference_third
    except:
        row["difference_third"] = np.NAN
    try:
        if raceId not in raw_dfs["pit_stops"].index.levels[0]:
            raise ValueError("No pit stops recorded for this race")
        total_pit_stops = raw_dfs["pit_stops"].query("raceId == @raceId & driverId == @driverId").index.size
        row["total_pit_stops"] = total_pit_stops
    except:
        row["total_pit_stops"] = np.NAN
    return row

winner_df = winners_df.apply(calculate_cluster_info, axis=1)
winner_df

In [None]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import plotly.express as px

df_to_pca = winner_df[["grid", "milliseconds", "fastestLapSpeed", "difference_second", "difference_third", "total_pit_stops"]].copy()
df_to_pca = df_to_pca.apply(lambda a: pd.to_numeric(a, errors="coerce"))
df_to_pca = df_to_pca.dropna()
scaler = StandardScaler()
data_scaled = scaler.fit_transform(df_to_pca.values)
data_scaled[:, 0] = data_scaled[:, 0] * 1.25
pca = PCA(n_components=2)
pca_result = pca.fit_transform(data_scaled)
kmeans = KMeans(n_clusters=3)
kmeans.fit(pca_result)
df_to_pca["x"] = pca_result[:, 0]
df_to_pca["y"] = pca_result[:, 1]
df_to_pca["cluster"] = kmeans.labels_
raw_dfs["races"] = pd.read_parquet(".data_parquet/races.parquet")
df_to_pca = df_to_pca.join(raw_dfs["races"], lsuffix="_results", rsuffix="_races")
df_to_pca = df_to_pca[['grid', 'milliseconds', 'difference_second', 'difference_third', 'total_pit_stops', 'year', 'name', 'x', 'y', 'cluster']]
df_to_pca['name'] = df_to_pca['year'].astype(str) + ' ' + df_to_pca['name']

df_to_pca["cluster"] = df_to_pca["cluster"].astype(str)
fig = px.scatter(df_to_pca, x="x", y="y", color="cluster", hover_data={
    'name': True,
    'x': False,
    'y': False,
    'cluster': False,
    'grid': True,
    'milliseconds': True,
    'difference_second': True,
    'difference_third': True,
    'total_pit_stops': True,
}, color_continuous_scale=px.colors.sequential.Burgyl)
fig.show()

df_to_pca["year"] = df_to_pca["year"].astype(str)
fig2 = px.scatter(df_to_pca, x="x", y="y", color="year", hover_data={
    'name': True,
    'x': False,
    'y': False,
    'cluster': False,
    'grid': True,
    'milliseconds': True,
    'difference_second': True,
    'difference_third': True,
    'total_pit_stops': True,
    'year': False,
}, color_continuous_scale=px.colors.sequential.RdBu)
fig2.show()