In [None]:
import pandas as pd
import numpy as np

In [None]:
raw_dfs: dict[str, pd.DataFrame] = dict()
raw_dfs["results"] = pd.read_parquet(".data_parquet/results.parquet")
raw_dfs["pit_stops"] = pd.read_parquet(".data_parquet/pit_stops.parquet")
raw_dfs["races"] = pd.read_parquet(".data_parquet/races.parquet")
raw_dfs["laps"] = pd.read_parquet(".data_parquet/lap_times.parquet")

In [None]:
winners_df = raw_dfs["results"].query("position == 1.0", engine="python")

def calculate_cluster_info(row):
    raceId = row.name[0]
    driverId = row.name[1]
    try:
        differences = (
            raw_dfs["results"]
            .loc[(raceId, slice(None))]
            .sort_values("position")["milliseconds"]
            .dropna()
            .values
        )
        if len(differences) == 1:
            raise ValueError("No data available")
        differences = np.diff(differences)
        # weights = np.arange(differences.size, 0, -1)
        row["difference_coefficient"] = differences[0]
    except:
        row["difference_coefficient"] = np.NAN
    try:
        # 15 fastest laps of the winning driver
        # Percentage of laps in the 15 fastest laps of the race itself
        fl_laps_driver = raw_dfs["laps"].loc[(raceId, driverId, slice(None))]["milliseconds"].dropna().values
        fl_laps_driver = np.sort(fl_laps_driver)
        num_laps = np.minimum(fl_laps_driver.size, 15)
        fl_laps_race = raw_dfs["laps"].loc[(raceId, slice(None), slice(None))]["milliseconds"].dropna().values
        fl_laps_race = np.sort(fl_laps_race)
        laps_in_subset = np.isin(fl_laps_driver[:num_laps], fl_laps_race[:num_laps])
        count = np.count_nonzero(laps_in_subset)
        row["fl_percentage"] = count / num_laps
        row["lap_spread_coefficient"] = np.std(fl_laps_driver[:-5])
    except:
        row["fl_percentage"] = np.NAN
        row["lap_spread_coefficient"] = np.NAN
    try:
        if raceId not in raw_dfs["pit_stops"].index.levels[0]:
            raise ValueError("No pit stops recorded for this race")
        total_pit_stops = (
            raw_dfs["pit_stops"]
            .loc[(raceId, driverId, slice(None))]
            .index
            .size
        )
        row["total_pit_stops"] = total_pit_stops
    except:
        row["total_pit_stops"] = np.NAN
    return row

winners_with_extra_info_df = winners_df.apply(calculate_cluster_info, axis=1)
winners_with_extra_info_df

In [None]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import plotly.express as px

df_to_pca = winners_with_extra_info_df[["grid", "difference_coefficient", "fl_percentage", "lap_spread_coefficient", "total_pit_stops"]].copy()
df_to_pca = df_to_pca.apply(lambda a: pd.to_numeric(a, errors="coerce"))
df_to_pca = df_to_pca.dropna()
scaler = StandardScaler()
data_scaled = scaler.fit_transform(df_to_pca.values)
data_scaled[:, 0] = data_scaled[:, 0]
pca = PCA(n_components=2)
pca_result = pca.fit_transform(data_scaled)
kmeans = KMeans(n_clusters=3)
kmeans.fit(pca_result)
df_to_pca["x"] = pca_result[:, 0]
df_to_pca["y"] = pca_result[:, 1]
df_to_pca["cluster"] = kmeans.labels_
raw_dfs["races"] = pd.read_parquet(".data_parquet/races.parquet")
df_to_pca = df_to_pca.join(raw_dfs["races"], lsuffix="_results", rsuffix="_races")
df_to_pca = df_to_pca[['grid', 'difference_coefficient', 'year', 'name', 'x', 'y', 'cluster']]
df_to_pca['name'] = df_to_pca['year'].astype(str) + ' ' + df_to_pca['name']

df_to_pca["cluster"] = df_to_pca["cluster"].astype(str)
fig = px.scatter(df_to_pca, x="x", y="y", color="cluster", hover_data={
    'name': True,
    'x': False,
    'y': False,
    'cluster': False,
    'grid': True,
    'difference_coefficient': True
}, color_continuous_scale=px.colors.sequential.Burgyl)
fig.show()

df_to_pca["year"] = df_to_pca["year"].astype(str)
fig2 = px.scatter(df_to_pca, x="x", y="y", color="year", hover_data={
    'name': True,
    'x': False,
    'y': False,
    'cluster': False,
    'grid': True,
    'difference_coefficient': True,
    'year': False,
}, color_continuous_scale=px.colors.sequential.RdBu)
fig2.show()

In [None]:
from dash import Dash, dcc, html, Input, Output
import plotly.express as px
import plotly.graph_objects as go

app = Dash(__name__)


app.layout = html.Div([
    html.H4('Interactive scatter plot'),
    dcc.Graph(id="scatter-plot"),
    html.P("Select used parameters"),
    dcc.Checklist(
        ["grid", "difference_coefficient", "fl_percentage", "lap_spread_coefficient", "total_pit_stops"],
        ["grid", "difference_coefficient", "fl_percentage", "lap_spread_coefficient"],
        id="select_checklist"
    )
])


@app.callback(
    Output("scatter-plot", "figure"), 
    Input("select_checklist", "value"))
def update_bar_chart(select_checklist):
    df_to_pca = winners_with_extra_info_df[select_checklist].copy()
    df_to_pca = df_to_pca.apply(lambda a: pd.to_numeric(a, errors="coerce"))
    df_to_pca = df_to_pca.dropna()
    available_data = df_to_pca.index
    showcase_df = winners_with_extra_info_df.loc[available_data].copy()
    x = ""
    y = ""
    hover_data_dict = {}
    # If select_checklist has only 1 value (TODO)
    if len(select_checklist) == 1:
        x = select_checklist[0]
        y = "zero"
        showcase_df[y] = 0.
        app.logger.info(showcase_df)
    # If select_checklist has only 2 values, simple scatter
    if len(select_checklist) == 2:
        x = select_checklist[0]
        y = select_checklist[1]
    if len(select_checklist) > 2:
        # More than 2, PCA
        scaler = StandardScaler()
        data_scaled = scaler.fit_transform(df_to_pca.values)
        pca = PCA(n_components=2)
        pca_result = pca.fit_transform(data_scaled)
        x = "x"
        y = "y"
        showcase_df[x] = pca_result[:, 0]
        showcase_df[y] = pca_result[:, 1]
        hover_data_dict[x] = False
        hover_data_dict[y] = False
    kmeans = KMeans(3)
    kmeans.fit(showcase_df[[x, y]])
    showcase_df["cluster"] = kmeans.labels_
    showcase_df["cluster"] = showcase_df["cluster"].astype(str)
    showcase_df = showcase_df.join(raw_dfs["races"], lsuffix="_results", rsuffix="_races")
    showcase_df['name'] = showcase_df['year'].astype(str) + ' ' + showcase_df['name']
    hover_data_dict['name'] = True
    hover_data_dict['cluster'] = False
    for key in select_checklist:
        hover_data_dict[key] = True
    fig = px.scatter(showcase_df, x=x, y=y, color="cluster", hover_data=hover_data_dict, color_continuous_scale=px.colors.sequential.Burgyl)
    return fig


app.run(jupyter_mode="external", debug=True)