In [None]:
import pandas as pd
import numpy as np

In [None]:
raw_dfs: dict[str, pd.DataFrame] = dict()
raw_dfs["results"] = pd.read_parquet(".data_parquet/results.parquet")
raw_dfs["pit_stops"] = pd.read_parquet(".data_parquet/pit_stops.parquet")
raw_dfs["races"] = pd.read_parquet(".data_parquet/races.parquet")
raw_dfs["laps"] = pd.read_parquet(".data_parquet/lap_times.parquet")
raw_dfs["constructor_standings"] = pd.read_parquet(".data_parquet/constructor_standings.parquet")
raw_dfs["status"] = pd.read_parquet(".data_parquet/status.parquet")

In [None]:
races_in_order = raw_dfs["races"][["year", "round"]].sort_values(["year", "round"])
races_in_order["number"] = range(1, len(races_in_order) + 1)
races_in_order["raceId"] = races_in_order.index
races_in_order = races_in_order.set_index("number")
races_in_order

In [None]:
# Status used to differentiate DNFs from DNQs
exclude_status = ["Did not qualify", "Did not prequalify", "Not classified", "Withdrew"]
not_qualified_status = raw_dfs["status"].query("status in @exclude_status", engine="python").index
not_qualified_status

In [None]:
def calculate_difference_coefficient(
    raceId: int,
    verbose=False
):
    """
    Calculates a difference coefficient that will tell how big was the advantage
    the winner had over the previous drivers based on the time difference
    """
    try:
        millis = (
            raw_dfs["results"]
            .loc[(raceId, slice(None))]
            .sort_values("position")["milliseconds"]
            .dropna()
            .to_numpy()
        )
        if len(millis) < 3:
            raise ValueError("No data available")
        differences_each_driver = np.diff(millis[:3])
        weights = np.arange(differences_each_driver.size, 0, -1) ** 2
        dot_product = np.dot(differences_each_driver, weights)
        sum_weights = weights.sum()
        difference_coefficient = dot_product / sum_weights
        if verbose:
            print(f"Race ID: {raceId}")
            print(f"Milliseconds: {millis}")
            print(f"Differences: {differences_each_driver}")
            print(f"Weights: {weights}")
            print(f"Dot product: {dot_product}")
            print(f"Sum weights: {sum_weights}")
            print(f"Difference coefficient: {difference_coefficient}")
        return difference_coefficient
    except Exception as excp:
        if verbose:
            print(excp)
        return np.NAN


# Vitórias folgadas, exemplo: Silverstone 2008
print(calculate_difference_coefficient(26, verbose=True))
# Vitórias apertadas, exemplo: EUA 2002
print(calculate_difference_coefficient(139, verbose=True))

In [None]:
def calculate_total_pit_stops(
    raceId: int,
    driverId: int,
    verbose=False
):
    """
    Calculates:
    - The total number of pit stops a driver made in a race
    - The difference of pit stops between driver and mean of all drivers
    """
    try:
        pit_stops_by_driver = (
            raw_dfs["pit_stops"]
            .loc[raceId]
            .groupby("driverId")
            .size()
        )
        pit_stops_winner = pit_stops_by_driver[driverId]
        mean_pit_stops = pit_stops_by_driver.mean()
        difference_from_mean = pit_stops_winner - mean_pit_stops
        if verbose:
            print(f"Race ID: {raceId}; Driver ID: {driverId}")
            print(f"Winner pit stops: {pit_stops_winner}")
            print(f"Mean pit stops: {mean_pit_stops}")
            print(f"Difference from mean: {difference_from_mean}")
        return pit_stops_winner, difference_from_mean
    except Exception as excp:
        if verbose:
            print(excp)
        return np.NAN, np.NAN

# Vitória com muitos pits, exemplo: Canadá 2011
print(calculate_total_pit_stops(847, 18, verbose=True))
# Vitória com poucos pits, exemplo: Espanha 2016
print(calculate_total_pit_stops(952, 830, verbose=True))

In [None]:
def calculate_constructor_performance_coefficient(
    raceId: int,
    driverId: int,
    verbose=False
):
    """
    Calculates the performance coefficient of the constructor of a driver based
    on the result of the last 5 races
    """
    try:
        constructorId = (
            raw_dfs["results"]
            .loc[(raceId, driverId)]["constructorId"]
            .to_numpy()
            [0]
        )
        # Get race number
        race_number = races_in_order.query("raceId == @raceId").index[0]
        # Get the last five races raceId
        previous_five_raceId = races_in_order.loc[race_number - 5: race_number - 1]["raceId"]
        # Get the results of the last 5 races, group wins by constructor
        previous_wins_by_constructor = (
            raw_dfs["results"]
            .loc[(previous_five_raceId, slice(None))]
            .groupby("constructorId")["position"]
            .apply(lambda x: (x == 1).sum())
        )
        # Get the ratio of wins by constructor of the driver
        previous_constructor_wins_ratio = (
            previous_wins_by_constructor.loc[constructorId] /
            previous_wins_by_constructor.sum()
        )
        if verbose:
            print(f"Race ID: {raceId}; Driver ID: {driverId}")
            print(f"Constructor ID of the winning driver: {constructorId}")
            print(f"Race number: {race_number}")
            print(f"Last 5 races raceId: {previous_five_raceId}")
            print(f"Previous wins by constructor in the last 5 races: {previous_wins_by_constructor}")
            print(f"Ratio of wins by constructor: {previous_constructor_wins_ratio}")
        return previous_constructor_wins_ratio
    except Exception as excp:
        if verbose:
            print(excp)
        return np.NAN


# Vitória com construtor em alta, exemplo: Russia 2016
print(calculate_constructor_performance_coefficient(951, 3, verbose=True))
# Vitória com construtor em baixa, exemplo: Hungria 2021
print(calculate_constructor_performance_coefficient(1062, 839, verbose=True))

In [None]:
def calculate_dnfs_in_race(
    raceId: int,
    verbose=False
):
    """
    Calculates the number of DNFs in a given race
    """
    try:
        race_results = raw_dfs["results"].loc[(raceId, slice(None))]
        race_results = race_results.query("statusId not in @not_qualified_status", engine="python")
        total_drivers = race_results.shape[0]
        dnf_count = race_results["position"].isna().sum()
        dnf_ratio = dnf_count / total_drivers
        if verbose:
            print(f"Race ID: {raceId}")
            print(f"Total drivers: {total_drivers}")
            print(f"DNF count: {dnf_count}")
            print(f"DNF ratio: {dnf_ratio}")
        return dnf_count, dnf_ratio
    except Exception as excp:
        if verbose:
            print(excp)
        return np.NAN, np.NAN


# Vitória com muitos DNFs, exemplo: Mônaco 1996
print(calculate_dnfs_in_race(229, verbose=True))
# Vitória com poucos DNFs, exemplo: Europa 2011
print(calculate_dnfs_in_race(848, verbose=True))

In [None]:
def calculate_position_distribution(
    raceId: int,
    driverId: int,
    verbose=False
):
    """
    Calculate:
    - The worst position a driver had in a given race.
    - Mean position across all race (if led all laps, mean=1.0)
    """
    try:
        this_driver_laps = raw_dfs["laps"].loc[(raceId, driverId)]
        worst_position = this_driver_laps["position"].max()
        mean_position = this_driver_laps["position"].mean()
        if verbose:
            print(f"Race ID: {raceId}; Driver ID: {driverId}")
            print(f"Worst position: {worst_position}")
            print(f"Mean position: {mean_position}")
        return worst_position, mean_position
    except Exception as excp:
        if verbose:
            print(excp)
        return np.NAN, np.NAN
    

# Vitória de ponta-a-ponta, exemplo: Japão 2012
print(calculate_position_distribution(874, 20, verbose=True))
# Vitória com corrida de recuperação, exemplo: Brasil 2012
print(calculate_position_distribution(879, 20, verbose=True))

In [None]:
winners_df = raw_dfs["results"].query("position == 1.0", engine="python")


def calculate_cluster_info(row):
    raceId = row.name[0]
    driverId = row.name[1]
    row["difference_coefficient"] = calculate_difference_coefficient(raceId)
    row["total_pit_stops"], row["pit_stop_diff_from_mean"] = calculate_total_pit_stops(
        raceId,
        driverId
    )
    row["constructor_performance_coefficient"] = calculate_constructor_performance_coefficient(
        raceId,
        driverId
    )
    row["dnf_count"], row["dnf_ratio"] = calculate_dnfs_in_race(raceId)
    row["worst_position"], row["mean_position"] = calculate_position_distribution(raceId, driverId)
    return row


winners_with_extra_info_df = winners_df.apply(calculate_cluster_info, axis=1)
winners_with_extra_info_df

In [None]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import plotly.express as px

excluded_columns = [
    "constructorId", "position", "points", "laps", "milliseconds", "fastestLap",
    "rank", "fastestLapTime", "fastestLapSpeed", "statusId", "fastestLapMillis"]
columns_used_for_pca = winners_with_extra_info_df.columns.difference(
    excluded_columns)
df_to_pca = winners_with_extra_info_df[columns_used_for_pca].copy()
df_to_pca = df_to_pca.apply(lambda a: pd.to_numeric(a, errors="coerce"))
df_to_pca = df_to_pca.dropna()
scaler = StandardScaler()
data_scaled = scaler.fit_transform(df_to_pca.values)
pca = PCA(n_components=2)
pca_result = pca.fit_transform(data_scaled)
kmeans = KMeans(n_clusters=4)
kmeans.fit(pca_result)
df_to_pca["x"] = pca_result[:, 0]
df_to_pca["y"] = pca_result[:, 1]
df_to_pca["cluster"] = kmeans.labels_
raw_dfs["races"] = pd.read_parquet(".data_parquet/races.parquet")
df_to_pca = df_to_pca.join(
    raw_dfs["races"][['name', 'year']], lsuffix="_results", rsuffix="_races")
df_to_pca['name'] = df_to_pca['year'].astype(str) + ' ' + df_to_pca['name']

df_to_pca["cluster"] = df_to_pca["cluster"].astype(str)
hover_data_dict = {}
hover_data_dict["name"] = True
for column in df_to_pca.columns:
    hover_data_dict[column] = True
hover_data_dict["x"] = False
hover_data_dict["y"] = False
hover_data_dict["cluster"] = False
hover_data_dict["year"] = False
fig = px.scatter(df_to_pca, x="x", y="y", color="cluster", hover_data=hover_data_dict,
                 color_continuous_scale=px.colors.sequential.Burgyl)
fig.show()

df_to_pca["year"] = df_to_pca["year"].astype(str)
fig2 = px.scatter(df_to_pca, x="x", y="y", color="year", hover_data=hover_data_dict,
                  color_continuous_scale=px.colors.sequential.RdBu)
fig2.show()

In [None]:
from dash import Dash, dcc, html, Input, Output, dash_table
import plotly.express as px
import plotly.graph_objects as go

available_parameters = {
    "grid": "Grid starting position",
    "difference_coefficient": "Difference coefficient",
    "total_pit_stops": "Total pit stops",
    "pit_stop_diff_from_mean": "Pit stop difference from mean",
    "constructor_performance_coefficient": "Constructor performance coefficient",
    "dnf_count": "DNF count",
    "dnf_ratio": "DNF ratio",
    "worst_position": "Worst position",
    "mean_position": "Mean position",
}

app = Dash(__name__)


app.layout = html.Div([
    html.H4('Interactive scatter plot'),
    dcc.Graph(id="scatter-plot"),
    html.P("Select used parameters"),
    dcc.Checklist(
        options=[{"label": available_parameters[param], "value": param} for param in available_parameters],
        value=[item for item in available_parameters.keys()],
        id="select_checklist"
    ),
    # Filter by year
    dcc.RangeSlider(
        id='year_slider',
        min=1950,
        max=2023,
        step=1,
        value=[1950, 2023],
        marks={str(year): str(year) for year in range(1950, 2023, 10)}
    ),
    dash_table.DataTable(
        id='selected_data_table'
    )
])


@app.callback(
    Output("scatter-plot", "figure"),
    Output("year_slider", "min"),
    Output("year_slider", "max"),
    Output("year_slider", "marks"),
    Output("selected_data_table", "columns"),
    Output("selected_data_table", "data"),
    Input("select_checklist", "value"),
    Input("year_slider", "value"))
def update_bar_chart(select_checklist, year_slider):
    df_to_pca = winners_with_extra_info_df[select_checklist].copy()
    df_to_pca = df_to_pca.apply(lambda a: pd.to_numeric(a, errors="coerce"))
    df_to_pca = df_to_pca.dropna()
    available_races = df_to_pca.index.get_level_values(0)
    included_races = raw_dfs["races"].loc[available_races]
    new_available_min_year = included_races["year"].min()
    new_available_max_year = included_races["year"].max()
    included_races = included_races.query("year >= @year_slider[0] and year <= @year_slider[1]", engine="python")
    df_to_pca = df_to_pca.loc[included_races.index]
    available_data = df_to_pca.index
    showcase_df = winners_with_extra_info_df.loc[available_data].copy()
    x = ""
    y = ""
    hover_data_dict = {}
    # If select_checklist has only 1 value
    if len(select_checklist) == 0:
        return go.Figure(), new_available_min_year, new_available_max_year
    if len(select_checklist) == 1:
        x = select_checklist[0]
        y = "zero"
        showcase_df[y] = 0.
        app.logger.info(showcase_df)
    # If select_checklist has only 2 values, simple scatter
    if len(select_checklist) == 2:
        x = select_checklist[0]
        y = select_checklist[1]
    if len(select_checklist) > 2:
        # More than 2, PCA
        scaler = StandardScaler()
        data_scaled = scaler.fit_transform(df_to_pca.values)
        pca = PCA(n_components=2)
        pca_result = pca.fit_transform(data_scaled)
        x = "x"
        y = "y"
        showcase_df[x] = pca_result[:, 0]
        showcase_df[y] = pca_result[:, 1]
        hover_data_dict[x] = False
        hover_data_dict[y] = False
    kmeans = KMeans(3)
    kmeans.fit(showcase_df[[x, y]])
    showcase_df["cluster"] = kmeans.labels_
    showcase_df["cluster"] = showcase_df["cluster"].astype(str)
    showcase_df = showcase_df.join(raw_dfs["races"], lsuffix="_results", rsuffix="_races")
    showcase_df['name'] = showcase_df['year'].astype(str) + ' ' + showcase_df['name']
    hover_data_dict['name'] = True
    hover_data_dict['cluster'] = False
    for key in select_checklist:
        hover_data_dict[key] = True
    fig = px.scatter(showcase_df, x=x, y=y, color="cluster", hover_data=hover_data_dict, color_continuous_scale=px.colors.sequential.Burgyl)
    step = (new_available_max_year - new_available_min_year + 1) // 10
    marks = {str(year): str(year) for year in range(new_available_min_year, new_available_max_year + 1, step)}
    return fig, new_available_min_year, new_available_max_year, marks, [{"name": col, "id": col} for col in showcase_df.columns], showcase_df.to_dict("records")


app.run(jupyter_mode="external", debug=True)