In [11]:
import pandas as pd
from ipywidgets import interact, IntSlider
import matplotlib.pyplot as plt
import seaborn as sns

In [12]:
# read the files
results_data_file = pd.read_csv(r"..\cleaned_data\f1db-races-race-results-CLEANED.csv")
quals_data_file = pd.read_csv(r"..\cleaned_data\f1db-races-qualifying-results-CLEANED.csv")

# create list of unique race ids
race_id_list = results_data_file['raceId'].unique()

In [13]:
# Compute the difference between Fastest Qualifying Time grid and the Starting Grid

time_data = quals_data_file[quals_data_file['q1Millis']>0] # get the races which have the qualifying data we need
new_race_id_list = time_data['raceId'].unique()

qs_difference = {} # here 'qs difference' stands for 'Qualifying-Starting difference' 

for race_id in new_race_id_list:
    qual_data = quals_data_file[quals_data_file['raceId'] == race_id]
    driver_number_data = list(qual_data['driverNumber'])
    q1_time_data = list(qual_data['q1Millis'])
    q2_time_data = list(qual_data['q2Millis'])
    q3_time_data = list(qual_data['q3Millis'])

    min_qual_time_data = {}
    min_qual_time_list = []
    num_of_drivers = len(q1_time_data)
    for x in range(num_of_drivers):
        min_qual_time = min(q1_time_data[x],q2_time_data[x],q3_time_data[x])
        min_qual_time_data[x] = min_qual_time
        min_qual_time_list.append(min_qual_time)

    fastest_qual_time_positions = sorted(min_qual_time_list)
    position_difference = 0

    for y in range(num_of_drivers):
        if fastest_qual_time_positions[y] != min_qual_time_data[y]:
            driver_difference = abs(y - min_qual_time_list.index(fastest_qual_time_positions[y]))
            position_difference += driver_difference
    
    qs_difference[race_id] = position_difference

In [23]:
# Writing methods to determine race time clustering

def new_time_open_ball(time, time_list, time_gap):        # Get sets of open balls with radius 1 second and basepoint each time
    new_open_ball = {time}

    for other_time in time_list:
        if abs(time - other_time) < time_gap: 
            new_open_ball.add(other_time)

    return new_open_ball

def get_clusters(set_of_times):                 # Generate all the clusters of times within 1 second of each other       
    time_clusters = []

    for set_1 in set_of_times:
        if len(time_clusters) > 0: 
            cluster_union = set.union(*time_clusters)
        else: 
            cluster_union = set()

        if set_1 & cluster_union: continue

        cluster = set_1
        for set_2 in set_of_times:
            if cluster & set_2:
                cluster = cluster | set_2
        time_clusters.append(cluster)

    return time_clusters

def sqmean_drivers_per_cluster(clusters_of_time, num_of_drivers):     # Gets squared mean number of drivers per cluster
    squared_mean = 0

    for cluster in clusters_of_time:
        num_of_drivers_in_cluster = len(cluster)
        squared_mean += num_of_drivers_in_cluster**2
    
    squared_mean = squared_mean / num_of_drivers

    return squared_mean

def sqmean_drivers_per_cluster_D2(clusters_of_time, num_of_drivers):     # Gets squared mean number of drivers per cluster
    squared_mean = 0

    for cluster in clusters_of_time:
        num_of_drivers_in_cluster = len(cluster)
        squared_mean += num_of_drivers_in_cluster**2
    
    squared_mean = squared_mean / num_of_drivers**2

    return squared_mean

def clustering_vs_raceid_plot(time_gap):
    cluster_sqmean = []

    for race_id in race_id_list:
        race_data = results_data_file[results_data_file['raceId'] == race_id]
        necessary_data = race_data[race_data['timeMillis']>0]
        time_data = list(necessary_data['timeMillis'])
        time_sets = [new_time_open_ball(time, time_data, time_gap) for time in time_data]
        num_drivers = len(time_data)

        race_time_clusters = get_clusters(time_sets)
        cluster_sqmean.append((race_id, sqmean_drivers_per_cluster(race_time_clusters, num_drivers)))

    cluster_data = pd.DataFrame(cluster_sqmean, columns=["raceId", "cluster squared mean"])
    plt.figure(figsize=(10, 6))
    sns.regplot(data=cluster_data, x="raceId", y="cluster squared mean",
                order=1, line_kws={'color':'red', "linewidth": 2})
    plt.title(f"Time Clustering Squared Mean (Time Gap = {time_gap} ms)")
    plt.xlabel("Race ID")
    plt.ylabel("Cluster Squared Mean")
    plt.ylim(0, 6)
    plt.show()

def clustering_vs_raceid_plot_D2(time_gap):
    cluster_sqmean = []

    for race_id in race_id_list:
        race_data = results_data_file[results_data_file['raceId'] == race_id]
        necessary_data = race_data[race_data['timeMillis']>0]
        time_data = list(necessary_data['timeMillis'])
        time_sets = [new_time_open_ball(time, time_data, time_gap) for time in time_data]
        num_drivers = len(time_data)

        race_time_clusters = get_clusters(time_sets)
        cluster_sqmean.append((race_id, sqmean_drivers_per_cluster_D2(race_time_clusters, num_drivers)))

    cluster_data = pd.DataFrame(cluster_sqmean, columns=["raceId", "cluster squared mean"])
    plt.figure(figsize=(10, 6))
    sns.regplot(data=cluster_data, x="raceId", y="cluster squared mean",
                order=1, line_kws={'color':'red', "linewidth": 2})
    plt.title(f"Time Clustering Squared Mean D^2 (Time Gap = {time_gap} ms)")
    plt.xlabel("Race ID")
    plt.ylabel("Cluster Squared Mean")
    plt.ylim(0, 1)
    plt.show()

def clustering_vs_qsdiff_plot(time_gap):
    qs_cluster_sqmean = []

    for race_id in new_race_id_list:
        race_data = results_data_file[results_data_file['raceId'] == race_id]
        necessary_data = race_data[race_data['timeMillis'] > 0]
        time_data = list(necessary_data['timeMillis'])
        time_sets = [new_time_open_ball(time, time_data, time_gap) for time in time_data]
        num_drivers = len(time_data)

        race_time_clusters = get_clusters(time_sets)
        qs_cluster_sqmean.append((qs_difference[race_id], sqmean_drivers_per_cluster(race_time_clusters, num_drivers)))

    qs_cluster_data = pd.DataFrame(qs_cluster_sqmean, columns=["qs difference", "cluster squared mean"])
    plt.figure(figsize=(10, 6))
    sns.regplot(data=qs_cluster_data, x="qs difference", y="cluster squared mean",
                order=1, line_kws={'color':'red', "linewidth": 2})
    plt.title(f"Time Clustering Squared Mean (Time Gap = {time_gap} ms)")
    plt.xlabel("QS Difference")
    plt.ylabel("Cluster Squared Mean")
    plt.ylim(0, 6)
    plt.show()

def clustering_vs_qsdiff_plot_D2(time_gap):
    qs_cluster_sqmean = []

    for race_id in new_race_id_list:
        race_data = results_data_file[results_data_file['raceId'] == race_id]
        necessary_data = race_data[race_data['timeMillis'] > 0]
        time_data = list(necessary_data['timeMillis'])
        time_sets = [new_time_open_ball(time, time_data, time_gap) for time in time_data]
        num_drivers = len(time_data)

        race_time_clusters = get_clusters(time_sets)
        qs_cluster_sqmean.append((qs_difference[race_id], sqmean_drivers_per_cluster_D2(race_time_clusters, num_drivers)))

    qs_cluster_data = pd.DataFrame(qs_cluster_sqmean, columns=["qs difference", "cluster squared mean"])
    plt.figure(figsize=(10, 6))
    sns.regplot(data=qs_cluster_data, x="qs difference", y="cluster squared mean",
                order=1, line_kws={'color':'red', "linewidth": 2})
    plt.title(f"Time Clustering Squared Mean D^2 (Time Gap = {time_gap} ms)")
    plt.xlabel("QS Difference")
    plt.ylabel("Cluster Squared Mean")
    plt.ylim(0, 1)
    plt.show()

In [19]:
# Interactive Plot Cluster Squared Mean vs Race ID
interact(clustering_vs_raceid_plot, time_gap=IntSlider(min=100, max=4000, step=100, value=1000))

interactive(children=(IntSlider(value=1000, description='time_gap', max=4000, min=100, step=100), Output()), _…

<function __main__.clustering_vs_raceid_plot(time_gap)>

In [24]:
# Interactive Plot Cluster Squared Mean D^2 vs Race ID
interact(clustering_vs_raceid_plot_D2, time_gap=IntSlider(min=100, max=4000, step=100, value=1000))

interactive(children=(IntSlider(value=1000, description='time_gap', max=4000, min=100, step=100), Output()), _…

<function __main__.clustering_vs_raceid_plot_D2(time_gap)>

In [None]:
# Interactive Plot Cluster Squared Mean vs QS Difference
interact(clustering_vs_qsdiff_plot, time_gap=IntSlider(min=100, max=4000, step=100, value=1000))

interactive(children=(IntSlider(value=1000, description='time_gap', max=4000, min=100, step=100), Output()), _…

<function __main__.clustering_vs_qsdiff_plot(time_gap)>

In [25]:
# Interactive Plot Cluster Squared Mean vs Race ID
interact(clustering_vs_qsdiff_plot_D2, time_gap=IntSlider(min=100, max=4000, step=100, value=1000))

interactive(children=(IntSlider(value=1000, description='time_gap', max=4000, min=100, step=100), Output()), _…

<function __main__.clustering_vs_qsdiff_plot_D2(time_gap)>