In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
import fastf1 as ff1
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [28]:
#I. Defining all important functions that will be later called during our learning/training process.

def get_race_info(year, race_number, session='Race'):
    session = ff1.get_session(year, race_number, session)
    session.load(laps=False, telemetry=False, weather=False, messages=False)
    race_result = session.results
    return race_result[["DriverNumber", "Abbreviation", "TeamName", "LastName", "GridPosition", "Position", "Time"]]

#When building this program I started by calling a lot of times the same information so to simplify the code and make it less demanding I create this function to be able to reuse cached schedules.
def Schedule(year):
    return ff1.get_event_schedule(year, include_testing=False)

def get_calendar(year):
    calendar = Schedule(year) 
    circuits = list(calendar.Location)
    return circuits

def quali_results(year, race):
    session = ff1.get_session(year, race, 'Qualifying')
    session.load(laps = False, telemetry=False, weather=False, messages=False)
    result = session.results.copy()

    quali = result[["DriverNumber", "Abbreviation", "Position", "Q1", "Q2", "Q3"]].copy() 
    quali.rename(columns={'Position': 'QualiPosition'}, inplace=True)
    quali['Year'] = int(year)
    quali['Race'] = race
    return quali

def get_race_data(year, race):
    race_df = get_race_info(year, race).copy()
    race_df['Race'] = race
    race_df['Year'] = int(year)

    quali_df = quali_results(year, race)

    general_df = pd.merge(race_df, quali_df, on = ['Year', 'Race', 'DriverNumber', 'Abbreviation'], how = 'left')

    return general_df

def learning_table_specific_race(race_name, years): 
    rows = []
    for year in years : 
        data = get_race_data(int(year), race_name)
        rows.append(data)

    if rows : 
        df = pd.concat(rows, ignore_index = True)
        return df
        
#II. Implementing a function that cleans our dataset/preprocess it 

def clean_data_prep(data): 
    clean_data = data[['DriverNumber', 'Abbreviation', 'TeamName', 'GridPosition', 'Position', 'Time', 'Year']]
    clean_data['Year'] = clean_data['Year'].astype(int)
    encoder = LabelEncoder()
    clean_data['TeamNumber'] = encoder.fit_transform(clean_data['TeamName']) #1. I change the name of the teams into values 

    clean_data['Time_td'] = pd.to_timedelta(clean_data['Time'], errors = 'coerce') #2. Time as expressed in fastF1 API isn't readable for our model so I converted  the time stamps to time deltas to the leader/winner.
    clean_data['gap_to_winner'] = clean_data['Time_td'].dt.total_seconds()
    clean_data.loc[clean_data['Position'] == 1, 'gap_to_winner'] = 0 #This is for the first car, since it leads the race there no time delta.
    clean_data = clean_data.drop(columns=['Time', 'Time_td'])

    clean_data['GaptoP1 (sec)'] = clean_data['gap_to_winner'].fillna(9999) # I fill the empty time stamps (DnF) with '9999'
    clean_data['GridPosition'] = clean_data['GridPosition'].fillna(25) # I fill the position of drivers who don't start on the grid to P25.
    clean_data = clean_data.drop(columns=['TeamName', 'gap_to_winner'])

    return clean_data

#III. Let's now build our actual race predictor

def predict_winner(race_name, predict_year, training_years):
    predict_year = int(predict_year)
    
    all_years = training_years + [str(predict_year)]
    all_data = learning_table_specific_race(race_name, all_years)

    clean_data = clean_data_prep(all_data)
    train_data = clean_data[clean_data['Year'] != int(predict_year)].copy() 
    test_data = clean_data[clean_data['Year'] == predict_year].copy() #I selected our testing set, which is the set on which the model won't be trained on + to assess the quality of our model.

    X_train = train_data.drop(columns=['DriverNumber', 'Abbreviation', 'Position'])
    y_train = train_data['Position']
    
    X_test = test_data.drop(columns=['DriverNumber', 'Abbreviation', 'Position'])
    y_test = test_data['Position']

    #since the data doesn't participate in the same way/weight to the performance of the driver during a race, we need to scale it.
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    classifier = RandomForestClassifier(n_estimators = 200, max_depth = 10, min_samples_leaf = 5, random_state = 67)
    model = classifier.fit(X_train_scaled, y_train) #Training, fitting our model to the training dataset.

    predictions = model.predict(X_test_scaled)

    results = test_data[['Abbreviation']].copy()
    results['ActualPosition'] = y_test.values
    results['PredictedPosition'] = predictions
    results = results.sort_values('PredictedPosition') #To have the full results correctly shown

    predicted_winner = results.sort_values('PredictedPosition').iloc[0]['Abbreviation']
    predicted_second = results.sort_values('PredictedPosition').iloc[1]['Abbreviation']
    predicted_third = results.sort_values('PredictedPosition').iloc[2]['Abbreviation']
    
    actual_winner = results[results['ActualPosition'] == 1]['Abbreviation'].values[0]
    
    accuracy = accuracy_score(y_test, predictions)

    print(f"\nPredicted Winner : {predicted_winner}")
    print(f"\nPredicted Second : {predicted_second}")
    print(f"\nPredicted Third : {predicted_third}")
    print(f"\nActual Winner : {actual_winner}")
    print(f"\nModel Accuracy: {round(accuracy*100,2)}% ")
    
    return predicted_winner

In [30]:
results = predict_winner('Monaco Grand Prix', '2025', ['2022', '2023', '2024'])


core           INFO 	Loading data for Monaco Grand Prix - Race [v3.6.1]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
core           INFO 	Finished loading data for 20 drivers: ['11', '55', '1', '16', '63', '4', '14', '44', '77', '5', '10', '31', '3', '18', '6', '24', '22', '23', '47', '20']
core           INFO 	Loading data for Monaco Grand Prix - Qualifying [v3.6.1]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
core           INFO 	Finished loading data for 20 drivers: ['16', '55', '11', '1', '4', '63', '14', '44', '5', '31', '22', '77', '20', '3', '47', '23', '10', '18', '6', '24']
core           INFO 	Loading data for Monaco Grand Prix - Race [v3.6.1]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
core           INFO 	Finished loading data for 20 drivers: ['1', '14', '31', '44', '63', '16', 


Predicted Winner : NOR

Predicted Second : PIA

Predicted Third : LEC

Actual Winner : NOR

Model Accuracy: 25.0% 
