In [1]:
import numpy as np 
import pandas as pd 
import fastf1
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from scipy.stats import spearmanr

**GET DATA**

In [18]:
# Get data from all races of predefined seasons


races = []
years = [2023,2024,2025]
for year in years:
    print(year)
    event_schedule = fastf1.get_event_schedule(year,include_testing=False,backend='fastf1')
    no_races = len(event_schedule["EventName"])

    for race in range(1,no_races):
        session = fastf1.get_session(year, race, 'R')
        session.load()
    
        location = session.event["EventName"]
        race = session.results[["Abbreviation","TeamName","Position"]]
        race["EventName"] = location
        race["Year"] = year
        races.append(race)
    

races = pd.concat(races, ignore_index=True)
# print(races)

2023


core           INFO 	Loading data for Bahrain Grand Prix - Race [v3.5.3]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '11', '14', '55', '44', '18', '63', '77', '10', '23', '22', '2', '20', '21', '27', '24', '4', '31', '16', '81']
A value is trying to be set on a copy of a slice from a DataF

2024


req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '11', '55', '16', '63', '4', '44', '81', '14', '18', '24', '20', '3', '22', '23', '27', '31', '10', '77', '2']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cav

2025


req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['4', '1', '63', '12', '23', '18', '27', '16', '81', '44', '10', '22', '31', '87', '30', '5', '14', '55', '7', '6']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vie

**PRE-PROCESSING**

In [57]:
# Make feature table
data = races.iloc[:,[3,4,0,1,2]].dropna()

# Split features and target
X = data[["Abbreviation","TeamName","EventName"]]
Y = data[["Position"]]

# One-hot encode
X_encoded = pd.get_dummies(X,columns=["EventName","TeamName","Abbreviation"],drop_first=True)
# print(X_encoded)

# Binary encode - WIP
# import category_encoders as ce
# encoder = ce.BinaryEncoder(cols=["EventName","TeamName","Abbreviation"])
# data_bin = encoder.fit_transform(data)
# print(data_bin)

# Label encoding for XGB model
label_encoder = preprocessing.LabelEncoder()
X_le = pd.DataFrame()
X_le["Abbreviation"] = label_encoder.fit_transform(X["Abbreviation"])
X_le["TeamName"] = label_encoder.fit_transform(X["TeamName"])
X_le["EventName"] = label_encoder.fit_transform(X["EventName"])
# X_le["Year"] = label_encoder.fit_transform(X["Year"]) # Only use if we want YEAR to be a feature in the model.

# print(X_le)

# Train & test split
train_split = int(len(X_le)*.70) #How many data points is 70% n--> used for training
test_split = int(len(X_le)-train_split)
test_split_sample = len(X_le) - test_split
# print(X.iloc[range(test_split_sample-10,test_split_sample+10),:]) # split exactly between two races
test_split_sample = 678

X_train = X_le.iloc[range(0,test_split_sample),[0,1,2]]
Y_train = Y.iloc[range(0,test_split_sample),:]

X_test = X_le.iloc[range(test_split_sample,len(X_le)),[0,1,2]]
Y_test = Y.iloc[range(test_split_sample,len(Y)),:]

# Don't forget about validation


**Train the model**

In [58]:
# model = RandomForestRegressor(n_estimators=100,random_state=42)
# model.fit(X_encoded.iloc[0:918,:],Y)

from xgboost import XGBRegressor

xgb_model = XGBRegressor(objective='reg:squarederror', n_estimators=100)
xgb_model.fit(X_train, Y_train)

**Test the model**

In [59]:
y_pred = xgb_model.predict(X_test)
# y_pred = model.predict(X_test)

mae = mean_absolute_error(Y_test, y_pred)

print(f"MAE: {mae:.2f}")

# Spearmans ranking 
testdata = X.iloc[range(test_split_sample,len(X_le)),[0,1,2]]
testdata["Predicted Position"] = y_pred
testdata["Actual Position"] = Y_test

# Get a list of possible eventnames
# print(X["EventName"].unique(0))
# print(testdata["EventName"].unique())
test_br = testdata[testdata["EventName"]=="Saudi Arabian Grand Prix"]

# Rank both predicted and actual positions
actual_rank = test_br["Actual Position"].rank(method='min', ascending=True)
predicted_rank = test_br["Predicted Position"].rank(method='min', ascending=True)

# Compute Spearman correlation
rho, p_value = spearmanr(actual_rank,predicted_rank)

print(f"Spearman's Rank Correlation: {rho:.3f}")
if rho == 1:
    print("Perfect ranking match")
elif rho >= 0.8:
    print("Very good ranking similarity")
elif rho >= 0.4:
    print("Some correlation")
elif rho >= 0:
    print("No correlation")
elif rho >= -1:
    print("Opposite ranking")



MAE: 4.64
Spearman's Rank Correlation: 0.442
Some correlation


In [61]:
miami25 = pd.DataFrame({
    "EventName": ["Miami Grand Prix"] * 20,
    "Abbreviation": [
        "VER",  # Max Verstappen
        "TSU",  # Yuki Tsunoda
        "HAM",  # Lewis Hamilton
        "LEC",  # Charles Leclerc
        "RUS",  # George Russell
        "ANT",  # Andrea Kimi Antonelli
        "NOR",  # Lando Norris
        "PIA",  # Oscar Piastri
        "ALO",  # Fernando Alonso
        "STR",  # Lance Stroll
        "GAS",  # Pierre Gasly
        "DOO",  # Jack Doohan
        "ALB",  # Alex Albon
        "SAI",  # Carlos Sainz
        "LAW",  # Liam Lawson
        "HAD",  # Isack Hadjar
        "HUL",  # Nico Hülkenberg
        "BOR",  # Gabriel Bortoleto
        "BEA",  # Oliver Bearman
        "OCO"   # Esteban Ocon
    ],
    "TeamName": [
        "Red Bull Racing",
        "Red Bull Racing",
        "Ferrari",
        "Ferrari",
        "Mercedes",
        "Mercedes",
        "McLaren",
        "McLaren",
        "Aston Martin",
        "Aston Martin",
        "Alpine",
        "Alpine",
        "Williams",
        "Williams",
        "RB",
        "RB",
        "Kick Sauber",
        "Kick Sauber",
        "Haas F1 Team",
        "Haas F1 Team"
    ],
    "Year": [2025] * 20
})

# Encode 2025 data
miami25_encoded = pd.get_dummies(miami25,columns=['Abbreviation', 'TeamName', 'EventName'],drop_first=True)
# Get list of columns from training data
cols = X_encoded.columns.tolist()
# Fill 2025 data with training columns (like old drivers or tracks), setting them to False
miami25_encoded = miami25_encoded.reindex(columns=cols).fillna(False)
# print(miami25_encoded)

# Label encoding for XGB model
Y_le = pd.DataFrame()
Y_le["Abbreviation"] = label_encoder.fit_transform(miami25["Abbreviation"])
Y_le["TeamName"] = label_encoder.fit_transform(miami25["TeamName"])
Y_le["EventName"] = label_encoder.fit_transform(miami25["EventName"])
# Y_le["Year"] = label_encoder.fit_transform(miami25["Year"]) # Only use if we want the YEAR to be a feature in the model.


# print(Y_le)



  miami25_encoded = miami25_encoded.reindex(columns=cols).fillna(False)


Three options to test:
- 'normal' RandomForestRegressor
- XGB model (extreme gradiant boost) with one-hot encoded data (KEEP IN MIND TO TRAIN THE MODEL WITH CORRECTLY ENCODED DATA!)
- XGB model with label encoded data (KEEP IN MIND TO TRAIN THE MODEL WITH CORRECTLY ENCODED DATA!)

In [62]:
# predicted_positions = model.predict(miami25_encoded)
# predicted_positions = xgb_model.predict(miami25_encoded)
predicted_positions = xgb_model.predict(Y_le)


miami25["Predicted Position"] = predicted_positions

miami25_sorted = miami25.sort_values("Predicted Position").reset_index(drop=True)
miami25_sorted["FinalRank"] = miami25_sorted.index + 1
print(miami25_sorted)

           EventName Abbreviation         TeamName  Year  Predicted Position  \
0   Miami Grand Prix          LAW               RB  2025            5.953818   
1   Miami Grand Prix          BEA     Haas F1 Team  2025            6.083793   
2   Miami Grand Prix          HUL      Kick Sauber  2025            6.267663   
3   Miami Grand Prix          BOR      Kick Sauber  2025            6.562505   
4   Miami Grand Prix          HAD               RB  2025            6.905750   
5   Miami Grand Prix          NOR          McLaren  2025            8.185206   
6   Miami Grand Prix          SAI         Williams  2025            9.218492   
7   Miami Grand Prix          OCO     Haas F1 Team  2025            9.808806   
8   Miami Grand Prix          ALO     Aston Martin  2025           10.387738   
9   Miami Grand Prix          GAS           Alpine  2025           10.965835   
10  Miami Grand Prix          DOO           Alpine  2025           10.965835   
11  Miami Grand Prix          ALB       