In [9]:
import numpy as np 
import pandas as pd 
import fastf1
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from scipy.stats import spearmanr
from sklearn.preprocessing import OrdinalEncoder

**GET DATA**

In [12]:
# session = fastf1.get_session(2024, 'Miami', 'R')
# session.load()
# print(session.results.columns)
# print(session.results.head())

In [None]:
# Get data from all races of predefined seasons

races = []
years = [2022,2023,2024,2025]
for year in years:
    print(year)
    event_schedule = fastf1.get_event_schedule(year,include_testing=False,backend='fastf1')
    no_races = len(event_schedule["EventName"])

    for race in range(1,no_races):
        session = fastf1.get_session(year, race, 'R')
        if year == 2025 and race >= 6:
            print("Miami Grand Prix 2025 is not available, now looking into the future")
        else:
            session.load()
            try:
                weather = session.weather_data[["AirTemp","Rainfall"]]
            except:
                print("Weather data not available")
                continue
            location = session.event["EventName"]
            race = session.results[["Abbreviation","TeamName","GridPosition","Position"]]
            race["EventName"] = location
            race["Year"] = year
            try:
                race["Rainfall"] = weather["Rainfall"].any() # Was there any rain in the session
                race["AirTemp"] = weather["AirTemp"].mean() # Average air temperature in the session
            except:
                race["Rainfall"] = False
                race["AirTemp"] = 22.0 # Default air temperature in case of missing data
            races.append(race)
    

races = pd.concat(races, ignore_index=True)

**PRE-PROCESSING**

In [None]:
# Make feature table
data = races.iloc[:,[3,4,0,1,2,5,6,7]].dropna()
# print(data.tail(20))

# New feature: driver performance
data["AveragePos"] = data.groupby("Abbreviation")["Position"].shift(1).rolling(window=3,min_periods=1).mean().reset_index(drop=True)

# Split features and target
X = data[["Abbreviation","TeamName","EventName","Year","Rainfall","AirTemp","GridPosition","AveragePos"]]
Y = data[["Position"]]

# One-hot encode
X_encoded = pd.get_dummies(X,columns=["EventName","TeamName","Abbreviation","Year","Rainfall","AirTemp","GridPosition","AveragePos"],drop_first=True)
# print(X_encoded)

# Binary encode - WIP
# import category_encoders as ce
# encoder = ce.BinaryEncoder(cols=["EventName","TeamName","Abbreviation"])
# data_bin = encoder.fit_transform(data)
# print(data_bin)

# Label encoding for XGB model
label_encoder_abb = preprocessing.LabelEncoder()
label_encoder_abb.fit(X["Abbreviation"])
label_encoder_team = preprocessing.LabelEncoder()
label_encoder_team.fit(X["TeamName"])
label_encoder_event = preprocessing.LabelEncoder()
label_encoder_event.fit(X["EventName"])
label_encoder_rain = preprocessing.LabelEncoder()
label_encoder_rain.fit(X["Rainfall"])

X_le = pd.DataFrame()
X_le["Abbreviation"] = label_encoder_abb.transform(X["Abbreviation"])
X_le["TeamName"] = label_encoder_team.transform(X["TeamName"])
X_le["EventName"] = label_encoder_event.transform(X["EventName"])
X_le["Year"] = X["Year"]
X_le["Rainfall"] = label_encoder_rain.transform(X["Rainfall"])
X_le["AirTemp"] = X["AirTemp"]
X_le["GridPosition"] = X["GridPosition"]
X_le["AveragePos"] = X["AveragePos"]

# Train & test split
train_split = int(len(X_le)*.70) #How many data points is 70% n--> used for training
test_split = int(len(X_le)-train_split)
test_split_sample = len(X_le) - test_split
# print(X.iloc[range(test_split_sample-10,test_split_sample+10),:]) # split exactly between two races
test_split_sample = 678

X_train = X_le.iloc[range(0,test_split_sample),[0,1,2,3,4,5,6,7]]
Y_train = Y.iloc[range(0,test_split_sample),:]

X_test = X_le.iloc[range(test_split_sample,len(X_le)),[0,1,2,3,4,5,6,7]]
Y_test = Y.iloc[range(test_split_sample,len(Y)),:]

print(X_train)

# Don't forget about validation


      Position                 EventName Abbreviation         TeamName  \
1379       1.0  Saudi Arabian Grand Prix          PIA          McLaren   
1380       2.0  Saudi Arabian Grand Prix          VER  Red Bull Racing   
1381       3.0  Saudi Arabian Grand Prix          LEC          Ferrari   
1382       4.0  Saudi Arabian Grand Prix          NOR          McLaren   
1383       5.0  Saudi Arabian Grand Prix          RUS         Mercedes   
1384       6.0  Saudi Arabian Grand Prix          ANT         Mercedes   
1385       7.0  Saudi Arabian Grand Prix          HAM          Ferrari   
1386       8.0  Saudi Arabian Grand Prix          SAI         Williams   
1387       9.0  Saudi Arabian Grand Prix          ALB         Williams   
1388      10.0  Saudi Arabian Grand Prix          HAD     Racing Bulls   
1389      11.0  Saudi Arabian Grand Prix          ALO     Aston Martin   
1390      12.0  Saudi Arabian Grand Prix          LAW     Racing Bulls   
1391      13.0  Saudi Arabian Grand Pr

**Train the model**

In [15]:
# model = RandomForestRegressor(n_estimators=100,random_state=42)
# model.fit(X_encoded.iloc[0:918,:],Y)

from xgboost import XGBRegressor

xgb_model = XGBRegressor(objective='reg:squarederror', n_estimators=100)
xgb_model.fit(X_train, Y_train)

**Test the model**

In [16]:
y_pred = xgb_model.predict(X_test)
# y_pred = model.predict(X_test)

mae = mean_absolute_error(Y_test, y_pred)

print(f"MAE: {mae:.2f}")

# Spearmans ranking 
testdata = X.iloc[range(test_split_sample,len(X_le)),[0,1,2]]
testdata["Predicted Position"] = y_pred
testdata["Actual Position"] = Y_test

# Get a list of possible eventnames
# print(X["EventName"].unique(0))
# print(testdata["EventName"].unique())
test_br = testdata[testdata["EventName"]=="Saudi Arabian Grand Prix"]

# Rank both predicted and actual positions
actual_rank = test_br["Actual Position"].rank(method='min', ascending=True)
predicted_rank = test_br["Predicted Position"].rank(method='min', ascending=True)

# Compute Spearman correlation
rho, p_value = spearmanr(actual_rank,predicted_rank)

print(f"Spearman's Rank Correlation: {rho:.3f}")
if rho == 1:
    print("Perfect ranking match")
elif rho >= 0.8:
    print("Very good ranking similarity")
elif rho >= 0.4:
    print("Some correlation")
elif rho >= 0:
    print("No correlation")
elif rho >= -1:
    print("Opposite ranking")



MAE: 3.52
Spearman's Rank Correlation: 0.670
Some correlation


Generate qualifying positions

In [21]:
# session = fastf1.get_session(2025, 'Miami', 'Q')
# session.load()
# print(session.results)
miami25 = session.results[["Abbreviation","TeamName","Position"]]

Predict weather data using historical data

In [22]:
AirTemp_dict = data.groupby("EventName")["AirTemp"].mean().to_dict()
Rainfall_dict = data.groupby("EventName")["Rainfall"].mean().to_dict()
for key in Rainfall_dict:
    if Rainfall_dict[key] <= 0.4:
        Rainfall_dict[key] = False
    else:
        Rainfall_dict[key] = True
latest_races = data[data['Year']==2025].tail(60) # Get last 3 races
AveragePos = latest_races.groupby("Abbreviation")['Position'].mean()


2025 prediction!

In [None]:
# Miami will be the 6th race of the season
miami25["EventName"] = "Miami Grand Prix"
miami25["Year"] = 2025
miami25["Rainfall"] = Rainfall_dict["Miami Grand Prix"]
miami25["AirTemp"] = AirTemp_dict["Miami Grand Prix"]
miami25["AveragePos"] = AveragePos
miami25["GridPosition"] = miami25["Position"]
# miami25 = pd.DataFrame({
#     "EventName": ["Miami Grand Prix"] * 20,
#     "Abbreviation": [
#         "VER",  # Max Verstappen
#         "TSU",  # Yuki Tsunoda
#         "HAM",  # Lewis Hamilton
#         "LEC",  # Charles Leclerc
#         "RUS",  # George Russell
#         "ANT",  # Andrea Kimi Antonelli
#         "NOR",  # Lando Norris
#         "PIA",  # Oscar Piastri
#         "ALO",  # Fernando Alonso
#         "STR",  # Lance Stroll
#         "GAS",  # Pierre Gasly
#         "DOO",  # Jack Doohan
#         "ALB",  # Alex Albon
#         "SAI",  # Carlos Sainz
#         "LAW",  # Liam Lawson
#         "HAD",  # Isack Hadjar
#         "HUL",  # Nico Hülkenberg
#         "BOR",  # Gabriel Bortoleto
#         "BEA",  # Oliver Bearman
#         "OCO"   # Esteban Ocon
#     ],
#     "TeamName": [
#         "Red Bull Racing",
#         "Red Bull Racing",
#         "Ferrari",
#         "Ferrari",
#         "Mercedes",
#         "Mercedes",
#         "McLaren",
#         "McLaren",
#         "Aston Martin",
#         "Aston Martin",
#         "Alpine",
#         "Alpine",
#         "Williams",
#         "Williams",
#         "RB",
#         "RB",
#         "Kick Sauber",
#         "Kick Sauber",
#         "Haas F1 Team",
#         "Haas F1 Team"
#     ],
#     "Year": [2025] * 20,
#     "Rainfall": Rainfall_dict["Miami Grand Prix"],
#     "AirTemp": AirTemp_dict["Miami Grand Prix"],
#     "AveragePos": AveragePos,
#     "GridPosition": data.head(20)["GridPosition"].values.tolist()
# })

# # Encode 2025 data
# miami25_encoded = pd.get_dummies(miami25,columns=['Abbreviation', 'TeamName', 'EventName'],drop_first=True)
# # Get list of columns from training data
# cols = X_encoded.columns.tolist()
# # Fill 2025 data with training columns (like old drivers or tracks), setting them to False
# miami25_encoded = miami25_encoded.reindex(columns=cols).fillna(False)
# # print(miami25_encoded)

# Label encoding for XGB model
Y_le = pd.DataFrame()
Y_le["Year"] = miami25["Year"]
Y_le["AirTemp"] = miami25["AirTemp"]
Y_le["GridPosition"] = miami25["GridPosition"]
Y_le["AveragePos"] = miami25["AveragePos"]
Y_le["Abbreviation"] = label_encoder_abb.transform(miami25["Abbreviation"])
Y_le["TeamName"] = label_encoder_team.transform(miami25["TeamName"])
Y_le["EventName"] = label_encoder_event.transform(miami25["EventName"])
Y_le["Rainfall"] = label_encoder_rain.transform(miami25["Rainfall"])

Y_le = Y_le[["Abbreviation","TeamName","EventName","Year","Rainfall","AirTemp","GridPosition","AveragePos"]]




    Abbreviation  TeamName  EventName  Year  Rainfall    AirTemp  \
1             28        11         16  2025         0  28.752234   
4             18         7         16  2025         0  28.752234   
12             2         8         16  2025         0  28.752234   
81            21         7         16  2025         0  28.752234   
63            23         8         16  2025         0  28.752234   
55            24        12         16  2025         0  28.752234   
23             0        12         16  2025         0  28.752234   
16            15         4         16  2025         0  28.752234   
31            19         5         16  2025         0  28.752234   
22            27        11         16  2025         0  28.752234   
6             10        10         16  2025         0  28.752234   
44            11         4         16  2025         0  28.752234   
5              4         6         16  2025         0  28.752234   
7              8         2         16  2025     

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  miami25["EventName"] = "Miami Grand Prix"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  miami25["Year"] = 2025
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  miami25["Rainfall"] = Rainfall_dict["Miami Grand Prix"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[ro

Three options to test:
- 'normal' RandomForestRegressor
- XGB model (extreme gradiant boost) with one-hot encoded data (KEEP IN MIND TO TRAIN THE MODEL WITH CORRECTLY ENCODED DATA!)
- XGB model with label encoded data (KEEP IN MIND TO TRAIN THE MODEL WITH CORRECTLY ENCODED DATA!)

In [25]:
# predicted_positions = model.predict(miami25_encoded)
# predicted_positions = xgb_model.predict(miami25_encoded)
predicted_positions = xgb_model.predict(Y_le)

miami25["Predicted Position"] = predicted_positions

miami25_sorted = miami25.sort_values("Predicted Position").reset_index(drop=True)
miami25_sorted["FinalRank"] = miami25_sorted.index + 1
print(miami25_sorted[["Abbreviation", "TeamName", "Predicted Position", "FinalRank"]])

   Abbreviation         TeamName  Predicted Position  FinalRank
0           TSU  Red Bull Racing            4.530632          1
1           ANT         Mercedes            5.822234          2
2           NOR          McLaren            6.584948          3
3           RUS         Mercedes            7.576278          4
4           BEA     Haas F1 Team            9.384250          5
5           SAI         Williams            9.853812          6
6           HAD     Racing Bulls            9.867655          7
7           BOR      Kick Sauber           10.454054          8
8           ALB         Williams           10.526019          9
9           HAM          Ferrari           10.574513         10
10          GAS           Alpine           10.733112         11
11          STR     Aston Martin           11.251393         12
12          ALO     Aston Martin           11.507257         13
13          PIA          McLaren           11.883094         14
14          LEC          Ferrari        

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  miami25["Predicted Position"] = predicted_positions
