In [86]:
import pandas as pd

# Load the datasets
data_2023 = pd.read_csv("2024_data/2023_results.csv")
data_2024 = pd.read_csv("2024_data/2024_race_results.csv")

# Add the year column
data_2023['Year'] = 2023
data_2024['Year'] = 2024

# Keep only the required columns
columns_to_keep = ['Track', 'Driver', 'Team', 'Starting Grid', 'Position', 'Year']
data_2023 = data_2023[columns_to_keep]
data_2024 = data_2024[columns_to_keep]

# Combine the datasets
combined_data = pd.concat([data_2023, data_2024], ignore_index=True)

# Replace DQ and NC values in the Position column with 20
combined_data['Position'] = combined_data['Position'].replace({'DQ': 20, 'NC': 20})

# Convert Position column to numeric
combined_data['Position'] = pd.to_numeric(combined_data['Position'], errors='coerce')

# Feature engineering
combined_data['driver_avg_starting_position'] = combined_data.groupby('Driver')['Starting Grid'].transform('mean')
combined_data['driver_avg_finishing_position'] = combined_data.groupby('Driver')['Position'].transform('mean')
combined_data['team_avg_starting_position'] = combined_data.groupby('Team')['Starting Grid'].transform('mean')
combined_data['team_avg_finishing_position'] = combined_data.groupby('Team')['Position'].transform('mean')
combined_data['track_driver_avg_finishing_position'] = combined_data.groupby(['Track', 'Driver'])['Position'].transform('mean')
combined_data['track_team_avg_finishing_position'] = combined_data.groupby(['Track', 'Team'])['Position'].transform('mean')

# Calculate the current form of the driver (average of the last 10 races)
combined_data['driver_form'] = combined_data.groupby('Driver', group_keys=False)['Position'].apply(lambda x: x.rolling(10, min_periods=1).mean())

# Calculate the current form of the team (average of the last 10 races)
combined_data['team_form'] = combined_data.groupby('Team', group_keys=False)['Position'].apply(lambda x: x.rolling(10, min_periods=1).mean())

# Driver's race win count
combined_data['driver_win_count'] = combined_data.groupby('Driver')['Position'].transform(lambda x: (x == 1).sum())

# Team's race win count
combined_data['team_win_count'] = combined_data.groupby('Team')['Position'].transform(lambda x: (x == 1).sum())

# Driver's podium finishes count (top 3 finishes)
combined_data['driver_podium_count'] = combined_data.groupby('Driver')['Position'].transform(lambda x: (x <= 3).sum())

# Team's podium finishes count (top 3 finishes)
combined_data['team_podium_count'] = combined_data.groupby('Team')['Position'].transform(lambda x: (x <= 3).sum())

# Save the cleaned and enhanced dataset
combined_data.to_csv("cleaned_final_data.csv", index=False)

# Display the first few rows of the enhanced dataframe
print(combined_data.head())


     Track           Driver                          Team  Starting Grid  \
0  Bahrain   Max Verstappen    Red Bull Racing Honda RBPT              1   
1  Bahrain     Sergio Perez    Red Bull Racing Honda RBPT              2   
2  Bahrain  Fernando Alonso  Aston Martin Aramco Mercedes              5   
3  Bahrain     Carlos Sainz                       Ferrari              4   
4  Bahrain   Lewis Hamilton                      Mercedes              7   

   Position  Year  driver_avg_starting_position  \
0         1  2023                      2.705882   
1         2  2023                      9.058824   
2         3  2023                      7.941176   
3         4  2023                      5.545455   
4         5  2023                      6.676471   

   driver_avg_finishing_position  team_avg_starting_position  \
0                       2.058824                    5.882353   
1                       6.705882                    5.882353   
2                       7.941176            

In [88]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load the cleaned dataset
combined_data = pd.read_csv("cleaned_final_data.csv")

# Select relevant features and target variable
features = combined_data[['Track', 'Driver', 'Starting Grid', 'driver_avg_starting_position', 'driver_avg_finishing_position', 
                          'team_avg_starting_position', 'team_avg_finishing_position', 'track_driver_avg_finishing_position', 
                          'track_team_avg_finishing_position', 'driver_form', 'team_form', 'team_podium_count', 'driver_podium_count', 'team_win_count', 'driver_win_count']]
target = combined_data['Position']

# Convert categorical variables to dummy variables
features = pd.get_dummies(features, columns=['Track', 'Driver'], drop_first=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Train the Random Forest model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")

# Display the first few rows of the features to verify
X_train.head()


Mean Squared Error: 7.925469117647058
R² Score: 0.7826446061299808


Unnamed: 0,Starting Grid,driver_avg_starting_position,driver_avg_finishing_position,team_avg_starting_position,team_avg_finishing_position,track_driver_avg_finishing_position,track_team_avg_finishing_position,driver_form,team_form,team_podium_count,...,Driver_Logan Sargeant,Driver_Max Verstappen,Driver_Nico Hulkenberg,Driver_Nyck De Vries,Driver_Oliver Bearman,Driver_Oscar Piastri,Driver_Pierre Gasly,Driver_Sergio Perez,Driver_Valtteri Bottas,Driver_Yuki Tsunoda
430,15,13.421053,13.421053,14.431818,13.636364,11.0,9.5,12.714286,11.5,0,...,False,False,False,False,False,False,False,False,False,False
249,16,12.264706,12.735294,12.352941,12.264706,10.0,6.5,11.5,13.4,2,...,False,False,False,False,False,False,False,False,False,False
220,6,2.705882,2.058824,5.882353,4.382353,1.0,1.5,1.1,2.7,43,...,False,True,False,False,False,False,False,False,False,False
18,3,5.0,7.558824,5.352941,7.191176,12.0,7.75,20.0,12.0,19,...,False,False,False,False,False,False,False,False,False,False
244,6,5.545455,6.818182,5.352941,7.191176,5.0,12.5,7.7,7.5,19,...,False,False,False,False,False,False,False,False,False,False


In [89]:
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)


In [90]:
y_pred = model.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")


Mean Squared Error: 7.925469117647058
R^2 Score: 0.7826446061299808


In [92]:
# Add interaction terms
combined_data['driver_team_interaction'] = combined_data.groupby(['Driver', 'Team'])['Position'].transform('mean')
combined_data['track_starting_grid_interaction'] = combined_data.groupby(['Track', 'Starting Grid'])['Position'].transform('mean')

# Select relevant features and target variable
features = combined_data[['Track', 'Driver', 'Starting Grid', 'driver_avg_starting_position', 'driver_avg_finishing_position', 
                          'team_avg_starting_position', 'team_avg_finishing_position', 'track_driver_avg_finishing_position', 
                          'track_team_avg_finishing_position', 'driver_form', 'team_form', 
                          'driver_team_interaction', 'track_starting_grid_interaction', 'team_podium_count', 'driver_podium_count', 'team_win_count', 'driver_win_count']]
target = combined_data['Position']

# Convert categorical variables to dummy variables
features = pd.get_dummies(features, columns=['Track', 'Driver'], drop_first=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)


In [93]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the model
model = RandomForestRegressor(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best parameters and the best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best Parameters:", best_params)


Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.6s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.6s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.6s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.6s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.6s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   1.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   1.5s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   1.6s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   1.7s
[CV] END m

In [94]:
# Make predictions on the test data
y_pred = best_model.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")


Mean Squared Error: 4.682008492209817
R^2 Score: 0.8715962695935447


In [115]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures

# Load the datasets
data_2023 = pd.read_csv("2024_data/2023_results.csv")
data_2024 = pd.read_csv("2024_data/2024_race_results.csv")

# Add the year column
data_2023['Year'] = 2023
data_2024['Year'] = 2024

# Keep only the required columns
columns_to_keep = ['Track', 'Driver', 'Team', 'Starting Grid', 'Position', 'Year']
data_2023 = data_2023[columns_to_keep]
data_2024 = data_2024[columns_to_keep]

# Combine the datasets
combined_data = pd.concat([data_2023, data_2024], ignore_index=True)

combined_data = combined_data[~combined_data['Position'].isin(['DQ', 'NC'])]

# Convert the Finishing Position column to numeric
combined_data['Position'] = combined_data['Position'].astype(int)

# Feature engineering
combined_data['driver_avg_starting_position'] = combined_data.groupby('Driver')['Starting Grid'].transform('mean')
combined_data['driver_avg_finishing_position'] = combined_data.groupby('Driver')['Position'].transform('mean')
combined_data['team_avg_starting_position'] = combined_data.groupby('Team')['Starting Grid'].transform('mean')
combined_data['team_avg_finishing_position'] = combined_data.groupby('Team')['Position'].transform('mean')
combined_data['track_driver_avg_finishing_position'] = combined_data.groupby(['Track', 'Driver'])['Position'].transform('mean')
combined_data['track_team_avg_finishing_position'] = combined_data.groupby(['Track', 'Team'])['Position'].transform('mean')

# Calculate the current form of the driver (average of the last 10 races)
combined_data['driver_form'] = combined_data.groupby('Driver', group_keys=False)['Position'].apply(lambda x: x.rolling(10, min_periods=1).mean())

# Calculate the current form of the team (average of the last 10 races)
combined_data['team_form'] = combined_data.groupby('Team', group_keys=False)['Position'].apply(lambda x: x.rolling(10, min_periods=1).mean())

# Driver's race win count
combined_data['driver_win_count'] = combined_data.groupby('Driver')['Position'].transform(lambda x: (x == 1).sum())

# Team's race win count
combined_data['team_win_count'] = combined_data.groupby('Team')['Position'].transform(lambda x: (x == 1).sum())

# Driver's podium finishes count (top 3 finishes)
combined_data['driver_podium_count'] = combined_data.groupby('Driver')['Position'].transform(lambda x: (x <= 3).sum())

# Team's podium finishes count (top 3 finishes)
combined_data['team_podium_count'] = combined_data.groupby('Team')['Position'].transform(lambda x: (x <= 3).sum())

# Add interaction terms
combined_data['driver_team_interaction'] = combined_data.groupby(['Driver', 'Team'])['Position'].transform('mean')
combined_data['track_starting_grid_interaction'] = combined_data.groupby(['Track', 'Starting Grid'])['Position'].transform('mean')
combined_data['driver_track_interaction'] = combined_data.groupby(['Driver', 'Track'])['Position'].transform('mean')
combined_data['team_track_interaction'] = combined_data.groupby(['Team', 'Track'])['Position'].transform('mean')

combined_data.to_csv("combined_data.csv")


In [116]:

# Select relevant features and target variable
features = combined_data[['Starting Grid', 'driver_avg_starting_position', 'driver_avg_finishing_position', 
                          'team_avg_starting_position', 'team_avg_finishing_position', 'track_driver_avg_finishing_position', 
                          'track_team_avg_finishing_position', 'driver_form', 'team_form', 'driver_team_interaction', 
                          'track_starting_grid_interaction', 'driver_track_interaction', 'team_track_interaction', 
                          'team_podium_count', 'driver_podium_count', 'team_win_count', 'driver_win_count']]

target = combined_data['Position']

# Convert categorical variables to dummy variables
features = pd.get_dummies(features, drop_first=True)

# Add polynomial features
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
features_poly = poly.fit_transform(features)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_poly, target, test_size=0.2, random_state=42)

# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the model
model = RandomForestRegressor(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best parameters and the best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best Parameters:", best_params)

# Make predictions on the test data
y_pred = best_model.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")


Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   7.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   7.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   7.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   7.5s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   7.5s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  11.6s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  12.0s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  12.0s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   8.4s
[CV] END m

In [117]:
import joblib

# Save the model to a file
joblib.dump(best_model, 'f1_finishing_position_predictor.pkl')


['f1_finishing_position_predictor.pkl']

In [183]:
import joblib
import pandas as pd

# Load the combined data to get the mappings
combined_data = pd.read_csv("combined_data.csv")

# Load the saved model
best_model = joblib.load('f1_finishing_position_predictor.pkl')

def safe_get_value(series, default_value=0):
    try:
        return series.values[0]
    except IndexError:
        return default_value

def predict_finishing_position(track, driver, starting_grid):
    # Create a DataFrame for the input
    input_data = {
        'Track': [track],
        'Driver': [driver],
        'Starting Grid': [starting_grid]
    }

    # Calculate the engineered features for the input data
    input_df = pd.DataFrame(input_data)
    input_df['driver_avg_starting_position'] = safe_get_value(combined_data.groupby('Driver')['Starting Grid'].transform('mean').loc[combined_data['Driver'] == driver])
    input_df['driver_avg_finishing_position'] = safe_get_value(combined_data.groupby('Driver')['Position'].transform('mean').loc[combined_data['Driver'] == driver])
    input_df['team_avg_starting_position'] = safe_get_value(combined_data.groupby('Team')['Starting Grid'].transform('mean').loc[combined_data['Driver'] == driver])
    input_df['team_avg_finishing_position'] = safe_get_value(combined_data.groupby('Team')['Position'].transform('mean').loc[combined_data['Driver'] == driver])
    input_df['track_driver_avg_finishing_position'] = safe_get_value(combined_data.groupby(['Track', 'Driver'])['Position'].transform('mean').loc[(combined_data['Track'] == track) & (combined_data['Driver'] == driver)])
    input_df['track_team_avg_finishing_position'] = safe_get_value(combined_data.groupby(['Track', 'Team'])['Position'].transform('mean').loc[(combined_data['Track'] == track) & (combined_data['Driver'] == driver)])
    input_df['driver_form'] = safe_get_value(combined_data.groupby('Driver', group_keys=False)['Position'].apply(lambda x: x.rolling(10, min_periods=1).mean()).loc[combined_data['Driver'] == driver])
    input_df['team_form'] = safe_get_value(combined_data.groupby('Team', group_keys=False)['Position'].apply(lambda x: x.rolling(10, min_periods=1).mean()).loc[combined_data['Driver'] == driver])
    input_df['driver_win_count'] = safe_get_value(combined_data.groupby('Driver')['Position'].transform(lambda x: (x == 1).sum()).loc[combined_data['Driver'] == driver])
    input_df['team_win_count'] = safe_get_value(combined_data.groupby('Team')['Position'].transform(lambda x: (x == 1).sum()).loc[combined_data['Driver'] == driver])
    input_df['driver_podium_count'] = safe_get_value(combined_data.groupby('Driver')['Position'].transform(lambda x: (x <= 3).sum()).loc[combined_data['Driver'] == driver])
    input_df['team_podium_count'] = safe_get_value(combined_data.groupby('Team')['Position'].transform(lambda x: (x <= 3).sum()).loc[combined_data['Driver'] == driver])
    input_df['driver_team_interaction'] = safe_get_value(combined_data.groupby(['Driver', 'Team'])['Position'].transform('mean').loc[(combined_data['Driver'] == driver) & (combined_data['Track'] == track)])
    input_df['track_starting_grid_interaction'] = safe_get_value(combined_data.groupby(['Track', 'Starting Grid'])['Position'].transform('mean').loc[(combined_data['Track'] == track) & (combined_data['Starting Grid'] == starting_grid)])
    input_df['driver_track_interaction'] = safe_get_value(combined_data.groupby(['Driver', 'Track'])['Position'].transform('mean').loc[(combined_data['Driver'] == driver) & (combined_data['Track'] == track)])
    input_df['team_track_interaction'] = safe_get_value(combined_data.groupby(['Team', 'Track'])['Position'].transform('mean').loc[(combined_data['Track'] == track) & (combined_data['Driver'] == driver)])

    # Convert categorical variables to dummy variables
    input_df = pd.get_dummies(input_df, drop_first=True)

    # Ensure the input_df has the same columns as the training data
    missing_cols = set(features.columns) - set(input_df.columns)
    for col in missing_cols:
        input_df[col] = 0
    input_df = input_df[features.columns]

    # Add polynomial features
    input_features_poly = poly.transform(input_df)

    # Predict the finishing position
    prediction = best_model.predict(input_features_poly)
    
    return prediction[0]

# Example usage
track = "Abu Dhabi"  # Replace with input track
driver = "Logan Sargeant"  # Replace with input driver
starting_grid = 10 # Replace with input starting grid position

predicted_position = predict_finishing_position(track, driver, starting_grid)
print(f"Predicted Finishing Position for {driver} starting at {starting_grid} on {track}: {predicted_position}")


Predicted Finishing Position for Logan Sargeant starting at 10 on Abu Dhabi: 15.183511139595705
