In [1]:
import pandas as pd
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures

# Load datasets
data_2023 = pd.read_csv("2024_data/2023_results.csv")
data_2024 = pd.read_csv("2024_data/2024_race_results.csv")

# Add a column for year
data_2023['Year'] = 2023
data_2024['Year'] = 2024

# Columns to keep
columns_to_keep = ['Track', 'Driver', 'Team', 'Starting Grid', 'Position', 'Year']
data_2023 = data_2023[columns_to_keep]
data_2024 = data_2024[columns_to_keep]

# Combine datasets
combined_data = pd.concat([data_2023, data_2024], ignore_index=True)

combined_data = combined_data[~combined_data['Position'].isin(['DQ', 'NC'])]

# Convert Finishing Position to numeric
combined_data['Position'] = combined_data['Position'].astype(int)

# Feature engineering
combined_data['driver_avg_starting_position'] = combined_data.groupby('Driver')['Starting Grid'].transform('mean')
combined_data['driver_avg_finishing_position'] = combined_data.groupby('Driver')['Position'].transform('mean')
combined_data['team_avg_starting_position'] = combined_data.groupby('Team')['Starting Grid'].transform('mean')
combined_data['team_avg_finishing_position'] = combined_data.groupby('Team')['Position'].transform('mean')
combined_data['track_driver_avg_finishing_position'] = combined_data.groupby(['Track', 'Driver'])['Position'].transform('mean')
combined_data['track_team_avg_finishing_position'] = combined_data.groupby(['Track', 'Team'])['Position'].transform('mean')

# Calculate the current form of the driver
combined_data['driver_form'] = combined_data.groupby('Driver', group_keys=False)['Position'].apply(lambda x: x.rolling(10, min_periods=1).mean())

# Calculate the current form of the team 
combined_data['team_form'] = combined_data.groupby('Team', group_keys=False)['Position'].apply(lambda x: x.rolling(10, min_periods=1).mean())

# Driver's race win count
combined_data['driver_win_count'] = combined_data.groupby('Driver')['Position'].transform(lambda x: (x == 1).sum())

# Team's race win count
combined_data['team_win_count'] = combined_data.groupby('Team')['Position'].transform(lambda x: (x == 1).sum())

# Driver's podium finishes count
combined_data['driver_podium_count'] = combined_data.groupby('Driver')['Position'].transform(lambda x: (x <= 3).sum())

# Team's podium finishes count
combined_data['team_podium_count'] = combined_data.groupby('Team')['Position'].transform(lambda x: (x <= 3).sum())

# Interaction terms
combined_data['driver_team_interaction'] = combined_data.groupby(['Driver', 'Team'])['Position'].transform('mean')
combined_data['track_starting_grid_interaction'] = combined_data.groupby(['Track', 'Starting Grid'])['Position'].transform('mean')
combined_data['driver_track_interaction'] = combined_data.groupby(['Driver', 'Track'])['Position'].transform('mean')
combined_data['team_track_interaction'] = combined_data.groupby(['Team', 'Track'])['Position'].transform('mean')
combined_data['Driver'] = combined_data['Driver'].replace({'Alexander Albon': 'Alex Albon', 'Guanyu Zhou': 'Zhou Guanyu'})

combined_data.to_csv("cleaned_data/combined_data.csv")


In [2]:
# Load cleaned data 
combined_data = pd.read_csv("cleaned_data/combined_data.csv")

# Select features
features = combined_data[['Starting Grid', 'driver_avg_starting_position', 'driver_avg_finishing_position', 
                          'team_avg_starting_position', 'team_avg_finishing_position', 'track_driver_avg_finishing_position', 
                          'track_team_avg_finishing_position', 'driver_form', 'team_form', 'driver_team_interaction', 
                          'track_starting_grid_interaction', 'driver_track_interaction', 'team_track_interaction', 
                          'team_podium_count', 'driver_podium_count', 'team_win_count', 'driver_win_count']]

target = combined_data['Position']

# Categorical variables to dummy variables
features = pd.get_dummies(features, drop_first=True)
joblib.dump(features.columns, 'models/features_columns.pkl')

# Add polynomial features
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
features_poly = poly.fit_transform(features)
joblib.dump(poly, 'models/poly_transformer.pkl')

# Split data 
X_train, X_test, y_train, y_test = train_test_split(features_poly, target, test_size=0.2, random_state=42)

# Parameter grid 
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the model
model = RandomForestRegressor(random_state=42)

# Initialize GridSearchCVf
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit the model
grid_search.fit(X_train, y_train)

# Get best parameters and best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
joblib.dump(best_model, 'models/f1_finishing_position_predictor.pkl')

print("Best Parameters:", best_params)

# Make predictions on the test data
y_pred = best_model.predict(X_test)

# Mse and r2 for evaluation
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")


Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.7s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.7s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.7s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.8s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.8s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   3.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   3.5s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   3.7s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   3.4s
[CV] END m

In [3]:
combined_data = pd.read_csv("cleaned_data/combined_data.csv")
best_model = joblib.load('models/f1_finishing_position_predictor.pkl')

def safe_get_value(series, default_value=0):
    try:
        return series.values[0]
    except IndexError:
        return default_value

def predict_finishing_position(track, driver, starting_grid):
    # Create a DataFrame for the input
    input_data = {
        'Track': [track],
        'Driver': [driver],
        'Starting Grid': [starting_grid]
    }

    # Calculate the engineered features for user input
    input_df = pd.DataFrame(input_data)
    input_df['driver_avg_starting_position'] = safe_get_value(combined_data.groupby('Driver')['Starting Grid'].transform('mean').loc[combined_data['Driver'] == driver])
    input_df['driver_avg_finishing_position'] = safe_get_value(combined_data.groupby('Driver')['Position'].transform('mean').loc[combined_data['Driver'] == driver])
    input_df['team_avg_starting_position'] = safe_get_value(combined_data.groupby('Team')['Starting Grid'].transform('mean').loc[combined_data['Driver'] == driver])
    input_df['team_avg_finishing_position'] = safe_get_value(combined_data.groupby('Team')['Position'].transform('mean').loc[combined_data['Driver'] == driver])
    input_df['track_driver_avg_finishing_position'] = safe_get_value(combined_data.groupby(['Track', 'Driver'])['Position'].transform('mean').loc[(combined_data['Track'] == track) & (combined_data['Driver'] == driver)])
    input_df['track_team_avg_finishing_position'] = safe_get_value(combined_data.groupby(['Track', 'Team'])['Position'].transform('mean').loc[(combined_data['Track'] == track) & (combined_data['Driver'] == driver)])
    input_df['driver_form'] = safe_get_value(combined_data.groupby('Driver', group_keys=False)['Position'].apply(lambda x: x.rolling(10, min_periods=1).mean()).loc[combined_data['Driver'] == driver])
    input_df['team_form'] = safe_get_value(combined_data.groupby('Team', group_keys=False)['Position'].apply(lambda x: x.rolling(10, min_periods=1).mean()).loc[combined_data['Driver'] == driver])
    input_df['driver_win_count'] = safe_get_value(combined_data.groupby('Driver')['Position'].transform(lambda x: (x == 1).sum()).loc[combined_data['Driver'] == driver])
    input_df['team_win_count'] = safe_get_value(combined_data.groupby('Team')['Position'].transform(lambda x: (x == 1).sum()).loc[combined_data['Driver'] == driver])
    input_df['driver_podium_count'] = safe_get_value(combined_data.groupby('Driver')['Position'].transform(lambda x: (x <= 3).sum()).loc[combined_data['Driver'] == driver])
    input_df['team_podium_count'] = safe_get_value(combined_data.groupby('Team')['Position'].transform(lambda x: (x <= 3).sum()).loc[combined_data['Driver'] == driver])
    input_df['driver_team_interaction'] = safe_get_value(combined_data.groupby(['Driver', 'Team'])['Position'].transform('mean').loc[(combined_data['Driver'] == driver) & (combined_data['Track'] == track)])
    input_df['track_starting_grid_interaction'] = safe_get_value(combined_data.groupby(['Track', 'Starting Grid'])['Position'].transform('mean').loc[(combined_data['Track'] == track) & (combined_data['Starting Grid'] == starting_grid)])
    input_df['driver_track_interaction'] = safe_get_value(combined_data.groupby(['Driver', 'Track'])['Position'].transform('mean').loc[(combined_data['Driver'] == driver) & (combined_data['Track'] == track)])
    input_df['team_track_interaction'] = safe_get_value(combined_data.groupby(['Team', 'Track'])['Position'].transform('mean').loc[(combined_data['Track'] == track) & (combined_data['Driver'] == driver)])

    # Convert categorical variables to dummy variables
    input_df = pd.get_dummies(input_df, drop_first=True)

    # Ensure the input_df has the same columns as the training data
    missing_cols = set(features.columns) - set(input_df.columns)
    for col in missing_cols:
        input_df[col] = 0
    input_df = input_df[features.columns]

    # Add polynomial features
    input_features_poly = poly.transform(input_df)

    # Predict the finishing position
    prediction = best_model.predict(input_features_poly)
    
    # Round the predicted finishing position to the nearest whole number
    rounded_prediction = round(prediction[0])
    
    return rounded_prediction

# Example of user input 

track = "Abu Dhabi"  
driver = "Alex Albon"  
starting_grid = 20 

predicted_position = predict_finishing_position(track, driver, starting_grid)
print(f"Predicted Finishing Position for {driver} starting at {starting_grid} on {track}: {predicted_position}")


Predicted Finishing Position for Alex Albon starting at 20 on Abu Dhabi: 16
