In [1]:
import pandas as pd
import numpy as np
import fastf1 as ff
import logging
logging.getLogger("fastf1").setLevel(logging.ERROR)
from joblib import Parallel, delayed
from tqdm import tqdm

In [2]:
def process_session(job_info):
    year, event, session_type = job_info
    try:
        session = ff.get_session(year, event, session_type)
        session.load(laps=True, telemetry=False, weather=True, messages=False)
        
        laps = session.laps
        weather = session.weather_data

        laps = laps.sort_values('Time')
        weather = weather.sort_values('Time')

        laps_with_weather = pd.merge_asof(
            left=laps,
            right=weather,
            on='Time'
        )

        clean_laps = laps_with_weather.loc[laps_with_weather['IsAccurate'] == True].copy()
        clean_laps = clean_laps.dropna(subset=['LapTime'])
        clean_laps['EventName'] = event
        clean_laps['Year'] = year
        
        return clean_laps
        
    except Exception as e:
        return None

In [3]:
jobs = []
for year in [2024, 2023, 2022, 2021, 2020, 2019, 2018]:
    schedule = ff.get_event_schedule(year, include_testing=False)
    for event in schedule['EventName']:
        jobs.append((year, event, 'R'))

print(f"Starting to process {len(jobs)} sessions in parallel...")
results = Parallel(n_jobs=-1)(delayed(process_session)(job) for job in tqdm(jobs))

all_results = [res for res in results if res is not None]
final_dataset = pd.concat(all_results, ignore_index=True)

print(f"Successfully gathered {len(final_dataset)} clean laps. Saving to CSV...")
final_dataset.to_csv('Formula_1_Data_2018-24.csv', index=False)
print("Done!")

Starting to process 149 sessions in parallel...


100%|██████████| 149/149 [01:05<00:00,  2.28it/s]


Successfully gathered 139523 clean laps. Saving to CSV...
Done!


In [2]:
df = pd.read_csv('Formula_1_Data_2018-24.csv')

In [4]:
df.drop(columns=['Deleted','DeletedReason','LapStartDate','PitOutTime','PitInTime'], inplace = True)

In [5]:
df.shape

(139523, 35)

In [6]:
df['TrackStatus'].unique()

array([ 1, 12, 21,  2])

In [7]:
df.drop(columns=['FastF1Generated','IsAccurate'],inplace = True)

In [None]:
dropped_cols = [
    'DriverNumber','FreshTyre','Team','SpeedI1','SpeedI2','SpeedST',
    'SpeedFL','Time','LapStartTime', 'Sector1SessionTime', 'Sector2SessionTime',
    'Sector3SessionTime', 'LapTime', 'Sector1Time', 'Sector2Time', 'Sector3Time',
    'IsPersonalBest','Position','TrackStatus'        
]

In [32]:
df['LapTime(s)'] = pd.to_timedelta(df['LapTime']).dt.total_seconds()

In [None]:
df['TyreLife'] = df.groupby(['Driver','EventName'])['TyreLife'].transform(lambda x : x.fillna(x.median()))
df['Stint'] = df.groupby(['Driver','EventName'])['Stint'].transform(lambda x : x.fillna(x.median()))

In [35]:
df['Compound'] = df.groupby(['Driver', 'EventName'])['Compound'].transform(lambda x: x.fillna(x.mode()[0]))

In [36]:
final_df = df.drop(columns = dropped_cols)

In [37]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139523 entries, 0 to 139522
Data columns (total 15 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Driver         139523 non-null  object 
 1   LapNumber      139523 non-null  float64
 2   Stint          139523 non-null  float64
 3   Compound       139523 non-null  object 
 4   TyreLife       139523 non-null  float64
 5   AirTemp        139523 non-null  float64
 6   Humidity       139523 non-null  float64
 7   Pressure       139523 non-null  float64
 8   Rainfall       139523 non-null  bool   
 9   TrackTemp      139523 non-null  float64
 10  WindDirection  139523 non-null  int64  
 11  WindSpeed      139523 non-null  float64
 12  EventName      139523 non-null  object 
 13  Year           139523 non-null  int64  
 14  LapTime(s)     139523 non-null  float64
dtypes: bool(1), float64(9), int64(2), object(3)
memory usage: 15.0+ MB


In [38]:
final_df['Rainfall'] = final_df['Rainfall'].astype(int)

In [54]:
categorical_cols = ['Driver', 'Compound']
final_df_encoded = pd.get_dummies(final_df, columns=categorical_cols)

In [63]:
bool_cols = final_df_encoded.select_dtypes(include='bool').columns
final_df_encoded[bool_cols] = final_df_encoded[bool_cols].astype(int)

In [65]:
final_df_encoded.to_csv("TrainingDataF1.csv", index = False)

In [11]:
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import os
import re

def train_and_evaluate_circuit(df, circuit_name):
    print(f"--- Processing: {circuit_name} ---")
    
    circuit_df = df[df['EventName'] == circuit_name].copy()
    
    if len(circuit_df) < 500:
        print(f"Skipping {circuit_name}, not enough data.\n")
        return None

    y = circuit_df['LapTime(s)']
    X = circuit_df.drop(columns=['LapTime(s)', 'EventName'])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    models = {
        'XGBoost': xgb.XGBRegressor(random_state=42),
        'LightGBM': lgb.LGBMRegressor(random_state=42)
    }
    
    circuit_results = {'Circuit': circuit_name}
    trained_models = {}

    for name, model in models.items():
        print(f"  Training {name}...")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)
        
        circuit_results[f'{name}_RMSE'] = rmse
        circuit_results[f'{name}_R2'] = r2
        trained_models[name] = model

    best_model_name = 'XGBoost' if circuit_results['XGBoost_RMSE'] <= circuit_results['LightGBM_RMSE'] else 'LightGBM'
    circuit_results['Best_Model'] = best_model_name
    
    print(f"  -> Best Model: {best_model_name} (RMSE: {circuit_results[f'{best_model_name}_RMSE']:.3f})")

    best_model_object = trained_models[best_model_name]
    safe_filename = re.sub(r'[\\/*?:"<>|]', "", circuit_name).replace(" ", "_")
    model_filename = f'models/{safe_filename}_model.joblib'
    joblib.dump(best_model_object, model_filename)
    print(f"  -> Model saved as {model_filename}\n")
    
    return circuit_results

In [12]:
if __name__ == "__main__":
    if not os.path.exists('models'):
        os.makedirs('models')

    print("Loading preprocessed dataset...")
    try:
        full_df = pd.read_csv('TrainingDataF1.csv')
        print("Dataset loaded.")
    except FileNotFoundError:
        print("ERROR: Preprocessed CSV file not found.")
        exit()

    if 'Unnamed: 0' in full_df.columns:
        full_df = full_df.drop(columns=['Unnamed: 0'])

    bool_cols = full_df.select_dtypes(include='bool').columns
    full_df[bool_cols] = full_df[bool_cols].astype(int)

    circuits = full_df['EventName'].unique()
    
    all_results_list = []

    for circuit in circuits:
        results = train_and_evaluate_circuit(full_df, circuit)
        if results is not None:
            all_results_list.append(results)
        
    print("--- All circuits processed. ---")

    results_df = pd.DataFrame(all_results_list)

    avg_rmse = results_df.apply(lambda row: row[f"{row['Best_Model']}_RMSE"], axis=1).mean()
    avg_r2 = results_df.apply(lambda row: row[f"{row['Best_Model']}_R2"], axis=1).mean()

    final_summary = {
        "total_models_trained": len(results_df),
        "average_rmse": avg_rmse,
        "average_r2": avg_r2
    }

Loading preprocessed dataset...
Dataset loaded.
--- Processing: Bahrain Grand Prix ---
  Training XGBoost...
  Training LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000221 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 763
[LightGBM] [Info] Number of data points in the train set: 4940, number of used features: 47
[LightGBM] [Info] Start training from score 97.395866
  -> Best Model: XGBoost (RMSE: 0.700)
  -> Model saved as models/Bahrain_Grand_Prix_model.joblib

--- Processing: Saudi Arabian Grand Prix ---
  Training XGBoost...
  Training LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000192 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 461
[LightGBM] [Info] Number of data points in the train set: 2258, number of used features: 39
[LightGBM] [Info] Start training from score 94.9922

In [16]:
print("\n--- Model Performance by Circuit ---")
results_df


--- Model Performance by Circuit ---


Unnamed: 0,Circuit,XGBoost_RMSE,XGBoost_R2,LightGBM_RMSE,LightGBM_R2,Best_Model
0,Bahrain Grand Prix,0.700314,0.863391,0.711977,0.858803,XGBoost
1,Saudi Arabian Grand Prix,0.798139,0.792485,0.911358,0.729435,XGBoost
2,Australian Grand Prix,0.686099,0.958616,0.706368,0.956135,XGBoost
3,Japanese Grand Prix,1.213372,0.930551,1.283383,0.922305,XGBoost
4,Chinese Grand Prix,0.693838,0.845481,0.682511,0.850484,LightGBM
5,Miami Grand Prix,0.52308,0.859556,0.513824,0.864483,LightGBM
6,Emilia Romagna Grand Prix,1.179796,0.941789,1.218983,0.937857,XGBoost
7,Monaco Grand Prix,1.589533,0.936413,1.552192,0.939366,LightGBM
8,Canadian Grand Prix,1.385556,0.916481,0.775564,0.973832,LightGBM
9,Spanish Grand Prix,1.352841,0.838161,1.404804,0.82549,XGBoost


In [15]:
print("\n--- Final Model Performance Summary ---")
final_summary


--- Final Model Performance Summary ---


{'total_models_trained': 36,
 'average_rmse': np.float64(0.9165628292469629),
 'average_r2': np.float64(0.8636081762386086)}