In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
import fastf1 as ff1

In [2]:
def get_race_info(year, race_number, session='Race'):
    session = ff1.get_session(year, race_number, session)
    session.load(laps=False, telemetry=False, weather=False, messages=False)
    race_result = session.results
    return race_result[["DriverNumber", "Abbreviation", "TeamName", "LastName", "GridPosition", "Position", "Time"]]

#When building this program I started by calling a lot of times the same information so to simplify the code and make it less demanding I create this function to be able to reuse cached schedules.
def Schedule(year):
    return ff1.get_event_schedule(year, include_testing=False)

def get_calendar(year):
    calendar = Schedule(year) 
    circuits = list(calendar.Location)
    return circuits


def season_winners(year):
    calendar = Schedule(year) 
    races = list(calendar['EventName'])

    sum_results = []
    for race in races:
        race_result = get_race_info(year, race)
        sum_results.append(race_result.iloc[0, :])

    df = pd.DataFrame(sum_results)
    df['Race'] = races
    return df


def season_full_results(year):
    calendar = Schedule(year)
    races = list(calendar['EventName'])

    rows = []
    for race in races:
        race_results = get_race_info(year, race).copy()   
        race_results.loc[:, 'Race'] = race             
        race_results.loc[:, 'Year'] = year              
        rows.append(race_results)

    df = pd.concat(rows, ignore_index=True)
    return df


def quali_results(year, race):
    session = ff1.get_session(year, race, 'Qualifying')
    session.load(laps = False, telemetry=False, weather=False, messages=False)
    result = session.results.copy()

    quali = result[["DriverNumber", "Abbreviation", "Position", "Q1", "Q2", "Q3"]].copy() 
    quali.rename(columns={'Position': 'QualiPosition'}, inplace=True)
    quali['Year'] = year
    quali['Race'] = race
    return quali


def season_quali_results(year):
    calendar = Schedule(year)  
    races = list(calendar['EventName'])

    rows = []
    for race in races:
        rows.append(quali_results(year, race))

    df = pd.concat(rows, ignore_index=True)
    return df


def learning_table(year):
    race_df = season_full_results(year)
    quali_df = season_quali_results(year)
    general_df = pd.merge(
        race_df, quali_df,
        on=['Year', 'Race', 'DriverNumber', 'Abbreviation'],
        how='left'
    )
    return general_df


def learning_table_all_years(years=['2024']):
    rows = []
    for year in years:
        rows.append(learning_table(int(year)))

    df = pd.concat(rows, ignore_index=True)
    return df


In [3]:
#II/- Data preprocessing

In [4]:
extract = learning_table_all_years()
data = extract.copy()

core           INFO 	Loading data for Bahrain Grand Prix - Race [v3.6.1]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
core           INFO 	Finished loading data for 20 drivers: ['1', '11', '55', '16', '63', '4', '44', '81', '14', '18', '24', '20', '3', '22', '23', '27', '31', '10', '77', '2']
core           INFO 	Loading data for Saudi Arabian Grand Prix - Race [v3.6.1]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
core           INFO 	Finished loading data for 20 drivers: ['1', '11', '16', '81', '14', '63', '38', '4', '44', '27', '23', '20', '31', '2', '22', '3', '77', '24', '18', '10']
core           INFO 	Loading data for Australian Grand Prix - Race [v3.6.1]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
core           INFO 	Finished loading data for 19 drivers: ['55', '16', '4', '81', '11'

In [5]:
#Self-note : This dataframe extracts data from 2021-2024. In 2022 regulations changed which might have impacted results/trends in teams. But overall there were no huge changes in which team was positioned where in the constructor championship.
#During those 4 seasons, some drivers have changed or retired, this is going to be something I'll have to address later on.

In [6]:
data[data['Year'] == 2024]

Unnamed: 0,DriverNumber,Abbreviation,TeamName,LastName,GridPosition,Position,Time,Race,Year,QualiPosition,Q1,Q2,Q3
0,1,VER,Red Bull Racing,Verstappen,1.0,1.0,0 days 01:31:44.742000,Bahrain Grand Prix,2024,1.0,0 days 00:01:30.031000,0 days 00:01:29.374000,0 days 00:01:29.179000
1,11,PER,Red Bull Racing,Perez,5.0,2.0,0 days 00:00:22.457000,Bahrain Grand Prix,2024,5.0,0 days 00:01:30.221000,0 days 00:01:29.932000,0 days 00:01:29.537000
2,55,SAI,Ferrari,Sainz,4.0,3.0,0 days 00:00:25.110000,Bahrain Grand Prix,2024,4.0,0 days 00:01:29.909000,0 days 00:01:29.573000,0 days 00:01:29.507000
3,16,LEC,Ferrari,Leclerc,2.0,4.0,0 days 00:00:39.669000,Bahrain Grand Prix,2024,2.0,0 days 00:01:30.243000,0 days 00:01:29.165000,0 days 00:01:29.407000
4,63,RUS,Mercedes,Russell,3.0,5.0,0 days 00:00:46.788000,Bahrain Grand Prix,2024,3.0,0 days 00:01:30.350000,0 days 00:01:29.922000,0 days 00:01:29.485000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
474,20,MAG,Haas F1 Team,Magnussen,14.0,16.0,0 days 00:01:17.597000,Abu Dhabi Grand Prix,2024,15.0,0 days 00:01:23.632000,0 days 00:01:23.877000,NaT
475,30,LAW,RB,Lawson,12.0,17.0,NaT,Abu Dhabi Grand Prix,2024,12.0,0 days 00:01:23.733000,0 days 00:01:23.472000,NaT
476,77,BOT,Kick Sauber,Bottas,9.0,18.0,NaT,Abu Dhabi Grand Prix,2024,9.0,0 days 00:01:23.481000,0 days 00:01:23.341000,0 days 00:01:23.204000
477,43,COL,Williams,Colapinto,20.0,19.0,NaT,Abu Dhabi Grand Prix,2024,19.0,0 days 00:01:23.912000,NaT,NaT


In [7]:
print("----Data Summary-----")
print("Column name and type :\n",data.dtypes)
#To-do-list : 1. Remove columns that give redundant informations (e.g : driver name and driver number)
#2. Give a specific number for each team using encoding 
#3. The time as it is given in the dataframe cannot be interpreted by python so we need to modify it.

----Data Summary-----
Column name and type :
 DriverNumber              object
Abbreviation              object
TeamName                  object
LastName                  object
GridPosition             float64
Position                 float64
Time             timedelta64[ns]
Race                      object
Year                       int64
QualiPosition            float64
Q1               timedelta64[ns]
Q2               timedelta64[ns]
Q3               timedelta64[ns]
dtype: object


In [10]:
from sklearn.preprocessing import LabelEncoder 

#1. 
clean_data = data[['DriverNumber', 'TeamName', 'GridPosition', 'Position', 'Time']]
#2.
encoder = LabelEncoder() 
clean_data['TeamNumber'] = encoder.fit_transform(clean_data['TeamName'])
#3. Let's change the expression of time as it is given as time delta for each driver after P1.

clean_data['Time_td'] = pd.to_timedelta(clean_data['Time'], errors='coerce')
clean_data['gap_to_winner'] = clean_data['Time_td'].dt.total_seconds()
clean_data.loc[clean_data['Position'] == 1, 'gap_to_winner'] = 0
clean_data = clean_data.drop(columns=['Time', 'Time_td'])

#Let's fill missing data : DnF, Pit start, Penalty
clean_data['GaptoP1 (sec)'] = clean_data['gap_to_winner'].fillna(9999) #This is to handle a DnF case for which we'd have 'NaT'
clean_data['GridPosition'] = clean_data['GridPosition'].fillna(25) #This is to handle a pit start 

clean_data = clean_data.drop(columns=['TeamName', 'gap_to_winner'])
clean_data



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_data['TeamNumber'] = encoder.fit_transform(clean_data['TeamName'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_data['Time_td'] = pd.to_timedelta(clean_data['Time'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_data['gap_to_winner'] = clean_data['Time_td'].dt.

Unnamed: 0,DriverNumber,GridPosition,Position,TeamNumber,GaptoP1 (sec)
0,1,1.0,1.0,8,0.000
1,11,5.0,2.0,8,22.457
2,55,4.0,3.0,2,25.110
3,16,2.0,4.0,2,39.669
4,63,3.0,5.0,6,46.788
...,...,...,...,...,...
474,20,14.0,16.0,3,77.597
475,30,12.0,17.0,7,9999.000
476,77,9.0,18.0,4,9999.000
477,43,20.0,19.0,9,9999.000


In [11]:
clean_data.isnull().sum()

DriverNumber     0
GridPosition     0
Position         0
TeamNumber       0
GaptoP1 (sec)    0
dtype: int64

In [21]:
X = clean_data.drop(columns = ['DriverNumber', 'Position'])
y = clean_data[['Position']]