In [1]:
import os
from dotenv import load_dotenv
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

In [2]:
load_dotenv('../sql_credentials.env')

db_user = os.getenv('DB_USER')
db_password = os.getenv('DB_PASSWORD')
db_host = os.getenv('DB_HOST')
db_name = os.getenv('DB_NAME')

db_connection_str = f'mysql+pymysql://{db_user}:{db_password}@{db_host}/{db_name}'
engine = create_engine(db_connection_str)


### Objectives:

- Gather all necesary data for modeling and EDA


#### Step 1

Here I'll query for all rows in results table, I'll fetch all foreign keys related to tables we'll need to get data from, plus the grid position. I'll join that with the race table to get other foreign keys plus the year, round, and date of the race. 

In [3]:
results_query = """
SELECT r.resultId, r.raceId, r.driverId, r.constructorId, r.grid, r.position, races.year, races.round, races.circuitId, races.date
FROM results r
JOIN races ON r.raceId = races.raceId
"""
results_df = pd.read_sql(results_query, con=engine)
results_df.info()
results_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26759 entries, 0 to 26758
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   resultId       26759 non-null  int64  
 1   raceId         26759 non-null  int64  
 2   driverId       26759 non-null  int64  
 3   constructorId  26759 non-null  int64  
 4   grid           26759 non-null  int64  
 5   position       15806 non-null  float64
 6   year           26759 non-null  int64  
 7   round          26759 non-null  int64  
 8   circuitId      26759 non-null  int64  
 9   date           26759 non-null  object 
dtypes: float64(1), int64(8), object(1)
memory usage: 2.0+ MB


Unnamed: 0,resultId,raceId,driverId,constructorId,grid,position,year,round,circuitId,date
0,1,18,1,1,1,1.0,2008,1,1,2008-03-16
1,2,18,2,2,5,2.0,2008,1,1,2008-03-16
2,3,18,3,3,7,3.0,2008,1,1,2008-03-16
3,4,18,4,4,11,4.0,2008,1,1,2008-03-16
4,5,18,5,1,3,5.0,2008,1,1,2008-03-16



#### Step 2

Here I'll calculate the driver age at the time of the race, for that I'll get the driver's DOB and compare that to each entry's date to get the drivers age at that time.

In [4]:
driver_age_query = """
SELECT d.driverId, d.dob
FROM drivers d
"""
drivers_df = pd.read_sql(driver_age_query, con=engine)

results_df = results_df.merge(drivers_df, on="driverId", how="left")
results_df['driver_age'] = results_df.apply(
    lambda row: row['date'].year - row['dob'].year - 
                ((row['date'].month, row['date'].day) < (row['dob'].month, row['dob'].day)),
    axis=1
)

results_df.head()

Unnamed: 0,resultId,raceId,driverId,constructorId,grid,position,year,round,circuitId,date,dob,driver_age
0,1,18,1,1,1,1.0,2008,1,1,2008-03-16,1985-01-07,23
1,2,18,2,2,5,2.0,2008,1,1,2008-03-16,1977-05-10,30
2,3,18,3,3,7,3.0,2008,1,1,2008-03-16,1985-06-27,22
3,4,18,4,4,11,4.0,2008,1,1,2008-03-16,1981-07-29,26
4,5,18,5,1,3,5.0,2008,1,1,2008-03-16,1981-10-19,26



#### Step 3

Here I'll calculate the driver's experience (Number of GPs entered). For that I'll start by sorting the current `results_df` by `date` and `resultId`. Then I'll create a cumulative count of appearance for each driver.

In [5]:
results_df = results_df.sort_values(by=['date', 'resultId']).reset_index(drop=True)

results_df['driver_experience'] = results_df.groupby('driverId').cumcount()

results_df.tail()

Unnamed: 0,resultId,raceId,driverId,constructorId,grid,position,year,round,circuitId,date,dob,driver_age,driver_experience
26754,26760,1144,825,210,14,16.0,2024,24,24,2024-12-08,1992-10-05,32,185
26755,26761,1144,859,215,12,17.0,2024,24,24,2024-12-08,2002-02-11,22,10
26756,26762,1144,822,15,9,,2024,24,24,2024-12-08,1989-08-28,35,246
26757,26763,1144,861,3,20,,2024,24,24,2024-12-08,2003-05-27,21,8
26758,26764,1144,815,9,10,,2024,24,24,2024-12-08,1990-01-26,34,282


We can see that the 2nd row, which corresponds to Franco Colapinto (driverId=861) has 6 races which is correct after the 2024 Brazilian GP


#### Step 4

Similar to last step, I'll calculate the driver's experience with it's current team. For that I'll create a cumulative count of appearances for each `driverId` and `constructorId` combination

In [6]:
results_df['driver_constructor_experience'] = results_df.groupby(['driverId', 'constructorId']).cumcount()

results_df.tail()

Unnamed: 0,resultId,raceId,driverId,constructorId,grid,position,year,round,circuitId,date,dob,driver_age,driver_experience,driver_constructor_experience
26754,26760,1144,825,210,14,16.0,2024,24,24,2024-12-08,1992-10-05,32,185,144
26755,26761,1144,859,215,12,17.0,2024,24,24,2024-12-08,2002-02-11,22,10,5
26756,26762,1144,822,15,9,,2024,24,24,2024-12-08,1989-08-28,35,246,23
26757,26763,1144,861,3,20,,2024,24,24,2024-12-08,2003-05-27,21,8,8
26758,26764,1144,815,9,10,,2024,24,24,2024-12-08,1990-01-26,34,282,89


#### Step 5

Now I want to calculate both the driver's all time wins and the driver's all time wins with that specific constructor. Using a temporary `win_indicator`, and similary to last steps, using a cumulative count, then dropping the temporary column as it's no longer needed.

In [7]:
results_df['win_indicator'] = results_df['position'] == 1.0

results_df['driver_wins'] = results_df.groupby('driverId')['win_indicator'].cumsum()
results_df['constructor_wins'] = results_df.groupby(['driverId', 'constructorId'])['win_indicator'].cumsum()

results_df.drop(columns=['win_indicator'], inplace=True)


#### Step 6

For this step I need to make a new query: I need `driver_points` and `driver_standings` after each race, that I'll then merge to `results_df`

In [8]:
driver_standings_query = """
SELECT ds.raceId, ds.driverId, ds.points AS driver_points, ds.position AS driver_standing
FROM driverStandings ds
"""
driver_standings_df = pd.read_sql(driver_standings_query, con=engine)
results_df = results_df.merge(driver_standings_df, on=["raceId", "driverId"], how="left")

results_df.head()

Unnamed: 0,resultId,raceId,driverId,constructorId,grid,position,year,round,circuitId,date,dob,driver_age,driver_experience,driver_constructor_experience,driver_wins,constructor_wins,driver_points,driver_standing
0,20025,833,642,51,1,1.0,1950,1,9,1950-05-13,1906-10-30,43,0,0,1,1,9.0,1.0
1,20026,833,786,51,2,2.0,1950,1,9,1950-05-13,1898-06-09,51,0,0,0,0,6.0,2.0
2,20027,833,686,51,4,3.0,1950,1,9,1950-05-13,1911-07-02,38,0,0,0,0,4.0,3.0
3,20028,833,704,154,6,4.0,1950,1,9,1950-05-13,1904-10-08,45,0,0,0,0,3.0,4.0
4,20029,833,627,154,9,5.0,1950,1,9,1950-05-13,1905-11-05,44,0,0,0,0,2.0,5.0



#### Step 7

Same as last step, but this time for constructor standings data

In [9]:
constructor_standings_query = """
SELECT cs.raceId, cs.constructorId, cs.points AS constructor_points, cs.position AS constructor_standing
FROM constructorStandings cs
"""
constructor_standings_df = pd.read_sql(constructor_standings_query, con=engine)
results_df = results_df.merge(constructor_standings_df, on=["raceId", "constructorId"], how="left")

results_df.head()

Unnamed: 0,resultId,raceId,driverId,constructorId,grid,position,year,round,circuitId,date,dob,driver_age,driver_experience,driver_constructor_experience,driver_wins,constructor_wins,driver_points,driver_standing,constructor_points,constructor_standing
0,20025,833,642,51,1,1.0,1950,1,9,1950-05-13,1906-10-30,43,0,0,1,1,9.0,1.0,,
1,20026,833,786,51,2,2.0,1950,1,9,1950-05-13,1898-06-09,51,0,0,0,0,6.0,2.0,,
2,20027,833,686,51,4,3.0,1950,1,9,1950-05-13,1911-07-02,38,0,0,0,0,4.0,3.0,,
3,20028,833,704,154,6,4.0,1950,1,9,1950-05-13,1904-10-08,45,0,0,0,0,3.0,4.0,,
4,20029,833,627,154,9,5.0,1950,1,9,1950-05-13,1905-11-05,44,0,0,0,0,2.0,5.0,,


#### Warning

Now we have a problem here:
As we can see the standings and data we got are for ***after*** each race, and since we want to predict a race result (finishing position), we need to have the standings from ***before*** the race. 
This means we will have to do some logic:
1. For the first race ever (1950 round 1), `driver_wins`, `constructor_wins`, `driver_points`, `driver_standing`, `constructor_points` and `constructor_standing` will be set to zero.
2. For the first race of every season, `driver_points`, `driver_standing`, `constructor_points` and `constructor_standing` will be 0, but `driver_wins` and `constructor_wins` will be carried over from the past race.
3. All data we have now on an entry, will be moved 1 race ahead. I.e. 2022 round 4 has the data for *after* that race, that data will be the *starting* data for the next race. So 2022 round 4 data will now be 2022 round 5 data.
4. Keeping the last step logic, if a season like 2022 has 22 rounds, we will end up with a round 23. So we need to check if the data we're handling is from the last round of the season, therefore we won't append that new entry to the new DataFrame. 


#### Step 8

Shift data. As this is a more complicated step, I'll go step by step in the code block

In [10]:
# Sort the DataFrame to ensure chronological order
results_df = results_df.sort_values(by=['year', 'round', 'driverId', 'constructorId'])

# Calculate the maximum round for each season
max_rounds = results_df.groupby('year')['round'].max().reset_index(name='max_round')
results_df = results_df.merge(max_rounds, on='year')

# Filter out the last race of each season as there's no next race to predict
results_df = results_df[results_df['round'] != results_df['max_round']]
results_df.drop(columns='max_round', inplace=True)

# Shift driver and constructor wins across all races (carry over across seasons)
results_df['driver_wins_shifted'] = results_df.groupby('driverId')['driver_wins'].shift(1).fillna(0)
results_df['constructor_wins_shifted'] = results_df.groupby('constructorId')['constructor_wins'].shift(1).fillna(0)

# Shift points and standings within each season (reset to 0 at the start of a season)
results_df['driver_points_shifted'] = results_df.groupby(['driverId', 'year'])['driver_points'].shift(1).fillna(0)
results_df['driver_standing_shifted'] = results_df.groupby(['driverId', 'year'])['driver_standing'].shift(1).fillna(0)
results_df['constructor_points_shifted'] = results_df.groupby(['constructorId', 'year'])['constructor_points'].shift(1).fillna(0)
results_df['constructor_standing_shifted'] = results_df.groupby(['constructorId', 'year'])['constructor_standing'].shift(1).fillna(0)

# The first race of each season will have NaN for points/standing shifts, which we filled with 0
# Wins are correctly carried over from the last race of the previous season (or 0 if first race ever)

In [11]:
#results_df[(results_df['year'] == 2024) & (results_df['round'] == 3) & (results_df['position'] == 1.0)].head()
results_df.columns

Index(['resultId', 'raceId', 'driverId', 'constructorId', 'grid', 'position',
       'year', 'round', 'circuitId', 'date', 'dob', 'driver_age',
       'driver_experience', 'driver_constructor_experience', 'driver_wins',
       'constructor_wins', 'driver_points', 'driver_standing',
       'constructor_points', 'constructor_standing', 'driver_wins_shifted',
       'constructor_wins_shifted', 'driver_points_shifted',
       'driver_standing_shifted', 'constructor_points_shifted',
       'constructor_standing_shifted'],
      dtype='object')

#### Step 9

Rename shifted columns and drop unnecesary ones

In [12]:
results_df = results_df.drop(columns=['driver_wins', 'constructor_wins', 'driver_points', 'driver_standing', 'constructor_points', 'constructor_standing'])
results_df = results_df.rename(columns={'driver_wins_shifted': 'driver_wins', 
                                        'constructor_wins_shifted': 'constructor_wins', 
                                        'driver_points_shifted': 'driver_points', 
                                        'driver_standing_shifted': 'driver_standing', 
                                        'constructor_points_shifted': 'constructor_points', 
                                        'constructor_standing_shifted': 'constructor_standing'})

merged_df = results_df.copy() # Defragmentate
merged_df.tail(10)

Unnamed: 0,resultId,raceId,driverId,constructorId,grid,position,year,round,circuitId,date,dob,driver_age,driver_experience,driver_constructor_experience,driver_wins,constructor_wins,driver_points,driver_standing,constructor_points,constructor_standing
26729,26729,1143,842,214,11,5.0,2024,23,78,2024-12-01,1996-02-07,28,152,44,1.0,1.0,26.0,12.0,59.0,6.0
26730,26726,1143,844,6,5,2.0,2024,23,78,2024-12-01,1997-10-16,27,147,126,8.0,4.0,319.0,3.0,619.0,2.0
26731,26734,1143,846,1,3,10.0,2024,23,78,2024-12-01,1999-11-13,25,126,126,3.0,2.0,340.0,2.0,608.0,1.0
26732,26728,1143,847,131,1,4.0,2024,23,78,2024-12-01,1998-02-15,26,126,67,3.0,84.0,217.0,6.0,446.0,4.0
26733,26739,1143,848,3,16,15.0,2024,23,78,2024-12-01,1996-03-23,28,103,65,0.0,0.0,12.0,16.0,17.0,9.0
26734,26737,1143,852,215,14,13.0,2024,23,78,2024-12-01,2000-05-11,24,88,22,0.0,0.0,30.0,11.0,46.0,8.0
26735,26732,1143,855,15,12,8.0,2024,23,78,2024-12-01,1999-05-30,25,66,22,0.0,0.0,0.0,21.0,4.0,10.0
26736,26727,1143,857,1,4,3.0,2024,23,78,2024-12-01,2001-04-06,23,44,44,2.0,3.0,268.0,4.0,640.0,1.0
26737,26738,1143,859,215,17,14.0,2024,23,78,2024-12-01,2002-02-11,22,9,4,0.0,0.0,4.0,20.0,46.0,8.0
26738,26743,1143,861,3,19,,2024,23,78,2024-12-01,2003-05-27,21,7,7,0.0,0.0,5.0,19.0,17.0,9.0


#### Step 10

Filter `merged_df` to keep only rows where `year` is 2010 or later. Why I'm doing this? Because 2010 is the last time F1 made a big change in the points award system. So for simplicity, instead of converting all the previous races for the current point system, I'll work with all the entries from 2010 or later.


In [13]:
merged_df = merged_df[merged_df['year'] >= 2010].reset_index(drop=True)
merged_df.head()

Unnamed: 0,resultId,raceId,driverId,constructorId,grid,position,year,round,circuitId,date,dob,driver_age,driver_experience,driver_constructor_experience,driver_wins,constructor_wins,driver_points,driver_standing,constructor_points,constructor_standing
0,20325,337,1,1,4,3.0,2010,1,3,2010-03-14,1985-01-07,25,52,52,11.0,1.0,0.0,0.0,0.0,0.0
1,20327,337,3,131,5,5.0,2010,1,3,2010-03-14,1985-06-27,24,70,0,0.0,0.0,0.0,0.0,0.0,0.0
2,20323,337,4,6,3,1.0,2010,1,3,2010-03-14,1981-07-29,28,140,0,21.0,0.0,0.0,0.0,0.0,0.0
3,20337,337,5,205,21,15.0,2010,1,3,2010-03-14,1981-10-19,28,52,0,1.0,0.0,0.0,0.0,0.0,0.0
4,20333,337,9,4,9,11.0,2010,1,3,2010-03-14,1984-12-07,25,57,0,1.0,0.0,0.0,0.0,0.0,0.0


#### Step 11

Calculate Circuit Danger Metric. What is this? One of the tables is *status* which displays the status for each *results* entry. And since each of those entries corresponds to one *race*, we can calculate how many incidents there were on each circuit, and the total of races on that circuit. So `circuit_danger` will result of dividing the total of incidents on a circuit by the total races on that circuit, from 2010 or later.

In [14]:
driver_experience_query = """
SELECT 
    c.circuitId,
    c.name,
    COUNT(*) AS count,
    total_races.total,
    COUNT(*) * 1.0 / total_races.total AS circuit_danger
FROM 
    races r
JOIN 
    results res ON r.raceId = res.raceId
JOIN 
    circuits c ON r.circuitId = c.circuitId
JOIN 
    (SELECT circuitId, COUNT(*) AS total 
     FROM races 
     WHERE year >= 2010 
     GROUP BY circuitId) AS total_races
    ON r.circuitId = total_races.circuitId
WHERE 
    res.statusId IN (3, 4) 
    AND r.year >= 2010
GROUP BY 
    c.circuitId, c.name, total_races.total
ORDER BY 
    circuit_danger DESC;
"""

circuit_df = pd.read_sql(driver_experience_query, con=engine)

circuit_df

Unnamed: 0,circuitId,name,count,total,circuit_danger
0,76,Autodromo Internazionale del Mugello,6,1,6.0
1,35,Korean International Circuit,13,4,3.25
2,6,Circuit de Monaco,36,14,2.57143
3,15,Marina Bay Street Circuit,28,13,2.15385
4,1,Albert Park Grand Prix Circuit,24,13,1.84615
5,71,Sochi Autodrom,14,8,1.75
6,7,Circuit Gilles Villeneuve,22,13,1.69231
7,12,Valencia Street Circuit,5,3,1.66667
8,68,Buddh International Circuit,5,3,1.66667
9,18,Autódromo José Carlos Pace,23,14,1.64286


#### Step 12

Here I'll merge `circuit_danger` to the `merged_df`. The rest of the values will be used for EDA.

In [15]:
circuit_danger_df = circuit_df[['circuitId', 'circuit_danger']]

merged_df = merged_df.merge(circuit_danger_df, on='circuitId', how='left')

merged_df.tail()

Unnamed: 0,resultId,raceId,driverId,constructorId,grid,position,year,round,circuitId,date,...,driver_age,driver_experience,driver_constructor_experience,driver_wins,constructor_wins,driver_points,driver_standing,constructor_points,constructor_standing,circuit_danger
6115,26737,1143,852,215,14,13.0,2024,23,78,2024-12-01,...,24,88,22,0.0,0.0,30.0,11.0,46.0,8.0,1.0
6116,26732,1143,855,15,12,8.0,2024,23,78,2024-12-01,...,25,66,22,0.0,0.0,0.0,21.0,4.0,10.0,1.0
6117,26727,1143,857,1,4,3.0,2024,23,78,2024-12-01,...,23,44,44,2.0,3.0,268.0,4.0,640.0,1.0,1.0
6118,26738,1143,859,215,17,14.0,2024,23,78,2024-12-01,...,22,9,4,0.0,0.0,4.0,20.0,46.0,8.0,1.0
6119,26743,1143,861,3,19,,2024,23,78,2024-12-01,...,21,7,7,0.0,0.0,5.0,19.0,17.0,9.0,1.0


#### Step 13

Drop Nulls

In [16]:
merged_df = merged_df.dropna()
merged_df.head()

Unnamed: 0,resultId,raceId,driverId,constructorId,grid,position,year,round,circuitId,date,...,driver_age,driver_experience,driver_constructor_experience,driver_wins,constructor_wins,driver_points,driver_standing,constructor_points,constructor_standing,circuit_danger
0,20325,337,1,1,4,3.0,2010,1,3,2010-03-14,...,25,52,52,11.0,1.0,0.0,0.0,0.0,0.0,0.66667
1,20327,337,3,131,5,5.0,2010,1,3,2010-03-14,...,24,70,0,0.0,0.0,0.0,0.0,0.0,0.0,0.66667
2,20323,337,4,6,3,1.0,2010,1,3,2010-03-14,...,28,140,0,21.0,0.0,0.0,0.0,0.0,0.0,0.66667
3,20337,337,5,205,21,15.0,2010,1,3,2010-03-14,...,28,52,0,1.0,0.0,0.0,0.0,0.0,0.0,0.66667
4,20333,337,9,4,9,11.0,2010,1,3,2010-03-14,...,25,57,0,1.0,0.0,0.0,0.0,0.0,0.0,0.66667


In [17]:
# Export current df to .csv for EDA
merged_df.to_csv('final_df_EDA.csv', index=False)

#### Step 14

Drop unnecesary columns

In [18]:
merged_df = merged_df.drop(columns=['date', 'dob', 'resultId', 'raceId'])
merged_df.head()

Unnamed: 0,driverId,constructorId,grid,position,year,round,circuitId,driver_age,driver_experience,driver_constructor_experience,driver_wins,constructor_wins,driver_points,driver_standing,constructor_points,constructor_standing,circuit_danger
0,1,1,4,3.0,2010,1,3,25,52,52,11.0,1.0,0.0,0.0,0.0,0.0,0.66667
1,3,131,5,5.0,2010,1,3,24,70,0,0.0,0.0,0.0,0.0,0.0,0.0,0.66667
2,4,6,3,1.0,2010,1,3,28,140,0,21.0,0.0,0.0,0.0,0.0,0.0,0.66667
3,5,205,21,15.0,2010,1,3,28,52,0,1.0,0.0,0.0,0.0,0.0,0.0,0.66667
4,9,4,9,11.0,2010,1,3,25,57,0,1.0,0.0,0.0,0.0,0.0,0.0,0.66667


#### Step 15

Change IDs from `driverId`, `circuitId` and `constructorId` to their descriptive names

In [19]:
driver_id_query = """
SELECT driverId, driverRef
FROM drivers
"""
drivers_name = pd.read_sql(driver_id_query, con=engine)

merged_df = merged_df.merge(drivers_name[['driverId', 'driverRef']], on='driverId', how='left')
merged_df = merged_df.drop(columns=['driverId'])
merged_df = merged_df.rename(columns={'driverRef': 'driver'})

In [20]:
circuit_id_query = """
SELECT circuitId, circuitRef
FROM circuits
"""
circuits_name = pd.read_sql(circuit_id_query, con=engine)

merged_df = merged_df.merge(circuits_name[['circuitId', 'circuitRef']], on='circuitId', how='left')
merged_df = merged_df.drop(columns=['circuitId'])
merged_df = merged_df.rename(columns={'circuitRef': 'circuit'})


In [21]:
constructor_id_query = """
SELECT constructorId, constructorRef
FROM constructors
"""
constructors_name = pd.read_sql(constructor_id_query, con=engine)

merged_df = merged_df.merge(constructors_name[['constructorId', 'constructorRef']], on='constructorId', how='left')
merged_df = merged_df.drop(columns=['constructorId'])
merged_df = merged_df.rename(columns={'constructorRef': 'constructor'})

In [22]:
merged_df = pd.get_dummies(merged_df, columns=['driver', 'circuit', 'constructor'])
merged_df.head()

Unnamed: 0,grid,position,year,round,driver_age,driver_experience,driver_constructor_experience,driver_wins,constructor_wins,driver_points,...,constructor_mclaren,constructor_mercedes,constructor_racing_point,constructor_rb,constructor_red_bull,constructor_renault,constructor_sauber,constructor_toro_rosso,constructor_virgin,constructor_williams
0,4,3.0,2010,1,25,52,52,11.0,1.0,0.0,...,True,False,False,False,False,False,False,False,False,False
1,5,5.0,2010,1,24,70,0,0.0,0.0,0.0,...,False,True,False,False,False,False,False,False,False,False
2,3,1.0,2010,1,28,140,0,21.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
3,21,15.0,2010,1,28,52,0,1.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
4,9,11.0,2010,1,25,57,0,1.0,0.0,0.0,...,False,False,False,False,False,True,False,False,False,False


In [23]:
for col in merged_df.columns:
    if 'driver' in col and merged_df[col].sum() < 25:
        merged_df.drop(col, axis = 1, inplace = True)
        
    elif 'constructor' in col and merged_df[col].sum() < 50:
        merged_df.drop(col, axis = 1, inplace = True)

    elif 'circuit' in col and merged_df[col].sum() < 3:
        merged_df.drop(col, axis = 1, inplace = True)
    
    else:
        pass

merged_df.head()

Unnamed: 0,grid,position,year,round,driver_age,driver_experience,driver_constructor_experience,driver_wins,constructor_wins,driver_points,...,constructor_manor,constructor_marussia,constructor_mclaren,constructor_mercedes,constructor_racing_point,constructor_red_bull,constructor_renault,constructor_sauber,constructor_toro_rosso,constructor_williams
0,4,3.0,2010,1,25,52,52,11.0,1.0,0.0,...,False,False,True,False,False,False,False,False,False,False
1,5,5.0,2010,1,24,70,0,0.0,0.0,0.0,...,False,False,False,True,False,False,False,False,False,False
2,3,1.0,2010,1,28,140,0,21.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
3,21,15.0,2010,1,28,52,0,1.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
4,9,11.0,2010,1,25,57,0,1.0,0.0,0.0,...,False,False,False,False,False,False,True,False,False,False


In [24]:
merged_df.tail()

Unnamed: 0,grid,position,year,round,driver_age,driver_experience,driver_constructor_experience,driver_wins,constructor_wins,driver_points,...,constructor_manor,constructor_marussia,constructor_mclaren,constructor_mercedes,constructor_racing_point,constructor_red_bull,constructor_renault,constructor_sauber,constructor_toro_rosso,constructor_williams
5080,16,15.0,2024,23,28,103,65,0.0,0.0,12.0,...,False,False,False,False,False,False,False,False,False,True
5081,14,13.0,2024,23,24,88,22,0.0,0.0,30.0,...,False,False,False,False,False,False,False,False,False,False
5082,12,8.0,2024,23,25,66,22,0.0,0.0,0.0,...,False,False,False,False,False,False,False,True,False,False
5083,4,3.0,2024,23,23,44,44,2.0,3.0,268.0,...,False,False,True,False,False,False,False,False,False,False
5084,17,14.0,2024,23,22,9,4,0.0,0.0,4.0,...,False,False,False,False,False,False,False,False,False,False


#### Step 17

Export the data frame as `final_df.csv`

In [25]:
merged_df.to_csv('final_df.csv', index=False)

#### Step 18

Save a copy of the data frame filtered by the 2024 season for making predictions: `df_2024.csv`

In [None]:
df2024 = merged_df[(merged_df['year'] == 2024)].copy()
df2024.to_csv('../Streamlit/assets/df_2024.csv', index=False)