# Project - Formula 1 Analysis
## Machine Learning Predictions


  ***

In [1]:
# Import Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [2]:
# Kaggle API Conection

import kagglehub

# Download latest version
path = kagglehub.dataset_download("rohanrao/formula-1-world-championship-1950-2020")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\Joaquim Meruje\.cache\kagglehub\datasets\rohanrao\formula-1-world-championship-1950-2020\versions\24


In [3]:
# Read Datasets

# Paths for CSV files
races_file = f"{path}/races.csv"
results_file = f"{path}/results.csv"
circuits_file = f"{path}/circuits.csv"
status_file = f"{path}/status.csv"
driver_standings_file = f"{path}/driver_standings.csv"
lap_times_file = f"{path}/lap_times.csv"
drivers_file = f"{path}/drivers.csv"
constructors_file = f"{path}/constructors.csv"
constructors_standings_file = f"{path}/constructor_standings.csv"

# Read for a dataframe each csv file
races = pd.read_csv(races_file, na_values='\\N')
results = pd.read_csv(results_file, na_values='\\N')
circuits = pd.read_csv(circuits_file, na_values='\\N')
status = pd.read_csv(status_file, na_values='\\N')
driver_standings = pd.read_csv(driver_standings_file, na_values='\\N')
lap_times = pd.read_csv(lap_times_file, na_values='\\N')
drivers = pd.read_csv(drivers_file, na_values='\\N')
constructors = pd.read_csv(constructors_file, na_values='\\N')
constructors_standings = pd.read_csv(constructors_standings_file, na_values='\\N')

## 1) Race Winners

In [4]:
results

Unnamed: 0,resultId,raceId,driverId,constructorId,number,grid,position,positionText,positionOrder,points,laps,time,milliseconds,fastestLap,rank,fastestLapTime,fastestLapSpeed,statusId
0,1,18,1,1,22.0,1,1.0,1,1,10.0,58,1:34:50.616,5690616.0,39.0,2.0,1:27.452,218.300,1
1,2,18,2,2,3.0,5,2.0,2,2,8.0,58,+5.478,5696094.0,41.0,3.0,1:27.739,217.586,1
2,3,18,3,3,7.0,7,3.0,3,3,6.0,58,+8.163,5698779.0,41.0,5.0,1:28.090,216.719,1
3,4,18,4,4,5.0,11,4.0,4,4,5.0,58,+17.181,5707797.0,58.0,7.0,1:28.603,215.464,1
4,5,18,5,1,23.0,3,5.0,5,5,4.0,58,+18.014,5708630.0,43.0,1.0,1:27.418,218.385,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26754,26760,1144,825,210,20.0,14,16.0,16,16,0.0,57,,,57.0,1.0,1:25.637,222.002,11
26755,26761,1144,859,215,30.0,12,17.0,17,17,0.0,55,,,52.0,12.0,1:28.751,214.212,5
26756,26762,1144,822,15,77.0,9,,R,18,0.0,30,,,14.0,19.0,1:29.482,212.462,130
26757,26763,1144,861,3,43.0,20,,R,19,0.0,26,,,5.0,17.0,1:29.411,212.631,5


In [5]:
races['month'] = pd.to_datetime(races['date']).dt.month
race_winners = results.drop(columns=['resultId', 'position', 'number', 'time', 'milliseconds', 'fastestLapTime', 'positionOrder'])

race_winners = race_winners.merge(races[['raceId', 'year', 'month', 'round', 'circuitId']], on='raceId', how='right')

In [6]:
race_winners

Unnamed: 0,raceId,driverId,constructorId,grid,positionText,points,laps,fastestLap,rank,fastestLapSpeed,statusId,year,month,round,circuitId
0,1,18,23,1,1,10.0,58,17.0,3.0,216.891,1,2009,3,1,1
1,1,22,23,2,2,8.0,58,43.0,14.0,214.344,1,2009,3,1,1
2,1,15,7,20,3,6.0,58,50.0,10.0,214.706,1,2009,3,1,1
3,1,10,7,19,4,5.0,58,53.0,6.0,215.920,1,2009,3,1,1
4,1,4,4,10,5,4.0,58,53.0,9.0,215.199,1,2009,3,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26754,1144,825,210,14,16,0.0,57,57.0,1.0,222.002,11,2024,12,24,24
26755,1144,859,215,12,17,0.0,55,52.0,12.0,214.212,5,2024,12,24,24
26756,1144,822,15,9,R,0.0,30,14.0,19.0,212.462,130,2024,12,24,24
26757,1144,861,3,20,R,0.0,26,5.0,17.0,212.631,5,2024,12,24,24


In [7]:
# Check for NaN values in each column

for col in race_winners.columns:
    print(f"Column {col} -> {race_winners[col].isna().sum()/len(race_winners)*100}")

Column raceId -> 0.0
Column driverId -> 0.0
Column constructorId -> 0.0
Column grid -> 0.0
Column positionText -> 0.0
Column points -> 0.0
Column laps -> 0.0
Column fastestLap -> 69.16177734593968
Column rank -> 68.19761575544676
Column fastestLapSpeed -> 69.16177734593968
Column statusId -> 0.0
Column year -> 0.0
Column month -> 0.0
Column round -> 0.0
Column circuitId -> 0.0


In [8]:
race_winners = race_winners.drop(columns=['fastestLap', 'fastestLapSpeed', 'rank'])

In [9]:
unique_strings = race_winners['positionText'][race_winners['positionText'].apply(lambda x: isinstance(x, str))].unique()
unique_strings = [val for val in unique_strings if not val.isdigit()]
print(unique_strings)

mapping = {'R': -1, 'D': -2, 'W': -3, 'N': -4, 'F': -5, 'E': -6}
race_winners['positionText'] = race_winners['positionText'].replace(mapping).astype(int, errors='ignore')

['R', 'D', 'W', 'N', 'F', 'E']


In [10]:
race_winners

Unnamed: 0,raceId,driverId,constructorId,grid,positionText,points,laps,statusId,year,month,round,circuitId
0,1,18,23,1,1,10.0,58,1,2009,3,1,1
1,1,22,23,2,2,8.0,58,1,2009,3,1,1
2,1,15,7,20,3,6.0,58,1,2009,3,1,1
3,1,10,7,19,4,5.0,58,1,2009,3,1,1
4,1,4,4,10,5,4.0,58,1,2009,3,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
26754,1144,825,210,14,16,0.0,57,11,2024,12,24,24
26755,1144,859,215,12,17,0.0,55,5,2024,12,24,24
26756,1144,822,15,9,-1,0.0,30,130,2024,12,24,24
26757,1144,861,3,20,-1,0.0,26,5,2024,12,24,24


In [None]:
# O objetivo é determinar a posição de um determinado piloto por corrida
# E no final obter a classifacação final do campeonato
# Colunas a adiconar -> resultados das ultimas 5 corridas
#                     -> resultados dos ultimos 3 campeonatos (pilotos e constructors)                       


In [None]:
# Prepare variables and split data

X = race_winners.drop(columns=["positionOrder"]) 
y = race_winners["positionOrder"]  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)