## 1. Import Required Libraries

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import os
import glob
from sklearn.preprocessing import LabelEncoder

## 2. Load and Combine F1 Datasets
This cell loads all CSV files from the `F1_datasets` folder and combines them into a single DataFrame.

In [13]:
# --- Load and Combine F1 Datasets with Driver Features ---
import pandas as pd
import os

project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
data_folder = os.path.join(project_root, 'pipelines', 'data')

# Load each relevant CSV
results = pd.read_csv(os.path.join(data_folder, 'results.csv'))
sprints = pd.read_csv(os.path.join(data_folder, 'sprint_results.csv')) if os.path.exists(os.path.join(data_folder, 'sprint_results.csv')) else pd.DataFrame()
lap_times = pd.read_csv(os.path.join(data_folder, 'lap_times.csv'))
standings = pd.read_csv(os.path.join(data_folder, 'driver_standings.csv'))
qualifying = pd.read_csv(os.path.join(data_folder, 'qualifying.csv'))
drivers = pd.read_csv(os.path.join(data_folder, 'drivers.csv'))

# --- Feature engineering for each driver ---
# Average race position
avg_race_pos = results.groupby('driverId')['positionOrder'].mean().rename('avg_race_pos')
# Average sprint position (if available)
if not sprints.empty and 'positionOrder' in sprints.columns:
    avg_sprint_pos = sprints.groupby('driverId')['positionOrder'].mean().rename('avg_sprint_pos')
else:
    avg_sprint_pos = pd.Series(dtype=float)
# Average lap time (in seconds)
if 'milliseconds' in lap_times.columns:
    lap_times['lap_time_sec'] = lap_times['milliseconds'] / 1000
    avg_lap_time = lap_times.groupby('driverId')['lap_time_sec'].mean().rename('avg_lap_time')
else:
    avg_lap_time = pd.Series(dtype=float)
# Latest driver standing points
latest_race_id = standings['raceId'].max()
latest_standings = standings[standings['raceId'] == latest_race_id][['driverId', 'points']].set_index('driverId')['points']
# Average qualifying position
if 'position' in qualifying.columns:
    avg_qual_pos = qualifying.groupby('driverId')['position'].mean().rename('avg_qual_pos')
else:
    avg_qual_pos = pd.Series(dtype=float)

# Merge all features into a single DataFrame
features = pd.DataFrame(index=drivers['driverId'])
features = features.join(avg_race_pos)
features = features.join(avg_sprint_pos)
features = features.join(avg_lap_time)
features = features.join(latest_standings)
features = features.join(avg_qual_pos)

# Add driver name for reference
features = features.join(drivers.set_index('driverId')[['forename', 'surname']])
features = features.reset_index().rename(columns={'index': 'driverId'})

# Drop drivers with insufficient data
features = features.dropna(subset=['avg_race_pos', 'avg_lap_time', 'points', 'avg_qual_pos'], how='any')

features.head()

Unnamed: 0,driverId,avg_race_pos,avg_sprint_pos,avg_lap_time,points,avg_qual_pos,forename,surname
0,1,5.019663,6.777778,96.752708,223.0,4.073034,Lewis,Hamilton
3,4,8.492574,13.277778,96.208609,70.0,8.139535,Fernando,Alonso
452,842,11.25974,11.555556,96.090065,42.0,11.150327,Pierre,Gasly
807,807,11.926087,13.916667,98.356898,41.0,10.650655,Nico,Hülkenberg
814,815,9.332155,8.055556,98.009264,152.0,9.900709,Sergio,Pérez


In [7]:
# Export the features DataFrame (model input) to CSV
features.to_csv('f1_model_input.csv', index=False)
print("Model input CSV saved as f1_model_input.csv")

Model input CSV saved as f1_model_input.csv


## 3. Preprocess Data for Win Prediction
This step cleans the data, encodes categorical features, and prepares the target variable for win prediction.

In [8]:
# Use the 'features' DataFrame from the previous cell as the main dataset
df = features.copy()

# Ensure 'driver' column exists by combining 'forename' and 'surname' (as in cell 5)
if 'driver' not in df.columns:
    if 'forename' in df.columns and 'surname' in df.columns:
        df['driver'] = df['forename'].astype(str) + ' ' + df['surname'].astype(str)
    elif 'driverRef' in df.columns:
        df['driver'] = df['driverRef'].astype(str)
    elif 'driverId' in df.columns:
        df['driver'] = df['driverId'].astype(str)
    else:
        raise ValueError("No suitable columns found to create 'driver'.")

# Ensure required columns exist before preprocessing (as in cell 5)
required_columns = {'driverId', 'avg_race_pos', 'avg_lap_time', 'points', 'avg_qual_pos', 'forename', 'surname'}
missing_columns = required_columns - set(df.columns)
if missing_columns:
    raise ValueError(f"Missing columns in the dataset: {missing_columns}")

# Drop rows with missing required features (as in cell 5)
df = df.dropna(subset=list(required_columns))

# Encode all categorical columns except the target and identifiers (as in cell 5)
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
categorical_cols = [col for col in categorical_cols if col not in ['driver', 'driverId', 'forename', 'surname']]
for col in categorical_cols:
    df[col] = LabelEncoder().fit_transform(df[col].astype(str))

# Encode driver column for later use (as in cell 5)
le_driver = LabelEncoder()
df['driver_encoded'] = le_driver.fit_transform(df['driver'])

# Target: 1 if avg_race_pos is in the top 3 (proxy for winner), else 0
top_n = 3
df['win'] = df['avg_race_pos'].rank(method='min').le(top_n).astype(int)

# Check class balance for stratification (as in cell 5)
if df['win'].sum() < 2 or (df['win'] == 0).sum() < 2:
    # Not enough samples in one class for stratification, fallback to no stratify
    stratify = None
else:
    stratify = df['win']

# Select features (exclude identifiers and target) (as in cell 5)
feature_cols = [col for col in df.columns if col not in ['driver', 'driverId', 'forename', 'surname', 'win']]
X = df[feature_cols]
y = df['win']

# Fill any remaining missing values with column means (for numeric columns) (as in cell 5)
X = X.fillna(X.mean())

# Split data (as in cell 5)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=stratify)

## 4. Train a Model
We'll use a Random Forest Classifier with class balancing and a fixed random state.

In [9]:
model = RandomForestClassifier(class_weight='balanced', random_state=42)
model.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


## 5. Evaluate the Model

In [10]:
# Evaluate model accuracy
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

import numpy as np  # Ensure numpy is imported for np.zeros

# Predict Win Probabilities for Next Race (using all available driver data)
next_race_drivers = df.copy()

# Prepare features for prediction (must match training features)
predict_features = list(model.feature_names_in_)
X_next = next_race_drivers[predict_features]
X_next = X_next.apply(pd.to_numeric, errors='coerce')

# Predict win probabilities
proba = model.predict_proba(X_next)
if proba.shape[1] == 2:
    win_probs = proba[:, 1]
else:
    # Only one class present in training, assign probability 1.0 to that class, 0.0 otherwise
    if model.classes_[0] == 1:
        win_probs = proba[:, 0]
    else:
        win_probs = np.zeros(X_next.shape[0])
next_race_drivers['win_probability'] = win_probs

# Display all columns, sorted by win probability descending
display(next_race_drivers.sort_values('win_probability', ascending=False))

Accuracy: 1.00


Unnamed: 0,driverId,avg_race_pos,avg_sprint_pos,avg_lap_time,points,avg_qual_pos,forename,surname,driver,driver_encoded,win,win_probability
0,1,5.019663,6.777778,96.752708,223.0,4.073034,Lewis,Hamilton,Lewis Hamilton,13,1,0.71
829,830,5.645933,2.0,95.693988,437.0,4.727273,Max,Verstappen,Max Verstappen,16,1,0.7
842,844,7.557047,5.055556,95.584539,356.0,6.208054,Charles,Leclerc,Charles Leclerc,2,1,0.63
844,846,7.851562,6.333333,96.429629,374.0,7.21875,Lando,Norris,Lando Norris,12,0,0.27
855,857,7.891304,6.0,96.402151,292.0,7.543478,Oscar,Piastri,Oscar Piastri,19,0,0.13
831,832,9.274038,4.888889,96.419344,290.0,8.512077,Carlos,Sainz,Carlos Sainz,1,0,0.1
845,847,10.53125,7.277778,95.362814,245.0,10.328125,George,Russell,George Russell,7,0,0.05
814,815,9.332155,8.055556,98.009264,152.0,9.900709,Sergio,Pérez,Sergio Pérez,21,0,0.02
846,848,11.914286,13.866667,94.918974,12.0,11.961538,Alexander,Albon,Alexander Albon,0,0,0.02
816,817,10.206226,9.083333,98.227582,12.0,9.792969,Daniel,Ricciardo,Daniel Ricciardo,3,0,0.01


In [14]:
# Save the trained model, label encoder, and feature columns as a dictionary
import pickle
import os

# Ensure the 'models' directory exists at the project root
project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
models_dir = os.path.join(project_root, 'models')
os.makedirs(models_dir, exist_ok=True)

model_dict = {
    'model': model,
    'driver_encoder': le_driver,
    'feature_columns': list(X_train.columns)
}

pkl_path = os.path.join(models_dir, 'f1_race_position_model.pkl')
with open(pkl_path, 'wb') as f:
    pickle.dump(model_dict, f)

print(f"Model, encoder, and feature columns saved to {pkl_path}")

Model, encoder, and feature columns saved to c:\Users\regut003\Documents\proyectos\proyecto_ds_jueves\models\f1_race_position_model.pkl
