In [1]:
import pandas as pd
import numpy as np
import os

data_dir = '../data'
csv_files = [file for file in os.listdir(data_dir) if file.endswith('.csv')]

# Load each CSV into a dictionary
dataframes = {file.replace('.csv', ''): pd.read_csv(os.path.join(data_dir, file)) for file in csv_files}

In [10]:
# Unpack relevant dataframes
races = dataframes['races']
results = dataframes['results']
drivers = dataframes['drivers']
constructors = dataframes['constructors']
qualifying = dataframes['qualifying']
circuits = dataframes['circuits']

# Merge results with races (to get race metadata)
results_races = results.merge(races[['raceId', 'year', 'round', 'circuitId', 'name', 'date']], on='raceId', how='left')

# Merge with drivers to get driver info
results_races_drivers = results_races.merge(drivers[['driverId', 'driverRef', 'surname']], on='driverId', how='left')

# Merge with constructors to get team info
results_races_drivers_teams = results_races_drivers.merge(constructors[['constructorId', 'name']], on='constructorId', how='left', suffixes=('', '_constructor'))

# Merge with qualifying to get qualifying position
results_full = results_races_drivers_teams.merge(qualifying[['raceId', 'driverId', 'position']], on=['raceId', 'driverId'], how='left', suffixes=('', '_qualifying'))

# Merge with circuits for track info
results_full = results_full.merge(circuits[['circuitId', 'name']], on='circuitId', how='left', suffixes=('', '_circuit'))


# Target variable: 1 if the driver won that race
results_full['won'] = (results_full['positionOrder'] == 1).astype(int)

# Drop unnecessary columns and keep relevant features
model_df = results_full[[
    'raceId', 'year', 'round', 'date',
    'driverId', 'driverRef', 'surname',
    'constructorId', 'name_constructor',
    'grid', 'position', 'positionOrder', 'position_qualifying',
    'won'
]]

# Example: Drop rows with missing values (optional)
model_df = model_df.dropna()

# Show a sample
print(model_df.sample(5))


       raceId  year  round        date  driverId           driverRef  \
7614        4  2009      4  2009-04-26        20              vettel   
7672        6  2009      6  2009-05-24        67               buemi   
25650    1086  2022     13  2022-07-31       840              stroll   
5116      256  1995     17  1995-11-12        81          morbidelli   
1199       74  2005      4  2005-04-24        30  michael_schumacher   

          surname  constructorId name_constructor  grid position  \
7614       Vettel              9         Red Bull     3        2   
7672        Buemi              5       Toro Rosso    11       \N   
25650      Stroll            117     Aston Martin    14       11   
5116   Morbidelli             29         Footwork    13        3   
1199   Schumacher              6          Ferrari    13        2   

       positionOrder  position_qualifying  won  
7614               2                  3.0    0  
7672              20                 11.0    0  
25650      

Does this driver win?

In [9]:
from sklearn.preprocessing import LabelEncoder

model_df['year'] = pd.to_numeric(model_df['year'])

# Create encoders
driver_encoder = LabelEncoder()
constructor_encoder = LabelEncoder()

model_df['driver_encoded'] = driver_encoder.fit_transform(model_df['driverRef'])
model_df['constructor_encoded'] = constructor_encoder.fit_transform(model_df['name_constructor'])
model_df['top_3_grid'] = (model_df['grid'] <= 3).astype(int)
model_df['qual_grid_diff'] = model_df['position_qualifying'] - model_df['grid']


In [7]:
from sklearn.model_selection import train_test_split

features = ['driver_encoded', 'constructor_encoded', 'grid', 'position_qualifying',
            'top_3_grid', 'qual_grid_diff', 'year']
target = 'won'

X = model_df[features]
y = model_df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Calculate scale_pos_weight: ratio of negative to positive classes
scale = (len(y_train) - sum(y_train)) / sum(y_train)
print(f"scale_pos_weight: {scale:.2f}")



scale_pos_weight: 20.25


In [8]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

model = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    scale_pos_weight=scale
)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.99      0.94      0.96      2000
           1       0.36      0.74      0.49        99

    accuracy                           0.93      2099
   macro avg       0.67      0.84      0.72      2099
weighted avg       0.96      0.93      0.94      2099



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
