In [1]:
import pandas as pd
from sklearn import svm
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import GradientBoostingRegressor
from imblearn.over_sampling import RandomOverSampler

# Read data
dir = r"C:\Users\jornv\OneDrive\Documenten\Projects\F1\Data\Formula1_ML.csv"
data = pd.read_csv(dir)

# Convert variables to factors
columns_to_factor = ['grid_t1', 
                     'grid', 
                     'position_driverstanding', 
                     'teammates_driverstanding',
                     'wins_driverstanding', 
                     'wins_constructorstanding', 
                     'quarter',
                     'grid_of_1_in_standings',
                     'grid_of_2_in_standings',
                     'grid_of_3_in_standings',
                     'grid_of_4_in_standings',
                     'grid_of_5_in_standings']

data[columns_to_factor] = data[columns_to_factor].astype('category')

# Split data into train and test sets
def train_test_split_year(dataset, year):
    train = dataset[(dataset['year'] < year) & (dataset['year'] > 1980) & (dataset['year'] != 2020) & (dataset['year'] != 2021)]
    test = dataset[dataset['year'] == year]
    return train, test



ImportError: DLL load failed while importing qhull: The specified module could not be found.

# Model trainer and tester SVM

In [None]:
for year in range(2010, 2023):
    if year != 2020 and year != 2021:
        # Split data into train and test sets
        train, test = train_test_split_year(data, year)

        # Select certain columns from test data
        selected_columns = ['end_race_position_t1', 
                            'drivers_takeover_chance', 
                            'quarter', 
                            'grid_t1', 
                            'grid',
                            'diff_grid_standing', 
                            'teammates_driverstanding', 
                            'overtakes_per_track_t1', 
                            'drivers_defense',
                            'wins_driverstanding', 
                            'wins_constructorstanding', 
                            'position_driverstanding', 
                            'raceId', 
                            'teammates_defense', 
                            'teammates_takeover_chance']

        test = test[selected_columns]
        # Remove rows with missing values
        test = test.dropna()

        variables = ['drivers_takeover_chance', 
                      'quarter', 
                      'grid_t1', 
                      'grid', 
                      'diff_grid_standing', 
                      'teammates_driverstanding', 
                      'overtakes_per_track_t1', 
                      'drivers_defense', 
                      'wins_driverstanding', 
                      'wins_constructorstanding', 
                      'position_driverstanding',
                      'end_race_position_t1',
                      'teammates_defense',
                      'teammates_takeover_chance'
                    ]

        train = train[variables].dropna()

        # Split the data into features (X) and target variable (y)
        X = train.drop('end_race_position_t1', axis=1)
        y = train['end_race_position_t1']

        # Oversample the minority class
        ros = RandomOverSampler()
        X_resampled, y_resampled = ros.fit_resample(X, y)

        # Train the SVM model
        svm_model = svm.SVC(kernel='rbf', gamma = 'scale', C = 30, probability=True)

        svm_model.fit(X_resampled, y_resampled)
        
        # Predict on the test set
        probabilities = svm_model.predict_proba(test.drop(['end_race_position_t1', 'raceId'], axis=1))[:, 1]

        # Create a data frame with the predictions
        values = test.copy()
        values['predicted_probability'] = probabilities

        # Group by 'raceId' and create 'top3' variable
        values['top3'] = values.groupby('raceId')['predicted_probability'].rank(method='min', ascending=False) <= 3


        # Create the confusion matrix
        cm = confusion_matrix(values['top3'], values['end_race_position_t1'])
        
        print(year)
        print(cm)

2010
[[334  20]
 [ 20  31]]
2011
[[331  17]
 [ 18  34]]
2012
[[346  29]
 [ 29  25]]
2013
[[318  23]
 [ 23  31]]


# Model trainer and tester: Gradient boosting regressor

In [2]:
for year in range(2010, 2024):
    if year != 2020 and year != 2021:
        # Split data into train and test sets
        train, test = train_test_split_year(data, year)

        # Select certain columns from test data
        selected_columns = ['end_race_position_t1', 
                            'drivers_takeover_chance', 
                            'quarter', 
                            'grid_t1', 
                            'diff_grid_standing', 
                            'teammates_driverstanding', 
                            'overtakes_per_track_t1', 
                            'drivers_defense',
                            'wins_driverstanding', 
                            'wins_constructorstanding', 
                            'position_driverstanding', 
                            'raceId', 
                            'teammates_defense', 
                            'teammates_takeover_chance']

        test = test[selected_columns]
        # Remove rows with missing values
        test = test.dropna()

        variables = ['drivers_takeover_chance', 
                      'quarter', 
                      'grid_t1', 
                      'diff_grid_standing', 
                      'teammates_driverstanding', 
                      'overtakes_per_track_t1', 
                      'drivers_defense', 
                      'wins_driverstanding', 
                      'wins_constructorstanding', 
                      'position_driverstanding',
                      'end_race_position_t1',
                      'teammates_defense',
                      'teammates_takeover_chance']

        train = train[variables].dropna()

        # Split the data into features (X) and target variable (y)
        X = train.drop('end_race_position_t1', axis=1)
        y = train['end_race_position_t1']

        # Oversample the minority class
        ros = RandomOverSampler()
        X_resampled, y_resampled = ros.fit_resample(X, y)
        
        # Create the gradient boosting regressor
        gbr = GradientBoostingRegressor(n_estimators=300, learning_rate=0.01, max_depth=8, random_state=42)

        # Train the model
        gbr.fit(X_resampled, y_resampled)
        
        # Predict on the test set
        probabilities = gbr.predict(test.drop(['end_race_position_t1', 'raceId'], axis=1))

        # Create a data frame with the predictions
        values = test.copy()
        values['predicted_probability'] = probabilities

        # Group by 'raceId' and create 'top3' variable
        values['top3'] = values.groupby('raceId')['predicted_probability'].rank(method='min', ascending=False) <= 3


        # Create the confusion matrix
        cm = confusion_matrix(values['top3'], values['end_race_position_t1'])
        
        print(year)
        print(cm)

2010
[[336  18]
 [ 18  33]]
2011
[[330  18]
 [ 19  33]]
2012
[[350  25]
 [ 25  29]]
2013
[[319  22]
 [ 22  32]]
2014
[[292  19]
 [ 19  32]]
2015
[[286  16]
 [ 16  38]]
2016
[[339  20]
 [ 20  37]]
2017
[[300  18]
 [ 18  39]]
2018
[[320  20]
 [ 20  40]]
2019
[[324  16]
 [ 16  44]]
2022
[[312  26]
 [ 26  34]]
2023
[[252  17]
 [ 17  31]]


In [2]:
train, test = train_test_split_year(data, 2023)

# Select certain columns from test data
selected_columns = ['end_race_position_t1', 
                    'drivers_takeover_chance', 
                    'quarter', 
                    'grid_t1', 
                    'diff_grid_standing', 
                    'teammates_driverstanding', 
                    'overtakes_per_track_t1', 
                    'drivers_defense',
                    'wins_driverstanding', 
                    'wins_constructorstanding', 
                    'position_driverstanding', 
                    'raceId', 
                    'teammates_defense', 
                    'teammates_takeover_chance',
                    'driverRef',
                    'name_circuit']

test = test[selected_columns]
# Remove rows with missing values
test = test.dropna()

variables = ['drivers_takeover_chance', 
              'quarter', 
              'grid_t1', 
              'diff_grid_standing', 
              'teammates_driverstanding', 
              'overtakes_per_track_t1', 
              'drivers_defense', 
              'wins_driverstanding', 
              'wins_constructorstanding', 
              'position_driverstanding',
              'end_race_position_t1',
              'teammates_defense',
              'teammates_takeover_chance']

train = train[variables].dropna()

# Split the data into features (X) and target variable (y)
X = train.drop('end_race_position_t1', axis=1)
y = train['end_race_position_t1']

# Oversample the minority class
ros = RandomOverSampler()
X_resampled, y_resampled = ros.fit_resample(X, y)

# Create the gradient boosting regressor
gbr = GradientBoostingRegressor(n_estimators=300, learning_rate=0.01, max_depth=8, random_state=42)

# Train the model
gbr.fit(X_resampled, y_resampled)

# Predict on the test set
probabilities = gbr.predict(test.drop(['end_race_position_t1', 'raceId', 'driverRef', 'name_circuit'], axis=1))

# Create a data frame with the predictions
values = test.copy()
values['predicted_probability'] = probabilities

# Group by 'raceId' and create 'top3' variable
values['top3'] = values.groupby('raceId')['predicted_probability'].rank(method='min', ascending=False) <= 2


# Create the confusion matrix
cm = confusion_matrix(values['top3'], values['end_race_position_t1'])
print(cm)

[[231  21]
 [  9  21]]


In [3]:
values.to_csv('C:\\Users\\jornv\\OneDrive\\Documenten\\Python data\\bets.csv')