In [255]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [256]:
df = pd.read_csv('../Data-collection/final_df.csv')
df.head()

Unnamed: 0,grid,position,year,round,driver_age,driver_experience,driver_constructor_experience,driver_points,driver_standing,constructor_points,...,constructorId_205,constructorId_206,constructorId_207,constructorId_208,constructorId_209,constructorId_210,constructorId_211,constructorId_213,constructorId_214,constructorId_215
0,3,1.0,2010,1,28,140,0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
1,2,2.0,2010,1,28,116,63,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
2,4,3.0,2010,1,25,52,52,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
3,1,4.0,2010,1,22,43,17,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
4,5,5.0,2010,1,24,70,0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False


In [257]:
# Here the test data needs to be whole races, not random entries from different races
# Will do 3 races per season as test sets
test_races_per_year = 3

# Create a train-test split based on races
unique_races = df[['year', 'round']].drop_duplicates()
test_races = unique_races.groupby('year').sample(n=test_races_per_year, random_state=42)

# Mark test races in the dataset
df['is_test'] = df[['year', 'round']].apply(
    lambda x: tuple(x) in test_races.itertuples(index=False, name=None), axis=1
)

# Split the data
train_df = df[df['is_test'] == False]
test_df = df[df['is_test'] == True]

train_df = train_df.drop(columns=['is_test'])
test_df = test_df.drop(columns=['is_test'])

print(f"Training set size: {len(train_df)}, Testing set size: {len(test_df)}")

Training set size: 4473, Testing set size: 783


In [258]:
X_train = train_df.drop(columns=['position'])
X_test = test_df.drop(columns=['position'])

y_train = train_df['position']
y_test = test_df['position']

print(f"Training Features: {X_train.shape}, Testing Features: {X_test.shape}")
print(f"Training Target: {y_train.shape}, Testing Target: {y_test.shape}")

Training Features: (4473, 146), Testing Features: (783, 146)
Training Target: (4473,), Testing Target: (783,)


In [259]:
# Select only numerical columns for scaling
numerical_columns = [
    'grid', 'driver_age', 'driver_experience', 'driver_constructor_experience',
    'driver_points', 'driver_standing', 'constructor_points', 
    'constructor_standing', 'driver_wins', 'constructor_wins', 'circuit_danger', 
    'year', 'round'
]

scaler = StandardScaler()

X_train[numerical_columns] = scaler.fit_transform(X_train[numerical_columns])
X_test[numerical_columns] = scaler.transform(X_test[numerical_columns])

# conver one-hot-encoding columns from True/False to 1/0
all_columns = X_train.columns.tolist()
one_hot_columns = [col for col in all_columns if col not in numerical_columns]

X_train[one_hot_columns] = X_train[one_hot_columns].astype(int)
X_test[one_hot_columns] = X_test[one_hot_columns].astype(int)

## Regression Approach

#### Linear Regression

In [260]:
from sklearn.linear_model import LinearRegression
reg_lr_model = LinearRegression()
reg_lr_model.fit(X_train, y_train)
reg_lr_y_test = reg_lr_model.predict(X_test)

#### Random Forest

In [261]:
from sklearn.ensemble import RandomForestRegressor
reg_rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
reg_rf_model.fit(X_train, y_train)
reg_rf_y_test = reg_rf_model.predict(X_test)

#### Support Vector Machines (SVM)

In [262]:
from sklearn.svm import SVR
reg_svm_model = SVR(kernel='linear') 
reg_svm_model.fit(X_train, y_train)
reg_svm_y_test = reg_svm_model.predict(X_test)

#### Decision Tree

In [263]:
from sklearn.tree import DecisionTreeRegressor
reg_dt_model = DecisionTreeRegressor()
reg_dt_model.fit(X_train, y_train)
reg_dt_y_test = reg_dt_model.predict(X_test)

#### K-Nearest Neighbor (KNN)

In [264]:
from sklearn.neighbors import KNeighborsRegressor
reg_knn_model = KNeighborsRegressor(n_neighbors=5)
reg_knn_model.fit(X_train, y_train)
reg_knn_y_test = reg_knn_model.predict(X_test)

#### Lasso

In [265]:
from sklearn.linear_model import Lasso
reg_lasso_model = Lasso(alpha=0.1) 
reg_lasso_model.fit(X_train, y_train)
reg_lasso_y_test = reg_lasso_model.predict(X_test)

### Evaluations

In [266]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

reg_models_info = [
    ("Linear Regression", "reg_lr_y_test"),
    ("Random Forest", "reg_rf_y_test"),
    ("SVM", "reg_svm_y_test"),
    ("Decision Tree", "reg_dt_y_test"),
    ("KNN", "reg_knn_y_test"),
    ("Lasso", "reg_lasso_y_test")
]

for model_name, y_test_var_name in reg_models_info:
    y_test_var = globals()[y_test_var_name]

    model_test_mae = mean_absolute_error(y_test, y_test_var)
    model_test_rmse = mean_squared_error(y_test, y_test_var)
    model_test_r2 = r2_score(y_test, y_test_var)

    print(f"---- {model_name} ----")
    print(f" MAE: {model_test_mae:.2f}")
    print(f" RMSE: {model_test_rmse:.2f}")
    print(f" R2: {model_test_r2:.2f}")
    print("")

---- Linear Regression ----
 MAE: 2.31
 RMSE: 9.25
 R2: 0.67

---- Random Forest ----
 MAE: 2.22
 RMSE: 8.82
 R2: 0.69

---- SVM ----
 MAE: 2.26
 RMSE: 9.51
 R2: 0.66

---- Decision Tree ----
 MAE: 2.98
 RMSE: 16.78
 R2: 0.41

---- KNN ----
 MAE: 2.43
 RMSE: 10.24
 R2: 0.64

---- Lasso ----
 MAE: 2.43
 RMSE: 10.07
 R2: 0.65



## Classification Approach

In [267]:
y_train_c = y_train.copy()
y_test_c = y_test.copy()
y_train_c = y_train_c.apply(lambda x: 1 if x == 1 else 0)
y_test_c = y_test_c.apply(lambda x: 1 if x == 1 else 0)

#### Logistic Regression 

In [268]:
from sklearn.linear_model import LogisticRegression
cla_lr_model = LogisticRegression()
cla_lr_model.fit(X_train, y_train_c)
cla_lr_y_test = cla_lr_model.predict(X_test)

#### Decision Tree

In [269]:
from sklearn.tree import DecisionTreeClassifier
cla_dt_model = DecisionTreeClassifier()
cla_dt_model.fit(X_train, y_train_c)
cla_dt_y_test = cla_dt_model.predict(X_test)

#### Support Vector Machines (SVM)

In [270]:
from sklearn.svm import SVC 
cla_svm_model = SVC(kernel='linear')
cla_svm_model.fit(X_train, y_train_c)
cla_svm_y_test = cla_svm_model.predict(X_test)

#### Random Forest

In [271]:
from sklearn.ensemble import RandomForestClassifier 
cla_rf_model = RandomForestClassifier(n_estimators=100)
cla_rf_model.fit(X_train, y_train_c)
cla_rf_y_test = cla_rf_model.predict(X_test)

#### K-Nearest Neighbor (KNN)

In [272]:
from sklearn.neighbors import KNeighborsClassifier
cla_knn_model = KNeighborsClassifier(n_neighbors=5)
cla_knn_model.fit(X_train, y_train_c)
cla_knn_y_test = cla_knn_model.predict(X_test)

#### Naïve Bayes

In [273]:
from sklearn.naive_bayes import GaussianNB
cla_nb_model = GaussianNB()
cla_nb_model.fit(X_train, y_train_c)
cla_nb_y_test = cla_nb_model.predict(X_test)

### Evaluations

In [274]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

cla_models_info = [
    ("Logistic Regression", "cla_lr_y_test"),
    ("Decision Tree", "cla_dt_y_test"),
    ("SVM", "cla_svm_y_test"),
    ("Random Forest", "cla_rf_y_test"),
    ("KNN", "cla_knn_y_test"),
    ("GaussianNB", "cla_nb_y_test")
]

for model_name, y_test_var_name in cla_models_info:
    y_test_var = globals()[y_test_var_name]

    accuracy = accuracy_score(y_test_c, y_test_var)
    conf_matrix = confusion_matrix(y_test_c, y_test_var)
    class_report = classification_report(y_test_c, y_test_var)

    print(f"---- {model_name} ----")
    print(f"Accuracy: {accuracy}")
    print("Confusion Matrix:")
    print(conf_matrix)
    print("Classification Report:")
    print(class_report)
    print("")

---- Logistic Regression ----
Accuracy: 0.9425287356321839
Confusion Matrix:
[[726  12]
 [ 33  12]]
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.98      0.97       738
           1       0.50      0.27      0.35        45

    accuracy                           0.94       783
   macro avg       0.73      0.63      0.66       783
weighted avg       0.93      0.94      0.93       783


---- Decision Tree ----
Accuracy: 0.9220945083014048
Confusion Matrix:
[[706  32]
 [ 29  16]]
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       738
           1       0.33      0.36      0.34        45

    accuracy                           0.92       783
   macro avg       0.65      0.66      0.65       783
weighted avg       0.92      0.92      0.92       783


---- SVM ----
Accuracy: 0.9412515964240102
Confusion Matrix:
[[724  14]
 [ 32  13]]
Classification Repor

## Deep Learning

### Regression

In [275]:
import tensorflow as tf

dl_reg_model = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=(146,)),  # Input layer with 146 features
    tf.keras.layers.Dense(16, activation='relu'),  # Hidden layer with 32 neurons
    tf.keras.layers.Dense(1, activation='linear')
])

dl_reg_model.compile(optimizer='adam',
              loss='mean_squared_error',
              metrics=['mean_squared_error'])

dl_reg_model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2)

Epoch 1/100
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 105.6073 - mean_squared_error: 105.6073 - val_loss: 78.3660 - val_mean_squared_error: 78.3660
Epoch 2/100
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 757us/step - loss: 64.0862 - mean_squared_error: 64.0862 - val_loss: 45.2039 - val_mean_squared_error: 45.2039
Epoch 3/100
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 753us/step - loss: 22.0494 - mean_squared_error: 22.0494 - val_loss: 26.7301 - val_mean_squared_error: 26.7301
Epoch 4/100
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 749us/step - loss: 13.8849 - mean_squared_error: 13.8849 - val_loss: 18.4198 - val_mean_squared_error: 18.4198
Epoch 5/100
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 725us/step - loss: 10.6568 - mean_squared_error: 10.6568 - val_loss: 14.9646 - val_mean_squared_error: 14.9646
Epoch 6/100
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[

<keras.src.callbacks.history.History at 0x30693e870>

In [276]:
dl_reg_y_pred = dl_reg_model.predict(X_test)
dl_reg_r2 = r2_score(y_test, dl_reg_y_pred)
print(f'R² score on test set: {dl_reg_r2}')

[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
R² score on test set: 0.6236976635221939


### Classification

In [277]:
import tensorflow as tf

# Define the model
dl_cl_model = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=(146,)),  # Input layer with 146 features
    tf.keras.layers.Dense(32, activation='relu'),  # Additional hidden layer with 64 neurons
    tf.keras.layers.Dense(1, activation='sigmoid')  # Output layer for classification
])

dl_cl_model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train the model
dl_cl_model.fit(X_train, y_train_c, epochs=100, batch_size=32, validation_split=0.2)

# Evaluate the model on the test set
test_loss, test_acc = dl_cl_model.evaluate(X_test, y_test_c)

print(f'Test accuracy: {test_acc}')


Epoch 1/100
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8708 - loss: 0.4721 - val_accuracy: 0.9441 - val_loss: 0.2359
Epoch 2/100
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 761us/step - accuracy: 0.9414 - loss: 0.1811 - val_accuracy: 0.9464 - val_loss: 0.1629
Epoch 3/100
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 753us/step - accuracy: 0.9369 - loss: 0.1501 - val_accuracy: 0.9475 - val_loss: 0.1377
Epoch 4/100
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 742us/step - accuracy: 0.9454 - loss: 0.1297 - val_accuracy: 0.9609 - val_loss: 0.1242
Epoch 5/100
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 769us/step - accuracy: 0.9542 - loss: 0.1194 - val_accuracy: 0.9665 - val_loss: 0.1175
Epoch 6/100
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 765us/step - accuracy: 0.9529 - loss: 0.1151 - val_accuracy: 0.9687 - val_loss: 0.1124
Epoch 7/100


In [278]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

dl_cl_y_pred_prob = dl_cl_model.predict(X_test)

dl_cl_y_pred = (dl_cl_y_pred_prob > 0.6).astype("int32")

dl_cl_accuracy = accuracy_score(y_test_c, dl_cl_y_pred)
dl_cl_conf_matrix = confusion_matrix(y_test_c, dl_cl_y_pred)
dl_cl_class_report = classification_report(y_test_c, dl_cl_y_pred)

print(f"Accuracy: {dl_cl_accuracy}")
print("Confusion Matrix:")
print(dl_cl_conf_matrix)
print("Classification Report:")
print(dl_cl_class_report)

[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
Accuracy: 0.9323116219667944
Confusion Matrix:
[[722  16]
 [ 37   8]]
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.98      0.96       738
           1       0.33      0.18      0.23        45

    accuracy                           0.93       783
   macro avg       0.64      0.58      0.60       783
weighted avg       0.92      0.93      0.92       783

