In [36]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [37]:
df = pd.read_csv('../Data-collection/final_df.csv')
df.head()

Unnamed: 0,grid,position,year,round,driver_age,driver_experience,driver_constructor_experience,driver_points,driver_standing,constructor_points,...,constructor_mclaren,constructor_mercedes,constructor_racing_point,constructor_rb,constructor_red_bull,constructor_renault,constructor_sauber,constructor_toro_rosso,constructor_virgin,constructor_williams
0,3,1.0,2010,1,28,140,0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
1,2,2.0,2010,1,28,116,63,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
2,4,3.0,2010,1,25,52,52,0.0,0.0,0.0,...,True,False,False,False,False,False,False,False,False,False
3,1,4.0,2010,1,22,43,17,0.0,0.0,0.0,...,False,False,False,False,True,False,False,False,False,False
4,5,5.0,2010,1,24,70,0,0.0,0.0,0.0,...,False,True,False,False,False,False,False,False,False,False


In [38]:
'''
# Here the test data needs to be whole races, not random entries from different races
# Will do 3 races per season as test sets
test_races_per_year = 3

# Create a train-test split based on races
unique_races = df[['year', 'round']].drop_duplicates()
test_races = unique_races.groupby('year').sample(n=test_races_per_year, random_state=42)

# Mark test races in the dataset
df['is_test'] = df[['year', 'round']].apply(
    lambda x: tuple(x) in test_races.itertuples(index=False, name=None), axis=1
)

# Split the data
train_df = df[df['is_test'] == False]
test_df = df[df['is_test'] == True]

train_df = train_df.drop(columns=['is_test'])
test_df = test_df.drop(columns=['is_test'])

print(f"Training set size: {len(train_df)}, Testing set size: {len(test_df)}")
'''

'\n# Here the test data needs to be whole races, not random entries from different races\n# Will do 3 races per season as test sets\ntest_races_per_year = 3\n\n# Create a train-test split based on races\nunique_races = df[[\'year\', \'round\']].drop_duplicates()\ntest_races = unique_races.groupby(\'year\').sample(n=test_races_per_year, random_state=42)\n\n# Mark test races in the dataset\ndf[\'is_test\'] = df[[\'year\', \'round\']].apply(\n    lambda x: tuple(x) in test_races.itertuples(index=False, name=None), axis=1\n)\n\n# Split the data\ntrain_df = df[df[\'is_test\'] == False]\ntest_df = df[df[\'is_test\'] == True]\n\ntrain_df = train_df.drop(columns=[\'is_test\'])\ntest_df = test_df.drop(columns=[\'is_test\'])\n\nprint(f"Training set size: {len(train_df)}, Testing set size: {len(test_df)}")\n'

In [39]:
train_df = df.copy()
test_df = pd.read_csv('../Data-collection/df_2024.csv')

In [40]:
X_train = train_df.drop(columns=['position'])
X_test = test_df.drop(columns=['position'])

y_train = train_df['position']
y_test = test_df['position']

print(f"Training Features: {X_train.shape}, Testing Features: {X_test.shape}")
print(f"Training Target: {y_train.shape}, Testing Target: {y_test.shape}")

Training Features: (4883, 146), Testing Features: (421, 146)
Training Target: (4883,), Testing Target: (421,)


In [41]:
# Select only numerical columns for scaling
numerical_columns = [
    'grid', 'driver_age', 'driver_experience', 'driver_constructor_experience',
    'driver_points', 'driver_standing', 'constructor_points', 
    'constructor_standing', 'driver_wins', 'constructor_wins', 'circuit_danger', 
    'year', 'round'
]

scaler = StandardScaler()

X_train[numerical_columns] = scaler.fit_transform(X_train[numerical_columns])
X_test[numerical_columns] = scaler.transform(X_test[numerical_columns])

# conver one-hot-encoding columns from True/False to 1/0
all_columns = X_train.columns.tolist()
one_hot_columns = [col for col in all_columns if col not in numerical_columns]

X_train[one_hot_columns] = X_train[one_hot_columns].astype(int)
X_test[one_hot_columns] = X_test[one_hot_columns].astype(int)

### Testing functions

In [42]:
# Pasar test df

def process_round(df, scaler, trained_model):    
    df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
    
    # Convert one-hot encoding columns from boolean to int
    one_hot_columns = [col for col in df.columns if df[col].dtype == 'bool']
    df[one_hot_columns] = df[one_hot_columns].astype(int)
    
    X_current_round = df.drop(columns=['position'])

    predictions_df = pd.DataFrame({'predicted_position': trained_model.predict(X_current_round)}) # Make predictions
    predictions_df.index = df.index # Ensure the indices align for merging
    df = pd.concat([df, predictions_df], axis=1) # Merge predictions back into the original DataFrame
    
    # Identify the predicted winner
    predicted_winner_idx = df['predicted_position'].idxmin()
    predicted_winner_row = df.loc[predicted_winner_idx]
    predicted_winner_name = next(col for col in one_hot_columns if predicted_winner_row[col] == 1)
    predicted_winner_name = predicted_winner_name.replace("driver_", "").replace("_", " ").title()
    predicted_winner_name = predicted_winner_name.split()[-1]
    predicted_position = predicted_winner_row['predicted_position']
    
    # Identify the actual winner
    actual_winner_row = df[df['position'] == 1.0]
    if not actual_winner_row.empty:
        actual_winner_name = next(col for col in one_hot_columns if actual_winner_row.iloc[0][col] == 1)
        actual_winner_name = actual_winner_name.replace("driver_", "").replace("_", " ").title()
        actual_winner_name = actual_winner_name.split()[-1]
    else:
        actual_winner_name = "--"
    
    return {
        'predicted_winner': predicted_winner_name,
        'predicted_position': predicted_position,
        'actual_winner': actual_winner_name
    }

In [43]:
# Pasar test df

def process_all_rounds(df, trained_model):
    scaler = StandardScaler()
    unique_years = df['year'].unique()
    results = []
    
    for year_number in unique_years:
        df_current_year = df[df['year'] == year_number].copy()
        unique_rounds = df_current_year['round'].unique()
        for round_number in unique_rounds:
            df_current_round = df_current_year[df_current_year['round'] == round_number].copy()
            result = process_round(df_current_round, scaler, trained_model)
            result['round'] = round_number
            result['year'] = year_number
            results.append(result)

    results_df = pd.DataFrame(results)
    results_df = results_df[['round'] + [col for col in results_df.columns if col != 'round']]
    results_df = results_df[['year'] + [col for col in results_df.columns if col != 'year']]
    
    return results_df

In [44]:
# pasar df de la funcion anterior

def model_accuracy (df, model_name):
    accuracy = {}
    accuracy['model_name'] = model_name

    new_df = df['predicted_winner'] == df['actual_winner']
    accuracy['accuracy'] = new_df.mean()
    
    return accuracy

In [45]:
# Pasar test_df

def test_all_models (df, trained_models):

    accuracy = []

    for model_name, model_var_name in trained_models:
        model_var = globals()[model_var_name]
        model_test_df = []

        model_test_df = process_all_rounds(df, model_var)
        accuracy.append(model_accuracy(model_test_df, model_name))

    accuracy_df = pd.DataFrame(accuracy)

    return accuracy_df        

## Regression Approach

#### Linear Regression

In [46]:
from sklearn.linear_model import LinearRegression
reg_lr_model = LinearRegression()
reg_lr_model.fit(X_train, y_train)
reg_lr_y_test = reg_lr_model.predict(X_test)

#### Random Forest

In [47]:
from sklearn.ensemble import RandomForestRegressor
reg_rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
reg_rf_model.fit(X_train, y_train)
reg_rf_y_test = reg_rf_model.predict(X_test)

#### Support Vector Machines (SVM)

In [48]:
from sklearn.svm import SVR
reg_svm_model = SVR(kernel='linear') 
reg_svm_model.fit(X_train, y_train)
reg_svm_y_test = reg_svm_model.predict(X_test)

#### Decision Tree

In [49]:
from sklearn.tree import DecisionTreeRegressor
reg_dt_model = DecisionTreeRegressor()
reg_dt_model.fit(X_train, y_train)
reg_dt_y_test = reg_dt_model.predict(X_test)

#### K-Nearest Neighbor (KNN)

In [50]:
from sklearn.neighbors import KNeighborsRegressor
reg_knn_model = KNeighborsRegressor(n_neighbors=5)
reg_knn_model.fit(X_train, y_train)
reg_knn_y_test = reg_knn_model.predict(X_test)

#### Lasso

In [51]:
from sklearn.linear_model import Lasso
reg_lasso_model = Lasso(alpha=0.1) 
reg_lasso_model.fit(X_train, y_train)
reg_lasso_y_test = reg_lasso_model.predict(X_test)

## Classification Approach

In [53]:
y_train_c = y_train.copy()
y_test_c = y_test.copy()
y_train_c = y_train_c.apply(lambda x: 1 if x == 1 else 0)
y_test_c = y_test_c.apply(lambda x: 1 if x == 1 else 0)

#### Logistic Regression 

In [54]:
from sklearn.linear_model import LogisticRegression
cla_lr_model = LogisticRegression()
cla_lr_model.fit(X_train, y_train_c)
cla_lr_y_test = cla_lr_model.predict(X_test)

#### Decision Tree

In [55]:
from sklearn.tree import DecisionTreeClassifier
cla_dt_model = DecisionTreeClassifier()
cla_dt_model.fit(X_train, y_train_c)
cla_dt_y_test = cla_dt_model.predict(X_test)

#### Support Vector Machines (SVM)

In [56]:
from sklearn.svm import SVC 
cla_svm_model = SVC(kernel='linear')
cla_svm_model.fit(X_train, y_train_c)
cla_svm_y_test = cla_svm_model.predict(X_test)

#### Random Forest

In [57]:
from sklearn.ensemble import RandomForestClassifier 
cla_rf_model = RandomForestClassifier(n_estimators=100)
cla_rf_model.fit(X_train, y_train_c)
cla_rf_y_test = cla_rf_model.predict(X_test)

#### K-Nearest Neighbor (KNN)

In [58]:
from sklearn.neighbors import KNeighborsClassifier
cla_knn_model = KNeighborsClassifier(n_neighbors=5)
cla_knn_model.fit(X_train, y_train_c)
cla_knn_y_test = cla_knn_model.predict(X_test)

#### Naïve Bayes

In [59]:
from sklearn.naive_bayes import GaussianNB
cla_nb_model = GaussianNB()
cla_nb_model.fit(X_train, y_train_c)
cla_nb_y_test = cla_nb_model.predict(X_test)

### Evaluations

In [60]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

cla_models_info = [
    ("Logistic Regression", "cla_lr_y_test"),
    ("Decision Tree", "cla_dt_y_test"),
    ("SVM", "cla_svm_y_test"),
    ("Random Forest", "cla_rf_y_test"),
    ("KNN", "cla_knn_y_test"),
    ("GaussianNB", "cla_nb_y_test")
]

for model_name, y_test_var_name in cla_models_info:
    y_test_var = globals()[y_test_var_name]

    accuracy = accuracy_score(y_test_c, y_test_var)
    conf_matrix = confusion_matrix(y_test_c, y_test_var)
    class_report = classification_report(y_test_c, y_test_var)

    print(f"---- {model_name} ----")
    print(f"Accuracy: {accuracy}")
    print("Confusion Matrix:")
    print(conf_matrix)
    print("Classification Report:")
    print(class_report)
    print("")

---- Logistic Regression ----
Accuracy: 0.9453681710213777
Confusion Matrix:
[[390   8]
 [ 15   8]]
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.98      0.97       398
           1       0.50      0.35      0.41        23

    accuracy                           0.95       421
   macro avg       0.73      0.66      0.69       421
weighted avg       0.94      0.95      0.94       421


---- Decision Tree ----
Accuracy: 0.9263657957244655
Confusion Matrix:
[[379  19]
 [ 12  11]]
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.95      0.96       398
           1       0.37      0.48      0.42        23

    accuracy                           0.93       421
   macro avg       0.67      0.72      0.69       421
weighted avg       0.94      0.93      0.93       421


---- SVM ----
Accuracy: 0.9406175771971497
Confusion Matrix:
[[389   9]
 [ 16   7]]
Classification Repor

## Deep Learning

### Regression

In [61]:
import tensorflow as tf

dl_reg_model = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=(146,)),  # Input layer with 146 features
    tf.keras.layers.Dense(16, activation='relu'),  # Hidden layer with 32 neurons
    tf.keras.layers.Dense(1, activation='linear')
])

dl_reg_model.compile(optimizer='adam',
              loss='mean_squared_error',
              metrics=['mean_squared_error'])

dl_reg_model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2)

Epoch 1/100
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 103.9004 - mean_squared_error: 103.9004 - val_loss: 75.1477 - val_mean_squared_error: 75.1477
Epoch 2/100
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 49.1682 - mean_squared_error: 49.1682 - val_loss: 45.7265 - val_mean_squared_error: 45.7265
Epoch 3/100
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 22.1109 - mean_squared_error: 22.1109 - val_loss: 34.3231 - val_mean_squared_error: 34.3231
Epoch 4/100
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 16.4175 - mean_squared_error: 16.4175 - val_loss: 27.4625 - val_mean_squared_error: 27.4625
Epoch 5/100
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 13.4498 - mean_squared_error: 13.4498 - val_loss: 22.3087 - val_mean_squared_error: 22.3087
Epoch 6/100
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

<keras.src.callbacks.history.History at 0x11ce3b380>

In [62]:
dl_reg_y_pred = dl_reg_model.predict(X_test)
dl_reg_r2 = r2_score(y_test, dl_reg_y_pred)
print(f'R² score on test set: {dl_reg_r2}')

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
R² score on test set: 0.30914355677548644


### Classification

In [63]:
import tensorflow as tf

# Define the model
dl_cl_model = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=(146,)),  # Input layer with 146 features
    tf.keras.layers.Dense(32, activation='relu'),  # Additional hidden layer with 64 neurons
    tf.keras.layers.Dense(1, activation='sigmoid')  # Output layer for classification
])

dl_cl_model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train the model
dl_cl_model.fit(X_train, y_train_c, epochs=100, batch_size=32, validation_split=0.2)

# Evaluate the model on the test set
test_loss, test_acc = dl_cl_model.evaluate(X_test, y_test_c)

print(f'Test accuracy: {test_acc}')


Epoch 1/100
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6337 - loss: 0.6031 - val_accuracy: 0.9519 - val_loss: 0.3393
Epoch 2/100
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9417 - loss: 0.2036 - val_accuracy: 0.9488 - val_loss: 0.2105
Epoch 3/100
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9422 - loss: 0.1532 - val_accuracy: 0.9417 - val_loss: 0.1699
Epoch 4/100
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9507 - loss: 0.1181 - val_accuracy: 0.9509 - val_loss: 0.1480
Epoch 5/100
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9469 - loss: 0.1253 - val_accuracy: 0.9550 - val_loss: 0.1361
Epoch 6/100
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9518 - loss: 0.1192 - val_accuracy: 0.9652 - val_loss: 0.1194
Epoch 7/100
[1m123/12

In [64]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

dl_cl_y_pred_prob = dl_cl_model.predict(X_test)

dl_cl_y_pred = (dl_cl_y_pred_prob > 0.6).astype("int32")

dl_cl_accuracy = accuracy_score(y_test_c, dl_cl_y_pred)
dl_cl_conf_matrix = confusion_matrix(y_test_c, dl_cl_y_pred)
dl_cl_class_report = classification_report(y_test_c, dl_cl_y_pred)

print(f"Accuracy: {dl_cl_accuracy}")
print("Confusion Matrix:")
print(dl_cl_conf_matrix)
print("Classification Report:")
print(dl_cl_class_report)

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
Accuracy: 0.9334916864608076
Confusion Matrix:
[[389   9]
 [ 19   4]]
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.98      0.97       398
           1       0.31      0.17      0.22        23

    accuracy                           0.93       421
   macro avg       0.63      0.58      0.59       421
weighted avg       0.92      0.93      0.92       421



# Metrics

### Regression models

In [None]:
reg_models = [
    ("Linear Regression", "reg_lr_model"),
    ("Random Forest", "reg_rf_model"),
    ("SVM", "reg_svm_model"),
    ("Decision Tree", "reg_dt_model"),
    ("KNN", "reg_knn_model"),
    ("Lasso", "reg_lasso_model") # TODO: Adapt Deep Learning: dl_reg_model
]

reg_models_accuracy = test_all_models(test_df, reg_models)

print(reg_models_accuracy)

          model_name  accuracy
0  Linear Regression  0.041667
1      Random Forest  0.375000
2                SVM  0.375000
3      Decision Tree  0.375000
4                KNN  0.208333
5              Lasso  0.416667


In [67]:
results_df = process_all_rounds(test_df, reg_lasso_model) # TODO: choose the better model dynamically
print(results_df.to_string(index=False))

 year  round predicted_winner  predicted_position actual_winner
 2024      1       Verstappen            5.738987    Verstappen
 2024      2       Verstappen            1.167036    Verstappen
 2024      3          Leclerc            2.600225            --
 2024      4       Verstappen            2.214198    Verstappen
 2024      5       Verstappen            1.891564    Verstappen
 2024      6       Verstappen            1.340561        Norris
 2024      7       Verstappen            1.395040    Verstappen
 2024      8          Leclerc            2.935386       Leclerc
 2024      9       Verstappen            2.114960    Verstappen
 2024     10       Verstappen            1.834022    Verstappen
 2024     11       Verstappen            1.050475       Russell
 2024     12       Verstappen            2.571257      Hamilton
 2024     13       Verstappen            2.012855       Piastri
 2024     14          Leclerc            3.396601      Hamilton
 2024     15       Verstappen           

### Classification models

In [None]:
cla_models = [
    ("Logistic Regression", "cla_lr_model"),
    ("Decision Tree", "cla_dt_model"),
    ("SVM", "cla_svm_model"),
    ("Random Forest", "cla_rf_model"),
    ("KNN", "cla_knn_model"),
    ("GaussianNB", "cla_nb_model"),
]

df_test_c = test_df.copy()
df_test_c['position'] = df_test_c['position'].apply(lambda x: 1 if x == 1 else 0)


# TODO

#### Old graphics

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score

reg_models_info = [
    ("Linear Regression", "reg_lr_y_test"),
    ("Random Forest", "reg_rf_y_test"),
    ("SVM", "reg_svm_y_test"),
    ("Decision Tree", "reg_dt_y_test"),
    ("KNN", "reg_knn_y_test"),
    ("Lasso", "reg_lasso_y_test"),
    ("Deep Learning", "dl_reg_y_pred")
]

reg_r2_scores = []
reg_model_names = []

for model_name, y_test_var_name in reg_models_info:
    y_test_var = globals()[y_test_var_name]

    r2 = r2_score(y_test, y_test_var)
    reg_r2_scores.append(r2)
    reg_model_names.append(model_name)

plt.bar(reg_model_names, reg_r2_scores)
plt.xlabel('Models')
plt.ylabel('R² Score')
plt.title('Regression Models')
plt.xticks(rotation=45)
plt.show()

In [None]:
from sklearn.metrics import f1_score

cla_models_info = [
    ("Logistic Regression", "cla_lr_y_test"),
    ("Decision Tree", "cla_dt_y_test"),
    ("SVM", "cla_svm_y_test"),
    ("Random Forest", "cla_rf_y_test"),
    ("KNN", "cla_knn_y_test"),
    ("GaussianNB", "cla_nb_y_test"),
    ("Deep Learning", "dl_cl_y_pred")
]

cl_f1_scores = []
cl_model_names = []

for model_name, y_test_var_name in cla_models_info:
    y_test_var = globals()[y_test_var_name]

    f1 = f1_score(y_test_c, y_test_var, pos_label=1)
    cl_f1_scores.append(f1)
    cl_model_names.append(model_name)
    

plt.bar(cl_model_names, cl_f1_scores)
plt.xlabel('Models')
plt.ylabel('F1 Score')
plt.title('Classification Models')
plt.xticks(rotation=45)
plt.show()

## Save The model

In [None]:
import joblib

joblib.dump(reg_rf_model, 'trained_model.pkl') # TODO: use the best model