In [30]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [31]:
df = pd.read_csv('../Data-collection/final_df.csv')
df.head()

Unnamed: 0,grid,position,year,round,driver_age,driver_experience,driver_constructor_experience,driver_points,driver_standing,constructor_points,...,constructorId_205,constructorId_206,constructorId_207,constructorId_208,constructorId_209,constructorId_210,constructorId_211,constructorId_213,constructorId_214,constructorId_215
0,3,1.0,2010,1,28,140,0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
1,2,2.0,2010,1,28,116,63,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
2,4,3.0,2010,1,25,52,52,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
3,1,4.0,2010,1,22,43,17,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
4,5,5.0,2010,1,24,70,0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False


In [32]:
# Number of races to include in the test set per season
test_races_per_year = 3

# Create a train-test split based on races
unique_races = df[['year', 'round']].drop_duplicates()
test_races = unique_races.groupby('year').sample(n=test_races_per_year, random_state=42)

# Mark test races in the dataset
df['is_test'] = df[['year', 'round']].apply(
    lambda x: tuple(x) in test_races.itertuples(index=False, name=None), axis=1
)

# Split the data
train_df = df[df['is_test'] == False]
test_df = df[df['is_test'] == True]

train_df = train_df.drop(columns=['is_test'])
test_df = test_df.drop(columns=['is_test'])

print(f"Training set size: {len(train_df)}, Testing set size: {len(test_df)}")

Training set size: 4473, Testing set size: 783


In [33]:
X_train = train_df.drop(columns=['position'])
X_test = test_df.drop(columns=['position'])

y_train = train_df['position']
y_test = test_df['position']

print(f"Training Features: {X_train.shape}, Testing Features: {X_test.shape}")
print(f"Training Target: {y_train.shape}, Testing Target: {y_test.shape}")

Training Features: (4473, 146), Testing Features: (783, 146)
Training Target: (4473,), Testing Target: (783,)


In [34]:
# Select only numerical columns for scaling
numerical_columns = [
    'grid', 'driver_age', 'driver_experience', 'driver_constructor_experience',
    'driver_points', 'driver_standing', 'constructor_points', 
    'constructor_standing', 'driver_wins', 'constructor_wins', 'circuit_danger'
]

scaler = StandardScaler()

X_train[numerical_columns] = scaler.fit_transform(X_train[numerical_columns])
X_test[numerical_columns] = scaler.transform(X_test[numerical_columns])

## Regression Approach

In [35]:
from sklearn.linear_model import LinearRegression
reg_lr_model = LinearRegression()
reg_lr_model.fit(X_train, y_train)
reg_lr_y_test = reg_lr_model.predict(X_test)

In [36]:
from sklearn.ensemble import RandomForestRegressor
reg_rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
reg_rf_model.fit(X_train, y_train)
reg_rf_y_test = reg_rf_model.predict(X_test)

In [37]:
from sklearn.svm import SVR
reg_svm_model = SVR(kernel='linear')  # Adjust `kernel` as needed ('linear', 'rbf', etc.)
reg_svm_model.fit(X_train, y_train)
reg_svm_y_test = reg_svm_model.predict(X_test)

In [38]:
from sklearn.tree import DecisionTreeRegressor
reg_dt_model = DecisionTreeRegressor()
reg_dt_model.fit(X_train, y_train)
reg_dt_y_test = reg_dt_model.predict(X_test)

In [39]:
from sklearn.neighbors import KNeighborsRegressor
reg_knn_model = KNeighborsRegressor(n_neighbors=5)  # Adjust `n_neighbors` as needed
reg_knn_model.fit(X_train, y_train)
reg_knn_y_test = reg_knn_model.predict(X_test)

In [40]:
from sklearn.linear_model import Lasso
reg_lasso_model = Lasso(alpha=0.1)  # alpha controls the regularization strength
reg_lasso_model.fit(X_train, y_train)
reg_lasso_y_test = reg_lasso_model.predict(X_test)

### Evaluations

In [61]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

reg_models_info = [
    ("Linear Regression", "reg_lr_y_test"),
    ("Random Forest", "reg_rf_y_test"),
    ("SVM", "reg_svm_y_test"),
    ("Decision Tree", "reg_dt_y_test"),
    ("KNN", "reg_knn_y_test"),
    ("Lasso", "reg_lasso_y_test")
]

for model_name, y_test_var_name in reg_models_info:
    y_test_var = globals()[y_test_var_name]

    # Evaluate the model
    model_test_mae = mean_absolute_error(y_test, y_test_var)
    model_test_rmse = mean_squared_error(y_test, y_test_var)
    model_test_r2 = r2_score(y_test, y_test_var)

    # Print evaluation metrics
    print(f"---- {model_name} ----")
    print(f" MAE: {model_test_mae:.2f}")
    print(f" RMSE: {model_test_rmse:.2f}")
    print(f" R2: {model_test_r2:.2f}")
    print("")

---- Linear Regression ----
 MAE: 9.35
 RMSE: 107.05
 R2: -1975.23

---- Random Forest ----
 MAE: 9.39
 RMSE: 108.59
 R2: -2003.68

---- SVM ----
 MAE: 9.44
 RMSE: 114.35
 R2: -2109.97

---- Decision Tree ----
 MAE: 9.38
 RMSE: 116.62
 R2: -2151.93

---- KNN ----
 MAE: 9.26
 RMSE: 106.53
 R2: -1965.59

---- Lasso ----
 MAE: 9.32
 RMSE: 104.24
 R2: -1923.38



## Classification Approach

In [51]:
y_train = y_train.apply(lambda x: 1 if x == 1 else 0)
y_test = y_test.apply(lambda x: 1 if x == 1 else 0)

In [52]:
from sklearn.linear_model import LogisticRegression
cla_lr_model = LogisticRegression()
cla_lr_model.fit(X_train, y_train)
cla_lr_y_test = cla_lr_model.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [53]:
from sklearn.tree import DecisionTreeClassifier
cla_dt_model = DecisionTreeClassifier()
cla_dt_model.fit(X_train, y_train)
cla_dt_y_test = cla_dt_model.predict(X_test)

In [54]:
from sklearn.svm import SVC 
cla_svm_model = SVC(kernel='linear')  # Adjust `kernel` as needed ('linear', 'rbf', etc.)
cla_svm_model.fit(X_train, y_train)
cla_svm_y_test = cla_svm_model.predict(X_test)

In [55]:
from sklearn.ensemble import RandomForestClassifier 
cla_rf_model = RandomForestClassifier(n_estimators=100)  # Adjust `n_estimators` as needed
cla_rf_model.fit(X_train, y_train)
cla_rf_y_test = cla_rf_model.predict(X_test)

In [56]:
from sklearn.neighbors import KNeighborsClassifier
cla_knn_model = KNeighborsClassifier(n_neighbors=5)  # Adjust `n_neighbors` as needed
cla_knn_model.fit(X_train, y_train)
cla_knn_y_test = cla_knn_model.predict(X_test)

In [57]:
from sklearn.naive_bayes import GaussianNB
cla_nb_model = GaussianNB()
cla_nb_model.fit(X_train, y_train)
cla_nb_y_test = cla_nb_model.predict(X_test)

### Evaluations

In [60]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

cla_models_info = [
    ("Logistic Regression", "cla_lr_y_test"),
    ("Decision Tree", "cla_dt_y_test"),
    ("SVM", "cla_svm_y_test"),
    ("Random Forest", "cla_rf_y_test"),
    ("KNN", "cla_knn_y_test"),
    ("GaussianNB", "cla_nb_y_test")
]

for model_name, y_test_var_name in cla_models_info:
    y_test_var = globals()[y_test_var_name]

    accuracy = accuracy_score(y_test, y_test_var)
    conf_matrix = confusion_matrix(y_test, y_test_var)
    class_report = classification_report(y_test, y_test_var)

    print(f"---- {model_name} ----")
    print(f"Accuracy: {accuracy}")
    print("Confusion Matrix:")
    print(conf_matrix)
    print("Classification Report:")
    print(class_report)
    print("")

---- Logistic Regression ----
Accuracy: 0.9386973180076629
Confusion Matrix:
[[724  14]
 [ 34  11]]
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.98      0.97       738
           1       0.44      0.24      0.31        45

    accuracy                           0.94       783
   macro avg       0.70      0.61      0.64       783
weighted avg       0.93      0.94      0.93       783


---- Decision Tree ----
Accuracy: 0.929757343550447
Confusion Matrix:
[[709  29]
 [ 26  19]]
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       738
           1       0.40      0.42      0.41        45

    accuracy                           0.93       783
   macro avg       0.68      0.69      0.69       783
weighted avg       0.93      0.93      0.93       783


---- SVM ----
Accuracy: 0.9399744572158365
Confusion Matrix:
[[726  12]
 [ 35  10]]
Classification Report