# Formula 1 Grand Prix result prediction

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.preprocessing import StandardScaler,LabelEncoder,OneHotEncoder
from sklearn.model_selection import cross_val_score,StratifiedKFold,RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix,precision_score,f1_score,recall_score
plt.style.use('seaborn')

In [2]:
n_estimators = [int(x) for x in np.linspace(start=200,stop=2000,num=10)]
max_features = ['auto','sqrt']
max_depth = [int(x) for x in np.linspace(10,110,num=11)]
min_samples_split = [2,5,8,10,15,20]
min_samples_leaf = [1,2,4,6,8,10]
bootstrap = [True,False]

random_parms = {
    'n_estimators':n_estimators,
    'max_features':max_features,
    'max_depth':max_depth,
    'min_samples_split':min_samples_split,
    'min_samples_leaf':min_samples_leaf,
    'bootstrap':bootstrap
                }

In [5]:
# rf_rand = RandomForestClassifier()
# rf_random = RandomizedSearchCV(estimator=rf_rand,param_distributions=random_parms,n_iter=10,cv=10,verbose=2,n_jobs=-1 )
# rf_random.fit(X,y)
# rf_random.best_params_

In [4]:
rf = RandomForestClassifier(n_estimators=1600,min_samples_split=8,min_samples_leaf=4,max_features='auto',max_depth=70,bootstrap=True)
kf = StratifiedKFold(n_splits=10,random_state=None,shuffle=False)
# for train_index,test_index in kf.split(X,y):
#     X_train,X_test = X.iloc[train_index],X.iloc[test_index]
#     y_train,y_test = y.iloc[train_index],y.iloc[test_index]

In [180]:
X_test

Unnamed: 0,GP_name,quali_pos,constructor,driver,driver_confidence,constructor_relaiblity,istest
7655,24,3,2,11,0.933798,0.824359,0
7669,24,4,5,14,0.940711,0.877805,0
7670,24,5,5,14,0.940711,0.877805,0
7671,24,1,5,14,0.940711,0.877805,0
7672,24,1,5,25,0.965035,0.877805,0
...,...,...,...,...,...,...,...
8122,8,1,7,23,0.934156,0.750865,0
8123,8,7,8,11,0.933798,0.601852,0
8124,8,6,8,11,0.933798,0.601852,0
8125,8,11,8,22,0.852071,0.601852,0


In [6]:
data = pd.read_csv('final_df.csv')


In [7]:
data.columns

Index(['season', 'round', 'weather_warm', 'weather_cold', 'weather_dry',
       'weather_wet', 'weather_cloudy', 'driver', 'grid', 'podium',
       'driver_points', 'driver_wins', 'driver_standings_pos',
       'constructor_points', 'constructor_wins', 'constructor_standings_pos',
       'qualifying_time', 'driver_age', 'circuit_id_adelaide',
       'circuit_id_albert_park', 'circuit_id_americas', 'circuit_id_bahrain',
       'circuit_id_brands_hatch', 'circuit_id_catalunya', 'circuit_id_detroit',
       'circuit_id_estoril', 'circuit_id_galvez', 'circuit_id_hockenheimring',
       'circuit_id_hungaroring', 'circuit_id_imola', 'circuit_id_indianapolis',
       'circuit_id_interlagos', 'circuit_id_istanbul',
       'circuit_id_jacarepagua', 'circuit_id_jerez', 'circuit_id_kyalami',
       'circuit_id_magny_cours', 'circuit_id_marina_bay', 'circuit_id_monaco',
       'circuit_id_monza', 'circuit_id_nurburgring', 'circuit_id_phoenix',
       'circuit_id_red_bull_ring', 'circuit_id_ricard'

In [9]:
data.drop(['podium','qualifying_time'],axis = 1)

Unnamed: 0,season,round,weather_warm,weather_cold,weather_dry,weather_wet,weather_cloudy,driver,grid,driver_points,...,constructor_minardi,constructor_prost,constructor_red_bull,constructor_renault,constructor_sauber,constructor_team_lotus,constructor_toro_rosso,constructor_toyota,constructor_tyrrell,constructor_williams
0,1983,1,False,False,True,False,False,keke_rosberg,1,0,...,0,0,0,0,0,0,0,0,0,1
1,1983,1,False,False,True,False,False,prost,2,0,...,0,0,0,1,0,0,0,0,0,0
2,1983,1,False,False,True,False,False,tambay,3,0,...,0,0,0,0,0,0,0,0,0,0
3,1983,1,False,False,True,False,False,piquet,4,0,...,0,0,0,0,0,0,0,0,0,0
4,1983,1,False,False,True,False,False,warwick,5,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15013,2023,4,True,False,False,False,True,sargeant,14,0,...,0,0,0,0,0,0,0,0,0,1
15014,2023,4,True,False,False,False,True,zhou,15,0,...,0,0,0,0,0,0,0,0,0,0
15015,2023,4,True,False,False,False,True,kevin_magnussen,16,1,...,0,0,0,0,0,0,0,0,0,0
15016,2023,4,True,False,False,False,True,gasly,17,4,...,0,0,0,0,0,0,0,0,0,0


In [10]:

X = data.copy()

sc  = StandardScaler()
le = LabelEncoder()
X['driver'] = le.fit_transform(X['driver'])

X_train = X[(X['season'] < 2023) ].drop('grid',axis = 1)
y_train = X[X['season'] < 2023]['grid']

In [11]:
X_2023 = X[(X['season'] == 2023) & (X['round'] < 4)].drop('grid',axis = 1)
y_2023 = X[(X['season'] == 2023) & (X['round'] < 4)]['grid']

X_train = pd.concat([X_train,X_2023])
y_train = pd.concat([y_train,y_2023])

In [12]:
X_train

Unnamed: 0,season,round,weather_warm,weather_cold,weather_dry,weather_wet,weather_cloudy,driver,podium,driver_points,...,constructor_minardi,constructor_prost,constructor_red_bull,constructor_renault,constructor_sauber,constructor_team_lotus,constructor_toro_rosso,constructor_toyota,constructor_tyrrell,constructor_williams
0,1983,1,False,False,True,False,False,109,15,0,...,0,0,0,0,0,0,0,0,0,1
1,1983,1,False,False,True,False,False,176,6,0,...,0,0,0,1,0,0,0,0,0,0
2,1983,1,False,False,True,False,False,212,4,0,...,0,0,0,0,0,0,0,0,0,0
3,1983,1,False,False,True,False,False,171,1,0,...,0,0,0,0,0,0,0,0,0,0
4,1983,1,False,False,True,False,False,223,7,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,2023,3,True,False,False,False,False,110,17,1,...,0,0,0,0,0,0,0,0,0,0
14996,2023,3,True,False,False,False,False,58,15,0,...,0,0,0,0,0,0,0,0,0,0
14997,2023,3,True,False,False,False,False,168,8,0,...,0,0,0,0,0,0,0,0,0,0
14998,2023,3,True,False,False,False,False,233,9,0,...,0,0,0,0,0,0,0,0,0,0


In [105]:
y_train

0        15
1         6
2         4
3         1
4         7
         ..
14995    17
14996    15
14997     8
14998     9
14999    16
Name: podium, Length: 15000, dtype: int64

In [13]:
X_test = X[(X['season'] == 2023) & (X['round'] == 4)].drop('grid',axis = 1)
y_test = X[(X['season'] == 2023) & (X['round'] == 4)]['grid']

In [14]:
X_test

Unnamed: 0,season,round,weather_warm,weather_cold,weather_dry,weather_wet,weather_cloudy,driver,podium,driver_points,...,constructor_minardi,constructor_prost,constructor_red_bull,constructor_renault,constructor_sauber,constructor_team_lotus,constructor_toro_rosso,constructor_toyota,constructor_tyrrell,constructor_williams
15000,2023,4,True,False,False,False,True,126,3,0,...,0,0,0,0,0,0,0,0,0,0
15001,2023,4,True,False,False,False,True,139,2,44,...,0,0,1,0,0,0,0,0,0,0
15002,2023,4,True,False,False,False,True,166,1,18,...,0,0,1,0,0,0,0,0,0,0
15003,2023,4,True,False,False,False,True,189,5,20,...,0,0,0,0,0,0,0,0,0,0
15004,2023,4,True,False,False,False,True,93,6,20,...,0,0,0,0,0,0,0,0,0,0
15005,2023,4,True,False,False,False,True,8,4,30,...,0,0,0,0,0,0,0,0,0,0
15006,2023,4,True,False,False,False,True,159,9,0,...,0,0,0,0,0,0,0,0,0,0
15007,2023,4,True,False,False,False,True,216,10,0,...,0,0,0,0,0,0,0,0,0,0
15008,2023,4,True,False,False,False,True,206,7,8,...,0,0,0,0,0,0,0,0,0,0
15009,2023,4,True,False,False,False,True,168,11,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
y_test

15000     1
15001     2
15002     3
15003     4
15004     5
15005     6
15006     7
15007     8
15008     9
15009    10
15010    11
15011    12
15012    13
15013    14
15014    15
15015    16
15016    17
15017    18
Name: grid, dtype: int64

In [16]:
rf.fit(X_train,y_train)
y_pred_rf = rf.predict(X_test)

  warn(


In [119]:
import pickle

filename = 'rf_model.pkl'
with open(filename, 'wb') as file:
    pickle.dump(rf, file)

In [17]:
print(y_pred_rf)

[ 1  2  2  4  6  5 13 12  6 12  6 16 14 18 17 16 17 20]


In [19]:
X_later = X_test.copy()
X_later = X_later[['season','round','driver']]

X_later['pred_pos'] = y_pred_rf
X_later['actual'] = y_test

In [20]:
X_later = X_later.sort_values(by = 'pred_pos')

In [21]:
X_later.shape

(18, 5)

In [22]:
X_later['pred_pos'] = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18]

In [23]:
X_later['driver'] = le.inverse_transform(X_later['driver'])

In [24]:
X_later

Unnamed: 0,season,round,driver,pred_pos,actual
15000,2023,4,leclerc,1,1
15001,2023,4,max_verstappen,2,2
15002,2023,4,perez,3,3
15003,2023,4,sainz,4,4
15005,2023,4,alonso,5,6
15010,2023,4,russell,6,11
15008,2023,4,stroll,7,9
15004,2023,4,hamilton,8,5
15007,2023,4,tsunoda,9,8
15009,2023,4,piastri,10,10


In [226]:
new_data_tested = new_data.copy()
new_data_tested['pos'] = y_pred_new

In [227]:
new_data_tested = new_data_tested.sort_values(by='pos', ascending=True)
new_data_tested['pos'] = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]


In [228]:
new_data_tested

Unnamed: 0,GP_name,quali_pos,constructor,driver,driver_confidence,constructor_relaiblity,active_driver,active_constructor,dob,istest,pos
0,Baku City Circuit,1,Ferrari,Charles Leclerc,0.844444,0.824359,1,1,1,1,1
1,Baku City Circuit,2,Red Bull,Max Verstappen,0.914286,0.750865,1,1,1,1,2
2,Baku City Circuit,3,Red Bull,Sergio Pérez,0.933333,0.750865,1,1,1,1,3
3,Baku City Circuit,4,Ferrari,Carlos Sainz Jr.,0.903846,0.824359,1,1,1,1,4
4,Baku City Circuit,5,Mercedes,Lewis Hamilton,0.940711,0.877805,1,1,1,1,5
5,Baku City Circuit,6,Racing Point,Fernando Alonso,0.8,0.590234,1,1,1,1,6
6,Baku City Circuit,7,McLaren,Lando Norris,0.916667,0.634492,1,1,1,1,7
7,Baku City Circuit,8,AlphaTauri,Yuki Tsunoda,0.8,0.45539,1,1,1,1,8
8,Baku City Circuit,9,Racing Point,Lance Stroll,0.923077,0.590234,1,1,1,1,9
10,Baku City Circuit,11,Mercedes,George Russell,0.958333,0.877805,1,1,1,1,10


In [38]:
X.dtypes

GP_name                            int64
quali_pos                          int64
constructor                        int32
driver                             int32
driver_confidence                float64
constructor_relaiblity           float64
dob                       datetime64[ns]
dtype: object

In [79]:
X_d.dtypes

GP_name                int64
quali_pos              int64
driver                 int64
age_at_gp_in_days    float64
driver_confidence    float64
dtype: object

In [48]:
X.drop(["country", "constructor_nationality"], axis=1, inplace=True)


In [39]:
X = X.drop([ 'dob'], axis=1)


In [50]:
X = X.drop(['driver_nationality'],axis = 1)

In [51]:
X['age_at_gp_in_days'] = X['age_at_gp_in_days'].apply(lambda x: float(x.split()[0]))


In [25]:
svc = SVC()
svc.fit(X_train,y_train)
y_pred = svc.predict(X_test)
svc.fit(X_train,y_train)
y_pred_svc = svc.predict(X_test)
cnf_mat_svc = confusion_matrix(y_test,y_pred)
cnf_mat_svc = cnf_mat_svc/cnf_mat_svc.sum()

In [27]:
X_later = X_test.copy()
X_later = X_later[['season','round','driver',]]

X_later['pred_pos'] = y_pred_rf
X_later['actual'] = y_test

X_later = X_later.sort_values(by = 'pred_pos')

In [28]:
X_later.shape

(18, 5)

In [29]:
X_later['driver'] = le.inverse_transform(X_later['driver'])

In [30]:
X_later['pred_pos'] = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18]

In [31]:
X_later

Unnamed: 0,season,round,driver,pred_pos,actual
15000,2023,4,leclerc,1,1
15001,2023,4,max_verstappen,2,2
15002,2023,4,perez,3,3
15003,2023,4,sainz,4,4
15005,2023,4,alonso,5,6
15010,2023,4,russell,6,11
15008,2023,4,stroll,7,9
15004,2023,4,hamilton,8,5
15007,2023,4,tsunoda,9,8
15009,2023,4,piastri,10,10


In [109]:
print(y_pred_svc)

[13  7  7 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13]


In [37]:
rf_pression = precision_score(y_test, y_pred_rf, average = 'micro' )
rf_f1 = f1_score(y_test,y_pred_rf,average='macro')
rf_recall = recall_score(y_test,y_pred_rf,average='macro')
svc_pression = precision_score(y_test, y_pred_svc, average='micro')
svc_f1 = f1_score(y_test,y_pred_svc,average='macro')
svc_recall = recall_score(y_test,y_pred_svc,average='macro')
metrics_dict ={
    'RandomForestClassifier':{'precision_score':rf_pression,'f1_score':rf_f1,'recall_score':rf_recall},
    'SVC':{'precision_score':svc_pression,'f1_score':svc_f1,'recall_score':svc_recall}
}
metrics_df = pd.DataFrame(metrics_dict)
metrics_df

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,RandomForestClassifier,SVC
precision_score,0.277778,0.055556
f1_score,0.210526,0.006536
recall_score,0.263158,0.055556
