# Formula 1 Grand Prix result prediction

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.preprocessing import StandardScaler,LabelEncoder,OneHotEncoder
from sklearn.model_selection import cross_val_score,StratifiedKFold,RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix,precision_score,f1_score,recall_score
plt.style.use('seaborn')

In [2]:
results = pd.read_csv(r'data\results.csv')
races = pd.read_csv(r'data\races.csv')
quali = pd.read_csv(r'data\qualifying.csv')
drivers = pd.read_csv(r'data\drivers.csv')
constructors = pd.read_csv(r'data\constructors.csv')
circuit = pd.read_csv(r'data\circuits.csv')

In [4]:
df1 = pd.merge(races,results,how='inner',on=['raceId'])
df2 = pd.merge(df1,quali,how='inner',on=['raceId','driverId','constructorId'])
df3 = pd.merge(df2,drivers,how='inner',on=['driverId'])
df4 = pd.merge(df3,constructors,how='inner',on=['constructorId'])
df5 = pd.merge(df4,circuit,how='inner',on=['circuitId'])

In [5]:
#drop the columns which are not important
data = df5.drop(['round','circuitId','time_x','url_x','resultId','driverId',
                 'constructorId','number_x','positionText','position_x',
                 'positionOrder','laps','time_y','rank',
                 'fastestLapTime','fastestLapSpeed','qualifyId','driverRef','number','code','url_y','circuitRef',
                 'location','lat','lng','alt','number_y','points','constructorRef','name_x','raceId','fastestLap','q2','q3','milliseconds','q1'],1)

In [6]:
#considering data points from 1980
data = data[data['year']>=1980]

In [9]:
#rename the columns
data.rename(columns={'name':'GP_name','position_y':'position','grid':'quali_pos','name_y':'constructor','nationality_x':'driver_nationality','nationality_y':'constructor_nationality'},inplace=True)
data['driver'] = data['forename']+' '+data['surname']
data['date'] = pd.to_datetime(data['date'])
data['dob'] = pd.to_datetime(data['dob'])

In [10]:
#creating a driver age parameter
data['age_at_gp_in_days'] = abs(data['dob']-data['date'])
data['age_at_gp_in_days'] = data['age_at_gp_in_days'].apply(lambda x: str(x).split(' ')[0])

In [11]:
#Some of the constructors changed their name over the year so replacing old names with current name
data['constructor'] = data['constructor'].apply(lambda x: 'Racing Point' if x=='Force India' else x)
data['constructor'] = data['constructor'].apply(lambda x: 'Racing Point' if x=='Aston Martin' else x)
data['constructor'] = data['constructor'].apply(lambda x: 'Alfa Romeo' if x=='Sauber' else x)
data['constructor'] = data['constructor'].apply(lambda x: 'Renault' if x=='Lotus F1' else x)
data['constructor'] = data['constructor'].apply(lambda x: 'Renault' if x=='Alpine' else x)
data['constructor'] = data['constructor'].apply(lambda x: 'AlphaTauri' if x=='Toro Rosso' else x)

In [12]:
data['driver_nationality'] = data['driver_nationality'].apply(lambda x: str(x)[:3])
data['constructor_nationality'] = data['constructor_nationality'].apply(lambda x: str(x)[:3])
data['country'] = data['country'].apply(lambda x: 'Bri' if x=='UK' else x)
data['country'] = data['country'].apply(lambda x: 'Ame' if x=='USA' else x)
data['country'] = data['country'].apply(lambda x: 'Fre' if x=='Fra' else x)
data['country'] = data['country'].apply(lambda x: str(x)[:3])
data['driver_home'] = data['driver_nationality'] == data['country']
data['constructor_home'] = data['constructor_nationality'] == data['country']
data['driver_home'] = data['driver_home'].apply(lambda x: int(x))
data['constructor_home'] = data['constructor_home'].apply(lambda x: int(x))

In [13]:
#reasons for DNF(did not finish)
data['driver_dnf'] = data['statusId'].apply(lambda x: 1 if x in [3,4,20,29,31,41,68,73,81,97,82,104,107,130,137] else 0)
data['constructor_dnf'] = data['statusId'].apply(lambda x: 1 if x not in [3,4,20,29,31,41,68,73,81,97,82,104,107,130,137,1] else 0)
data.drop(['forename','surname'],1,inplace=True)

In [161]:
data.head()

Unnamed: 0,year,date,quali_pos,statusId,position,dob,driver_nationality,constructor,constructor_nationality,GP_name,country,driver,age_at_gp_in_days,driver_home,constructor_home,driver_dnf,constructor_dnf
26,2011,2011-03-27,18,11,18,1977-05-10,Ger,Renault,Fre,Albert Park Grand Prix Circuit,Aus,Nick Heidfeld,12374,0,0,0,1
30,2010,2010-03-28,9,1,9,1984-12-07,Pol,Renault,Fre,Albert Park Grand Prix Circuit,Aus,Robert Kubica,9242,0,0,0,0
34,2017,2017-03-26,11,11,12,1987-08-19,Ger,Renault,Fre,Albert Park Grand Prix Circuit,Aus,Nico Hülkenberg,10812,0,0,0,1
35,2018,2018-03-25,7,1,8,1987-08-19,Ger,Renault,Fre,Albert Park Grand Prix Circuit,Aus,Nico Hülkenberg,11176,0,0,0,0
36,2019,2019-03-17,11,11,11,1987-08-19,Ger,Renault,Fre,Albert Park Grand Prix Circuit,Aus,Nico Hülkenberg,11533,0,0,0,1


In [13]:
font = {
    'family':'serif',
    'color':'black',
    'weight':'bold',
    'size':10
}

In [14]:
dnf_by_driver = data.groupby('driver').sum()['driver_dnf']
driver_race_entered = data.groupby('driver').count()['driver_dnf']
driver_dnf_ratio = (dnf_by_driver/driver_race_entered)
driver_confidence = 1-driver_dnf_ratio
driver_confidence_dict = dict(zip(driver_confidence.index,driver_confidence))

In [15]:
dnf_by_constructor = data.groupby('constructor').sum()['constructor_dnf']
constructor_race_entered = data.groupby('constructor').count()['constructor_dnf']
constructor_dnf_ratio = (dnf_by_constructor/constructor_race_entered)
constructor_relaiblity = 1-constructor_dnf_ratio
constructor_relaiblity_dict = dict(zip(constructor_relaiblity.index,constructor_relaiblity))

In [16]:
data['driver_confidence'] = data['driver'].apply(lambda x:driver_confidence_dict[x])
data['constructor_relaiblity'] = data['constructor'].apply(lambda x:constructor_relaiblity_dict[x])
#removing retired drivers and constructors
active_constructors = ['Renault', 'Williams', 'McLaren', 'Ferrari', 'Mercedes',
                       'AlphaTauri', 'Racing Point', 'Alfa Romeo', 'Red Bull',
                       'Haas F1 Team']
active_drivers = ['Daniel Ricciardo', 'Kevin Magnussen', 'Carlos Sainz',
                  'Valtteri Bottas', 'Lance Stroll', 'George Russell',
                  'Lando Norris', 'Sebastian Vettel', 'Kimi Räikkönen',
                  'Charles Leclerc', 'Lewis Hamilton', 'Daniil Kvyat',
                  'Max Verstappen', 'Pierre Gasly', 'Alexander Albon',
                  'Sergio Pérez', 'Esteban Ocon', 'Antonio Giovinazzi',
                  'Romain Grosjean','Nicholas Latifi']
data['active_driver'] = data['driver'].apply(lambda x: int(x in active_drivers))
data['active_constructor'] = data['constructor'].apply(lambda x: int(x in active_constructors))

### Model considering both drivers and constructors

In [32]:
cleaned_data = data[['GP_name','quali_pos','constructor','driver','position','driver_confidence','constructor_relaiblity','active_driver','active_constructor','dob']]
cleaned_data = cleaned_data[(cleaned_data['active_driver']==1)&(cleaned_data['active_constructor']==1)]
#cleaned_data.to_csv('cleaned_data.csv',index=False)


In [33]:
cleaned_data

Unnamed: 0,GP_name,quali_pos,constructor,driver,position,driver_confidence,constructor_relaiblity,active_driver,active_constructor,dob
39,Albert Park Grand Prix Circuit,12,Renault,Daniel Ricciardo,12,0.942197,0.601852,1,1,1989-07-01
40,Albert Park Grand Prix Circuit,14,Renault,Kevin Magnussen,15,0.952381,0.601852,1,1,1992-10-05
41,Albert Park Grand Prix Circuit,9,Renault,Carlos Sainz,9,0.903846,0.601852,1,1,1994-09-01
90,Albert Park Grand Prix Circuit,16,Williams,Valtteri Bottas,16,0.965035,0.569961,1,1,1989-08-28
91,Albert Park Grand Prix Circuit,15,Williams,Valtteri Bottas,10,0.965035,0.569961,1,1,1989-08-28
...,...,...,...,...,...,...,...,...,...,...
8122,Buddh International Circuit,1,Red Bull,Sebastian Vettel,1,0.934156,0.750865,1,1,1987-07-03
8123,Buddh International Circuit,7,Renault,Kimi Räikkönen,7,0.933798,0.601852,1,1,1979-10-17
8124,Buddh International Circuit,6,Renault,Kimi Räikkönen,6,0.933798,0.601852,1,1,1979-10-17
8125,Buddh International Circuit,11,Renault,Romain Grosjean,11,0.852071,0.601852,1,1,1986-04-17


In [34]:
x = cleaned_data

In [20]:
def position_index(x):
    if x<4:
        return 1
    if x>10:
        return 3
    else :
        return 2

In [21]:
data.columns

Index(['year', 'date', 'quali_pos', 'statusId', 'position', 'dob',
       'driver_nationality', 'constructor', 'constructor_nationality',
       'GP_name', 'country', 'driver', 'age_at_gp_in_days', 'driver_home',
       'constructor_home', 'driver_dnf', 'constructor_dnf',
       'driver_confidence', 'constructor_relaiblity', 'active_driver',
       'active_constructor'],
      dtype='object')

In [22]:
#Some of the constructors changed their name over the year so replacing old names with current name
data['constructor'] = data['constructor'].apply(lambda x: 'Racing Point' if x=='Force India' else x)
data['constructor'] = data['constructor'].apply(lambda x: 'Racing Point' if x=='Aston Martin' else x)
data['constructor'] = data['constructor'].apply(lambda x: 'Alfa Romeo' if x=='Sauber' else x)
data['constructor'] = data['constructor'].apply(lambda x: 'Renault' if x=='Lotus F1' else x)
data['constructor'] = data['constructor'].apply(lambda x: 'Renault' if x=='Alpine' else x)
data['constructor'] = data['constructor'].apply(lambda x: 'AlphaTauri' if x=='Toro Rosso' else x)

In [23]:
constructor_names = ['Ferrari', 'Red Bull', 'Mercedes', 'Racing Point', 'Williams', 'Alfa Romeo', 'AlphaTauri', 'McLaren', 'Renault', 'Haas F1 Team']

for name in constructor_names:
    reliability = cleaned_data.loc[cleaned_data['constructor'] == name, 'constructor_relaiblity'].values[0]
    print(f"{name}: {reliability}")


Ferrari: 0.8243589743589743
Red Bull: 0.7508650519031141
Mercedes: 0.8778054862842892
Racing Point: 0.5902335456475584
Williams: 0.5699614890885751
Alfa Romeo: 0.3952755905511811
AlphaTauri: 0.4553903345724907
McLaren: 0.6344916344916345
Renault: 0.6018518518518519
Haas F1 Team: 0.34302325581395354


In [24]:
drivers = [
    'Lewis Hamilton',
    'George Russell',
    'Max Verstappen',
    'Sergio Pérez',
    'Charles Leclerc',
    'Carlos Sainz',
    'Lando Norris',
    'Oscar Piastri',
    'Esteban Ocon',
    'Pierre Gasly',
    'Yuki Tsunoda',
    'Nyck de Vries',
    'Fernando Alonso',
    'Lance Stroll',
    'Valtteri Bottas',
    'Zhou Guanyu',
    'Alex Albon',
    'Logan Sargeant',
    'Kevin Magnussen',
    'Nico Hulkenberg'
]

driver_confidence_dict = {}

for driver in drivers:
    driver_data = cleaned_data[cleaned_data['driver'] == driver]
    if len(driver_data) > 0:
        driver_confidence_dict[driver] = driver_data.iloc[0]['driver_confidence']
    else:
        driver_confidence_dict[driver] = 0.8

for driver, confidence in driver_confidence_dict.items():
    print(f"{driver}: {confidence}")

Lewis Hamilton: 0.9407114624505929
George Russell: 0.9583333333333334
Max Verstappen: 0.9142857142857143
Sergio Pérez: 0.9333333333333333
Charles Leclerc: 0.8444444444444444
Carlos Sainz: 0.9038461538461539
Lando Norris: 0.9166666666666666
Oscar Piastri: 0.8
Esteban Ocon: 0.9230769230769231
Pierre Gasly: 0.9387755102040817
Yuki Tsunoda: 0.8
Nyck de Vries: 0.8
Fernando Alonso: 0.8
Lance Stroll: 0.9230769230769231
Valtteri Bottas: 0.965034965034965
Zhou Guanyu: 0.8
Alex Albon: 0.8
Logan Sargeant: 0.8
Kevin Magnussen: 0.9523809523809523
Nico Hulkenberg: 0.8


In [35]:
cleaned_data['istest'] = 0

In [36]:
cleaned_data

Unnamed: 0,GP_name,quali_pos,constructor,driver,position,driver_confidence,constructor_relaiblity,active_driver,active_constructor,dob,istest
39,Albert Park Grand Prix Circuit,12,Renault,Daniel Ricciardo,12,0.942197,0.601852,1,1,1989-07-01,0
40,Albert Park Grand Prix Circuit,14,Renault,Kevin Magnussen,15,0.952381,0.601852,1,1,1992-10-05,0
41,Albert Park Grand Prix Circuit,9,Renault,Carlos Sainz,9,0.903846,0.601852,1,1,1994-09-01,0
90,Albert Park Grand Prix Circuit,16,Williams,Valtteri Bottas,16,0.965035,0.569961,1,1,1989-08-28,0
91,Albert Park Grand Prix Circuit,15,Williams,Valtteri Bottas,10,0.965035,0.569961,1,1,1989-08-28,0
...,...,...,...,...,...,...,...,...,...,...,...
8122,Buddh International Circuit,1,Red Bull,Sebastian Vettel,1,0.934156,0.750865,1,1,1987-07-03,0
8123,Buddh International Circuit,7,Renault,Kimi Räikkönen,7,0.933798,0.601852,1,1,1979-10-17,0
8124,Buddh International Circuit,6,Renault,Kimi Räikkönen,6,0.933798,0.601852,1,1,1979-10-17,0
8125,Buddh International Circuit,11,Renault,Romain Grosjean,11,0.852071,0.601852,1,1,1986-04-17,0


In [25]:
# 1 	16	Charles Leclerc	Ferrari	1'40.203	 
# 2 	1	Max Verstappen	Red Bull/Honda RBPT	1'40.391	0.188
# 3 	11	Sergio Pérez	Red Bull/Honda RBPT	1'40.495	0.292
# 4 	55	Carlos Sainz Jr.	Ferrari	1'41.016	0.813
# 5 	44	Lewis Hamilton	Mercedes	1'41.177	0.974
# 6 	14	Fernando Alonso	Aston Martin/Mercedes	1'41.253	1.050
# 7 	4	Lando Norris	McLaren/Mercedes	1'41.281	1.078
# 8 	22	Yuki Tsunoda	AlphaTauri/Honda RBPT	1'41.581	1.378
# 9 	18	Lance Stroll	Aston Martin/Mercedes	1'41.611	1.408
# 10 	81	Oscar Piastri	McLaren/Mercedes	1'41.611	1.408
# 11 	63	George Russell	Mercedes	1'41.654	1.451
# 12 	31	Esteban Ocon	Alpine/Renault	1'41.798	1.595
# 13 	23	Alexander Albon	Williams/Mercedes	1'41.818	1.615
# 14 	77	Valtteri Bottas	Alfa Romeo/Ferrari	1'42.259	2.056
# 15 	2	Logan Sargeant	Williams/Mercedes	1'42.395	2.192
# 16 	24	Zhou Guanyu	Alfa Romeo/Ferrari	1'42.642	2.439
# 17 	27	Nico Hülkenberg	Haas/Ferrari	1'42.755	2.552
# 18 	20	Kevin Magnussen	Haas/Ferrari	1'43.417	3.214
# 19 	10	Pierre Gasly	Alpine/Renault	1'44.853	4.650
# 20 	21	Nyck de Vries	AlphaTauri/Honda RBPT

import pandas as pd

# Driver names
# #drivers = ["Lewis Hamilton", "George Russell", "Max Verstappen", "Sergio Pérez", "Charles Leclerc", "Carlos Sainz",
#            "Lando Norris", "Oscar Piastri", "Esteban Ocon", "Pierre Gasly", "Yuki Tsunoda", "Nyck de Vries",
#            "Fernando Alonso", "Lance Stroll", "Valtteri Bottas", "Zhou Guanyu", "Alex Albon", "Logan Sargeant",
#            "Kevin Magnussen", "Nico Hülkenberg"]

# Constructor names
constructors = ["Ferrari", "Red Bull", "Mercedes", "Racing Point", "Williams", "Alfa Romeo", "AlphaTauri", "McLaren",
                "Renault", "Haas F1 Team"]

# Birthdates

# Driver confidences
driverconfidences = {"Lewis Hamilton": 0.9407114624505929, "George Russell": 0.9583333333333334,
                      "Max Verstappen": 0.9142857142857143, "Sergio Pérez": 0.9333333333333333,
                      "Charles Leclerc": 0.8444444444444444, "Carlos Sainz": 0.9038461538461539,
                      "Lando Norris": 0.9166666666666666, "Oscar Piastri": 0.8, "Esteban Ocon": 0.9230769230769231,
                      "Pierre Gasly": 0.9387755102040817, "Yuki Tsunoda": 0.8, "Nyck de Vries": 0.8,
                      "Fernando Alonso": 0.8, "Lance Stroll": 0.9230769230769231,
                      "Valtteri Bottas": 0.965034965034965, "Zhou Guanyu": 0.8, "Alex Albon": 0.8,
                      "Logan Sargeant": 0.8, "Kevin Magnussen": 0.9523809523809523, "Nico Hülkenberg": 0.8}

# Constructor reliabilities
# Ferrari: 0.8243589743589743
# Red Bull: 0.7508650519031141
# Mercedes: 0.8778054862842892
# Racing Point: 0.5902335456475584
# Williams: 0.5699614890885751
# Alfa Romeo: 0.3952755905511811
# AlphaTauri: 0.4553903345724907
# McLaren: 0.6344916344916345
# Renault: 0.6018518518518519
# Haas F1 Team: 0.34302325581395354
driver_confidences = [0.8444444444444444, 0.9142857142857143, 0.9333333333333333, 0.9038461538461539, 
                      0.9407114624505929, 0.8, 0.9166666666666666, 0.8, 0.9230769230769231, 0.8, 
                      0.9583333333333334, 0.9230769230769231, 0.8, 0.965034965034965, 0.8, 0.8, 
                      0.8, 0.9523809523809523, 0.9387755102040817, 0.8]


drivers = ["Charles Leclerc", "Max Verstappen", "Sergio Pérez", "Carlos Sainz Jr.", "Lewis Hamilton", "Fernando Alonso","Lando Norris", "Yuki Tsunoda", "Lance Stroll", "Oscar Piastri", "George Russell", "Esteban Ocon","Alexander Albon", "Valtteri Bottas", "Logan Sargeant", "Zhou Guanyu", "Nico Hülkenberg", "Kevin Magnussen","Pierre Gasly", "Nyck de Vries"]

import random
constructor_reliabilities = [0.8243589743589743, 0.7508650519031141, 0.7508650519031141, 0.8243589743589743,
0.8778054862842892, 0.5902335456475584, 0.6344916344916345, 0.4553903345724907,
0.5902335456475584, 0.6344916344916345, 0.8778054862842892, 0.6018518518518519,
0.5699614890885751, 0.3952755905511811, 0.5699614890885751, 0.3952755905511811,
0.34302325581395354, 0.34302325581395354, 0.6018518518518519, 0.4553903345724907]
# Create dictionary of data
newdata = {
    'GP_name': ['Baku City Circuit','Baku City Circuit','Baku City Circuit','Baku City Circuit','Baku City Circuit','Baku City Circuit','Baku City Circuit','Baku City Circuit','Baku City Circuit','Baku City Circuit','Baku City Circuit','Baku City Circuit','Baku City Circuit','Baku City Circuit','Baku City Circuit','Baku City Circuit','Baku City Circuit','Baku City Circuit','Baku City Circuit','Baku City Circuit'],
    'quali_pos': [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20],
    'constructor': ["Ferrari", "Red Bull","Red Bull","Ferrari","Mercedes","Racing Point","McLaren","AlphaTauri","Racing Point","McLaren", "Mercedes","Renault","Williams", "Alfa Romeo","Williams","Alfa Romeo","Haas F1 Team","Haas F1 Team","Renault","AlphaTauri"],
    'driver': drivers,
    'position': [3,2,1,5,6,4,9,10,7,11,8,15,12,18,16,20,17,13,14,19],
    'driver_confidence': driver_confidences,
    'constructor_relaiblity':constructor_reliabilities,
    'active_driver': [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1],
    'active_constructor': [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1],
    'dob': [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
}

# Create dataframe

new_data = pd.DataFrame(newdata)
new_data['istest'] = 0

# Print dataframe
print(new_data)



              GP_name  quali_pos   constructor            driver  position  \
0   Baku City Circuit          1       Ferrari   Charles Leclerc         3   
1   Baku City Circuit          2      Red Bull    Max Verstappen         2   
2   Baku City Circuit          3      Red Bull      Sergio Pérez         1   
3   Baku City Circuit          4       Ferrari  Carlos Sainz Jr.         5   
4   Baku City Circuit          5      Mercedes    Lewis Hamilton         6   
5   Baku City Circuit          6  Racing Point   Fernando Alonso         4   
6   Baku City Circuit          7       McLaren      Lando Norris         9   
7   Baku City Circuit          8    AlphaTauri      Yuki Tsunoda        10   
8   Baku City Circuit          9  Racing Point      Lance Stroll         7   
9   Baku City Circuit         10       McLaren     Oscar Piastri        11   
10  Baku City Circuit         11      Mercedes    George Russell         8   
11  Baku City Circuit         12       Renault      Esteban Ocon

In [37]:
cleaned_data = pd.concat([cleaned_data,new_data])

In [38]:
cleaned_data

Unnamed: 0,GP_name,quali_pos,constructor,driver,position,driver_confidence,constructor_relaiblity,active_driver,active_constructor,dob,istest
39,Albert Park Grand Prix Circuit,12,Renault,Daniel Ricciardo,12,0.942197,0.601852,1,1,1989-07-01 00:00:00,0
40,Albert Park Grand Prix Circuit,14,Renault,Kevin Magnussen,15,0.952381,0.601852,1,1,1992-10-05 00:00:00,0
41,Albert Park Grand Prix Circuit,9,Renault,Carlos Sainz,9,0.903846,0.601852,1,1,1994-09-01 00:00:00,0
90,Albert Park Grand Prix Circuit,16,Williams,Valtteri Bottas,16,0.965035,0.569961,1,1,1989-08-28 00:00:00,0
91,Albert Park Grand Prix Circuit,15,Williams,Valtteri Bottas,10,0.965035,0.569961,1,1,1989-08-28 00:00:00,0
...,...,...,...,...,...,...,...,...,...,...,...
15,Baku City Circuit,16,Alfa Romeo,Zhou Guanyu,20,0.800000,0.395276,1,1,1,0
16,Baku City Circuit,17,Haas F1 Team,Nico Hülkenberg,17,0.800000,0.343023,1,1,1,0
17,Baku City Circuit,18,Haas F1 Team,Kevin Magnussen,13,0.952381,0.343023,1,1,1,0
18,Baku City Circuit,19,Renault,Pierre Gasly,14,0.938776,0.601852,1,1,1,0


In [39]:
miami = pd.read_csv('miami.csv')
cleaned_data = pd.concat([cleaned_data,miami])

In [40]:
cleaned_data.tail()

Unnamed: 0,GP_name,quali_pos,constructor,driver,position,driver_confidence,constructor_relaiblity,active_driver,active_constructor,dob,istest
15,Circuit Gilles Villeneuve,16,McLaren,Lando Norris,,0.916667,0.63449,1,1,1,1
16,Circuit Gilles Villeneuve,17,AlphaTauri,Yuki Tsunoda,,0.8,0.45539,1,1,1,1
17,Circuit Gilles Villeneuve,18,Racing Point,Lance Stroll,,0.923077,0.59023,1,1,1,1
18,Circuit Gilles Villeneuve,19,McLaren,Oscar Piastri,,0.8,0.63449,1,1,1,1
19,Circuit Gilles Villeneuve,20,Williams,Logan Sargeant,,0.8,0.56996,1,1,1,1


In [41]:
sc  = StandardScaler()
le = LabelEncoder()

x = cleaned_data
x['GP_name'] = le.fit_transform(x['GP_name'])
x['constructor'] = le.fit_transform(x['constructor'])
x['driver_names'] = x['driver']
x['driver'] = le.fit_transform(x['driver'])
x['GP_name'] = le.fit_transform(x['GP_name'])
print(x[x['istest'] == 1])

X = x.drop(['position','active_driver','active_constructor','driver_names'],1)
X_test_new = X[X['istest'] == 1]
X = X[X['istest']== 0]
y = x[x['istest'] == 0]['position'].apply(lambda x: position_index(x))
#y = x[x['istest'] == 0]['position']

    GP_name  quali_pos  constructor  driver  position  driver_confidence  \
0         9          1            7      24       NaN           0.933333   
1         9          2            6       8       NaN           0.932660   
2         9          3            2       2       NaN           0.903846   
3         9          4            3      10       NaN           0.952381   
4         9          5            8      21       NaN           0.938776   
5         9          6            5       9       NaN           0.958333   
6         9          7            2       4       NaN           0.844444   
7         9          8            8       7       NaN           0.923077   
8         9          9            7      16       NaN           0.914286   
9         9         10            0      25       NaN           0.965035   
10        9         11            9       0       NaN           0.956250   
11        9         12            3      18       NaN           0.800000   
12        9 

In [125]:
X.dtypes

GP_name                      int64
quali_pos                    int64
constructor                  int32
driver                       int32
driver_confidence          float64
constructor_relaiblity     float64
istest                       int64
constructor_reliability    float64
dtype: object

In [42]:
X = X.drop('dob', axis=1)
X_test_new =  X_test_new.drop('dob', axis=1)

In [43]:
y_test_new = x[x['istest'] == 1]['position'].apply(lambda x: position_index(x))

In [130]:
#cross validation for diffrent models
models = [LogisticRegression(),DecisionTreeClassifier(),RandomForestClassifier(),SVC(),GaussianNB(),KNeighborsClassifier()]
names = ['LogisticRegression','DecisionTreeClassifier','RandomForestClassifier','SVC','GaussianNB','KNeighborsClassifier']
model_dict = dict(zip(models,names))
mean_results = []
results = []
name = []
for model in models:
    cv = StratifiedKFold(n_splits=10,random_state=1, shuffle = True)
    result = cross_val_score(model,X,y,cv=cv,scoring='accuracy')
    mean_results.append(result.mean())
    results.append(result)
    name.append(model_dict[model])
    print(f'{model_dict[model]} : {result.mean()}')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

LogisticRegression : 0.08327345821572149
DecisionTreeClassifier : 0.5614104866991703
RandomForestClassifier : 0.5387514968779403
SVC : 0.060839000085535874
GaussianNB : 0.271799888803353
KNeighborsClassifier : 0.09599317851338637


### Model considering only drivers

In [44]:
n_estimators = [int(x) for x in np.linspace(start=200,stop=2000,num=10)]
max_features = ['auto','sqrt']
max_depth = [int(x) for x in np.linspace(10,110,num=11)]
min_samples_split = [2,5,8,10,15,20]
min_samples_leaf = [1,2,4,6,8,10]
bootstrap = [True,False]

random_parms = {
    'n_estimators':n_estimators,
    'max_features':max_features,
    'max_depth':max_depth,
    'min_samples_split':min_samples_split,
    'min_samples_leaf':min_samples_leaf,
    'bootstrap':bootstrap
                }

In [64]:
X.shape

(4323, 18)

In [124]:
y = np.array(y).reshape(-1, 1)
y = pd.Series(y.flatten())

In [125]:
y.shape

(4323,)

In [79]:
rf_rand = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator=rf_rand,param_distributions=random_parms,n_iter=10,cv=10,verbose=2,n_jobs=-1 )
rf_random.fit(X,y)
rf_random.best_params_

Fitting 10 folds for each of 10 candidates, totalling 100 fits


KeyboardInterrupt: 

In [45]:
rf = RandomForestClassifier(n_estimators=1600,min_samples_split=8,min_samples_leaf=4,max_features='auto',max_depth=70,bootstrap=True)
rf.fit(X,y)
y_pred_rf = rf.predict(X_test_new)

  warn(


In [104]:
X_train

Unnamed: 0,season,round,weather_warm,weather_cold,weather_dry,weather_wet,weather_cloudy,driver,grid,driver_points,...,constructor_minardi,constructor_prost,constructor_red_bull,constructor_renault,constructor_sauber,constructor_team_lotus,constructor_toro_rosso,constructor_toyota,constructor_tyrrell,constructor_williams
0,1983,1,False,False,True,False,False,109,1,0,...,0,0,0,0,0,0,0,0,0,1
1,1983,1,False,False,True,False,False,176,2,0,...,0,0,0,1,0,0,0,0,0,0
2,1983,1,False,False,True,False,False,212,3,0,...,0,0,0,0,0,0,0,0,0,0
3,1983,1,False,False,True,False,False,171,4,0,...,0,0,0,0,0,0,0,0,0,0
4,1983,1,False,False,True,False,False,223,5,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,2023,3,True,False,False,False,False,110,14,1,...,0,0,0,0,0,0,0,0,0,0
14996,2023,3,True,False,False,False,False,58,15,0,...,0,0,0,0,0,0,0,0,0,0
14997,2023,3,True,False,False,False,False,168,16,0,...,0,0,0,0,0,0,0,0,0,0
14998,2023,3,True,False,False,False,False,233,17,0,...,0,0,0,0,0,0,0,0,0,0


In [105]:
y_train

0        15
1         6
2         4
3         1
4         7
         ..
14995    17
14996    15
14997     8
14998     9
14999    16
Name: podium, Length: 15000, dtype: int64

In [119]:
import pickle

filename = 'rf_model.pkl'
with open(filename, 'wb') as file:
    pickle.dump(rf, file)

In [52]:
y_pred_new = rf.predict(X_test_new)

NameError: name 'X_test_new' is not defined

In [46]:
print(y_pred_rf)

[1 2 1 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3]


In [47]:
X_later = X_test_new.copy()

X_later['pred_pos'] = y_pred_rf
X_later['actual'] = y_test_new

In [85]:
X_later = X_later.sort_values(by = 'pred_pos')

In [86]:
X_later.shape

(20, 9)

In [98]:
X_later['pred_pos'] = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18]

In [48]:
X_later['driver'] = le.inverse_transform(X_later['driver'])

In [49]:
X_later ## jeddah 2022

Unnamed: 0,GP_name,quali_pos,constructor,driver,driver_confidence,constructor_relaiblity,istest,pred_pos,actual
0,9,1,7,24,0.933333,0.75086,1,1,2
1,9,2,6,8,0.93266,0.59023,1,2,2
2,9,3,2,2,0.903846,0.8243,1,1,2
3,9,4,3,10,0.952381,0.34302,1,2,2
4,9,5,8,21,0.938776,0.60185,1,2,2
5,9,6,5,9,0.958333,0.8778,1,2,2
6,9,7,2,4,0.844444,0.82435,1,2,2
7,9,8,8,7,0.923077,0.60185,1,2,2
8,9,9,7,16,0.914286,0.75086,1,2,2
9,9,10,0,25,0.965035,0.39527,1,2,2


In [101]:
X_later

Unnamed: 0,season,round,driver,grid,pred_pos,actual
15001,2023,4,max_verstappen,2,1,2
15002,2023,4,perez,3,2,1
15004,2023,4,hamilton,5,3,6
15005,2023,4,alonso,6,4,4
15000,2023,4,leclerc,1,5,3
15010,2023,4,russell,11,6,8
15008,2023,4,stroll,9,7,7
15009,2023,4,piastri,10,8,11
15003,2023,4,sainz,4,9,5
15006,2023,4,norris,7,10,9


In [226]:
new_data_tested = new_data.copy()
new_data_tested['pos'] = y_pred_new

In [227]:
new_data_tested = new_data_tested.sort_values(by='pos', ascending=True)
new_data_tested['pos'] = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]


In [228]:
new_data_tested

Unnamed: 0,GP_name,quali_pos,constructor,driver,driver_confidence,constructor_relaiblity,active_driver,active_constructor,dob,istest,pos
0,Baku City Circuit,1,Ferrari,Charles Leclerc,0.844444,0.824359,1,1,1,1,1
1,Baku City Circuit,2,Red Bull,Max Verstappen,0.914286,0.750865,1,1,1,1,2
2,Baku City Circuit,3,Red Bull,Sergio Pérez,0.933333,0.750865,1,1,1,1,3
3,Baku City Circuit,4,Ferrari,Carlos Sainz Jr.,0.903846,0.824359,1,1,1,1,4
4,Baku City Circuit,5,Mercedes,Lewis Hamilton,0.940711,0.877805,1,1,1,1,5
5,Baku City Circuit,6,Racing Point,Fernando Alonso,0.8,0.590234,1,1,1,1,6
6,Baku City Circuit,7,McLaren,Lando Norris,0.916667,0.634492,1,1,1,1,7
7,Baku City Circuit,8,AlphaTauri,Yuki Tsunoda,0.8,0.45539,1,1,1,1,8
8,Baku City Circuit,9,Racing Point,Lance Stroll,0.923077,0.590234,1,1,1,1,9
10,Baku City Circuit,11,Mercedes,George Russell,0.958333,0.877805,1,1,1,1,10


In [38]:
X.dtypes

GP_name                            int64
quali_pos                          int64
constructor                        int32
driver                             int32
driver_confidence                float64
constructor_relaiblity           float64
dob                       datetime64[ns]
dtype: object

In [79]:
X_d.dtypes

GP_name                int64
quali_pos              int64
driver                 int64
age_at_gp_in_days    float64
driver_confidence    float64
dtype: object

In [48]:
X.drop(["country", "constructor_nationality"], axis=1, inplace=True)


In [39]:
X = X.drop([ 'dob'], axis=1)


In [50]:
X = X.drop(['driver_nationality'],axis = 1)

In [51]:
X['age_at_gp_in_days'] = X['age_at_gp_in_days'].apply(lambda x: float(x.split()[0]))


In [59]:
svc = SVC()
svc.fit(X_train,y_train)
y_pred = svc.predict(X_test)
svc.fit(X_train,y_train)
y_pred_svc = svc.predict(X_test)
cnf_mat_svc = confusion_matrix(y_test,y_pred)
cnf_mat_svc = cnf_mat_svc/cnf_mat_svc.sum()

In [110]:
X_later = X_test.copy()
X_later = X_later[['season','round','driver','grid']]

X_later['pred_pos'] = y_pred_rf
X_later['actual'] = y_test

X_later = X_later.sort_values(by = 'pred_pos')

In [111]:
X_later.shape

(18, 6)

In [114]:
X_later['driver'] = le.inverse_transform(X_later['driver'])

In [112]:
X_later['pred_pos'] = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18]

In [115]:
X_later

Unnamed: 0,season,round,driver,grid,pred_pos,actual
15001,2023,4,max_verstappen,2,1,2
15002,2023,4,perez,3,2,1
15004,2023,4,hamilton,5,3,6
15005,2023,4,alonso,6,4,4
15000,2023,4,leclerc,1,5,3
15010,2023,4,russell,11,6,8
15008,2023,4,stroll,9,7,7
15009,2023,4,piastri,10,8,11
15003,2023,4,sainz,4,9,5
15006,2023,4,norris,7,10,9


In [109]:
print(y_pred_svc)

[13  7  7 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13]


In [65]:
rf_pression = precision_score(y_test, y_pred_rf, average='macro')
rf_f1 = f1_score(y_test,y_pred_rf,average='macro')
rf_recall = recall_score(y_test,y_pred_rf,average='macro')
svc_pression = precision_score(y_test, y_pred_svc, average='micro')
svc_f1 = f1_score(y_test,y_pred_svc,average='macro')
svc_recall = recall_score(y_test,y_pred_svc,average='macro')
metrics_dict ={
    'RandomForestClassifier':{'precision_score':rf_pression,'f1_score':rf_f1,'recall_score':rf_recall},
    'SVC':{'precision_score':svc_pression,'f1_score':svc_f1,'recall_score':svc_recall}
}
metrics_df = pd.DataFrame(metrics_dict)
metrics_df

Unnamed: 0,RandomForestClassifier,SVC
precision_score,0.938202,0.901869
f1_score,0.936028,0.901417
recall_score,0.934206,0.899036
