# Formula 1 Grand Prix result prediction

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.preprocessing import StandardScaler,LabelEncoder,OneHotEncoder
from sklearn.model_selection import cross_val_score,StratifiedKFold,RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix,precision_score,f1_score,recall_score
plt.style.use('seaborn')

In [2]:
results = pd.read_csv(r'data\results.csv')
races = pd.read_csv(r'data\races.csv')
quali = pd.read_csv(r'data\qualifying.csv')
drivers = pd.read_csv(r'data\drivers.csv')
constructors = pd.read_csv(r'data\constructors.csv')
circuit = pd.read_csv(r'data\circuits.csv')

In [3]:
df1 = pd.merge(races,results,how='inner',on=['raceId'])
df2 = pd.merge(df1,quali,how='inner',on=['raceId','driverId','constructorId'])
df3 = pd.merge(df2,drivers,how='inner',on=['driverId'])
df4 = pd.merge(df3,constructors,how='inner',on=['constructorId'])
df5 = pd.merge(df4,circuit,how='inner',on=['circuitId'])

In [6]:
data.head()

Unnamed: 0,year,date,grid,statusId,position_y,forename,surname,dob,nationality_x,name_y,nationality_y,name,country
0,2009,2009-03-29,1,1,1,Jenson,Button,1980-01-19,British,Brawn,British,Albert Park Grand Prix Circuit,Australia
1,2009,2009-03-29,2,1,2,Rubens,Barrichello,1972-05-23,Brazilian,Brawn,British,Albert Park Grand Prix Circuit,Australia
2,2008,2008-03-16,12,4,13,Jenson,Button,1980-01-19,British,Honda,Japanese,Albert Park Grand Prix Circuit,Australia
3,2007,2007-03-18,14,11,14,Jenson,Button,1980-01-19,British,Honda,Japanese,Albert Park Grand Prix Circuit,Australia
4,2006,2006-04-02,1,11,1,Jenson,Button,1980-01-19,British,Honda,Japanese,Albert Park Grand Prix Circuit,Australia


In [5]:
#drop the columns which are not important
data = df5.drop(['round','circuitId','time_x','url_x','resultId','driverId',
                 'constructorId','number_x','positionText','position_x',
                 'positionOrder','laps','time_y','rank',
                 'fastestLapTime','fastestLapSpeed','qualifyId','driverRef','number','code','url_y','circuitRef',
                 'location','lat','lng','alt','number_y','points','constructorRef','name_x','raceId','fastestLap','q2','q3','milliseconds','q1'],1)

In [7]:
#considering data points from 1980
data = data[data['year']>=1980]

In [8]:
#rename the columns
data.rename(columns={'name':'GP_name','position_y':'position','grid':'quali_pos','name_y':'constructor','nationality_x':'driver_nationality','nationality_y':'constructor_nationality'},inplace=True)
data['driver'] = data['forename']+' '+data['surname']
data['date'] = pd.to_datetime(data['date'])
data['dob'] = pd.to_datetime(data['dob'])

In [9]:
#creating a driver age parameter
data['age_at_gp_in_days'] = abs(data['dob']-data['date'])
data['age_at_gp_in_days'] = data['age_at_gp_in_days'].apply(lambda x: str(x).split(' ')[0])

In [10]:
#Some of the constructors changed their name over the year so replacing old names with current name
data['constructor'] = data['constructor'].apply(lambda x: 'Racing Point' if x=='Force India' else x)
data['constructor'] = data['constructor'].apply(lambda x: 'Racing Point' if x=='Aston Martin' else x)
data['constructor'] = data['constructor'].apply(lambda x: 'Alfa Romeo' if x=='Sauber' else x)
data['constructor'] = data['constructor'].apply(lambda x: 'Renault' if x=='Lotus F1' else x)
data['constructor'] = data['constructor'].apply(lambda x: 'Renault' if x=='Alpine' else x)
data['constructor'] = data['constructor'].apply(lambda x: 'AlphaTauri' if x=='Toro Rosso' else x)

In [11]:
data['driver_nationality'] = data['driver_nationality'].apply(lambda x: str(x)[:3])
data['constructor_nationality'] = data['constructor_nationality'].apply(lambda x: str(x)[:3])
data['country'] = data['country'].apply(lambda x: 'Bri' if x=='UK' else x)
data['country'] = data['country'].apply(lambda x: 'Ame' if x=='USA' else x)
data['country'] = data['country'].apply(lambda x: 'Fre' if x=='Fra' else x)
data['country'] = data['country'].apply(lambda x: str(x)[:3])
data['driver_home'] = data['driver_nationality'] == data['country']
data['constructor_home'] = data['constructor_nationality'] == data['country']
data['driver_home'] = data['driver_home'].apply(lambda x: int(x))
data['constructor_home'] = data['constructor_home'].apply(lambda x: int(x))

In [12]:
#reasons for DNF(did not finish)
data['driver_dnf'] = data['statusId'].apply(lambda x: 1 if x in [3,4,20,29,31,41,68,73,81,97,82,104,107,130,137] else 0)
data['constructor_dnf'] = data['statusId'].apply(lambda x: 1 if x not in [3,4,20,29,31,41,68,73,81,97,82,104,107,130,137,1] else 0)
data.drop(['forename','surname'],1,inplace=True)

In [13]:
data.head()

Unnamed: 0,year,date,quali_pos,statusId,position,dob,driver_nationality,constructor,constructor_nationality,GP_name,country,driver,age_at_gp_in_days,driver_home,constructor_home,driver_dnf,constructor_dnf
0,2009,2009-03-29,1,1,1,1980-01-19,Bri,Brawn,Bri,Albert Park Grand Prix Circuit,Aus,Jenson Button,10662,0,0,0,0
1,2009,2009-03-29,2,1,2,1972-05-23,Bra,Brawn,Bri,Albert Park Grand Prix Circuit,Aus,Rubens Barrichello,13459,0,0,0,0
2,2008,2008-03-16,12,4,13,1980-01-19,Bri,Honda,Jap,Albert Park Grand Prix Circuit,Aus,Jenson Button,10284,0,0,1,0
3,2007,2007-03-18,14,11,14,1980-01-19,Bri,Honda,Jap,Albert Park Grand Prix Circuit,Aus,Jenson Button,9920,0,0,0,1
4,2006,2006-04-02,1,11,1,1980-01-19,Bri,Honda,Jap,Albert Park Grand Prix Circuit,Aus,Jenson Button,9570,0,0,0,1


In [13]:
font = {
    'family':'serif',
    'color':'black',
    'weight':'bold',
    'size':10
}

In [14]:
dnf_by_driver = data.groupby('driver').sum()['driver_dnf']
driver_race_entered = data.groupby('driver').count()['driver_dnf']
driver_dnf_ratio = (dnf_by_driver/driver_race_entered)
driver_confidence = 1-driver_dnf_ratio
driver_confidence_dict = dict(zip(driver_confidence.index,driver_confidence))

In [15]:
dnf_by_constructor = data.groupby('constructor').sum()['constructor_dnf']
constructor_race_entered = data.groupby('constructor').count()['constructor_dnf']
constructor_dnf_ratio = (dnf_by_constructor/constructor_race_entered)
constructor_relaiblity = 1-constructor_dnf_ratio
constructor_relaiblity_dict = dict(zip(constructor_relaiblity.index,constructor_relaiblity))

In [16]:
data['driver_confidence'] = data['driver'].apply(lambda x:driver_confidence_dict[x])
data['constructor_relaiblity'] = data['constructor'].apply(lambda x:constructor_relaiblity_dict[x])
#removing retired drivers and constructors
active_constructors=['Renault', 'Williams', 'McLaren', 'Ferrari', 'Mercedes',
                  'AlphaTauri', 'Racing Point', 'Alfa Romeo', 'Red Bull',
                       'Haas F1 Team']

active_drivers= ['Kevin Magnussen', 'Carlos Sainz',
                  'Valtteri Bottas', 'Lance Stroll', 'George Russell',
                  'Lando Norris', 'Oscar Piastri', 'Logan Sargeant',
                  'Charles Leclerc', 'Lewis Hamilton', 'Fernando Alonso',
                  'Max Verstappen', 'Pierre Gasly', 'Alexander Albon',
                  'Sergio Pérez', 'Esteban Ocon', 'Zhou Guanyu',
                  'Nico Hulkenburg','Nyck De Vries', 'Yuki Tsunoda']


In [17]:
data.dtypes

year                                int64
date                       datetime64[ns]
quali_pos                           int64
statusId                            int64
position                            int64
dob                        datetime64[ns]
driver_nationality                 object
constructor                        object
constructor_nationality            object
GP_name                            object
country                            object
driver                             object
age_at_gp_in_days                  object
driver_home                         int64
constructor_home                    int64
driver_dnf                          int64
constructor_dnf                     int64
driver_confidence                 float64
constructor_relaiblity            float64
active_driver                       int64
active_constructor                  int64
dtype: object

In [75]:
cleaned_data['active_driver'] = data['driver'].apply(lambda x: int(x in active_drivers))
cleaned_data['active_constructor'] = data['constructor'].apply(lambda x: int(x in active_constructors))

### Model considering both drivers and constructors

In [73]:
cleaned_data = data[['GP_name','quali_pos','constructor','driver','position','driver_confidence','constructor_relaiblity','active_driver','active_constructor','dob']]
cleaned_data = cleaned_data[(cleaned_data['active_driver']==1)&(cleaned_data['active_constructor']==1)]
#cleaned_data.to_csv('cleaned_data.csv',index=False)


In [74]:
cleaned_data['istest'] = 0

In [76]:
# 1 	16	Charles Leclerc	Ferrari	1'40.203	 
# 2 	1	Max Verstappen	Red Bull/Honda RBPT	1'40.391	0.188
# 3 	11	Sergio Pérez	Red Bull/Honda RBPT	1'40.495	0.292
# 4 	55	Carlos Sainz Jr.	Ferrari	1'41.016	0.813
# 5 	44	Lewis Hamilton	Mercedes	1'41.177	0.974
# 6 	14	Fernando Alonso	Aston Martin/Mercedes	1'41.253	1.050
# 7 	4	Lando Norris	McLaren/Mercedes	1'41.281	1.078
# 8 	22	Yuki Tsunoda	AlphaTauri/Honda RBPT	1'41.581	1.378
# 9 	18	Lance Stroll	Aston Martin/Mercedes	1'41.611	1.408
# 10 	81	Oscar Piastri	McLaren/Mercedes	1'41.611	1.408
# 11 	63	George Russell	Mercedes	1'41.654	1.451
# 12 	31	Esteban Ocon	Alpine/Renault	1'41.798	1.595
# 13 	23	Alexander Albon	Williams/Mercedes	1'41.818	1.615
# 14 	77	Valtteri Bottas	Alfa Romeo/Ferrari	1'42.259	2.056
# 15 	2	Logan Sargeant	Williams/Mercedes	1'42.395	2.192
# 16 	24	Zhou Guanyu	Alfa Romeo/Ferrari	1'42.642	2.439
# 17 	27	Nico Hülkenberg	Haas/Ferrari	1'42.755	2.552
# 18 	20	Kevin Magnussen	Haas/Ferrari	1'43.417	3.214
# 19 	10	Pierre Gasly	Alpine/Renault	1'44.853	4.650
# 20 	21	Nyck de Vries	AlphaTauri/Honda RBPT

import pandas as pd

# Driver names
# #drivers = ["Lewis Hamilton", "George Russell", "Max Verstappen", "Sergio Pérez", "Charles Leclerc", "Carlos Sainz",
#            "Lando Norris", "Oscar Piastri", "Esteban Ocon", "Pierre Gasly", "Yuki Tsunoda", "Nyck de Vries",
#            "Fernando Alonso", "Lance Stroll", "Valtteri Bottas", "Zhou Guanyu", "Alex Albon", "Logan Sargeant",
#            "Kevin Magnussen", "Nico Hülkenberg"]

# Constructor names
constructors = ["Ferrari", "Red Bull", "Mercedes", "Racing Point", "Williams", "Alfa Romeo", "AlphaTauri", "McLaren",
                "Renault", "Haas F1 Team"]

# Birthdates

# Driver confidences
driverconfidences = {"Lewis Hamilton": 0.9407114624505929, "George Russell": 0.9583333333333334,
                      "Max Verstappen": 0.9142857142857143, "Sergio Pérez": 0.9333333333333333,
                      "Charles Leclerc": 0.8444444444444444, "Carlos Sainz": 0.9038461538461539,
                      "Lando Norris": 0.9166666666666666, "Oscar Piastri": 0.8, "Esteban Ocon": 0.9230769230769231,
                      "Pierre Gasly": 0.9387755102040817, "Yuki Tsunoda": 0.8, "Nyck de Vries": 0.8,
                      "Fernando Alonso": 0.8, "Lance Stroll": 0.9230769230769231,
                      "Valtteri Bottas": 0.965034965034965, "Zhou Guanyu": 0.8, "Alex Albon": 0.8,
                      "Logan Sargeant": 0.8, "Kevin Magnussen": 0.9523809523809523, "Nico Hülkenberg": 0.8}

# Constructor reliabilities
# Ferrari: 0.8243589743589743
# Red Bull: 0.7508650519031141
# Mercedes: 0.8778054862842892
# Racing Point: 0.5902335456475584
# Williams: 0.5699614890885751
# Alfa Romeo: 0.3952755905511811
# AlphaTauri: 0.4553903345724907
# McLaren: 0.6344916344916345
# Renault: 0.6018518518518519
# Haas F1 Team: 0.34302325581395354
driver_confidences = [0.8444444444444444, 0.9142857142857143, 0.9333333333333333, 0.9038461538461539, 
                      0.9407114624505929, 0.8, 0.9166666666666666, 0.8, 0.9230769230769231, 0.8, 
                      0.9583333333333334, 0.9230769230769231, 0.8, 0.965034965034965, 0.8, 0.8, 
                      0.8, 0.9523809523809523, 0.9387755102040817, 0.8]


drivers = ["Charles Leclerc", "Max Verstappen", "Sergio Pérez", "Carlos Sainz Jr.", "Lewis Hamilton", "Fernando Alonso","Lando Norris", "Yuki Tsunoda", "Lance Stroll", "Oscar Piastri", "George Russell", "Esteban Ocon","Alexander Albon", "Valtteri Bottas", "Logan Sargeant", "Zhou Guanyu", "Nico Hülkenberg", "Kevin Magnussen","Pierre Gasly", "Nyck de Vries"]

import random
constructor_reliabilities = [0.8243589743589743, 0.7508650519031141, 0.7508650519031141, 0.8243589743589743,
0.8778054862842892, 0.5902335456475584, 0.6344916344916345, 0.4553903345724907,
0.5902335456475584, 0.6344916344916345, 0.8778054862842892, 0.6018518518518519,
0.5699614890885751, 0.3952755905511811, 0.5699614890885751, 0.3952755905511811,
0.34302325581395354, 0.34302325581395354, 0.6018518518518519, 0.4553903345724907]
# Create dictionary of data
newdata = {
    'GP_name': ['Baku City Circuit','Baku City Circuit','Baku City Circuit','Baku City Circuit','Baku City Circuit','Baku City Circuit','Baku City Circuit','Baku City Circuit','Baku City Circuit','Baku City Circuit','Baku City Circuit','Baku City Circuit','Baku City Circuit','Baku City Circuit','Baku City Circuit','Baku City Circuit','Baku City Circuit','Baku City Circuit','Baku City Circuit','Baku City Circuit'],
    'quali_pos': [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20],
    'constructor': ["Ferrari", "Red Bull","Red Bull","Ferrari","Mercedes","Racing Point","McLaren","AlphaTauri","Racing Point","McLaren", "Mercedes","Renault","Williams", "Alfa Romeo","Williams","Alfa Romeo","Haas F1 Team","Haas F1 Team","Renault","AlphaTauri"],
    'driver': drivers,
    'driver_confidence': driver_confidences,
    'constructor_relaiblity':constructor_reliabilities,
    'active_driver': [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1],
    'active_constructor': [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1],
    'dob': [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
}

# Create dataframe

new_data = pd.DataFrame(newdata)
new_data['istest'] = 1

# Print dataframe
print(new_data)



              GP_name  quali_pos   constructor            driver  \
0   Baku City Circuit          1       Ferrari   Charles Leclerc   
1   Baku City Circuit          2      Red Bull    Max Verstappen   
2   Baku City Circuit          3      Red Bull      Sergio Pérez   
3   Baku City Circuit          4       Ferrari  Carlos Sainz Jr.   
4   Baku City Circuit          5      Mercedes    Lewis Hamilton   
5   Baku City Circuit          6  Racing Point   Fernando Alonso   
6   Baku City Circuit          7       McLaren      Lando Norris   
7   Baku City Circuit          8    AlphaTauri      Yuki Tsunoda   
8   Baku City Circuit          9  Racing Point      Lance Stroll   
9   Baku City Circuit         10       McLaren     Oscar Piastri   
10  Baku City Circuit         11      Mercedes    George Russell   
11  Baku City Circuit         12       Renault      Esteban Ocon   
12  Baku City Circuit         13      Williams   Alexander Albon   
13  Baku City Circuit         14    Alfa Romeo  

In [77]:
cleaned_data = pd.concat([cleaned_data,new_data])

In [78]:
cleaned_data

Unnamed: 0,GP_name,quali_pos,constructor,driver,position,driver_confidence,constructor_relaiblity,active_driver,active_constructor,dob,istest
90,Albert Park Grand Prix Circuit,16,Williams,Valtteri Bottas,16.0,0.965035,0.569961,1,1,1989-08-28 00:00:00,0
91,Albert Park Grand Prix Circuit,15,Williams,Valtteri Bottas,10.0,0.965035,0.569961,1,1,1989-08-28 00:00:00,0
92,Albert Park Grand Prix Circuit,0,Williams,Valtteri Bottas,6.0,0.965035,0.569961,1,1,1989-08-28 00:00:00,0
93,Albert Park Grand Prix Circuit,16,Williams,Valtteri Bottas,11.0,0.965035,0.569961,1,1,1989-08-28 00:00:00,0
94,Albert Park Grand Prix Circuit,20,Williams,Lance Stroll,19.0,0.923077,0.569961,1,1,1998-10-29 00:00:00,0
...,...,...,...,...,...,...,...,...,...,...,...
15,Baku City Circuit,16,Alfa Romeo,Zhou Guanyu,,0.800000,0.395276,1,1,1,1
16,Baku City Circuit,17,Haas F1 Team,Nico Hülkenberg,,0.800000,0.343023,1,1,1,1
17,Baku City Circuit,18,Haas F1 Team,Kevin Magnussen,,0.952381,0.343023,1,1,1,1
18,Baku City Circuit,19,Renault,Pierre Gasly,,0.938776,0.601852,1,1,1,1


In [79]:
x = cleaned_data.copy()

In [41]:
def position_index(x):
    if x<4:
        return 1
    if x>10:
        return 3
    else :
        return 2

In [26]:
data.columns

Index(['year', 'date', 'quali_pos', 'statusId', 'position', 'dob',
       'driver_nationality', 'constructor', 'constructor_nationality',
       'GP_name', 'country', 'driver', 'age_at_gp_in_days', 'driver_home',
       'constructor_home', 'driver_dnf', 'constructor_dnf',
       'driver_confidence', 'constructor_relaiblity', 'active_driver',
       'active_constructor'],
      dtype='object')

In [24]:
#Some of the constructors changed their name over the year so replacing old names with current name
data['constructor'] = data['constructor'].apply(lambda x: 'Racing Point' if x=='Force India' else x)
data['constructor'] = data['constructor'].apply(lambda x: 'Racing Point' if x=='Aston Martin' else x)
data['constructor'] = data['constructor'].apply(lambda x: 'Alfa Romeo' if x=='Sauber' else x)
data['constructor'] = data['constructor'].apply(lambda x: 'Renault' if x=='Lotus F1' else x)
data['constructor'] = data['constructor'].apply(lambda x: 'Renault' if x=='Alpine' else x)
data['constructor'] = data['constructor'].apply(lambda x: 'AlphaTauri' if x=='Toro Rosso' else x)

In [25]:
constructor_names = ['Ferrari', 'Red Bull', 'Mercedes', 'Racing Point', 'Williams', 'Alfa Romeo', 'AlphaTauri', 'McLaren', 'Renault', 'Haas F1 Team']

for name in constructor_names:
    reliability = cleaned_data.loc[cleaned_data['constructor'] == name, 'constructor_relaiblity'].values[0]
    print(f"{name}: {reliability}")


Ferrari: 0.8243589743589743
Red Bull: 0.7508650519031141
Mercedes: 0.8778054862842892


IndexError: index 0 is out of bounds for axis 0 with size 0

In [68]:
drivers = [
    'Lewis Hamilton',
    'George Russell',
    'Max Verstappen',
    'Sergio Pérez',
    'Charles Leclerc',
    'Carlos Sainz',
    'Lando Norris',
    'Oscar Piastri',
    'Esteban Ocon',
    'Pierre Gasly',
    'Yuki Tsunoda',
    'Nyck de Vries',
    'Fernando Alonso',
    'Lance Stroll',
    'Valtteri Bottas',
    'Zhou Guanyu',
    'Alex Albon',
    'Logan Sargeant',
    'Kevin Magnussen',
    'Nico Hulkenberg'
]

driver_confidence_dict = {}

for driver in drivers:
    driver_data = cleaned_data[cleaned_data['driver'] == driver]
    if len(driver_data) > 0:
        driver_confidence_dict[driver] = driver_data.iloc[0]['driver_confidence']
    else:
        driver_confidence_dict[driver] = 0.8

for driver, confidence in driver_confidence_dict.items():
    print(f"{driver}: {confidence}")

Lewis Hamilton: 0.9407114624505929
George Russell: 0.9583333333333334
Max Verstappen: 0.9142857142857143
Sergio Pérez: 0.9333333333333333
Charles Leclerc: 0.8444444444444444
Carlos Sainz: 0.9038461538461539
Lando Norris: 0.9166666666666666
Oscar Piastri: 0.8
Esteban Ocon: 0.9230769230769231
Pierre Gasly: 0.9387755102040817
Yuki Tsunoda: 0.8
Nyck de Vries: 0.8
Fernando Alonso: 0.8
Lance Stroll: 0.9230769230769231
Valtteri Bottas: 0.965034965034965
Zhou Guanyu: 0.8
Alex Albon: 0.8
Logan Sargeant: 0.8
Kevin Magnussen: 0.9523809523809523
Nico Hulkenberg: 0.8


In [26]:
cleaned_data['istest'] = 0

In [27]:
cleaned_data

Unnamed: 0,GP_name,quali_pos,constructor,driver,position,driver_confidence,constructor_relaiblity,active_driver,active_constructor,dob,istest
90,Albert Park Grand Prix Circuit,16,Williams,Valtteri Bottas,16,0.965035,0.569961,1,1,1989-08-28,0
91,Albert Park Grand Prix Circuit,15,Williams,Valtteri Bottas,10,0.965035,0.569961,1,1,1989-08-28,0
92,Albert Park Grand Prix Circuit,0,Williams,Valtteri Bottas,6,0.965035,0.569961,1,1,1989-08-28,0
93,Albert Park Grand Prix Circuit,16,Williams,Valtteri Bottas,11,0.965035,0.569961,1,1,1989-08-28,0
94,Albert Park Grand Prix Circuit,20,Williams,Lance Stroll,19,0.923077,0.569961,1,1,1998-10-29,0
...,...,...,...,...,...,...,...,...,...,...,...
8080,Buddh International Circuit,5,Ferrari,Fernando Alonso,5,0.932660,0.824359,1,1,1981-07-29,0
8081,Buddh International Circuit,8,Ferrari,Fernando Alonso,8,0.932660,0.824359,1,1,1981-07-29,0
8096,Buddh International Circuit,3,Mercedes,Lewis Hamilton,3,0.940711,0.877805,1,1,1985-01-07,0
8114,Buddh International Circuit,20,Alfa Romeo,Sergio Pérez,17,0.933333,0.395276,1,1,1990-01-26,0


In [80]:
cleaned_data['driver'].unique()

array(['Valtteri Bottas', 'Lance Stroll', 'George Russell',
       'Fernando Alonso', 'Lewis Hamilton', 'Sergio Pérez',
       'Kevin Magnussen', 'Carlos Sainz', 'Lando Norris',
       'Charles Leclerc', 'Max Verstappen', 'Pierre Gasly',
       'Alexander Albon', 'Carlos Sainz Jr.', 'Yuki Tsunoda',
       'Oscar Piastri', 'Esteban Ocon', 'Logan Sargeant', 'Zhou Guanyu',
       'Nico Hülkenberg', 'Nyck de Vries'], dtype=object)

In [29]:
# 1 	16	Charles Leclerc	Ferrari	1'40.203	 
# 2 	1	Max Verstappen	Red Bull/Honda RBPT	1'40.391	0.188
# 3 	11	Sergio Pérez	Red Bull/Honda RBPT	1'40.495	0.292
# 4 	55	Carlos Sainz Jr.	Ferrari	1'41.016	0.813
# 5 	44	Lewis Hamilton	Mercedes	1'41.177	0.974
# 6 	14	Fernando Alonso	Aston Martin/Mercedes	1'41.253	1.050
# 7 	4	Lando Norris	McLaren/Mercedes	1'41.281	1.078
# 8 	22	Yuki Tsunoda	AlphaTauri/Honda RBPT	1'41.581	1.378
# 9 	18	Lance Stroll	Aston Martin/Mercedes	1'41.611	1.408
# 10 	81	Oscar Piastri	McLaren/Mercedes	1'41.611	1.408
# 11 	63	George Russell	Mercedes	1'41.654	1.451
# 12 	31	Esteban Ocon	Alpine/Renault	1'41.798	1.595
# 13 	23	Alexander Albon	Williams/Mercedes	1'41.818	1.615
# 14 	77	Valtteri Bottas	Alfa Romeo/Ferrari	1'42.259	2.056
# 15 	2	Logan Sargeant	Williams/Mercedes	1'42.395	2.192
# 16 	24	Zhou Guanyu	Alfa Romeo/Ferrari	1'42.642	2.439
# 17 	27	Nico Hülkenberg	Haas/Ferrari	1'42.755	2.552
# 18 	20	Kevin Magnussen	Haas/Ferrari	1'43.417	3.214
# 19 	10	Pierre Gasly	Alpine/Renault	1'44.853	4.650
# 20 	21	Nyck de Vries	AlphaTauri/Honda RBPT

import pandas as pd

# Driver names
# #drivers = ["Lewis Hamilton", "George Russell", "Max Verstappen", "Sergio Pérez", "Charles Leclerc", "Carlos Sainz",
#            "Lando Norris", "Oscar Piastri", "Esteban Ocon", "Pierre Gasly", "Yuki Tsunoda", "Nyck de Vries",
#            "Fernando Alonso", "Lance Stroll", "Valtteri Bottas", "Zhou Guanyu", "Alex Albon", "Logan Sargeant",
#            "Kevin Magnussen", "Nico Hülkenberg"]

# Constructor names
constructors = ["Ferrari", "Red Bull", "Mercedes", "Racing Point", "Williams", "Alfa Romeo", "AlphaTauri", "McLaren",
                "Renault", "Haas F1 Team"]

# Birthdates

# Driver confidences
driverconfidences = {"Lewis Hamilton": 0.9407114624505929, "George Russell": 0.9583333333333334,
                      "Max Verstappen": 0.9142857142857143, "Sergio Pérez": 0.9333333333333333,
                      "Charles Leclerc": 0.8444444444444444, "Carlos Sainz": 0.9038461538461539,
                      "Lando Norris": 0.9166666666666666, "Oscar Piastri": 0.8, "Esteban Ocon": 0.9230769230769231,
                      "Pierre Gasly": 0.9387755102040817, "Yuki Tsunoda": 0.8, "Nyck de Vries": 0.8,
                      "Fernando Alonso": 0.8, "Lance Stroll": 0.9230769230769231,
                      "Valtteri Bottas": 0.965034965034965, "Zhou Guanyu": 0.8, "Alex Albon": 0.8,
                      "Logan Sargeant": 0.8, "Kevin Magnussen": 0.9523809523809523, "Nico Hülkenberg": 0.8}

# Constructor reliabilities
# Ferrari: 0.8243589743589743
# Red Bull: 0.7508650519031141
# Mercedes: 0.8778054862842892
# Racing Point: 0.5902335456475584
# Williams: 0.5699614890885751
# Alfa Romeo: 0.3952755905511811
# AlphaTauri: 0.4553903345724907
# McLaren: 0.6344916344916345
# Renault: 0.6018518518518519
# Haas F1 Team: 0.34302325581395354
driver_confidences = [0.8444444444444444, 0.9142857142857143, 0.9333333333333333, 0.9038461538461539, 
                      0.9407114624505929, 0.8, 0.9166666666666666, 0.8, 0.9230769230769231, 0.8, 
                      0.9583333333333334, 0.9230769230769231, 0.8, 0.965034965034965, 0.8, 0.8, 
                      0.8, 0.9523809523809523, 0.9387755102040817, 0.8]


drivers = ["Charles Leclerc", "Max Verstappen", "Sergio Pérez", "Carlos Sainz Jr.", "Lewis Hamilton", "Fernando Alonso","Lando Norris", "Yuki Tsunoda", "Lance Stroll", "Oscar Piastri", "George Russell", "Esteban Ocon","Alexander Albon", "Valtteri Bottas", "Logan Sargeant", "Zhou Guanyu", "Nico Hülkenberg", "Kevin Magnussen","Pierre Gasly", "Nyck de Vries"]

import random
constructor_reliabilities = [0.8243589743589743, 0.7508650519031141, 0.7508650519031141, 0.8243589743589743,
0.8778054862842892, 0.5902335456475584, 0.6344916344916345, 0.4553903345724907,
0.5902335456475584, 0.6344916344916345, 0.8778054862842892, 0.6018518518518519,
0.5699614890885751, 0.3952755905511811, 0.5699614890885751, 0.3952755905511811,
0.34302325581395354, 0.34302325581395354, 0.6018518518518519, 0.4553903345724907]
# Create dictionary of data
newdata = {
    'GP_name': ['Baku City Circuit','Baku City Circuit','Baku City Circuit','Baku City Circuit','Baku City Circuit','Baku City Circuit','Baku City Circuit','Baku City Circuit','Baku City Circuit','Baku City Circuit','Baku City Circuit','Baku City Circuit','Baku City Circuit','Baku City Circuit','Baku City Circuit','Baku City Circuit','Baku City Circuit','Baku City Circuit','Baku City Circuit','Baku City Circuit'],
    'quali_pos': [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20],
    'constructor': ["Ferrari", "Red Bull","Red Bull","Ferrari","Mercedes","Racing Point","McLaren","AlphaTauri","Racing Point","McLaren", "Mercedes","Renault","Williams", "Alfa Romeo","Williams","Alfa Romeo","Haas F1 Team","Haas F1 Team","Renault","AlphaTauri"],
    'driver': drivers,
    'driver_confidence': driver_confidences,
    'constructor_relaiblity':constructor_reliabilities,
    'active_driver': [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1],
    'active_constructor': [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1],
    'dob': [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
}

# Create dataframe

new_data = pd.DataFrame(newdata)
new_data['istest'] = 1

# Print dataframe
print(new_data)



              GP_name  quali_pos   constructor            driver  \
0   Baku City Circuit          1       Ferrari   Charles Leclerc   
1   Baku City Circuit          2      Red Bull    Max Verstappen   
2   Baku City Circuit          3      Red Bull      Sergio Pérez   
3   Baku City Circuit          4       Ferrari  Carlos Sainz Jr.   
4   Baku City Circuit          5      Mercedes    Lewis Hamilton   
5   Baku City Circuit          6  Racing Point   Fernando Alonso   
6   Baku City Circuit          7       McLaren      Lando Norris   
7   Baku City Circuit          8    AlphaTauri      Yuki Tsunoda   
8   Baku City Circuit          9  Racing Point      Lance Stroll   
9   Baku City Circuit         10       McLaren     Oscar Piastri   
10  Baku City Circuit         11      Mercedes    George Russell   
11  Baku City Circuit         12       Renault      Esteban Ocon   
12  Baku City Circuit         13      Williams   Alexander Albon   
13  Baku City Circuit         14    Alfa Romeo  

In [30]:
cleaned_data = pd.concat([cleaned_data,new_data])

In [43]:
cleaned_data

Unnamed: 0,GP_name,quali_pos,constructor,driver,position,driver_confidence,constructor_relaiblity,active_driver,active_constructor,dob,istest
90,Albert Park Grand Prix Circuit,16,Williams,Valtteri Bottas,16.0,0.965035,0.569961,1,1,1989-08-28 00:00:00,0
91,Albert Park Grand Prix Circuit,15,Williams,Valtteri Bottas,10.0,0.965035,0.569961,1,1,1989-08-28 00:00:00,0
92,Albert Park Grand Prix Circuit,0,Williams,Valtteri Bottas,6.0,0.965035,0.569961,1,1,1989-08-28 00:00:00,0
93,Albert Park Grand Prix Circuit,16,Williams,Valtteri Bottas,11.0,0.965035,0.569961,1,1,1989-08-28 00:00:00,0
94,Albert Park Grand Prix Circuit,20,Williams,Lance Stroll,19.0,0.923077,0.569961,1,1,1998-10-29 00:00:00,0
...,...,...,...,...,...,...,...,...,...,...,...
15,Baku City Circuit,16,Alfa Romeo,Zhou Guanyu,,0.800000,0.395276,1,1,1,1
16,Baku City Circuit,17,Haas F1 Team,Nico Hülkenberg,,0.800000,0.343023,1,1,1,1
17,Baku City Circuit,18,Haas F1 Team,Kevin Magnussen,,0.952381,0.343023,1,1,1,1
18,Baku City Circuit,19,Renault,Pierre Gasly,,0.938776,0.601852,1,1,1,1


In [44]:
x

Unnamed: 0,GP_name,quali_pos,constructor,driver,position,driver_confidence,constructor_relaiblity,active_driver,active_constructor,dob,istest
90,Albert Park Grand Prix Circuit,16,Williams,Valtteri Bottas,16.0,0.965035,0.569961,1,1,1989-08-28 00:00:00,0
91,Albert Park Grand Prix Circuit,15,Williams,Valtteri Bottas,10.0,0.965035,0.569961,1,1,1989-08-28 00:00:00,0
92,Albert Park Grand Prix Circuit,0,Williams,Valtteri Bottas,6.0,0.965035,0.569961,1,1,1989-08-28 00:00:00,0
93,Albert Park Grand Prix Circuit,16,Williams,Valtteri Bottas,11.0,0.965035,0.569961,1,1,1989-08-28 00:00:00,0
94,Albert Park Grand Prix Circuit,20,Williams,Lance Stroll,19.0,0.923077,0.569961,1,1,1998-10-29 00:00:00,0
...,...,...,...,...,...,...,...,...,...,...,...
15,Baku City Circuit,16,Alfa Romeo,Zhou Guanyu,,0.800000,0.395276,1,1,1,1
16,Baku City Circuit,17,Haas F1 Team,Nico Hülkenberg,,0.800000,0.343023,1,1,1,1
17,Baku City Circuit,18,Haas F1 Team,Kevin Magnussen,,0.952381,0.343023,1,1,1,1
18,Baku City Circuit,19,Renault,Pierre Gasly,,0.938776,0.601852,1,1,1,1


In [102]:
sc  = StandardScaler()
le = LabelEncoder()

x = cleaned_data.copy()
x['GP_name_encoded'] = le.fit_transform(x['GP_name'])
GP_name_nums = x['GP_name_encoded'].unique()
GP_name_names = x['GP_name'].unique()
GP_name_dict = dict(zip(GP_name_nums, GP_name_names))
x['constructor_encoded'] = le.fit_transform(x['constructor'])
constructor_nums = x['constructor_encoded'].unique()
construtctor_names = x['constructor'].unique()
constructor_dict = dict(zip(constructor_nums, construtctor_names))
#x['driver'] = le.fit_transform(x['driver'])
x['driver_encoded'] = le.fit_transform(x['driver'])
driver_nums = x['driver_encoded'].unique()
driver_names = x['driver'].unique()
driver_dict = dict(zip(driver_nums, driver_names))
x['GP_name'] = x['GP_name_encoded']
x['constructor'] = x['constructor_encoded']
x['driver'] = x['driver_encoded']
x = x.drop(['driver_encoded','constructor_encoded','GP_name_encoded'],1)
X = x.drop(['position','active_driver','active_constructor'],1)
inter = x.drop(['active_driver','active_constructor',],1)

X = X[X['istest']== 0]
y = x[x['istest'] == 0]['position'].apply(lambda x: position_index(x))
#y = x[x['istest'] == 0]['position']

In [103]:
X_test = inter[inter['istest'] == 1].drop('position',axis = 1)
y_test = inter[inter['istest'] == 1]['position'].apply(lambda x: position_index(x))

In [104]:
driver_dict

{18: 'Valtteri Bottas',
 8: 'Lance Stroll',
 6: 'George Russell',
 5: 'Fernando Alonso',
 10: 'Lewis Hamilton',
 17: 'Sergio Pérez',
 7: 'Kevin Magnussen',
 1: 'Carlos Sainz',
 9: 'Lando Norris',
 3: 'Charles Leclerc',
 12: 'Max Verstappen',
 16: 'Pierre Gasly',
 0: 'Alexander Albon',
 2: 'Carlos Sainz Jr.',
 19: 'Yuki Tsunoda',
 15: 'Oscar Piastri',
 4: 'Esteban Ocon',
 11: 'Logan Sargeant',
 20: 'Zhou Guanyu',
 13: 'Nico Hülkenberg',
 14: 'Nyck de Vries'}

In [105]:
constructor_dict

{9: 'Williams',
 4: 'McLaren',
 2: 'Ferrari',
 5: 'Mercedes',
 1: 'AlphaTauri',
 0: 'Alfa Romeo',
 7: 'Red Bull',
 3: 'Haas F1 Team',
 6: 'Racing Point',
 8: 'Renault'}

In [106]:
GP_name_dict

{0: 'Albert Park Grand Prix Circuit',
 23: 'Sepang International Circuit',
 24: 'Shanghai International Circuit',
 4: 'Bahrain International Circuit',
 9: 'Circuit de Barcelona-Catalunya',
 10: 'Circuit de Monaco',
 18: 'Istanbul Park',
 25: 'Silverstone Circuit',
 21: 'Nürburgring',
 16: 'Hungaroring',
 28: 'Valencia Street Circuit',
 12: 'Circuit de Spa-Francorchamps',
 1: 'Autodromo Nazionale di Monza',
 20: 'Marina Bay Street Circuit',
 27: 'Suzuka Circuit',
 3: 'Autódromo José Carlos Pace',
 29: 'Yas Marina Circuit',
 7: 'Circuit Gilles Villeneuve',
 11: 'Circuit de Nevers Magny-Cours',
 15: 'Hockenheimring',
 14: 'Fuji Speedway',
 17: 'Indianapolis Motor Speedway',
 19: 'Korean International Circuit',
 26: 'Sochi Autodrom',
 5: 'Baku City Circuit',
 22: 'Red Bull Ring',
 13: 'Circuit of the Americas',
 2: 'Autódromo Hermanos Rodríguez',
 8: 'Circuit Paul Ricard',
 6: 'Buddh International Circuit'}

In [107]:
X_test

Unnamed: 0,GP_name,quali_pos,constructor,driver,driver_confidence,constructor_relaiblity,dob,istest
0,5,1,2,3,0.844444,0.824359,1,1
1,5,2,7,12,0.914286,0.750865,1,1
2,5,3,7,17,0.933333,0.750865,1,1
3,5,4,2,2,0.903846,0.824359,1,1
4,5,5,5,10,0.940711,0.877805,1,1
5,5,6,6,5,0.8,0.590234,1,1
6,5,7,4,9,0.916667,0.634492,1,1
7,5,8,1,19,0.8,0.45539,1,1
8,5,9,6,8,0.923077,0.590234,1,1
9,5,10,4,15,0.8,0.634492,1,1


In [34]:
X

Unnamed: 0,GP_name,quali_pos,constructor,driver,driver_confidence,constructor_relaiblity,dob,istest
90,0,16,9,18,0.965035,0.569961,1989-08-28 00:00:00,0
91,0,15,9,18,0.965035,0.569961,1989-08-28 00:00:00,0
92,0,0,9,18,0.965035,0.569961,1989-08-28 00:00:00,0
93,0,16,9,18,0.965035,0.569961,1989-08-28 00:00:00,0
94,0,20,9,8,0.923077,0.569961,1998-10-29 00:00:00,0
...,...,...,...,...,...,...,...,...
8080,6,5,2,5,0.932660,0.824359,1981-07-29 00:00:00,0
8081,6,8,2,5,0.932660,0.824359,1981-07-29 00:00:00,0
8096,6,3,5,10,0.940711,0.877805,1985-01-07 00:00:00,0
8114,6,20,0,17,0.933333,0.395276,1990-01-26 00:00:00,0


In [34]:
y = x[x['istest'] == 0]['position'].apply(lambda x: position_index(x))

In [35]:
y

39      3
40      3
41      2
90      3
91      2
       ..
8122    1
8123    2
8124    2
8125    3
8126    3
Name: position, Length: 2149, dtype: int64

In [32]:
X.dtypes

GP_name                     int64
quali_pos                   int64
constructor                 int32
driver                      int32
driver_confidence         float64
constructor_relaiblity    float64
istest                      int64
dtype: object

In [108]:
X = X.drop(['istest','dob'], axis=1)

In [109]:
X

Unnamed: 0,GP_name,quali_pos,constructor,driver,driver_confidence,constructor_relaiblity
90,0,16,9,18,0.965035,0.569961
91,0,15,9,18,0.965035,0.569961
92,0,0,9,18,0.965035,0.569961
93,0,16,9,18,0.965035,0.569961
94,0,20,9,8,0.923077,0.569961
...,...,...,...,...,...,...
8080,6,5,2,5,0.932660,0.824359
8081,6,8,2,5,0.932660,0.824359
8096,6,3,5,10,0.940711,0.877805
8114,6,20,0,17,0.933333,0.395276


In [111]:
X_test = X_test.drop('dob',axis = 1)

In [112]:
X_test

Unnamed: 0,GP_name,quali_pos,constructor,driver,driver_confidence,constructor_relaiblity
0,5,1,2,3,0.844444,0.824359
1,5,2,7,12,0.914286,0.750865
2,5,3,7,17,0.933333,0.750865
3,5,4,2,2,0.903846,0.824359
4,5,5,5,10,0.940711,0.877805
5,5,6,6,5,0.8,0.590234
6,5,7,4,9,0.916667,0.634492
7,5,8,1,19,0.8,0.45539
8,5,9,6,8,0.923077,0.590234
9,5,10,4,15,0.8,0.634492


In [55]:
rf_rand = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator=rf_rand,param_distributions=random_parms,n_iter=10,cv=10,verbose=2,n_jobs=-1 )
rf_random.fit(X,y)
rf_random.best_params_

NameError: name 'random_parms' is not defined

In [113]:
rf = RandomForestClassifier(n_estimators=1800,min_samples_split=2,min_samples_leaf=4,max_features='sqrt',max_depth=90,bootstrap=False)
kf = StratifiedKFold(n_splits=10,random_state=None,shuffle=False)
# for train_index,test_index in kf.split(X,y):
#     X_train,X_test = X.iloc[train_index],X.iloc[test_index]
#     y_train,y_test = y.iloc[train_index],y.iloc[test_index]

In [45]:
# create a dictionary to map encoded values to original values
encoded_dict = dict(zip(le.inverse_transform(le.classes_), le.classes_))

# print the dictionary
print(encoded_dict)

{0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31}


In [46]:
le.classes_

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31])

In [39]:
X_test

Unnamed: 0,GP_name,quali_pos,constructor,driver,driver_confidence,constructor_relaiblity,istest
7670,24,5,5,14,0.940711,0.877805,0
7671,24,1,5,14,0.940711,0.877805,0
7672,24,1,5,25,0.965035,0.877805,0
7673,24,1,5,25,0.965035,0.877805,0
7675,24,1,5,25,0.965035,0.877805,0
...,...,...,...,...,...,...,...
8122,8,1,7,23,0.934156,0.750865,0
8123,8,7,8,11,0.933798,0.601852,0
8124,8,6,8,11,0.933798,0.601852,0
8125,8,11,8,22,0.852071,0.601852,0


In [40]:
data

Unnamed: 0,year,date,quali_pos,statusId,position,dob,driver_nationality,constructor,constructor_nationality,GP_name,...,driver,age_at_gp_in_days,driver_home,constructor_home,driver_dnf,constructor_dnf,driver_confidence,constructor_relaiblity,active_driver,active_constructor
0,2009,2009-03-29,1,1,1,1980-01-19,Bri,Brawn,Bri,Albert Park Grand Prix Circuit,...,Jenson Button,10662,0,0,0,0,0.923954,0.970588,0,0
1,2009,2009-03-29,2,1,2,1972-05-23,Bra,Brawn,Bri,Albert Park Grand Prix Circuit,...,Rubens Barrichello,13459,0,0,0,0,0.907080,0.970588,0,0
2,2008,2008-03-16,12,4,13,1980-01-19,Bri,Honda,Jap,Albert Park Grand Prix Circuit,...,Jenson Button,10284,0,0,1,0,0.923954,0.443396,0,0
3,2007,2007-03-18,14,11,14,1980-01-19,Bri,Honda,Jap,Albert Park Grand Prix Circuit,...,Jenson Button,9920,0,0,0,1,0.923954,0.443396,0,0
4,2006,2006-04-02,1,11,1,1980-01-19,Bri,Honda,Jap,Albert Park Grand Prix Circuit,...,Jenson Button,9570,0,0,0,1,0.923954,0.443396,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8398,1994,1994-10-16,25,5,25,1965-09-05,Aus,Simtek,Bri,Circuito de Jerez,...,David Brabham,10633,0,0,0,1,0.800000,0.135135,0,0
8399,1994,1994-10-16,21,12,21,1966-10-23,Ita,Team Lotus,Bri,Circuito de Jerez,...,Alessandro Zanardi,10220,0,0,0,1,0.692308,0.200000,0,0
8400,1994,1994-10-16,22,13,22,1964-08-24,Fre,Team Lotus,Bri,Circuito de Jerez,...,Éric Bernard,11010,0,0,0,1,0.923077,0.200000,0,0
8401,1994,1994-10-16,23,91,23,1963-09-28,Fre,Larrousse,Fre,Circuito de Jerez,...,Érik Comas,11341,0,0,0,1,1.000000,0.133333,0,0


In [7]:
data.dtypes

season                    int64
round                     int64
weather_warm               bool
weather_cold               bool
weather_dry                bool
                          ...  
constructor_team_lotus    int64
constructor_toro_rosso    int64
constructor_toyota        int64
constructor_tyrrell       int64
constructor_williams      int64
Length: 100, dtype: object

In [117]:
data.columns

Index(['season', 'round', 'weather_warm', 'weather_cold', 'weather_dry',
       'weather_wet', 'weather_cloudy', 'driver', 'grid', 'podium',
       'driver_points', 'driver_wins', 'driver_standings_pos',
       'constructor_points', 'constructor_wins', 'constructor_standings_pos',
       'qualifying_time', 'driver_age', 'circuit_id_adelaide',
       'circuit_id_albert_park', 'circuit_id_americas', 'circuit_id_bahrain',
       'circuit_id_brands_hatch', 'circuit_id_catalunya', 'circuit_id_detroit',
       'circuit_id_estoril', 'circuit_id_galvez', 'circuit_id_hockenheimring',
       'circuit_id_hungaroring', 'circuit_id_imola', 'circuit_id_indianapolis',
       'circuit_id_interlagos', 'circuit_id_istanbul',
       'circuit_id_jacarepagua', 'circuit_id_jerez', 'circuit_id_kyalami',
       'circuit_id_magny_cours', 'circuit_id_marina_bay', 'circuit_id_monaco',
       'circuit_id_monza', 'circuit_id_nurburgring', 'circuit_id_phoenix',
       'circuit_id_red_bull_ring', 'circuit_id_ricard'

In [102]:
data = pd.read_csv('final_df.csv')

X = data.copy()

sc  = StandardScaler()
le = LabelEncoder()
X['driver'] = le.fit_transform(X['driver'])

X_train = X[(X['season'] < 2023) ].drop('podium',axis = 1)
y_train = X[X['season'] < 2023]['podium']

In [103]:
X_2023 = X[(X['season'] == 2023) & (X['round'] < 4)].drop('podium',axis = 1)
y_2023 = X[(X['season'] == 2023) & (X['round'] < 4)]['podium']

X_train = pd.concat([X_train,X_2023])
y_train = pd.concat([y_train,y_2023])

In [104]:
X_train

Unnamed: 0,season,round,weather_warm,weather_cold,weather_dry,weather_wet,weather_cloudy,driver,grid,driver_points,...,constructor_minardi,constructor_prost,constructor_red_bull,constructor_renault,constructor_sauber,constructor_team_lotus,constructor_toro_rosso,constructor_toyota,constructor_tyrrell,constructor_williams
0,1983,1,False,False,True,False,False,109,1,0,...,0,0,0,0,0,0,0,0,0,1
1,1983,1,False,False,True,False,False,176,2,0,...,0,0,0,1,0,0,0,0,0,0
2,1983,1,False,False,True,False,False,212,3,0,...,0,0,0,0,0,0,0,0,0,0
3,1983,1,False,False,True,False,False,171,4,0,...,0,0,0,0,0,0,0,0,0,0
4,1983,1,False,False,True,False,False,223,5,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,2023,3,True,False,False,False,False,110,14,1,...,0,0,0,0,0,0,0,0,0,0
14996,2023,3,True,False,False,False,False,58,15,0,...,0,0,0,0,0,0,0,0,0,0
14997,2023,3,True,False,False,False,False,168,16,0,...,0,0,0,0,0,0,0,0,0,0
14998,2023,3,True,False,False,False,False,233,17,0,...,0,0,0,0,0,0,0,0,0,0


In [105]:
y_train

0        15
1         6
2         4
3         1
4         7
         ..
14995    17
14996    15
14997     8
14998     9
14999    16
Name: podium, Length: 15000, dtype: int64

In [None]:
X_test['actualname'] = X_test[s]

In [106]:
X_test = X[(X['season'] == 2023) & (X['round'] == 4) & X['dr']].drop('podium',axis = 1)
y_test = X[(X['season'] == 2023) & (X['round'] == 4)]['podium']

In [91]:
X_test

Unnamed: 0,season,round,weather_warm,weather_cold,weather_dry,weather_wet,weather_cloudy,driver,grid,driver_points,...,constructor_minardi,constructor_prost,constructor_red_bull,constructor_renault,constructor_sauber,constructor_team_lotus,constructor_toro_rosso,constructor_toyota,constructor_tyrrell,constructor_williams
15000,2023,4,True,False,False,False,True,126,1,0,...,0,0,0,0,0,0,0,0,0,0
15001,2023,4,True,False,False,False,True,139,2,44,...,0,0,1,0,0,0,0,0,0,0
15002,2023,4,True,False,False,False,True,166,3,18,...,0,0,1,0,0,0,0,0,0,0
15003,2023,4,True,False,False,False,True,189,4,20,...,0,0,0,0,0,0,0,0,0,0
15004,2023,4,True,False,False,False,True,93,5,20,...,0,0,0,0,0,0,0,0,0,0
15005,2023,4,True,False,False,False,True,8,6,30,...,0,0,0,0,0,0,0,0,0,0
15006,2023,4,True,False,False,False,True,159,7,0,...,0,0,0,0,0,0,0,0,0,0
15007,2023,4,True,False,False,False,True,216,8,0,...,0,0,0,0,0,0,0,0,0,0
15008,2023,4,True,False,False,False,True,206,9,8,...,0,0,0,0,0,0,0,0,0,0
15009,2023,4,True,False,False,False,True,168,10,0,...,0,0,0,0,0,0,0,0,0,0


In [92]:
y_test

15000     3
15001     2
15002     1
15003     5
15004     6
15005     4
15006     9
15007    10
15008     7
15009    11
15010     8
15011    12
15012    18
15013    16
15014    19
15015    13
15016    14
15017    20
Name: podium, dtype: int64

In [114]:
rf.fit(X,y)
y_pred_rf = rf.predict(X_test)
cnf_mat_rf = confusion_matrix(y_test,y_pred_rf)
cnf_mat_rf = cnf_mat_rf/cnf_mat_rf.sum()

In [115]:
y_pred_rf

array([1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
      dtype=int64)

In [121]:
import pickle

filename = 'rffinal.pkl'
with open(filename, 'wb') as file:
    pickle.dump(rf, file)

In [52]:
y_pred_new = rf.predict(X_test_new)

NameError: name 'X_test_new' is not defined

In [94]:
print(y_pred_rf)

[ 4  1  1  6  3  3  6 10  5  6  5 19 14 16 14 17 13 15]


In [117]:
X_later = X_test.copy()

X_later['pred_pos'] = y_pred_rf
X_later['actual'] = y_test

In [118]:
X_later = X_later.sort_values(by = 'pred_pos')

In [63]:
X_later.shape

(20, 9)

In [98]:
X_later['pred_pos'] = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18]

In [99]:
X_later['driver'] = le.inverse_transform(X_later['driver'])

In [100]:
X_later ## jeddah 2022

Unnamed: 0,season,round,driver,grid,pred_pos,actual
15001,2023,4,max_verstappen,2,1,2
15002,2023,4,perez,3,2,1
15004,2023,4,hamilton,5,3,6
15005,2023,4,alonso,6,4,4
15000,2023,4,leclerc,1,5,3
15010,2023,4,russell,11,6,8
15008,2023,4,stroll,9,7,7
15009,2023,4,piastri,10,8,11
15003,2023,4,sainz,4,9,5
15006,2023,4,norris,7,10,9


In [120]:
cleaned_data.to_csv('cleaned_data.csv')

In [119]:
X_later

Unnamed: 0,GP_name,quali_pos,constructor,driver,driver_confidence,constructor_relaiblity,pred_pos,actual
0,5,1,2,3,0.844444,0.824359,1,2
1,5,2,7,12,0.914286,0.750865,1,2
2,5,3,7,17,0.933333,0.750865,1,2
8,5,9,6,8,0.923077,0.590234,2,2
7,5,8,1,19,0.8,0.45539,2,2
6,5,7,4,9,0.916667,0.634492,2,2
9,5,10,4,15,0.8,0.634492,2,2
4,5,5,5,10,0.940711,0.877805,2,2
3,5,4,2,2,0.903846,0.824359,2,2
5,5,6,6,5,0.8,0.590234,2,2


In [226]:
new_data_tested = new_data.copy()
new_data_tested['pos'] = y_pred_new

In [227]:
new_data_tested = new_data_tested.sort_values(by='pos', ascending=True)
new_data_tested['pos'] = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]


In [228]:
new_data_tested

Unnamed: 0,GP_name,quali_pos,constructor,driver,driver_confidence,constructor_relaiblity,active_driver,active_constructor,dob,istest,pos
0,Baku City Circuit,1,Ferrari,Charles Leclerc,0.844444,0.824359,1,1,1,1,1
1,Baku City Circuit,2,Red Bull,Max Verstappen,0.914286,0.750865,1,1,1,1,2
2,Baku City Circuit,3,Red Bull,Sergio Pérez,0.933333,0.750865,1,1,1,1,3
3,Baku City Circuit,4,Ferrari,Carlos Sainz Jr.,0.903846,0.824359,1,1,1,1,4
4,Baku City Circuit,5,Mercedes,Lewis Hamilton,0.940711,0.877805,1,1,1,1,5
5,Baku City Circuit,6,Racing Point,Fernando Alonso,0.8,0.590234,1,1,1,1,6
6,Baku City Circuit,7,McLaren,Lando Norris,0.916667,0.634492,1,1,1,1,7
7,Baku City Circuit,8,AlphaTauri,Yuki Tsunoda,0.8,0.45539,1,1,1,1,8
8,Baku City Circuit,9,Racing Point,Lance Stroll,0.923077,0.590234,1,1,1,1,9
10,Baku City Circuit,11,Mercedes,George Russell,0.958333,0.877805,1,1,1,1,10


In [38]:
X.dtypes

GP_name                            int64
quali_pos                          int64
constructor                        int32
driver                             int32
driver_confidence                float64
constructor_relaiblity           float64
dob                       datetime64[ns]
dtype: object

In [79]:
X_d.dtypes

GP_name                int64
quali_pos              int64
driver                 int64
age_at_gp_in_days    float64
driver_confidence    float64
dtype: object

In [48]:
X.drop(["country", "constructor_nationality"], axis=1, inplace=True)


In [39]:
X = X.drop([ 'dob'], axis=1)


In [50]:
X = X.drop(['driver_nationality'],axis = 1)

In [51]:
X['age_at_gp_in_days'] = X['age_at_gp_in_days'].apply(lambda x: float(x.split()[0]))


In [67]:
svc = SVC()
svc.fit(X,y)
y_pred = svc.predict(X_test)
y_pred_svc = svc.predict(X_test)
cnf_mat_svc = confusion_matrix(y_test,y_pred)
cnf_mat_svc = cnf_mat_svc/cnf_mat_svc.sum()

In [69]:
X_later = X_test.copy()

X_later['pred_pos'] = y_pred_svc
X_later['actual'] = y_test

X_later = X_later.sort_values(by = 'pred_pos')

In [111]:
X_later.shape

(18, 6)

In [114]:
X_later['driver'] = le.inverse_transform(X_later['driver'])

In [112]:
X_later['pred_pos'] = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18]

In [70]:
X_later

Unnamed: 0,GP_name,quali_pos,constructor,driver_confidence,constructor_relaiblity,istest,driver_encoded,pred_pos,actual
0,5,1,2,0.844444,0.824359,1,3,1,2
1,5,2,7,0.914286,0.750865,1,12,1,2
2,5,3,7,0.933333,0.750865,1,17,1,2
8,5,9,6,0.923077,0.590234,1,8,2,2
7,5,8,1,0.8,0.45539,1,19,2,2
6,5,7,4,0.916667,0.634492,1,9,2,2
9,5,10,4,0.8,0.634492,1,15,2,2
4,5,5,5,0.940711,0.877805,1,10,2,2
3,5,4,2,0.903846,0.824359,1,2,2,2
5,5,6,6,0.8,0.590234,1,5,2,2


In [109]:
print(y_pred_svc)

[13  7  7 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13]


In [51]:
rf_pression = precision_score(y_test, y_pred_rf, average='macro')
rf_f1 = f1_score(y_test,y_pred_rf,average='macro')
rf_recall = recall_score(y_test,y_pred_rf,average='macro')
svc_pression = precision_score(y_test, y_pred_svc, average='micro')
svc_f1 = f1_score(y_test,y_pred_svc,average='macro')
svc_recall = recall_score(y_test,y_pred_svc,average='macro')
metrics_dict ={
    'RandomForestClassifier':{'precision_score':rf_pression,'f1_score':rf_f1,'recall_score':rf_recall},
    'SVC':{'precision_score':svc_pression,'f1_score':svc_f1,'recall_score':svc_recall}
}
metrics_df = pd.DataFrame(metrics_dict)
metrics_df

Unnamed: 0,RandomForestClassifier,SVC
precision_score,0.938202,0.901869
f1_score,0.936028,0.901417
recall_score,0.934206,0.899036
