In [1]:
import pandas as pd
import numpy as np 
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns

In [42]:
qual_df = pd.read_csv('Data/F1_2025_QualifyingResults.csv')
qual_df.head()

Unnamed: 0,Track,Position,No,Driver,Team,Q1,Q2,Q3,Laps
0,Australia,1,4,Lando Norris,McLaren Mercedes,1:15.912,1:15.415,1:15.096,20
1,Australia,2,81,Oscar Piastri,McLaren Mercedes,1:16.062,1:15.468,1:15.180,18
2,Australia,3,1,Max Verstappen,Red Bull Racing Honda RBPT,1:16.018,1:15.565,1:15.481,17
3,Australia,4,63,George Russell,Mercedes,1:15.971,1:15.798,1:15.546,21
4,Australia,5,22,Yuki Tsunoda,Racing Bulls Honda RBPT,1:16.225,1:16.009,1:15.670,18


In [43]:
qual_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Track     180 non-null    object
 1   Position  180 non-null    object
 2   No        180 non-null    int64 
 3   Driver    180 non-null    object
 4   Team      180 non-null    object
 5   Q1        180 non-null    object
 6   Q2        132 non-null    object
 7   Q3        90 non-null     object
 8   Laps      180 non-null    int64 
dtypes: int64(2), object(7)
memory usage: 12.8+ KB


In [44]:
qual_df.loc[qual_df['Position']=='NC',"Position"] = 0

In [45]:
qual_df['Position'] = pd.to_numeric(qual_df["Position"])

In [46]:
times_cols = ['Q1','Q2','Q3']
for cols in times_cols:
    qual_df[cols] = pd.to_timedelta('00:'+qual_df[cols],errors='coerce')

In [47]:
times_cols = ['Q1','Q2','Q3']
for cols in times_cols:
    qual_df[cols] = pd.to_numeric(qual_df[cols],errors='coerce')

In [48]:
qual_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Track     180 non-null    object
 1   Position  180 non-null    int64 
 2   No        180 non-null    int64 
 3   Driver    180 non-null    object
 4   Team      180 non-null    object
 5   Q1        180 non-null    int64 
 6   Q2        180 non-null    int64 
 7   Q3        180 non-null    int64 
 8   Laps      180 non-null    int64 
dtypes: int64(6), object(3)
memory usage: 12.8+ KB


In [49]:
# qual_df.set_index(['Track','No'])

In [50]:
# qual_df.fillna("DNF")

In [51]:
# qual_df.groupby('Driver')['Position'].value_counts()

In [52]:
race_df = pd.read_csv('Data/F1_2025_RaceResults.csv')
race_df.head()

Unnamed: 0,Track,Position,No,Driver,Team,Starting Grid,Laps,Time/Retired,Points,Set Fastest Lap,Fastest Lap Time
0,Australia,1,4,Lando Norris,McLaren Mercedes,1,57,1:42:06.304,25,Yes,1:22.167
1,Australia,2,1,Max Verstappen,Red Bull Racing Honda RBPT,3,57,+0.895,18,No,1:23.081
2,Australia,3,63,George Russell,Mercedes,4,57,+8.481,15,No,1:25.065
3,Australia,4,12,Kimi Antonelli,Mercedes,16,57,+10.135,12,No,1:24.901
4,Australia,5,23,Alexander Albon,Williams Mercedes,6,57,+12.773,10,No,1:24.597


In [53]:
# race_df.groupby('Driver').Points.sum().sort_values(ascending=False)

In [54]:
race_df['Fastest Lap Time'] = pd.to_timedelta("00:"+race_df['Fastest Lap Time'],errors='coerce')

In [55]:
race_df.loc[race_df['Position']=='DQ',"Position"] = '21'
race_df.loc[race_df['Position']=='NC',"Position"] = '21'

In [56]:
race_df.Position.unique()

array(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14', '21', '15', '16', '17', '18', '19', '20'], dtype=object)

In [57]:
li = ['Position','Points']
for cols in li:
    race_df[cols] = pd.to_numeric(race_df[cols])

In [58]:
def comparing(race_pos,qual_pos):
    if qual_pos > race_pos:
        return 'Gain'
    elif qual_pos == race_pos:
        return 'Same'
    elif qual_pos < race_pos:
        return 'Loss'
    

race_df['Result'] = race_df.apply(lambda x: comparing(x['Position'],x['Starting Grid']),axis=1)

In [59]:
# new_df = race_df.merge(qual_df[['Q1','Q2','Q3','Laps','No','Track']],on=['No','Track'])
new_df = qual_df.merge(race_df[['No','Track']],on=['No','Track'])

In [60]:
new_df.head()

Unnamed: 0,Track,Position,No,Driver,Team,Q1,Q2,Q3,Laps
0,Australia,1,4,Lando Norris,McLaren Mercedes,75912000000,75415000000,75096000000,20
1,Australia,2,81,Oscar Piastri,McLaren Mercedes,76062000000,75468000000,75180000000,18
2,Australia,3,1,Max Verstappen,Red Bull Racing Honda RBPT,76018000000,75565000000,75481000000,17
3,Australia,4,63,George Russell,Mercedes,75971000000,75798000000,75546000000,21
4,Australia,5,22,Yuki Tsunoda,Racing Bulls Honda RBPT,76225000000,76009000000,75670000000,18


In [61]:
X_data = new_df
y_data = race_df['Result']

In [62]:
string_cols = ['Track','Driver','Team']

In [None]:
encoder = OrdinalEncoder()
Encoded_X = pd.DataFrame(encoder.fit_transform(new_df[string_cols]))


In [64]:
Encoded_X.columns = string_cols

In [65]:
X_data = X_data.drop(columns=string_cols)

In [66]:
X_data = pd.merge(X_data,Encoded_X,on=Encoded_X.index)

In [67]:
X_data.drop(columns=['key_0','Driver'])

Unnamed: 0,Position,No,Q1,Q2,Q3,Laps,Track,Team
0,1,4,75912000000,75415000000,75096000000,20,0.0,5.0
1,2,81,76062000000,75468000000,75180000000,18,0.0,5.0
2,3,1,76018000000,75565000000,75481000000,17,0.0,10.0
3,4,63,75971000000,75798000000,75546000000,21,0.0,6.0
4,5,22,76225000000,76009000000,75670000000,18,0.0,7.0
...,...,...,...,...,...,...,...,...
174,16,27,73190000000,-9223372036854775808,-9223372036854775808,6,8.0,4.0
175,17,31,73201000000,-9223372036854775808,-9223372036854775808,9,8.0,3.0
176,18,55,73203000000,-9223372036854775808,-9223372036854775808,6,8.0,12.0
177,19,43,73334000000,-9223372036854775808,-9223372036854775808,7,8.0,0.0


In [68]:
X_data.fillna(0)

Unnamed: 0,key_0,Position,No,Q1,Q2,Q3,Laps,Track,Driver,Team
0,0,1,4,75912000000,75415000000,75096000000,20,0.0,13.0,5.0
1,1,2,81,76062000000,75468000000,75180000000,18,0.0,19.0,5.0
2,2,3,1,76018000000,75565000000,75481000000,17,0.0,16.0,10.0
3,3,4,63,75971000000,75798000000,75546000000,21,0.0,8.0,6.0
4,4,5,22,76225000000,76009000000,75670000000,18,0.0,21.0,7.0
...,...,...,...,...,...,...,...,...,...,...
174,174,16,27,73190000000,-9223372036854775808,-9223372036854775808,6,8.0,17.0,4.0
175,175,17,31,73201000000,-9223372036854775808,-9223372036854775808,9,8.0,3.0,3.0
176,176,18,55,73203000000,-9223372036854775808,-9223372036854775808,6,8.0,1.0,12.0
177,177,19,43,73334000000,-9223372036854775808,-9223372036854775808,7,8.0,6.0,0.0


In [69]:
model = SGDClassifier()
model.fit(X_data,y_data)

In [70]:
model.predict(X_data.head(1))

array(['Gain'], dtype='<U4')

In [73]:
model.score(X_data,y_data)

0.3240223463687151