In [3]:
import pandas as pd
import numpy as np 
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns


In [4]:
qual_df = pd.read_csv('Data/F1_2025_SprintQualifyingResults.csv')
qual_df.head()

Unnamed: 0,Track,Position,No,Driver,Team,Q1,Q2,Q3,Laps
0,China,1,44,Lewis Hamilton,Ferrari,1:31.212,1:31.484,1:30.849,15
1,China,2,1,Max Verstappen,Red Bull Racing Honda RBPT,1:31.916,1:31.521,1:30.867,12
2,China,3,81,Oscar Piastri,McLaren Mercedes,1:31.723,1:31.362,1:30.929,13
3,China,4,16,Charles Leclerc,Ferrari,1:31.518,1:31.561,1:31.057,15
4,China,5,63,George Russell,Mercedes,1:31.952,1:31.346,1:31.169,18


In [5]:
qual_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Track     40 non-null     object
 1   Position  40 non-null     int64 
 2   No        40 non-null     int64 
 3   Driver    40 non-null     object
 4   Team      40 non-null     object
 5   Q1        40 non-null     object
 6   Q2        30 non-null     object
 7   Q3        20 non-null     object
 8   Laps      40 non-null     int64 
dtypes: int64(3), object(6)
memory usage: 2.9+ KB


In [6]:
qual_df.loc[qual_df['Position']=='NC',"Position"] = 0

In [7]:
qual_df['Position'] = pd.to_numeric(qual_df["Position"])

In [8]:
times_cols = ['Q1','Q2','Q3']
for cols in times_cols:
    qual_df[cols] = pd.to_timedelta('00:'+qual_df[cols],errors='coerce')
    qual_df[cols] = pd.to_numeric(qual_df[cols],errors='coerce')

In [10]:
qual_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Track     40 non-null     object
 1   Position  40 non-null     int64 
 2   No        40 non-null     int64 
 3   Driver    40 non-null     object
 4   Team      40 non-null     object
 5   Q1        40 non-null     int64 
 6   Q2        40 non-null     int64 
 7   Q3        40 non-null     int64 
 8   Laps      40 non-null     int64 
dtypes: int64(6), object(3)
memory usage: 2.9+ KB


In [11]:
race_df = pd.read_csv('Data/F1_2025_SprintResults.csv')
race_df.head()

Unnamed: 0,Track,Position,No,Driver,Team,Starting Grid,Laps,Time/Retired,Points
0,China,1,44,Lewis Hamilton,Ferrari,1,19,30:39.965,8
1,China,2,81,Oscar Piastri,McLaren Mercedes,3,19,+6.889,7
2,China,3,1,Max Verstappen,Red Bull Racing Honda RBPT,2,19,+9.804,6
3,China,4,63,George Russell,Mercedes,5,19,+11.592,5
4,China,5,16,Charles Leclerc,Ferrari,4,19,+12.190,4


In [12]:
race_df['Fastest Lap Time'] = pd.to_timedelta("00:"+race_df['Fastest Lap Time'],errors='coerce')

KeyError: 'Fastest Lap Time'

In [13]:
race_df.loc[race_df['Position']=='DQ',"Position"] = '21'
race_df.loc[race_df['Position']=='NC',"Position"] = '21'
race_df.Position.unique()

array(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14', '15', '16', '17', '18', '19', '20', '21'], dtype=object)

In [14]:
li = ['Position','Points']
for cols in li:
    race_df[cols] = pd.to_numeric(race_df[cols])

In [15]:
def comparing(race_pos,qual_pos):
    if qual_pos > race_pos:
        return 'Gain'
    elif qual_pos == race_pos:
        return 'Same'
    elif qual_pos < race_pos:
        return 'Loss'
    

race_df['Result'] = race_df.apply(lambda x: comparing(x['Position'],x['Starting Grid']),axis=1)

In [16]:
# new_df = race_df.merge(qual_df[['Q1','Q2','Q3','Laps','No','Track']],on=['No','Track'])
new_df = qual_df.merge(race_df[['No','Track']],on=['No','Track'])

In [17]:
new_df.head()

Unnamed: 0,Track,Position,No,Driver,Team,Q1,Q2,Q3,Laps
0,China,1,44,Lewis Hamilton,Ferrari,91212000000,91484000000,90849000000,15
1,China,2,1,Max Verstappen,Red Bull Racing Honda RBPT,91916000000,91521000000,90867000000,12
2,China,3,81,Oscar Piastri,McLaren Mercedes,91723000000,91362000000,90929000000,13
3,China,4,16,Charles Leclerc,Ferrari,91518000000,91561000000,91057000000,15
4,China,5,63,George Russell,Mercedes,91952000000,91346000000,91169000000,18


In [18]:
X_data = new_df
y_data = race_df['Result']

In [21]:
string_cols = ['Track','Driver','Team']

In [22]:
encoder = OrdinalEncoder()
Encoded_X = pd.DataFrame(encoder.fit_transform(new_df[string_cols]))


In [23]:
Encoded_X.columns = string_cols
X_data = X_data.drop(columns=string_cols)
X_data = pd.merge(X_data,Encoded_X,on=Encoded_X.index)

In [24]:
X_data = X_data.drop(columns=['key_0','Driver'])
X_data = X_data.fillna(0)
X_data.head()

Unnamed: 0,Position,No,Q1,Q2,Q3,Laps,Track,Team
0,1,44,91212000000,91484000000,90849000000,15,0.0,2.0
1,2,1,91916000000,91521000000,90867000000,12,0.0,8.0
2,3,81,91723000000,91362000000,90929000000,13,0.0,5.0
3,4,16,91518000000,91561000000,91057000000,15,0.0,2.0
4,5,63,91952000000,91346000000,91169000000,18,0.0,6.0


In [42]:
model = SGDClassifier()
model.fit(X_data,y_data)

In [48]:
preds = model.predict(X_data)

In [49]:
model.score(X_data,y_data)

0.625