In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Assuming 2024 drivers alone and that they do not change their teams for the 2025 season.
Also each season has many races and we will be aggregating the points earned in each match to get the final result and assuming their performance is similar to 2024.

In [2]:
results = pd.read_csv("final_dataset/results.csv")
races = pd.read_csv("final_dataset/races.csv")
constructor = pd.read_csv("final_dataset/constructors.csv")
driver_standings = pd.read_csv("final_dataset/driver_standings.csv")

results = results.merge(races[['raceId','year','name']],on=['raceId'],how='left')
results.rename(columns={'name':'race_name'},inplace=True)
results = results.merge(constructor[['constructorId','name']],on=['constructorId'],how='left')
results.rename(columns={'name':'constructor_name'},inplace=True)
results = results.merge(driver_standings[['raceId','driverId','wins']],on=['raceId','driverId'],how='inner')

results.head()

Unnamed: 0,resultId,raceId,driverId,constructorId,number,grid,positionText,positionOrder,points,laps,milliseconds,fastestLap,rank,fastestLapSpeed,status,fastestlaptime_ms,year,race_name,constructor_name,wins
0,1,18,1,1,22,1,1,1,10.0,58,5690616,39,2,218.3,Finished,87452,2008,Australian Grand Prix,mclaren,1
1,2,18,2,2,3,5,2,2,8.0,58,5696094,41,3,217.586,Finished,87739,2008,Australian Grand Prix,bmw sauber,0
2,3,18,3,3,7,7,3,3,6.0,58,5698779,41,5,216.719,Finished,88090,2008,Australian Grand Prix,williams,0
3,4,18,4,4,5,11,4,4,5.0,58,5707797,58,7,215.464,Finished,88603,2008,Australian Grand Prix,renault,0
4,5,18,5,1,23,3,5,5,4.0,58,5708630,43,1,218.385,Finished,87418,2008,Australian Grand Prix,mclaren,0


In [3]:
data_2024 = results[results['year']==2024]
races_season = list(results[results['year']==2024]['race_name'].unique())
races_season

['Bahrain Grand Prix',
 'Saudi Arabian Grand Prix',
 'Australian Grand Prix',
 'Japanese Grand Prix',
 'Chinese Grand Prix',
 'Miami Grand Prix',
 'Emilia Romagna Grand Prix',
 'Monaco Grand Prix',
 'Canadian Grand Prix',
 'Spanish Grand Prix',
 'Austrian Grand Prix',
 'British Grand Prix',
 'Hungarian Grand Prix',
 'Belgian Grand Prix',
 'Dutch Grand Prix',
 'Italian Grand Prix',
 'Azerbaijan Grand Prix',
 'Singapore Grand Prix',
 'United States Grand Prix',
 'Mexico City Grand Prix',
 'São Paulo Grand Prix',
 'Las Vegas Grand Prix',
 'Qatar Grand Prix',
 'Abu Dhabi Grand Prix']

In [4]:
train_data = results[['driverId','race_name','wins','grid','milliseconds','fastestLapSpeed','positionOrder','points']]
train_data.head()

Unnamed: 0,driverId,race_name,wins,grid,milliseconds,fastestLapSpeed,positionOrder,points
0,1,Australian Grand Prix,1,1,5690616,218.3,1,10.0
1,2,Australian Grand Prix,0,5,5696094,217.586,2,8.0
2,3,Australian Grand Prix,0,7,5698779,216.719,3,6.0
3,4,Australian Grand Prix,0,11,5707797,215.464,4,5.0
4,5,Australian Grand Prix,0,3,5708630,218.385,5,4.0


In [5]:
from sklearn.preprocessing import OneHotEncoder

train_data.loc[:, 'milliseconds'] = train_data['milliseconds'].replace('\\N', -1).astype(float)
train_data.loc[:, 'fastestLapSpeed'] = train_data['fastestLapSpeed'].replace('\\N', np.nan).astype(float)
driver_avg_speed = train_data.groupby('driverId')['fastestLapSpeed'].transform(lambda x: x.fillna(x.mean()))
overall_avg_speed = train_data['fastestLapSpeed'].mean()
train_data.loc[:, 'fastestLapSpeed'] = driver_avg_speed.fillna(overall_avg_speed)


In [6]:
train_data.loc[:,['driverId', 'race_name']] = train_data.loc[:,['driverId', 'race_name']].astype(str)
encoder = OneHotEncoder(handle_unknown='ignore',sparse_output=False)
encoded_cols = encoder.fit_transform(train_data[['driverId', 'race_name']])
encoded_df = pd.DataFrame(encoded_cols, columns=encoder.get_feature_names_out())


In [7]:
encoded_df.index = train_data.index
train_data_encoded = train_data.drop(['driverId', 'race_name'], axis=1)
train_data_encoded = pd.concat([train_data_encoded, encoded_df], axis=1)
train_data_encoded.head()

Unnamed: 0,wins,grid,milliseconds,fastestLapSpeed,positionOrder,points,driverId_1,driverId_10,driverId_100,driverId_101,...,race_name_South African Grand Prix,race_name_Spanish Grand Prix,race_name_Styrian Grand Prix,race_name_Swedish Grand Prix,race_name_Swiss Grand Prix,race_name_São Paulo Grand Prix,race_name_Turkish Grand Prix,race_name_Tuscan Grand Prix,race_name_United States Grand Prix,race_name_United States Grand Prix West
0,1,1,5690616.0,218.3,1,10.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,5,5696094.0,217.586,2,8.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,7,5698779.0,216.719,3,6.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,11,5707797.0,215.464,4,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,3,5708630.0,218.385,5,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
train_data_encoded['point'] = train_data_encoded['points']
train_data_encoded.drop(columns=['points'],inplace=True)

In [9]:
from sklearn.model_selection import train_test_split

X = train_data_encoded.iloc[:,:-1].values
Y = train_data_encoded.iloc[:,-1].values

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [10]:
from sklearn.ensemble import RandomForestRegressor

RF = RandomForestRegressor(n_estimators=70,random_state=42)
RF.fit(X_train,Y_train)

In [11]:
from sklearn.metrics import mean_absolute_error

y_pred = RF.predict(X_test)
mae = mean_absolute_error(Y_test, y_pred)
print(f"Model MAE: {mae:.2f}")

Model MAE: 0.26


Driver and constructor points for 2025 season prediction

In [12]:
data_2024.head()

Unnamed: 0,resultId,raceId,driverId,constructorId,number,grid,positionText,positionOrder,points,laps,milliseconds,fastestLap,rank,fastestLapSpeed,status,fastestlaptime_ms,year,race_name,constructor_name,wins
25811,26286,1121,830,9,1,1,1,1,26.0,57,5504742,39,1,210.383,Finished,92608,2024,Bahrain Grand Prix,red bull,1
25812,26287,1121,815,9,11,5,2,2,18.0,57,5527199,40,4,206.468,Finished,94364,2024,Bahrain Grand Prix,red bull,0
25813,26288,1121,832,6,55,4,3,3,15.0,57,5529852,44,6,206.156,Finished,94507,2024,Bahrain Grand Prix,ferrari,0
25814,26289,1121,844,6,16,2,4,4,12.0,57,5544411,36,2,207.069,Finished,94090,2024,Bahrain Grand Prix,ferrari,0
25815,26290,1121,847,131,63,3,5,5,10.0,57,5551530,40,12,204.946,Finished,95065,2024,Bahrain Grand Prix,mercedes,0


In [13]:
predict_data = data_2024[['driverId','race_name','wins','grid','milliseconds','fastestLapSpeed','positionOrder','points']]
predict_data.loc[:, 'milliseconds'] = predict_data['milliseconds'].replace('\\N', -1).astype(float)
predict_data.loc[:, 'fastestLapSpeed'] = predict_data['fastestLapSpeed'].replace('\\N', np.nan).astype(float)
driver_avg_speed = predict_data.groupby('driverId')['fastestLapSpeed'].transform(lambda x: x.fillna(x.mean()))
overall_avg_speed = predict_data['fastestLapSpeed'].mean()
predict_data.loc[:, 'fastestLapSpeed'] = driver_avg_speed.fillna(overall_avg_speed)
predict_data.loc[:,['driverId', 'race_name']] = predict_data.loc[:,['driverId', 'race_name']].astype(str)
encoded_cols = encoder.transform(predict_data[['driverId', 'race_name']])
encoded_pred_df = pd.DataFrame(encoded_cols, columns=encoder.get_feature_names_out())
encoded_pred_df.index = predict_data.index
pred_data_encoded = predict_data.drop(['driverId', 'race_name'], axis=1)
pred_data_encoded = pd.concat([pred_data_encoded, encoded_pred_df], axis=1)
pred_data_encoded.head()

Unnamed: 0,wins,grid,milliseconds,fastestLapSpeed,positionOrder,points,driverId_1,driverId_10,driverId_100,driverId_101,...,race_name_South African Grand Prix,race_name_Spanish Grand Prix,race_name_Styrian Grand Prix,race_name_Swedish Grand Prix,race_name_Swiss Grand Prix,race_name_São Paulo Grand Prix,race_name_Turkish Grand Prix,race_name_Tuscan Grand Prix,race_name_United States Grand Prix,race_name_United States Grand Prix West
25811,1,1,5504742.0,210.383,1,26.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25812,0,5,5527199.0,206.468,2,18.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25813,0,4,5529852.0,206.156,3,15.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25814,0,2,5544411.0,207.069,4,12.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25815,0,3,5551530.0,204.946,5,10.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
pred_data_encoded.drop(columns=['points'],inplace=True)

In [15]:
data_2024.loc[:,'points'] = RF.predict(pred_data_encoded.values)

In [16]:
data_2025_predict = data_2024.copy()
data_2025_predict.head()

Unnamed: 0,resultId,raceId,driverId,constructorId,number,grid,positionText,positionOrder,points,laps,milliseconds,fastestLap,rank,fastestLapSpeed,status,fastestlaptime_ms,year,race_name,constructor_name,wins
25811,26286,1121,830,9,1,1,1,1,25.342857,57,5504742,39,1,210.383,Finished,92608,2024,Bahrain Grand Prix,red bull,1
25812,26287,1121,815,9,11,5,2,2,15.4,57,5527199,40,4,206.468,Finished,94364,2024,Bahrain Grand Prix,red bull,0
25813,26288,1121,832,6,55,4,3,3,13.9,57,5529852,44,6,206.156,Finished,94507,2024,Bahrain Grand Prix,ferrari,0
25814,26289,1121,844,6,16,2,4,4,11.642857,57,5544411,36,2,207.069,Finished,94090,2024,Bahrain Grand Prix,ferrari,0
25815,26290,1121,847,131,63,3,5,5,6.114286,57,5551530,40,12,204.946,Finished,95065,2024,Bahrain Grand Prix,mercedes,0


2025 rankings

In [17]:
drivers = pd.read_csv('final_dataset/drivers.csv')

data_2025_predict = data_2025_predict.merge(drivers[['driverId','forename','surname']],on=['driverId'],how='left')
data_2025_predict['forename'] = data_2025_predict['forename'].astype('str')
data_2025_predict['surname'] = data_2025_predict['surname'].astype('str')
data_2025_predict['Name']  = data_2025_predict['forename']+" "+data_2025_predict['surname']
data_2025_predict.drop(columns=['forename','surname'],inplace=True)

In [18]:
driver_rankings = data_2025_predict.groupby('Name').agg(total_points=('points', 'sum') ).reset_index()
driver_rankings = driver_rankings.sort_values(by='total_points', ascending=False)
driver_rankings.to_csv("Data_streamlit/q11_1.csv",index=False)
print("Predicted 2025 Driver Rankings:")
print(driver_rankings.head(10))

Predicted 2025 Driver Rankings:
               Name  total_points
16   max verstappen    388.485714
12     lando norris    320.600000
2   charles leclerc    313.985714
19    oscar piastri    255.342857
1      carlos sainz    254.514286
7    george russell    215.657143
13   lewis hamilton    192.507143
21     sergio pérez    133.328571
5   fernando alonso     66.942857
20     pierre gasly     37.500000


In [19]:
constructor_rankings = data_2025_predict.groupby('constructor_name').agg(total_points=('points', 'sum') ).reset_index()
constructor_rankings = constructor_rankings.sort_values(by='total_points', ascending=False)
constructor_rankings.to_csv("Data_streamlit/q11_2.csv",index=False)
print("Predicted 2025 constructor Rankings:")
print(constructor_rankings.head(10))

Predicted 2025 constructor Rankings:
  constructor_name  total_points
4          mclaren    575.942857
2          ferrari    574.328571
7         red bull    521.814286
5         mercedes    408.164286
1     aston martin     90.028571
0   alpine f1 team     60.328571
3     haas f1 team     49.671429
6       rb f1 team     36.921429
9         williams     15.128571
8           sauber      4.000000


11.	Predictions for 2025 Season:<br>
○	Who will win the Drivers' and Constructors' Championship based on historical and current data?


From the data we can expect <b>Max Verstappen</b> to win the 2025 season<br>
As of constructors <b>mclaren</b> is expected to win

12.	Struggling Teams Analysis:<br>
○	Predict which team is most likely to underperform in the upcoming 2025 season based on historical trends.

We will consider the change in performance from 2024 to 2025 to retrieve struggling teams in 2025 season


In [20]:
data_2024 = results[results['year']==2024]
data_2024 = data_2024.merge(drivers[['driverId','forename','surname']],on=['driverId'],how='left')
data_2024['forename'] = data_2024['forename'].astype('str')
data_2024['surname'] = data_2024['surname'].astype('str')
data_2024['Name']  = data_2024['forename']+" "+data_2024['surname']
data_2024.drop(columns=['forename','surname'],inplace=True)

In [21]:
driver_2024 = data_2024.groupby('Name').agg(total_points_2024=('points', 'sum')).reset_index()
driver_2025 = data_2025_predict.groupby('Name').agg(total_points_2025=('points', 'sum')).reset_index()
driver_trend = driver_2024.merge(driver_2025, on='Name', how='left')
driver_trend['point_change'] = driver_trend['total_points_2025'] - driver_trend['total_points_2024']
driver_trend = driver_trend.sort_values(by='point_change', ascending=True)
driver_trend.to_csv("Data_streamlit/q12_1.csv",index=False)
print("\nDrivers Most Likely to Struggle in 2025:")
print(driver_trend.head(2))


Drivers Most Likely to Struggle in 2025:
              Name  total_points_2024  total_points_2025  point_change
12    lando norris              344.0         320.600000    -23.400000
13  lewis hamilton              207.0         192.507143    -14.492857


In [22]:
constructor_2024 = data_2024.groupby('constructor_name').agg(total_points_2024=('points', 'sum')).reset_index()
constructor_2025 = data_2025_predict.groupby('constructor_name').agg(total_points_2025=('points', 'sum')).reset_index()
constructor_trend = constructor_2024.merge(constructor_2025, on='constructor_name', how='left')
constructor_trend['point_change'] = constructor_trend['total_points_2025'] - constructor_trend['total_points_2024']
constructor_trend = constructor_trend.sort_values(by='point_change', ascending=True)
constructor_trend.to_csv("Data_streamlit/q12_2.csv",index=False)
print("\nTeams Most Likely to Struggle in 2025:")
print(constructor_trend.iloc[1:6,:])


Teams Most Likely to Struggle in 2025:
  constructor_name  total_points_2024  total_points_2025  point_change
5         mercedes              433.0         408.164286    -24.835714
2          ferrari              595.0         574.328571    -20.671429
7         red bull              537.0         521.814286    -15.185714
1     aston martin               94.0          90.028571     -3.971429
6       rb f1 team               40.0          36.921429     -3.078571


According to the data <b>lando norris</b> is expected to struggle more in 2025 season (performance compared with 2024 season)<br>
Also when it comes to a team <b>mercedes</b> is expected to struggle more in 2025 season (performance compared with 2024 season)