In [205]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

df = pd.read_csv('f1_2019_to_2022_all_drivers_all_data.csv',low_memory=False) # it's for 2019 to 2022 <- Train 

#Change to numerical values
df['LapTime'] = pd.to_timedelta(df['LapTime']).dt.total_seconds()


columns_to_check = ['Position', 'IsPersonalBest', 'TrackStatus']
df = df.dropna(subset=columns_to_check)


# Change strings to integer
df['Sector1Time'] = pd.to_timedelta(df['Sector1Time']).dt.total_seconds()
df['Sector2Time'] = pd.to_timedelta(df['Sector2Time']).dt.total_seconds()
df['Sector3Time'] = pd.to_timedelta(df['Sector3Time']).dt.total_seconds()


df['Rainfall'] = df['Rainfall'].astype(int) # Encode Rainfall To integer
df['FreshTyre'] = df['FreshTyre'].astype(int) # Encode FreshTyre To integer
df['IsAccurate'] = df['IsAccurate'].astype(int)
df["IsPersonalBest"] = df['IsPersonalBest'].astype(int)



#Drop non-related features 
columns_to_drop = [
    'Time', 'LapStartTime',
    'Sector1SessionTime', 'Sector2SessionTime', 'Sector3SessionTime',
    'PitOutTime', 'PitInTime',
    'LapStartDate', 'Deleted', 'DeletedReason',
    'FastF1Generated','Sector1Time','Sector2Time','Sector3Time'
]
df = df.drop(columns=columns_to_drop)


# TO use imputation method
#df['Original_Driver'] = df['Driver']
#df['Original_Circuit'] = df['Circuit']


Q1 = df['LapTime'].quantile(0.25)
Q3 = df['LapTime'].quantile(0.75)
IQR = Q3 - Q1

# Determine bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify outliers
outliers = df[(df['LapTime'] < lower_bound) | (df['LapTime'] > upper_bound)]

#  Remove outliers from the DataFrame

df = df[(df['LapTime'] >= lower_bound) & (df['LapTime'] <= upper_bound)]


#One hot encoding
df = pd.get_dummies(df, columns=['Driver', 'Circuit','Compound','Team','TrackStatus'])

# In case of Standardization
numeric_features = ['SpeedI1', 'SpeedI2', 'SpeedFL', 'SpeedST', 'AirTemp', 'Humidity', 'Pressure', 'TrackTemp', 'WindDirection', 'WindSpeed',]


df = df.dropna(subset=['LapTime'])

X = df.drop(['LapTime'], axis=1) # Exclude response variable ('LapTime') 

y = df['LapTime']


print(X.info(verbose=True))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 79034 entries, 0 to 82417
Data columns (total 121 columns):
 #    Column                               Dtype  
---   ------                               -----  
 0    DriverNumber                         int64  
 1    LapNumber                            float64
 2    Stint                                float64
 3    SpeedI1                              float64
 4    SpeedI2                              float64
 5    SpeedFL                              float64
 6    SpeedST                              float64
 7    IsPersonalBest                       int64  
 8    TyreLife                             float64
 9    FreshTyre                            int64  
 10   Position                             float64
 11   IsAccurate                           int64  
 12   AirTemp                              float64
 13   Humidity                             float64
 14   Pressure                             float64
 15   Rainfall         

In [206]:
# It could be deleted if I use RandomForest
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
scaler = StandardScaler()
X[numeric_features] = scaler.fit_transform(X[numeric_features])


In [207]:
""" means_sector1 = X.groupby(['Original_Driver', 'Original_Circuit'])['Sector1Time'].mean()

# Map the means back to the original dataframe to fill missing values
X['Sector1Time'] = X.apply(lambda row: means_sector1[(row['Original_Driver'], row['Original_Circuit'])] if pd.isnull(row['Sector1Time']) else row['Sector1Time'], axis=1)

#  for Sector2Time
means_sector2 = X.groupby(['Original_Driver', 'Original_Circuit'])['Sector2Time'].mean()

X['Sector2Time'] = X.apply(lambda row: means_sector2[(row['Original_Driver'], row['Original_Circuit'])] if pd.isnull(row['Sector2Time']) else row['Sector2Time'], axis=1)

#  for Sector3Time
means_sector3 = X.groupby(['Original_Driver', 'Original_Circuit'])['Sector3Time'].mean()
X['Sector3Time'] = X.apply(lambda row: means_sector3[(row['Original_Driver'], row['Original_Circuit'])] if pd.isnull(row['Sector3Time']) else row['Sector3Time'], axis=1)

#  for SpeedI1
means_speedI1 = X.groupby(['Original_Driver', 'Original_Circuit'])['SpeedI1'].mean()
X['SpeedI1'] = X.apply(lambda row: means_speedI1[(row['Original_Driver'], row['Original_Circuit'])] if pd.isnull(row['SpeedI1']) else row['SpeedI1'], axis=1)


means_speedI2 = X.groupby(['Original_Driver', 'Original_Circuit'])['SpeedI2'].transform('mean')
X['SpeedI2'] = X['SpeedI2'].fillna(means_speedI2)

#  for SpeedFL
means_speedFL = X.groupby(['Original_Driver', 'Original_Circuit'])['SpeedFL'].transform('mean')
X['SpeedFL'] = X['SpeedFL'].fillna(means_speedFL)

#  for SpeedST
means_speedST = X.groupby(['Original_Driver', 'Original_Circuit'])['SpeedST'].transform('mean')
X['SpeedST'] = X['SpeedST'].fillna(means_speedST)"""


" means_sector1 = X.groupby(['Original_Driver', 'Original_Circuit'])['Sector1Time'].mean()\n\n# Map the means back to the original dataframe to fill missing values\nX['Sector1Time'] = X.apply(lambda row: means_sector1[(row['Original_Driver'], row['Original_Circuit'])] if pd.isnull(row['Sector1Time']) else row['Sector1Time'], axis=1)\n\n#  for Sector2Time\nmeans_sector2 = X.groupby(['Original_Driver', 'Original_Circuit'])['Sector2Time'].mean()\n\nX['Sector2Time'] = X.apply(lambda row: means_sector2[(row['Original_Driver'], row['Original_Circuit'])] if pd.isnull(row['Sector2Time']) else row['Sector2Time'], axis=1)\n\n#  for Sector3Time\nmeans_sector3 = X.groupby(['Original_Driver', 'Original_Circuit'])['Sector3Time'].mean()\nX['Sector3Time'] = X.apply(lambda row: means_sector3[(row['Original_Driver'], row['Original_Circuit'])] if pd.isnull(row['Sector3Time']) else row['Sector3Time'], axis=1)\n\n#  for SpeedI1\nmeans_speedI1 = X.groupby(['Original_Driver', 'Original_Circuit'])['SpeedI1'].

In [208]:
# Check for NaN values in each column
nan_counts = X.isna().sum()

# Print columns with NaN count more than 0
print(nan_counts[nan_counts >0])


# After attempting to fill NaNs with group means
#X['Sector1Time'].fillna(X['Sector1Time'].mean(), inplace=True)
#X['Sector2Time'].fillna(X['Sector2Time'].mean(), inplace=True)
#X['Sector3Time'].fillna(X['Sector3Time'].mean(), inplace=True)


X['SpeedI1'].fillna(X['SpeedI1'].mean(), inplace=True)
X['SpeedI2'].fillna(X['SpeedI2'].mean(), inplace=True)
X['SpeedFL'].fillna(X['SpeedFL'].mean(), inplace=True)
X['SpeedST'].fillna(X['SpeedST'].mean(), inplace=True)


# Check for NaN values in each column
nan_counts = X.isna().sum()

# Print columns with NaN count more than 0
print(nan_counts[nan_counts >0])


SpeedI1    11633
SpeedI2       75
SpeedFL     2288
SpeedST     6169
dtype: int64
Series([], dtype: int64)


In [209]:
#Checking the length to fit to the model
print(len(X), len(y))

79034 79034


In [210]:
""" PCA

from sklearn.decomposition import PCA

# Assuming df_encoded already excludes 'LapTime' and has standardized numeric features and one-hot encoded categorical variables

#Deleted String Columns
X = X.drop(['Original_Driver', 'Original_Circuit'], axis= 1)

# Initialize PCA, choose the number of components e.g., 95% of variance
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X)

# Explained variance ratio can be helpful to understand the coverage
print(f"Explained Variance Ratio: {pca.explained_variance_ratio_.sum()}")

"""

' PCA\n\nfrom sklearn.decomposition import PCA\n\n# Assuming df_encoded already excludes \'LapTime\' and has standardized numeric features and one-hot encoded categorical variables\n\n#Deleted String Columns\nX = X.drop([\'Original_Driver\', \'Original_Circuit\'], axis= 1)\n\n# Initialize PCA, choose the number of components e.g., 95% of variance\npca = PCA(n_components=0.95)\nX_pca = pca.fit_transform(X)\n\n# Explained variance ratio can be helpful to understand the coverage\nprint(f"Explained Variance Ratio: {pca.explained_variance_ratio_.sum()}")\n\n'

In [211]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [212]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100,max_depth=20,min_samples_split=5,random_state=42)

# Best Parameters: {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 300}

model.fit(X_train, y_train)

In [213]:
from sklearn.metrics import mean_squared_error

predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
rmse = mse ** 0.5
print(f"RMSE: {rmse}")


RMSE: 1.8595151301505328


In [214]:
""" Grid Search
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

rfr = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(estimator=rfr, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

print(f"Best Parameters: {grid_search.best_params_}")
best_model = grid_search.best_estimator_ """
    

' Grid Search\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.ensemble import RandomForestRegressor\n\nparam_grid = {\n    \'n_estimators\': [100, 200, 300],\n    \'max_depth\': [None, 10, 20],\n    \'min_samples_split\': [2, 5, 10]\n}\n\nrfr = RandomForestRegressor(random_state=42)\ngrid_search = GridSearchCV(estimator=rfr, param_grid=param_grid, cv=5, scoring=\'neg_mean_squared_error\', n_jobs=-1)\ngrid_search.fit(X_train, y_train)\n\nprint(f"Best Parameters: {grid_search.best_params_}")\nbest_model = grid_search.best_estimator_ '

In [215]:
importances = model.feature_importances_
# Sort the feature importance in descending order
sorted_indices = np.argsort(importances)[::-1]

for index in sorted_indices:
    print(f"{X_train.columns[index]}: {importances[index]}")


SpeedFL: 0.2722825651665258
Pressure: 0.19120753261418277
Circuit_Sakhir Grand Prix: 0.09266549153862938
IsAccurate: 0.07994760557821448
Circuit_Azerbaijan Grand Prix: 0.06665610804092746
SpeedI2: 0.04478567406522106
Circuit_Singapore Grand Prix: 0.043519588508653416
Circuit_Bahrain Grand Prix: 0.030367704151255774
Circuit_Japanese Grand Prix: 0.018905206964163923
SpeedST: 0.014717869163287952
Circuit_Chinese Grand Prix: 0.014502472300466198
TyreLife: 0.01316255386230876
SpeedI1: 0.0116293857290548
AirTemp: 0.0109374259144148
Humidity: 0.009397469879084607
Year: 0.008837864327269662
LapNumber: 0.007289445698328269
Circuit_British Grand Prix: 0.007137182332264707
Circuit_Saudi Arabian Grand Prix: 0.005963359442765602
Circuit_United States Grand Prix: 0.005389611906094267
Compound_INTERMEDIATE: 0.005192891479395603
TrackTemp: 0.005147675551048598
WindDirection: 0.004828894479414503
Circuit_Monaco Grand Prix: 0.004477142970936256
Position: 0.00334521614962975
Circuit_Dutch Grand Prix: 0.0