In [None]:
import sns
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import seaborn as sns

# Load dataset
df = pd.read_csv('f1_2019_to_2022_all_drivers_all_data.csv', low_memory=False)

# Convert 'LapTime' and sector times to seconds
time_columns = ['LapTime', 'Sector1Time', 'Sector2Time', 'Sector3Time']
for col in time_columns:
    df[col] = pd.to_timedelta(df[col]).dt.total_seconds()

# Handle missing values for sector times and speed features
# You may need to keep the original 'Driver' and 'Circuit' for grouping before encoding
df['Original_Driver'] = df['Driver']
df['Original_Circuit'] = df['Circuit']

# Encode Rainfall, FreshTyre, IsAccurate, and IsPersonalBest to integer
df['Rainfall'] = df['Rainfall'].astype(int)
df['FreshTyre'] = df['FreshTyre'].astype(int)
df['IsAccurate'] = df['IsAccurate'].astype(int)

# Drop columns not related to lap time prediction, Sector times are deleted since it is Laptime
columns_to_drop = ['Time', 'LapStartTime', 'Sector1SessionTime', 'Sector2SessionTime', 'Sector3SessionTime',
                   'PitOutTime', 'PitInTime', 'LapStartDate', 'Deleted', 'DeletedReason', 'FastF1Generated']

df.drop(columns=columns_to_drop, inplace=True)

# Remove outliers based on 'LapTime'
Q1 = df['LapTime'].quantile(0.25)
Q3 = df['LapTime'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df = df[(df['LapTime'] >= lower_bound) & (df['LapTime'] <= upper_bound)]


# One-hot encoding
df = pd.get_dummies(df, columns=['Driver', 'Circuit', 'Compound', 'Team'])


# Standardize numeric features
#numeric_features = ['SpeedI1', 'SpeedI2', 'SpeedFL', 'SpeedST', 'AirTemp', 'Humidity', 'Pressure', 'TrackTemp', 'WindDirection', 'WindSpeed']


#scaler = StandardScaler()
#X[numeric_features] = scaler.fit_transform(X[numeric_features])

print(df.info(verbose=True))




In [None]:
# Check for NaN values in each column
nan_counts = df.isna().sum()

# Print columns with NaN count more than 0
print(nan_counts[nan_counts >0])

# After attempting to fill NaNs with group means
df['Sector1Time'].fillna(df['Sector1Time'].mean(), inplace=True)
df['Sector2Time'].fillna(df['Sector2Time'].mean(), inplace=True)
df['Sector3Time'].fillna(df['Sector3Time'].mean(), inplace=True)
df['SpeedI1'].fillna(df['SpeedI1'].mean(), inplace=True)
df['SpeedI2'].fillna(df['SpeedI2'].mean(), inplace=True)
df['SpeedFL'].fillna(df['SpeedFL'].mean(), inplace=True)
df['SpeedST'].fillna(df['SpeedST'].mean(), inplace=True)


# Check for NaN values in each column
nan_counts = df.isna().sum()

# Print columns with NaN count more than 0
print(nan_counts[nan_counts >0])


In [ ]:

# Separate features and target
X = df.drop(['LapTime'], axis=1)
y = df['LapTime']


In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Split Train and test set for a model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Remove Strings
X_train = X_train.drop(['Original_Driver', 'Original_Circuit'], axis= 1)

#Fitting a RandomForest model

model = RandomForestRegressor(n_estimators=100,max_depth=20,min_samples_split=5,random_state=42)
# Best Parameters: {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 300}

model.fit(X_train, y_train)

NameError: name 'X' is not defined

In [None]:
from sklearn.metrics import mean_squared_error

X_test = X_test.drop(['Original_Driver', 'Original_Circuit'], axis= 1)


# Print RMSE value of the model
predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
rmse = mse ** 0.5
print(f"RMSE: {rmse}")


In [None]:
# Shows Important features

importance = model.feature_importances_

# Sort the feature importance
sorted_indices = np.argsort(importance)[::-1]

for index in sorted_indices:
    print(f"{X_train.columns[index]}: {importance[index]}")
