In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

df = pd.read_csv('f1_2019_to_2022_all_drivers_all_data.csv',low_memory=False) # it's for 2019 to 2022 <- Train 

#Change to numerical values
df['LapTime'] = pd.to_timedelta(df['LapTime']).dt.total_seconds()


columns_to_check = ['Position', 'IsPersonalBest', 'TrackStatus']
df = df.dropna(subset=columns_to_check)


# Change strings to integer
df['Sector1Time'] = pd.to_timedelta(df['Sector1Time']).dt.total_seconds()
df['Sector2Time'] = pd.to_timedelta(df['Sector2Time']).dt.total_seconds()
df['Sector3Time'] = pd.to_timedelta(df['Sector3Time']).dt.total_seconds()


df['Rainfall'] = df['Rainfall'].astype(int) # Encode Rainfall To integer
df['FreshTyre'] = df['FreshTyre'].astype(int) # Encode FreshTyre To integer
df['IsAccurate'] = df['IsAccurate'].astype(int)
df["IsPersonalBest"] = df['IsPersonalBest'].astype(int)



#Drop non-related features 
columns_to_drop = [
    'Time', 'LapStartTime',
    'Sector1SessionTime', 'Sector2SessionTime', 'Sector3SessionTime',
    'PitOutTime', 'PitInTime',
    'LapStartDate', 'Deleted', 'DeletedReason',
    'FastF1Generated'
]
df = df.drop(columns=columns_to_drop)



# TO use imputation method
df['Original_Driver'] = df['Driver']
df['Original_Circuit'] = df['Circuit']


#One hot encoding
df = pd.get_dummies(df, columns=['Driver', 'Circuit','Compound','Team'])

# In case of Standardization
numeric_features = ['SpeedI1', 'SpeedI2', 'SpeedFL', 'SpeedST', 'AirTemp', 'Humidity', 'Pressure', 'TrackTemp', 'WindDirection', 'WindSpeed',]


df = df.dropna(subset=['LapTime'])

X = df.drop(['LapTime'], axis=1) # Exclude response variable ('LapTime') 

y = df['LapTime']


print(X.info(verbose=True))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 80808 entries, 0 to 82417
Data columns (total 110 columns):
 #    Column                               Dtype  
---   ------                               -----  
 0    DriverNumber                         int64  
 1    LapNumber                            float64
 2    Stint                                float64
 3    Sector1Time                          float64
 4    Sector2Time                          float64
 5    Sector3Time                          float64
 6    SpeedI1                              float64
 7    SpeedI2                              float64
 8    SpeedFL                              float64
 9    SpeedST                              float64
 10   IsPersonalBest                       int64  
 11   TyreLife                             float64
 12   FreshTyre                            int64  
 13   TrackStatus                          float64
 14   Position                             float64
 15   IsAccurate       

In [3]:
# It could be deleted if I use RandomForest
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
scaler = StandardScaler()

X[numeric_features] = scaler.fit_transform(X[numeric_features])


In [4]:
# Check for NaN values in each column
nan_counts = X.isna().sum()

# Print columns with NaN count more than 0
print(nan_counts[nan_counts > 0])


Sector1Time     1507
Sector2Time       28
Sector3Time       31
SpeedI1        11665
SpeedI2           77
SpeedFL         2510
SpeedST         6268
dtype: int64


In [5]:
means_sector1 = X.groupby(['Original_Driver', 'Original_Circuit'])['Sector1Time'].mean()

# Map the means back to the original dataframe to fill missing values
X['Sector1Time'] = X.apply(lambda row: means_sector1[(row['Original_Driver'], row['Original_Circuit'])] if pd.isnull(row['Sector1Time']) else row['Sector1Time'], axis=1)

#  for Sector2Time
means_sector2 = X.groupby(['Original_Driver', 'Original_Circuit'])['Sector2Time'].mean()

X['Sector2Time'] = X.apply(lambda row: means_sector2[(row['Original_Driver'], row['Original_Circuit'])] if pd.isnull(row['Sector2Time']) else row['Sector2Time'], axis=1)

#  for Sector3Time
means_sector3 = X.groupby(['Original_Driver', 'Original_Circuit'])['Sector3Time'].mean()
X['Sector3Time'] = X.apply(lambda row: means_sector3[(row['Original_Driver'], row['Original_Circuit'])] if pd.isnull(row['Sector3Time']) else row['Sector3Time'], axis=1)

#  for SpeedI1
means_speedI1 = X.groupby(['Original_Driver', 'Original_Circuit'])['SpeedI1'].mean()
X['SpeedI1'] = X.apply(lambda row: means_speedI1[(row['Original_Driver'], row['Original_Circuit'])] if pd.isnull(row['SpeedI1']) else row['SpeedI1'], axis=1)


means_speedI2 = X.groupby(['Original_Driver', 'Original_Circuit'])['SpeedI2'].transform('mean')
X['SpeedI2'] = X['SpeedI2'].fillna(means_speedI2)

#  for SpeedFL
means_speedFL = X.groupby(['Original_Driver', 'Original_Circuit'])['SpeedFL'].transform('mean')
X['SpeedFL'] = X['SpeedFL'].fillna(means_speedFL)

#  for SpeedST
means_speedST = X.groupby(['Original_Driver', 'Original_Circuit'])['SpeedST'].transform('mean')
X['SpeedST'] = X['SpeedST'].fillna(means_speedST)


In [6]:
# Check for NaN values in each column
nan_counts = X.isna().sum()

# Print columns with NaN count more than 0
print(nan_counts[nan_counts >0])


Sector1Time    3
SpeedFL        1
dtype: int64


In [7]:

print(len(X), len(y))

80808 80808


In [8]:
# After attempting to fill NaNs with group means
X['Sector1Time'].fillna(X['Sector1Time'].mean(), inplace=True)
X['Sector2Time'].fillna(X['Sector2Time'].mean(), inplace=True)
X['Sector3Time'].fillna(X['Sector3Time'].mean(), inplace=True)
X['SpeedI1'].fillna(X['SpeedI1'].mean(), inplace=True)
X['SpeedI2'].fillna(X['SpeedI2'].mean(), inplace=True)
X['SpeedFL'].fillna(X['SpeedFL'].mean(), inplace=True)
X['SpeedST'].fillna(X['SpeedST'].mean(), inplace=True)


# Check for NaN values in each column
nan_counts = X.isna().sum()

# Print columns with NaN count more than 0
print(nan_counts[nan_counts >0])

Series([], dtype: int64)


In [9]:
#Checking the length to fit to the model
print(len(X), len(y))

80808 80808


In [10]:
from sklearn.decomposition import PCA

# Assuming df_encoded already excludes 'LapTime' and has standardized numeric features and one-hot encoded categorical variables

#Deleted String Columns
X = X.drop(['Original_Driver', 'Original_Circuit'], axis= 1)

# Initialize PCA, choose the number of components e.g., 95% of variance
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X)

# Explained variance ratio can be helpful to understand the coverage
print(f"Explained Variance Ratio: {pca.explained_variance_ratio_.sum()}")



Explained Variance Ratio: 0.9713348251582974


In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

In [16]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=300,max_depth=20,min_samples_split=5,random_state=42)

# Best Parameters: {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 300}

model.fit(X_train, y_train)

In [17]:
from sklearn.metrics import mean_squared_error

predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
rmse = mse ** 0.5
print(f"RMSE: {rmse}")


RMSE: 4.787357556404228


In [15]:
""" from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

rfr = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(estimator=rfr, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

print(f"Best Parameters: {grid_search.best_params_}")
best_model = grid_search.best_estimator_ """
    



Best Parameters: {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 300}


In [22]:
# Assuming `pca` is your PCA model and `model` is your trained RandomForestRegressor
importances = model.feature_importances_

# Sort the feature importances in descending order
sorted_indices = np.argsort(importances)[::-1]

# Print the importances
for i, index in enumerate(sorted_indices):
    print(f"Component {i+1}: {importances[index]}")


Component 1: 0.6624785678121609
Component 2: 0.18169337455125978
Component 3: 0.15582805763657953


In [24]:
# Assuming pca is your fitted PCA model
loadings = pca.components_
feature_names = X.columns  # X should be your data before PCA transformation

# Print out loadings for the first few components
for i, component in enumerate(loadings[:3], start=1):
    component_loadings = zip(feature_names, component)
    sorted_loadings = sorted(component_loadings, key=lambda x: abs(x[1]), reverse=True)
    print(f"Principal Component {i}:")
    for feature, loading in sorted_loadings[:5]:  # Top 5 loadings for simplicity
        print(f"{feature}: {loading:.2f}")
    print()


Principal Component 1:
TrackStatus: 1.00
Sector1Time: 0.00
Sector3Time: 0.00
Sector2Time: 0.00
TyreLife: -0.00

Principal Component 2:
DriverNumber: 1.00
LapNumber: 0.01
Sector1Time: -0.01
TyreLife: 0.01
Sector2Time: -0.01

Principal Component 3:
LapNumber: 0.92
TyreLife: 0.35
Sector2Time: -0.10
Sector1Time: -0.09
Sector3Time: -0.08


In [27]:
# Assuming `df_encoded` is your DataFrame before PCA transformation with column names
feature_names = X.columns

# Let's focus on the top N components (e.g., N=3)
N = 3
top_components = sorted_indices[:N]

for i, component_index in enumerate(top_components):
    print(f"\nImportant features for Component {i+1} (Importance: {importances[component_index]:.4f}):")

    # Get the loading scores for this component
    loading_scores = pd.Series(loadings[component_index], index=feature_names)

    # Sort the loading scores based on absolute value
    sorted_loading_scores = loading_scores.abs().sort_values(ascending=False)

    # Print top contributing features
    top_features = sorted_loading_scores[:10]  # Adjust the number to how many top features you want to display
    print(top_features)



Important features for Component 1 (Importance: 0.6625):
TrackStatus     0.999979
Sector1Time     0.003955
Sector3Time     0.003289
Sector2Time     0.002271
TyreLife        0.002076
DriverNumber    0.002054
LapNumber       0.001042
SpeedST         0.000577
SpeedFL         0.000463
SpeedI1         0.000370
dtype: float64

Important features for Component 2 (Importance: 0.1817):
DriverNumber     0.999755
LapNumber        0.013756
Sector1Time      0.008729
TyreLife         0.006112
Sector2Time      0.005238
Team_Mercedes    0.004530
Year             0.004370
Position         0.004182
Driver_GIO       0.003946
Sector3Time      0.003571
dtype: float64

Important features for Component 3 (Importance: 0.1558):
LapNumber        0.924735
TyreLife         0.345618
Sector2Time      0.095208
Sector1Time      0.086845
Sector3Time      0.082932
Position         0.028000
Stint            0.025791
DriverNumber     0.016281
Compound_HARD    0.006230
SpeedFL          0.005469
dtype: float64


In [30]:
from xgboost import XGBRegressor

xgb = XGBRegressor(n_estimators=100, learning_rate=0.05, random_state=42)
xgb.fit(X_train, y_train)

from sklearn.model_selection import cross_val_score

scores = cross_val_score(best_model, X, y, cv=5, scoring='neg_mean_squared_error')
print(f"Mean CV RMSE: {np.mean(np.sqrt(-scores))}")

KeyboardInterrupt: 