In [23]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import RobustScaler
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import RobustScaler
from sklearn.impute import KNNImputer
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import RobustScaler
from xgboost import XGBRegressor


# Load dataset
df = pd.read_csv('f1_2019_to_2022_all_drivers_all_data.csv', low_memory=False)

# Convert time columns to seconds
time_columns = ['LapTime', 'Sector1Time', 'Sector2Time', 'Sector3Time']
for col in time_columns:
    df[col] = pd.to_timedelta(df[col]).dt.total_seconds()

# Convert binary columns to integer type
df['Rainfall'] = df['Rainfall'].astype(int)
df['FreshTyre'] = df['FreshTyre'].astype(int)
df['IsAccurate'] = df['IsAccurate'].astype(int)

# Function to categorize weather
def categorize_weather(row):
    if row['Rainfall'] > 0:
        return 'Rainy'
    elif row['AirTemp'] > 28:
        return 'high'
    elif row['AirTemp'] > 21:
        return 'medium'
    elif row['AirTemp'] > 12:
        return 'low'
    else:
        return 'very_low'

df['Weather_Category'] = df.apply(categorize_weather, axis=1)
df['Original_Weather_Category'] = df['Weather_Category']
df = pd.get_dummies(df, columns=['Weather_Category', 'Driver', 'Circuit', 'Compound', 'Team', 'TrackStatus'])


# Create Track temperature category based on the result of clustering 
df['TrackTemp_Cat'] = pd.cut(df['TrackTemp'], bins=[0, 18.96764999, 27.87457484, 35.04425766, 41.75142602, 50.51006013], labels=['VERY_LOW', 'Low', 'Medium', 'Warm', 'High'])
df['Original_TrackTemp_Cat'] = df['TrackTemp_Cat']
df = pd.get_dummies(df, columns=['TrackTemp_Cat'])



# Tyre Age Interaction with TrackTemp 
df['TyreAge_TrackTemp'] = df['TyreLife'] * df['TrackTemp']

# Calculate the TrackConditionIndex
df['TrackConditionIndex'] = (df['AirTemp'] + df['TrackTemp'] + df['Humidity'] + df['Pressure'] + df['WindSpeed'] + df['Rainfall']) / 6

# Interaction between TrackTemp and Rainfall
df['Temp_Rainfall_Interaction'] = df['TrackTemp'] * df['Rainfall']


# Drop unnecessary columns
columns_to_drop = ['Time', 'Sector1SessionTime', 'Sector2SessionTime', 'Sector3SessionTime', 'PitOutTime', 'PitInTime', 'LapStartDate', 'Deleted', 'DeletedReason', 'FastF1Generated', 'IsPersonalBest', 'Sector3Time', 'LapStartTime', 'Sector2Time', 'Sector1Time']
df.drop(columns=columns_to_drop, inplace=True)

# Impute missing values for numeric features
numeric_features = ['SpeedI1', 'SpeedI2', 'SpeedFL', 'SpeedST', 'Humidity', 'Pressure', 'WindDirection', 'WindSpeed', 'TrackTemp', 'AirTemp', 'TrackConditionIndex', 'TyreAge_TrackTemp', 'Temp_Rainfall_Interaction']
imputer = IterativeImputer(estimator=LinearRegression(), random_state=42)




df[numeric_features] = imputer.fit_transform(df[numeric_features])

# Address class imbalance
scale_pos_weight = df['Rainfall'].value_counts(normalize=True)[0] / df['Rainfall'].value_counts(normalize=True)[1]




In [24]:
numeric_features = ['Humidity', 'Pressure', 'WindDirection', 'WindSpeed','TrackTemp','AirTemp','TrackConditionIndex','TyreAge_TrackTemp','Rainfall','SpeedI1', 'SpeedI2', 'SpeedFL', 'SpeedST']

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Use IterativeImputer for more sophisticated imputation
iterative_imputer = IterativeImputer(random_state=42)
df[numeric_features] = iterative_imputer.fit_transform(df[numeric_features])


In [25]:

# df['Sector1Time'].fillna(method='ffill', inplace=True)
# 1. Separate LapTime as dry or wet(rainy) condition ( since lapTime of rainy day would be recognized as outliers)
# 2. Remove Outliers for dry condition LapTime
# 3. Build Combined LapTime df (Outliers for dry days are deleted)


# Flag for rainy conditions
df['IsRainy'] = df['Rainfall'].apply(lambda x: 1 if x > 0 else 0)

# Separate dataframes for dry and wet conditions
df_dry = df[df['IsRainy'] == 0]
df_wet = df[df['IsRainy'] == 1]

# Remove Outliers for dry days using the IQR method
Q1_dry = df_dry['LapTime'].quantile(0.25)
Q3_dry = df_dry['LapTime'].quantile(0.75)
IQR_dry = Q3_dry - Q1_dry
lower_bound_dry = Q1_dry - 1.5 * IQR_dry
upper_bound_dry = Q3_dry + 1.5 * IQR_dry

df_dry_filtered = df_dry[(df_dry['LapTime'] >= lower_bound_dry) & (df_dry['LapTime'] <= upper_bound_dry)]

# Ensure a copy is made if modification is intended
df_dry_filtered = df_dry_filtered.copy()

## Scaling to the selected numeric features
robust_scaler = RobustScaler()
df_dry_filtered[numeric_features] = robust_scaler.fit_transform(df_dry_filtered[numeric_features])


# Build combined df for both dry-rainy days (Outliers are removed with rainy days lapTimes) 
df_combined = pd.concat([df_dry_filtered, df_wet], ignore_index=True)

In [26]:
## COMBINED

df_combined = df_combined.drop(['Original_Weather_Category','Original_TrackTemp_Cat',], axis= 1)


# Split the data
X = df_combined.drop(['LapTime'], axis=1)
y = df_combined['LapTime']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=df_combined['Rainfall'])


df_combined['LapTime'].fillna(method='ffill', inplace=True)

# Model training with class weight adjustment
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42, scale_pos_weight=scale_pos_weight)
xgb_model.fit(X_train, y_train)

# Predict and evaluate
predictions = xgb_model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
print(f"Baseline RMSE for Dry_filtered df: {rmse}")

# Optionally visualize the feature importance
importances = pd.Series(xgb_model.feature_importances_, index=X.columns)
importances_sorted = importances.sort_values(ascending=False)
plt.figure(figsize=(10, 8))
importances_sorted.plot(kind='bar')
plt.title('Feature Importances')
plt.show()




XGBoostError: [19:32:04] /Users/runner/work/xgboost/xgboost/src/data/data.cc:507: Check failed: valid: Label contains NaN, infinity or a value too large.
Stack trace:
  [bt] (0) 1   libxgboost.dylib                    0x0000000143b50994 dmlc::LogMessageFatal::~LogMessageFatal() + 124
  [bt] (1) 2   libxgboost.dylib                    0x0000000143c0dc9c xgboost::MetaInfo::SetInfoFromHost(xgboost::Context const&, xgboost::StringView, xgboost::Json) + 3452
  [bt] (2) 3   libxgboost.dylib                    0x0000000143c0cdc4 xgboost::MetaInfo::SetInfo(xgboost::Context const&, xgboost::StringView, xgboost::StringView) + 164
  [bt] (3) 4   libxgboost.dylib                    0x0000000143b67688 XGDMatrixSetInfoFromInterface + 224
  [bt] (4) 5   libffi.8.dylib                      0x0000000100d3804c ffi_call_SYSV + 76
  [bt] (5) 6   libffi.8.dylib                      0x0000000100d35834 ffi_call_int + 1404
  [bt] (6) 7   _ctypes.cpython-311-darwin.so       0x0000000100df8140 _ctypes_callproc + 752
  [bt] (7) 8   _ctypes.cpython-311-darwin.so       0x0000000100df24a4 PyCFuncPtr_call + 228
  [bt] (8) 9   python3.11                          0x000000010050653c _PyEval_EvalFrameDefault + 195268

