In [1]:
#1 IQR both

import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import RobustScaler
from sklearn.impute import KNNImputer
from sklearn.linear_model import LinearRegression
from sklearn.impute import IterativeImputer
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import RobustScaler
from xgboost import XGBRegressor
import matplotlib.pyplot as plt

# Load dataset
df = pd.read_csv('f1_2019_to_2023_all_drivers_all_data.csv', low_memory=False)

# Convert time columns to seconds
time_columns = ['LapTime', 'Sector1Time', 'Sector2Time', 'Sector3Time','Time']
for col in time_columns:
    df[col] = pd.to_timedelta(df[col]).dt.total_seconds()

# Convert binary columns to integer type
df['Rainfall'] = df['Rainfall'].astype(int)
df['FreshTyre'] = df['FreshTyre'].astype(int)
df['IsAccurate'] = df['IsAccurate'].astype(int)


# Categorize weather condition based on centroid values of Kmeans clustering
def categorize_weather(row):
    if row['Rainfall'] > 0:
        return 'Rainy'
    elif row['AirTemp'] > 28.43213126:
        return 'high'
    elif row['AirTemp'] > 21.31279265:
        return 'medium'
    elif row['AirTemp'] > 12.84901403:
        return 'low'
    else:
        return 'very_low'
df['Weather_Category'] = df.apply(categorize_weather, axis=1)
df = pd.get_dummies(df, columns=['Weather_Category'])



# Create Track temperature category based on the result of Kmeans clustering 
df['TrackTemp_Cat'] = pd.cut(df['TrackTemp'], bins=[0, 18.96764999, 27.87457484, 35.04425766, 41.75142602, 50.51006013, 53.02449646], labels=['VERY_LOW', 'Low', 'Medium', 'Warm', 'High','VERY_High'])
df = pd.get_dummies(df, columns=['TrackTemp_Cat'])





# One-hot encoding
df = pd.get_dummies(df, columns=['Driver', 'Compound', 'Team','TrackStatus','Circuit'])
# Drop irrelevant columns
columns_to_drop = ['Sector1SessionTime', 'Sector2SessionTime', 'Sector3SessionTime',
                   'PitOutTime', 'PitInTime', 'LapStartDate', 'Deleted', 'DeletedReason', 'FastF1Generated',
                   'IsPersonalBest', 'Sector3Time','LapStartTime','Sector2Time','Sector1Time']
df.drop(columns=columns_to_drop, inplace=True)



# Select numerical values for scaling and imputation
numeric_features = ['Humidity', 'Pressure', 'WindDirection', 'WindSpeed','TrackTemp','AirTemp','SpeedI1', 'SpeedI2', 'SpeedFL', 'SpeedST']

# Example for forward fill
#time_series_features = ['WindDirection', 'WindSpeed', 'TrackTemp', 'AirTemp', 'Rainfall', 'SpeedI1', 'SpeedI2', 'SpeedFL', 'SpeedST']

df[numeric_features] = df[numeric_features].fillna(method='ffill')


## Separate Rainy / dry days ##
# 1. Separate LapTime as dry or wet(rainy) condition ( since lapTime of rainy day would be recognized as outliers)
# 2. Remove Outliers for dry condition LapTime
# 3. Build Combined LapTime df (Outliers for dry days are deleted)

# Flag for rainy conditions
df['IsRainy'] = df['Rainfall'].apply(lambda x: 1 if x > 0 else 0)

# Separate dataframes for dry and wet conditions
df_dry = df[df['IsRainy'] == 0]
df_wet = df[df['IsRainy'] == 1]

def remove_outliers(df, column_name, multiplier=1.5):
    Q1 = df[column_name].quantile(0.25)
    Q3 = df[column_name].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - multiplier * IQR
    upper_bound = Q3 + multiplier * IQR
    return df[(df[column_name] >= lower_bound) & (df[column_name] <= upper_bound)]

# Apply standard IQR for dry days
df_dry_filtered = remove_outliers(df_dry, 'LapTime', multiplier=1.5)

# Apply a more lenient IQR for wet days
df_wet_filtered = remove_outliers(df_wet, 'LapTime', multiplier=2.0)


df_combined = pd.concat([df_dry_filtered, df_wet_filtered], ignore_index=True)

scaler = RobustScaler()
#fit sclaer
df_combined[numeric_features] = scaler.fit_transform(df_combined[numeric_features])


  df[numeric_features] = df[numeric_features].fillna(method='ffill')


In [7]:
import pandas as pd
from torch.utils.data import DataLoader
from pytorch_forecasting import TimeSeriesDataSet, TemporalFusionTransformer
from pytorch_forecasting.data.encoders import NaNLabelEncoder


# Assuming df_combined is already loaded and preprocessed
df_combined['time_idx'] = (df_combined.index - df_combined.index.min())

# Setup TimeSeriesDataSet
max_encoder_length = 36
max_prediction_length = 6

training = TimeSeriesDataSet(
    df_combined,
    time_idx="time_idx",
    target="LapTime",
    group_ids=["IsRainy"],
    min_encoder_length=max_encoder_length,
    max_encoder_length=max_encoder_length,
    min_prediction_length=max_prediction_length,
    max_prediction_length=max_prediction_length,
    static_categoricals=["IsRainy"],
    time_varying_known_reals=numeric_features + [f'lag_{i}' for i in range(1, 25)],
    target_normalizer=None,  # Already scaled
    add_relative_time_idx=True,
    add_target_scales=True
)

# Create dataloaders
batch_size = 16
train_dataloader = DataLoader(training, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(training.to_dataloader(), batch_size=batch_size, shuffle=False)

# Define and train the Temporal Fusion Transformer
tft = TemporalFusionTransformer.from_dataset(
    training,
    learning_rate=0.03,
    hidden_size=16,
    attention_head_size=1,
    dropout=0.1,
    hidden_continuous_size=8,
    output_size=1,
    loss=QuantileLoss(),
    log_interval=10,
    reduce_on_plateau_patience=4,
)

# Train the model
from pytorch_forecasting import Trainer, EarlyStopping

trainer = Trainer(
    max_epochs=20,
    gpus=0,  # Adjust based on your GPU availability
    limit_train_batches=30,  # Model on a subset of data for quick training
)

trainer.fit(
    tft,
    train_dataloader=train_dataloader,
    val_dataloaders=val_dataloader
)

# Note: You can adjust the model parameters and training setup based on your dataset size and complexity.


Baseline RMSE for Combined df: 3.5400348933251977
