In [1]:
import os 
import matplotlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import math
import re
import datetime as dt

from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error as rmse
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.automl.presets.text_presets import TabularNLPAutoML
from lightautoml.tasks import Task
from lightautoml.report.report_deco import ReportDeco, ReportDecoUtilized
from lightautoml.addons.tabular_interpretation import SSWARM
import torch

matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (10, 6)
matplotlib.rcParams['figure.facecolor'] = "#949494"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
N_THREADS = 16
N_FOLDS = 30
RANDOM_STATE = 42
TEST_SIZE = 0.2
TIMEOUT = 3600

In [3]:
def test_params(ModelClass, **params):
    model = ModelClass(**params).fit(X_train, Y_train)
    train_rmse = rmse(model.predict(X_train), Y_train)
    val_rmse = rmse(model.predict(X_val), Y_val)
    return train_rmse, val_rmse

def test_param_and_plot(ModelClass, param_name, param_values, **other_params):
    train_errors, val_errors = [], [] 
    for value in param_values:
        params = dict(other_params)
        params[param_name] = value
        train_rmse, val_rmse = test_params(ModelClass, **params)
        train_errors.append(train_rmse)
        val_errors.append(val_rmse)
    plt.figure(figsize=(10,6))
    plt.title('Overfitting curve: ' + param_name)
    plt.plot(param_values, train_errors, 'b-o')
    plt.plot(param_values, val_errors, 'r-o')
    plt.xlabel(param_name)
    plt.ylabel('RMSE')
    plt.legend(['Training', 'Validation'])

In [4]:
df_2019 = pd.read_csv('practice_data_2019.csv')
df_2020 = pd.read_csv('practice_data_2020.csv')
df_2021 = pd.read_csv('practice_data_2021.csv')
df_2022 = pd.read_csv('practice_data_2022.csv')
df_2023 = pd.read_csv('practice_data_2023.csv')
df_2024 = pd.read_csv('practice_data_2024.csv')

train = pd.concat([df_2019, df_2020, df_2021, df_2022, df_2023])
train_df = train.drop('Unnamed: 0', axis=1)
val_df = df_2024.drop('Unnamed: 0', axis=1).copy()

In [5]:
train_df.loc[:,'fastest_laptime']= pd.to_timedelta(train_df['fastest_laptime']).dt.total_seconds()
train_df.loc[:,'avg_quick_laptime']= pd.to_timedelta(train_df['avg_quick_laptime']).dt.total_seconds()
train_df.loc[:,'q1']= pd.to_timedelta(train_df['q1']).dt.total_seconds()
train_df.loc[:,'q2']= pd.to_timedelta(train_df['q2']).dt.total_seconds()
train_df.loc[:,'q3']= pd.to_timedelta(train_df['q3']).dt.total_seconds()

val_df.loc[:,'fastest_laptime']= pd.to_timedelta(val_df['fastest_laptime']).dt.total_seconds()
val_df.loc[:,'avg_quick_laptime']= pd.to_timedelta(val_df['avg_quick_laptime']).dt.total_seconds()
val_df.loc[:,'q1']= pd.to_timedelta(val_df['q1']).dt.total_seconds()
val_df.loc[:,'q2']= pd.to_timedelta(val_df['q2']).dt.total_seconds()
val_df.loc[:,'q3']= pd.to_timedelta(val_df['q3']).dt.total_seconds()

train_df['fastest_laptime'] = train_df['fastest_laptime'].astype(float)
train_df['avg_quick_laptime'] = train_df['avg_quick_laptime'].astype(float)
train_df['q1'] = train_df['q1'].astype(float)
train_df['q2'] = train_df['q2'].astype(float)
train_df['q3'] = train_df['q3'].astype(float)

val_df['fastest_laptime'] = val_df['fastest_laptime'].astype(float)
val_df['avg_quick_laptime'] = val_df['avg_quick_laptime'].astype(float)
val_df['q1'] = val_df['q1'].astype(float)
val_df['q2'] = val_df['q2'].astype(float)
val_df['q3'] = val_df['q3'].astype(float)

In [6]:
INPUT_COLS = ['Stint', 'Compound', 'LapCount', 'FirstLap', 'LastLap',
       'race_number', 'fastest_laptime', 'number_quick_laptime',
       'avg_quick_laptime', 'fp_session', 'SpeedTotal', 'high_engine_mode',
       'low_engine_mode', 'quali_sim', 'race_sim']

TARGET_NAME = 'q1'

X_train = train_df[INPUT_COLS].copy()
Y_train = train_df[TARGET_NAME].copy()

X_val = val_df[INPUT_COLS].copy()
Y_val = val_df[TARGET_NAME].copy()

numeric_cols = X_train.select_dtypes(include=np.number).columns.tolist()
categorical_cols = X_train.select_dtypes('object').columns.tolist()

In [7]:
imputer = SimpleImputer(strategy='mean').fit(X_train[numeric_cols])

X_train[numeric_cols] = imputer.transform(X_train[numeric_cols])
Y_train = pd.DataFrame(Y_train)
imputer = SimpleImputer(strategy='constant', fill_value=120).fit(Y_train)

Y_train = imputer.transform(Y_train)

scaler = MinMaxScaler().fit(X_train[numeric_cols])

X_train[numeric_cols] = scaler.transform(X_train[numeric_cols])

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore').fit(X_train[categorical_cols])
encoded_cols = list(encoder.get_feature_names_out(categorical_cols))

X_train[encoded_cols] = encoder.transform(X_train[categorical_cols])

train_df = X_train[numeric_cols + encoded_cols]

train_df['q1'] = Y_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['q1'] = Y_train


In [8]:
imputer = SimpleImputer(strategy='mean').fit(X_val[numeric_cols])

X_val[numeric_cols] = imputer.transform(X_val[numeric_cols])
Y_val = pd.DataFrame(Y_val)

imputer = SimpleImputer(strategy='constant', fill_value=120).fit(Y_val)

Y_val = imputer.transform(Y_val)

scaler = MinMaxScaler().fit(X_val[numeric_cols])

X_val[numeric_cols] = scaler.transform(X_val[numeric_cols])

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore').fit(X_val[categorical_cols])
encoded_cols = list(encoder.get_feature_names_out(categorical_cols))

X_val[encoded_cols] = encoder.transform(X_val[categorical_cols])

val_df = X_val[numeric_cols + encoded_cols]


In [9]:
task = Task('reg', loss='rmsle', metric='rmsle')

roles = {
    'target': TARGET_NAME,
}

In [10]:
automl = TabularAutoML(task = task,
                        timeout = TIMEOUT,
                        cpu_limit = N_THREADS,
                        reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE})
train_preds = automl.fit_predict(train_df, roles = roles, verbose = 0)


In [11]:
train_preds = automl.predict(X_train)
train_preds = train_preds.data

In [12]:
rmse(train_preds, Y_train)

1.4466488941518774

In [13]:
val_preds = automl.predict(X_val)
val_preds = val_preds.data

In [14]:
rmse(val_preds, Y_val)

19.297406584536727