In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn

In [9]:
df = pd.read_csv('df_regression.csv')

In [None]:
df

In [10]:
df.drop(['Unnamed: 0', 'driver_number', 'date_start', 'duration_sector_1', 'duration_sector_2', 'duration_sector_3',
         'meeting_key', 'session_key', 'segments_sector_1', 'segments_sector_2', 'segments_sector_3'], axis=1, inplace=True)

In [None]:
df

In [11]:
df.corr()

Unnamed: 0,lap_number,brake_count,n_gear,drs_count,speed,throttle,rpm,i1_speed,i2_speed,is_pit_out_lap,lap_duration,st_speed,yellow_flag
lap_number,1.0,0.482828,-0.284263,-0.262035,-0.601266,-0.573314,-0.479236,-0.616488,-0.605357,-0.077558,0.596446,-0.375307,0.093143
brake_count,0.482828,1.0,-0.228027,-0.030748,-0.812325,-0.736232,-0.728288,-0.777503,-0.768561,0.112149,0.8303,-0.586937,0.344027
n_gear,-0.284263,-0.228027,1.0,0.116791,0.379607,0.305765,0.218538,0.393093,0.355831,-0.093369,-0.276551,0.202004,-0.170152
drs_count,-0.262035,-0.030748,0.116791,1.0,0.181326,0.093337,0.096551,0.161027,0.172564,-0.075774,-0.187009,0.214456,-0.091069
speed,-0.601266,-0.812325,0.379607,0.181326,1.0,0.788852,0.868849,0.898277,0.919241,-0.16505,-0.979217,0.687289,-0.417256
throttle,-0.573314,-0.736232,0.305765,0.093337,0.788852,1.0,0.581208,0.904826,0.913125,-0.081941,-0.902132,0.658753,-0.416368
rpm,-0.479236,-0.728288,0.218538,0.096551,0.868849,0.581208,1.0,0.755304,0.780812,-0.076203,-0.870598,0.639699,-0.309729
i1_speed,-0.616488,-0.777503,0.393093,0.161027,0.898277,0.904826,0.755304,1.0,0.909137,-0.156227,-0.880982,0.655116,-0.430427
i2_speed,-0.605357,-0.768561,0.355831,0.172564,0.919241,0.913125,0.780812,0.909137,1.0,-0.087779,-0.911515,0.657308,-0.432869
is_pit_out_lap,-0.077558,0.112149,-0.093369,-0.075774,-0.16505,-0.081941,-0.076203,-0.156227,-0.087779,1.0,0.158444,-0.130261,0.27819


In [14]:
df.drop(['speed', 'i1_speed', 'i2_speed', 'st_speed'], axis=1, inplace=True)

In [15]:
df.isna().sum()

lap_number         0
brake_count        0
n_gear             0
drs_count          0
throttle           0
rpm                0
is_pit_out_lap     0
lap_duration      24
yellow_flag        0
dtype: int64

In [16]:
df.dtypes

lap_number          int64
brake_count         int64
n_gear            float64
drs_count           int64
throttle          float64
rpm               float64
is_pit_out_lap       bool
lap_duration      float64
yellow_flag         int64
dtype: object

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Assuming your DataFrame is named 'df' as implied by 'df.isna().sum()'
# If your DataFrame has a different name, replace 'df' accordingly.

# 1. Handle the 'is_pit_out_lap' boolean column
df['is_pit_out_lap'] = df['is_pit_out_lap'].astype(int)


# Strategy for handling nulls:
# - For 'i1_speed', 'lap_duration': Fill with the mean (assuming these are missing somewhat randomly)
# - For 'i2_speed', 'st_speed': Fill with the mean as well, given the low number of missing values

#df['i1_speed'].fillna(df['i1_speed'].mean(), inplace=True)
#df['i2_speed'].fillna(df['i2_speed'].mean(), inplace=True)
df['lap_duration'].fillna(df['lap_duration'].mean(), inplace=True)
#df['st_speed'].fillna(df['st_speed'].mean(), inplace=True)


# 3. Separate features (X) and target (y)
X = df.drop('lap_duration', axis=1)
y = df['lap_duration']

# 4. Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Scale the numerical features
# Identify numerical columns (excluding the target variable and potentially binary/categorical)
numerical_cols = X_train.select_dtypes(include=['float64', 'int64']).columns

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform both training and testing data
X_train_scaled = scaler.fit_transform(X_train[numerical_cols])
X_test_scaled = scaler.transform(X_test[numerical_cols])

# Convert the scaled arrays back to DataFrames (optional, but can be helpful for inspection)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=numerical_cols, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=numerical_cols, index=X_test.index)

print("\nScaled Training Data:")
print(X_train_scaled.head())

print("\nScaled Testing Data:")
print(X_test_scaled.head())

# 6. Model the lap duration using Linear Regression
model = LinearRegression()

# Train the model on the scaled training data
model.fit(X_train_scaled, y_train)

# Make predictions on the scaled testing data
y_pred = model.predict(X_test_scaled)

# 7. Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r_squared = r2_score(y_test, y_pred)


print(f"\nMean Squared Error on the test set: {mse:.2f}")
print(f"\nMean Absolute Error on the test set: {mae:.2f}")
print(f"R-squared on the test set: {r_squared:.2f}")

# You can further analyze the model coefficients if needed
print("\nModel Coefficients:")
print(pd.DataFrame(model.coef_, index=numerical_cols, columns=['Coefficient']))


Scaled Training Data:
      lap_number  brake_count    n_gear  drs_count  throttle       rpm  \
869     0.549264     0.892617 -0.365358   2.009731 -1.271687 -0.433201   
1280    1.671117     0.576993  0.012737  -0.457052 -0.004591 -0.403560   
1334    0.594138    -0.001649  0.269287  -0.457052  0.082063 -0.063829   
601    -1.425198     0.103558  0.421215  -0.148704  0.268194  0.310677   
590     0.504390    -0.527688  0.538757   2.626427  0.259202 -0.051885   

      is_pit_out_lap  yellow_flag  
869        -0.194091    -0.385766  
1280       -0.194091    -0.385766  
1334       -0.194091    -0.385766  
601        -0.194091    -0.385766  
590        -0.194091    -0.385766  

Scaled Testing Data:
      lap_number  brake_count    n_gear  drs_count  throttle       rpm  \
51      0.594138    -0.212065 -0.014653  -0.457052  0.021302  0.247477   
168    -1.066205    -1.053727  0.003932  -0.457052  0.295862  0.788798   
1468   -0.348219    -1.106331  0.234408  -0.457052  0.577132  0.641625  

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['lap_duration'].fillna(df['lap_duration'].mean(), inplace=True)
