In [2]:
import pandas as pd
df = pd.read_csv('Data3001Modelling.csv')

In [4]:
# Select only numeric columns
numeric_df = df.select_dtypes(include=['float64', 'int64'])

# Calculate correlations with the target variable
correlations = numeric_df.corr()['SECTION_TIME_MS'].sort_values(ascending=False)

# View the top and bottom correlations
print(correlations.head(15))  # High positive correlations
print(correlations.tail(15))  # High negative correlations


SECTION_TIME_MS        1.000000
522_CURR_LAPTIME       0.791883
495_CURR_LAPTIME       0.689533
MS_CURR_LAPTIME        0.654839
MT_CURR_LAPTIME        0.613664
468_CURR_LAPTIME       0.599119
MB_CURR_LAPTIME        0.552249
414_CURR_LAPTIME       0.543696
441_CURR_LAPTIME       0.536246
MS_DIST_APEX_1         0.522796
MS_LAP_DIST            0.432225
441_DIST_FROM_LEFT     0.425526
387_CURR_LAPTIME       0.404153
414_DIST_APEX_1        0.390600
414_DIST_FROM_RIGHT    0.364361
Name: SECTION_TIME_MS, dtype: float64
TOTAL_THROTTLE_495_522   -0.420383
MT_SPEED                 -0.421027
468_ENGINE_RPM           -0.444573
522_ENGINE_RPM           -0.450485
495_ENGINE_RPM           -0.454844
MS_GEAR                  -0.460889
MS_YPOS                  -0.472515
441_SPEED                -0.486042
468_GEAR                 -0.501002
MS_SPEED                 -0.519708
495_GEAR                 -0.547411
468_SPEED                -0.554445
495_SPEED                -0.578654
522_GEAR                 -0

In [5]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Prepare data
X = df.drop(['SECTION_TIME_MS', 'SESSION_IDENTIFIER', 'LAP_NUM'], axis=1)
y = df['SECTION_TIME_MS']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit Random Forest
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Get feature importances
feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': model.feature_importances_
}).sort_values(by='Importance', ascending=False)

# Display top features
print(feature_importances.head(15))


                   Feature  Importance
57         MS_CURR_LAPTIME    0.376648
201       522_CURR_LAPTIME    0.262919
196              522_SPEED    0.037590
60          MS_DIST_APEX_1    0.032487
185       495_CURR_LAPTIME    0.016800
164              468_SPEED    0.014424
165           468_LAP_DIST    0.013875
89         MT_CURR_LAPTIME    0.013414
213  TOTAL_BRAKING_414_441    0.013254
101           360_LAP_DIST    0.007608
81      MT_DIST_FROM_RIGHT    0.007576
51                 MS_YPOS    0.006793
54             MS_THROTTLE    0.006488
120           387_STEERING    0.006407
72             FT_STEERING    0.005694


In [6]:
from sklearn.feature_selection import VarianceThreshold

# Set a variance threshold
selector = VarianceThreshold(threshold=0.01)
selector.fit(X)

# Keep only high-variance features
high_variance_features = X.columns[selector.get_support()]
print("High-variance features:", high_variance_features)


High-variance features: Index(['FB_DIST_FROM_LEFT', 'FB_DIST_FROM_RIGHT', 'FB_XPOS', 'FB_YPOS',
       'FB_SPEED', 'FB_LAP_DIST', 'FB_THROTTLE', 'FB_BRAKE', 'FB_CURR_LAPTIME',
       'FB_GEAR',
       ...
       'TOTAL_THROTTLE_387_414', 'TOTAL_BRAKING_414_441',
       'TOTAL_THROTTLE_414_441', 'TOTAL_BRAKING_441_468',
       'TOTAL_THROTTLE_441_468', 'TOTAL_BRAKING_468_495',
       'TOTAL_THROTTLE_468_495', 'TOTAL_BRAKING_495_522',
       'TOTAL_THROTTLE_495_522', 'YEAR'],
      dtype='object', length=216)


In [7]:
from scipy.stats import f_oneway

# Apply ANOVA for selected groups of features based on specific key moments or points
anova_results = {}
for feature in X.columns:
    anova_results[feature] = f_oneway(X[feature], y).pvalue

# Filter features with p-values below a threshold (e.g., 0.05)
significant_features = [feature for feature, p_val in anova_results.items() if p_val < 0.05]
print("Significant features from ANOVA:", significant_features)


Significant features from ANOVA: ['FB_DIST_FROM_LEFT', 'FB_DIST_FROM_RIGHT', 'FB_XPOS', 'FB_YPOS', 'FB_SPEED', 'FB_LAP_DIST', 'FB_THROTTLE', 'FB_BRAKE', 'FB_STEERING', 'FB_CURR_LAPTIME', 'FB_GEAR', 'FB_ENGINE_RPM', 'FB_DIST_APEX_1', 'FB_DIST_APEX_2', 'FB_ANGLE_APEX_1', 'FB_ANGLE_APEX_2', 'MB_DIST_FROM_LEFT', 'MB_DIST_FROM_RIGHT', 'MB_XPOS', 'MB_YPOS', 'MB_SPEED', 'MB_LAP_DIST', 'MB_THROTTLE', 'MB_BRAKE', 'MB_STEERING', 'MB_CURR_LAPTIME', 'MB_GEAR', 'MB_ENGINE_RPM', 'MB_DIST_APEX_1', 'MB_DIST_APEX_2', 'MB_ANGLE_APEX_1', 'MB_ANGLE_APEX_2', 'FS_DIST_FROM_LEFT', 'FS_DIST_FROM_RIGHT', 'FS_XPOS', 'FS_YPOS', 'FS_SPEED', 'FS_LAP_DIST', 'FS_THROTTLE', 'FS_BRAKE', 'FS_STEERING', 'FS_CURR_LAPTIME', 'FS_GEAR', 'FS_ENGINE_RPM', 'FS_DIST_APEX_1', 'FS_DIST_APEX_2', 'FS_ANGLE_APEX_1', 'FS_ANGLE_APEX_2', 'MS_DIST_FROM_LEFT', 'MS_DIST_FROM_RIGHT', 'MS_XPOS', 'MS_YPOS', 'MS_SPEED', 'MS_LAP_DIST', 'MS_THROTTLE', 'MS_BRAKE', 'MS_STEERING', 'MS_CURR_LAPTIME', 'MS_GEAR', 'MS_ENGINE_RPM', 'MS_DIST_APEX_1', 'M