In [105]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import confusion_matrix, precision_score
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from imblearn.over_sampling import RandomOverSampler
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from keras.callbacks import EarlyStopping
import os

In [102]:
# Read data
current_dir = os.getcwd()
prepared_path = os.path.normpath(os.path.join(current_dir, '..', '..', 'Data', 'Prepared'))
data = pd.read_csv(os.path.join(prepared_path, 'F1_prepared.csv'))

# Convert variables to categorical types
columns_to_factor = [
    'grid_t1', 'grid', 'driverstandings_position', 'teammates_driverstanding',
    'driverstandings_wins', 'constructorstandings_wins', 'quarter'
]
data[columns_to_factor] = data[columns_to_factor].astype('category')

# Select relevant columns
selected_columns = [
    'results_position_t1', 'drivers_takeover_chance', 'grid_t1', 'grid',
    'diff_grid_standing', 'teammates_driverstanding', 'overtakes_per_track_t1', 'drivers_defense',
    'driverstandings_wins', 'constructorstandings_wins', 'driverstandings_position', 
    'teammates_defense', 'teammates_takeover_chance', 'date_diff', 'is_round_1', 'year', 'date', 'raceId', 'driverId', 'round'
]

data = data[selected_columns].dropna()

# Label encode categorical columns
label_encoders = {}
for col in data.select_dtypes(['category']).columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Sort data by date and raceId
data = data.sort_values(by=['date'])

# Split data into train and test sets based on year
train_data = data[(data['year'] > 1990) & (data['year'] < 2023)]
test_data = data[data['year'] >= 2023]

X_train = train_data.drop(['results_position_t1', 'year', 'raceId', 'date'], axis=1)
y_train = train_data['results_position_t1']

X_test = test_data.drop(['results_position_t1', 'year', 'raceId', 'date'], axis=1)
y_test = test_data['results_position_t1']

# Check the distribution of the target variable
print("Training target distribution before oversampling:")
print(y_train.value_counts())

# Oversample the minority class
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

# Check the distribution of the target variable after oversampling
print("Training target distribution after oversampling:")
print(pd.Series(y_resampled).value_counts())

# Scale the features
scaler = MinMaxScaler()
X_resampled = scaler.fit_transform(X_resampled)
X_test = scaler.transform(X_test)

# Reshape for LSTM
# The reshaping should consider the sequences based on the 'date_diff'
def reshape_for_lstm(X, sequence_length=20):
    X_lstm = []
    for i in range(0, len(X) - sequence_length + 1):
        X_lstm.append(X[i:i + sequence_length])
    return np.array(X_lstm)

X_train_reshaped = reshape_for_lstm(X_resampled)
X_test_reshaped = reshape_for_lstm(X_test)

# Adjust y_resampled and y_test for LSTM input
def reshape_target(y, sequence_length=20):
    y_lstm = []
    for i in range(sequence_length - 1, len(y)):
        y_lstm.append(y[i])
    return np.array(y_lstm)

y_train_reshaped = reshape_target(y_resampled)
y_test_reshaped = reshape_target(y_test.values)

# Verify target variable distribution in the test set
print("Target variable distribution in the test set:")
print(y_test.value_counts())

Training target distribution before oversampling:
results_position_t1
0    10122
1     1643
Name: count, dtype: int64
Training target distribution after oversampling:
results_position_t1
0    10122
1    10122
Name: count, dtype: int64
Target variable distribution in the test set:
results_position_t1
0    381
1     69
Name: count, dtype: int64


  data = pd.read_csv(os.path.join(prepared_path, 'F1_prepared.csv'))


# Model trainer and tester LSTM

In [103]:
# Function to create LSTM model with hyperparameters
def create_model(neurons, dropout_rate, optimizer):
    model = Sequential()
    model.add(LSTM(neurons, input_shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2]), return_sequences=True))
    model.add(Dropout(dropout_rate))
    model.add(LSTM(neurons))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Wrap Keras model for use in scikit-learn
model = KerasClassifier(build_fn=create_model, verbose=1)

# Define hyperparameters grid
param_grid = {
    'neurons': [20],
    'dropout_rate': [0.3],
    'optimizer': ['rmsprop'],
    'batch_size': [8],
    'epochs': [5]
}

# Implement grid search with cross-validation
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='precision')
grid_result = grid.fit(X_train_reshaped, y_train_reshaped, callbacks=[EarlyStopping(monitor='loss', patience=3)])

# Output best parameters and best score
print(f"Best parameters: {grid_result.best_params_}")
print(f"Best precision score: {grid_result.best_score_}")

# Predict on the test set with the best model
best_model = grid_result.best_estimator_
simple_probabilities = best_model.predict_proba(X_test_reshaped)[:, 1]

  model = KerasClassifier(build_fn=create_model, verbose=1)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Best parameters: {'batch_size': 8, 'dropout_rate': 0.3, 'epochs': 5, 'neurons': 20, 'optimizer': 'rmsprop'}
Best precision score: 0.8486522217910005


In [104]:
# Ensure that y_test_reshaped matches the length of X_test_reshaped
y_test_reshaped = y_test_reshaped[:len(X_test_reshaped)]

# Create a data frame with the predictions and X_test values
simple_values = pd.DataFrame(X_test[:len(simple_probabilities)])
simple_values['predicted_probability'] = simple_probabilities
simple_values['results_position_t1'] = y_test_reshaped

# Group by 'raceId' and create 'top3' variable
simple_values['raceId'] = test_data.reset_index().loc[simple_values.index, 'raceId']
simple_values['top3'] = simple_values.groupby('raceId')['predicted_probability'].rank(method='min', ascending=False) <= 3

# Create the confusion matrix
simple_cm = confusion_matrix(simple_values['results_position_t1'], simple_values['top3'])

print("Confusion Matrix with Best Model:")
print(simple_cm)

# Calculate and print precision
precision = precision_score(simple_values['results_position_t1'], simple_values['top3'])
print(f"Precision: {precision}")

# Check some of the predictions
print("Predicted probabilities:", simple_probabilities[:10])
print("Actual test labels:", y_test_reshaped[:10])

Confusion Matrix with Best Model:
[[343  22]
 [ 21  45]]
Precision: 0.6716417910447762
Predicted probabilities: [0.0400831  0.00360181 0.00239264 0.00624136 0.0528387  0.08432904
 0.04975397 0.28000182 0.00140125 0.08636601]
Actual test labels: [0 0 0 0 0 1 0 1 0 0]


# XGBoost