In [26]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split

# Load the dataset
final_dataset = pd.read_csv('cleaned_data/final_dataset.csv')

# Create additional features
def add_features(df):
    df['grid_squared'] = df['grid'] ** 2
    df['laps_per_grid'] = df['laps'] / (df['grid'] + 1)
    df['points_per_grid'] = df['points'] / (df['grid'] + 1)
    return df

# Add features to the dataset
final_dataset_encoded = pd.get_dummies(final_dataset, columns=['circuit_name', 'location', 'country', 'surname', 'nationality', 'constructor_name', 'status'])
final_dataset_encoded = add_features(final_dataset_encoded)

# Split data into features and target
X = final_dataset_encoded.drop(columns=['positionOrder'])
y = final_dataset_encoded['positionOrder']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [28]:
# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 6]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42), param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print(f'Best Parameters: {best_params}')

# Train the model with the best parameters
model = RandomForestClassifier(**best_params, random_state=42)
model.fit(X_train, y_train)

# Perform cross-validation
cv_scores = cross_val_score(model, X_train, y_train, cv=5)
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean Cross-Validation Score: {cv_scores.mean()}')

# Evaluate the model on the test set
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
print(f'Model Accuracy: {accuracy}')
print(f'Model F1 Score: {f1}')


Fitting 5 folds for each of 256 candidates, totalling 1280 fits
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.7s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.8s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   1.0s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   1.0s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   1.0s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.0s
[CV] END max_d

In [29]:
# Create a template with all possible features
def create_features_template(encoded_data):
    template = pd.DataFrame(columns=encoded_data.columns)
    return template

# Create a template based on the training data
input_template = create_features_template(X_train)

# Create features based on inputs
def create_features(race_id, driver_id, grid_position, laps, weather_condition, template):
    features = template.copy()
    features.loc[0] = 0  # Initialize all features to zero
    features.at[0, 'raceId'] = race_id
    features.at[0, 'driverId'] = driver_id
    features.at[0, 'grid'] = grid_position
    features.at[0, 'grid_squared'] = grid_position ** 2
    features.at[0, 'laps_per_grid'] = laps / (grid_position + 1)
    features.at[0, 'points_per_grid'] = 0  # Placeholder, needs actual points data if available

    # Handle one-hot encoding for weather condition
    if f'weather_condition_{weather_condition}' in features.columns:
        features.at[0, f'weather_condition_{weather_condition}'] = 1
    
    return features.fillna(0)

# Example user input
race_id = 1077  # Example race ID
driver_id = 830  # Example driver ID
grid_position = 1  # Example grid position
laps = 50  # Example number of laps
weather_condition = 'Neutral'  # Example weather condition

input_features = create_features(race_id, driver_id, grid_position, laps, weather_condition, input_template)

# Predict the grid order
predicted_position = model.predict(input_features)
print(f'Predicted Position for Driver {driver_id} in Race {race_id} starting at Grid {grid_position}: {predicted_position[0]}')


Predicted Position for Driver 830 in Race 1077 starting at Grid 1: 20
