# F1 Race Prediction Model

This notebook builds a machine learning model to predict F1 race results using the results.csv fact table and related CSV files. The final model will be saved as a .pkl file and wrapped in a FastAPI application.

## Project Overview
- **Data Source**: F1 CSV files (results.csv as fact table)
- **Goal**: Predict race finishing positions
- **Output**: Trained model (.pkl) + FastAPI application

## 1. Import Required Libraries

In [None]:
# Data processing and ML libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pickle
import warnings
warnings.filterwarnings('ignore')

# API libraries
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import uvicorn

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

print("All libraries imported successfully!")

## 2. Load and Explore Data

Load the CSV files and explore the data structure. The `results.csv` is our fact table that connects to other dimension tables.

In [None]:
# Data file paths
data_path = '../data/'

# Load CSV files
print("Loading F1 data...")
results_df = pd.read_csv(data_path + 'results.csv')
races_df = pd.read_csv(data_path + 'races.csv')
drivers_df = pd.read_csv(data_path + 'drivers.csv')
constructors_df = pd.read_csv(data_path + 'constructors.csv')
circuits_df = pd.read_csv(data_path + 'circuits.csv')
qualifying_df = pd.read_csv(data_path + 'qualifying.csv')

# Display basic info about the fact table (results)
print("Results DataFrame Info:")
print(f"Shape: {results_df.shape}")
print(f"Columns: {list(results_df.columns)}")
print("\nFirst few rows:")
results_df.head()

In [None]:
# Explore data quality and relationships
print("Data Quality Check:")
print(f"Results missing values: {results_df.isnull().sum().sum()}")
print(f"Races missing values: {races_df.isnull().sum().sum()}")
print(f"Drivers missing values: {drivers_df.isnull().sum().sum()}")

# Check target variable distribution (position)
print("\nPosition distribution:")
print(results_df['position'].value_counts().sort_index().head(10))

# Check for key relationships
print(f"\nUnique races in results: {results_df['raceId'].nunique()}")
print(f"Unique drivers in results: {results_df['driverId'].nunique()}")
print(f"Unique constructors in results: {results_df['constructorId'].nunique()}")

## 3. Data Preprocessing and Feature Engineering

Clean the data and create features that will help predict race positions.

In [None]:
# Clean and prepare the results data
results_clean = results_df.copy()

# Convert position to numeric, handle 'N' and '\N' values
results_clean['position'] = pd.to_numeric(results_clean['position'], errors='coerce')

# Filter out rows where position is NaN (DNF, DSQ, etc.) for training
# We want to predict finishing positions, not DNFs
results_clean = results_clean.dropna(subset=['position'])

# Convert grid position to numeric
results_clean['grid'] = pd.to_numeric(results_clean['grid'], errors='coerce')

# Handle missing grid positions (fill with max grid + 1, indicating back of grid)
max_grid = results_clean['grid'].max()
results_clean['grid'] = results_clean['grid'].fillna(max_grid + 1)

print(f"Cleaned results shape: {results_clean.shape}")
print(f"Position range: {results_clean['position'].min()} to {results_clean['position'].max()}")
print(f"Grid range: {results_clean['grid'].min()} to {results_clean['grid'].max()}")

## 4. Merge Tables and Create Dataset

Join the results table with other dimension tables to create a comprehensive dataset.

In [None]:
# Merge results with races to get race info
dataset = results_clean.merge(races_df, on='raceId', how='left')

# Merge with drivers to get driver info
dataset = dataset.merge(drivers_df, on='driverId', how='left')

# Merge with constructors to get constructor info
dataset = dataset.merge(constructors_df, on='constructorId', how='left')

# Merge with circuits to get circuit info
dataset = dataset.merge(circuits_df, on='circuitId', how='left')

# Add qualifying data (qualifying position as a feature)
qualifying_clean = qualifying_df.copy()
qualifying_clean['q1'] = pd.to_numeric(qualifying_clean['q1'], errors='coerce')
qualifying_clean['q2'] = pd.to_numeric(qualifying_clean['q2'], errors='coerce')
qualifying_clean['q3'] = pd.to_numeric(qualifying_clean['q3'], errors='coerce')

# Create a best qualifying time feature
def get_best_qualifying_time(row):
    times = [row['q1'], row['q2'], row['q3']]
    valid_times = [t for t in times if pd.notna(t)]
    return min(valid_times) if valid_times else None

qualifying_clean['best_qualifying_time'] = qualifying_clean.apply(get_best_qualifying_time, axis=1)

# Merge qualifying data
dataset = dataset.merge(
    qualifying_clean[['raceId', 'driverId', 'position', 'best_qualifying_time']], 
    on=['raceId', 'driverId'], 
    how='left',
    suffixes=('', '_qualifying')
)

print(f"Final dataset shape: {dataset.shape}")
print(f"Columns: {list(dataset.columns)}")

In [None]:
# Feature engineering - create meaningful features for prediction
# Select key features for the model
feature_columns = [
    'grid',  # Starting grid position
    'driverId',  # Driver ID (will be encoded)
    'constructorId',  # Constructor ID (will be encoded) 
    'circuitId',  # Circuit ID (will be encoded)
    'year',  # Year of race
    'round',  # Round number in season
    'position_qualifying',  # Qualifying position
    'best_qualifying_time'  # Best qualifying time
]

# Create the feature dataset
ml_dataset = dataset[feature_columns + ['position']].copy()

# Handle missing values
ml_dataset['position_qualifying'] = ml_dataset['position_qualifying'].fillna(20)  # Back of grid
ml_dataset['best_qualifying_time'] = ml_dataset['best_qualifying_time'].fillna(
    ml_dataset['best_qualifying_time'].max()  # Slowest time for missing
)

# Remove any remaining NaN values
ml_dataset = ml_dataset.dropna()

print(f"ML dataset shape: {ml_dataset.shape}")
print(f"Features: {feature_columns}")
print(f"Target: position")
print(f"\nDataset info:")
ml_dataset.info()

## 5. Train-Test Split

Split the dataset into training and testing sets for model evaluation.

In [None]:
# Prepare features and target
X = ml_dataset[feature_columns].copy()
y = ml_dataset['position'].copy()

# Encode categorical variables (IDs)
encoders = {}
categorical_features = ['driverId', 'constructorId', 'circuitId']

for feature in categorical_features:
    encoder = LabelEncoder()
    X[feature] = encoder.fit_transform(X[feature])
    encoders[feature] = encoder

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=None
)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"Training target distribution: min={y_train.min()}, max={y_train.max()}, mean={y_train.mean():.2f}")
print(f"Test target distribution: min={y_test.min()}, max={y_test.max()}, mean={y_test.mean():.2f}")

## 6. Model Training and Evaluation

Train multiple models and select the best performing one.

In [None]:
# Initialize models
models = {
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42)
}

# Train and evaluate models
model_results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    # Calculate metrics
    train_mae = mean_absolute_error(y_train, y_pred_train)
    test_mae = mean_absolute_error(y_test, y_pred_test)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    train_r2 = r2_score(y_train, y_pred_train)
    test_r2 = r2_score(y_test, y_pred_test)
    
    model_results[name] = {
        'model': model,
        'train_mae': train_mae,
        'test_mae': test_mae,
        'train_rmse': train_rmse,
        'test_rmse': test_rmse,
        'train_r2': train_r2,
        'test_r2': test_r2
    }
    
    print(f"  Train MAE: {train_mae:.3f}, Test MAE: {test_mae:.3f}")
    print(f"  Train RMSE: {train_rmse:.3f}, Test RMSE: {test_rmse:.3f}")
    print(f"  Train R²: {train_r2:.3f}, Test R²: {test_r2:.3f}")

# Select best model based on test MAE
best_model_name = min(model_results.keys(), key=lambda x: model_results[x]['test_mae'])
best_model = model_results[best_model_name]['model']

print(f"\n🏆 Best model: {best_model_name}")
print(f"Test MAE: {model_results[best_model_name]['test_mae']:.3f}")
print(f"Test R²: {model_results[best_model_name]['test_r2']:.3f}")

In [None]:
# Feature importance analysis
if hasattr(best_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'feature': feature_columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("\n📊 Feature Importance:")
    print(feature_importance)
    
    # Plot feature importance
    plt.figure(figsize=(10, 6))
    sns.barplot(data=feature_importance, x='importance', y='feature')
    plt.title(f'{best_model_name} - Feature Importance')
    plt.xlabel('Importance')
    plt.tight_layout()
    plt.show()

## 7. Save Model as Pickle File

Save the trained model and encoders for use in the API.

In [None]:
# Create model package with everything needed for prediction
model_package = {
    'model': best_model,
    'encoders': encoders,
    'feature_columns': feature_columns,
    'model_name': best_model_name,
    'model_metrics': model_results[best_model_name]
}

# Save the model package
model_path = '../models/f1_race_prediction_model.pkl'

# Create models directory if it doesn't exist
import os
os.makedirs('../models', exist_ok=True)

with open(model_path, 'wb') as f:
    pickle.dump(model_package, f)

print(f"✅ Model saved to: {model_path}")
print(f"Model type: {best_model_name}")
print(f"Test MAE: {model_results[best_model_name]['test_mae']:.3f}")
print(f"Features: {feature_columns}")

# Verify the saved model by loading it
with open(model_path, 'rb') as f:
    loaded_model_package = pickle.load(f)

print(f"\n✅ Model verification successful!")
print(f"Loaded model type: {type(loaded_model_package['model'])}")