In [1]:
# ========================================
# F1 Race Winner Predictor V2
# Section 1: Setup & Configuration
# ========================================

import os
import warnings
warnings.filterwarnings('ignore')

# Core Data Science
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# F1 Data
import fastf1
from fastf1.ergast import Ergast

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import xgboost as xgb
import lightgbm as lgb

# MLflow for experiment tracking
import mlflow
import mlflow.sklearn

# Model Explainability
import shap

# Utils
import time
from datetime import datetime
from dotenv import load_dotenv
import requests

# Load environment variables
load_dotenv()

# Configure FastF1
fastf1.ergast.interface.BASE_URL = "https://api.jolpi.ca/ergast/f1"

# Create cache directory if it doesn't exist
cache_dir = '../data/raw/f1_cache'
if not os.path.exists(cache_dir):
    os.makedirs(cache_dir)
fastf1.Cache.enable_cache(cache_dir)

# Configure MLflow
mlflow.set_tracking_uri("../mlruns")
mlflow.set_experiment("F1_Race_Prediction_V2")

# Plot styling
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("✅ All libraries imported successfully!")
print(f"📦 FastF1 Version: {fastf1.__version__}")
print(f"📦 XGBoost Version: {xgb.__version__}")
print(f"📦 Pandas Version: {pd.__version__}")
print(f"📊 MLflow Tracking URI: {mlflow.get_tracking_uri()}")
print(f"💾 Cache Directory: {cache_dir}")
print("\n🏎️ F1 Race Predictor V2 - Ready to start!")


2025/10/25 11:33:47 INFO mlflow.tracking.fluent: Experiment with name 'F1_Race_Prediction_V2' does not exist. Creating a new experiment.


✅ All libraries imported successfully!
📦 FastF1 Version: 3.6.1
📦 XGBoost Version: 3.1.1
📦 Pandas Version: 2.3.3
📊 MLflow Tracking URI: ../mlruns
💾 Cache Directory: ../data/raw/f1_cache

🏎️ F1 Race Predictor V2 - Ready to start!


In [5]:
# ========================================
# Section 2A: Data Collection - Race Results
# ========================================

def collect_race_results(start_year=2020, end_year=2025):
    """
    Collect race results from Jolpica F1 API

    Parameters:
    - start_year: First season to collect (default: 2020)
    - end_year: Last season to collect (default: 2025)

    Returns:
    - DataFrame with all race results
    """
    ergast = Ergast()
    all_race_results = []

    print(f"📊 Collecting race results ({start_year}-{end_year})...\n")

    for year in range(start_year, end_year + 1):
        print(f"Fetching {year} season data...")

        # Get race schedule for this year
        try:
            races = ergast.get_race_schedule(year)
            total_races = len(races)
            print(f"   Total races in {year}: {total_races}")
        except Exception as e:
            print(f"   ❌ Could not fetch {year} schedule: {e}")
            continue

        # Collect each race
        for race_round in range(1, total_races + 1):
            try:
                race_results = ergast.get_race_results(season=year, round=race_round)

                if race_results.content and len(race_results.content) > 0:
                    race_df = race_results.content[0]
                    race_df['season'] = year
                    race_df['round'] = race_round
                    all_race_results.append(race_df)
                    print(f"   ✅ Round {race_round}: {len(race_df)} drivers")
                else:
                    print(f"   ⚠️  Round {race_round}: Not completed yet")

                # Rate limiting
                time.sleep(2)

            except Exception as e:
                if "Too Many Requests" in str(e):
                    print(f"   ⏳ Rate limited at Round {race_round}. Waiting 10 seconds...")
                    time.sleep(10)
                else:
                    print(f"   ❌ Round {race_round}: {e}")
                continue

        print()

    # Combine all results
    if len(all_race_results) == 0:
        print("❌ No race data collected")
        return None

    full_dataset = pd.concat(all_race_results, ignore_index=True)

    print(f"✅ Race data collection complete!")
    print(f"📊 Total races: {len(all_race_results)}")
    print(f"📊 Total records: {len(full_dataset)}")
    print(f"📊 Unique drivers: {full_dataset['driverCode'].nunique()}")
    print(f"📊 Unique constructors: {full_dataset['constructorName'].nunique()}")

    # Save to CSV
    output_path = '../data/raw/race_results_2020_2025.csv'
    full_dataset.to_csv(output_path, index=False)
    print(f"💾 Saved to: {output_path}")

    return full_dataset

# Test the function
print("Testing race results collection...\n")
race_data = collect_race_results(start_year=2020, end_year=2025)

# Display sample
if race_data is not None:
    print("\n📋 Sample Data (first 10 rows):")
    display(race_data[['season', 'round', 'driverCode', 'constructorName', 'position', 'points']].head(10))


Testing race results collection...

📊 Collecting race results (2020-2025)...

Fetching 2020 season data...
   Total races in 2020: 17
   ✅ Round 1: 20 drivers
   ✅ Round 2: 20 drivers
   ✅ Round 3: 20 drivers
   ✅ Round 4: 20 drivers
   ✅ Round 5: 20 drivers
   ✅ Round 6: 20 drivers
   ✅ Round 7: 20 drivers
   ✅ Round 8: 20 drivers
   ✅ Round 9: 20 drivers
   ✅ Round 10: 20 drivers
   ✅ Round 11: 20 drivers
   ✅ Round 12: 20 drivers
   ✅ Round 13: 20 drivers
   ✅ Round 14: 20 drivers
   ✅ Round 15: 20 drivers
   ✅ Round 16: 20 drivers
   ✅ Round 17: 20 drivers

Fetching 2021 season data...
   Total races in 2021: 22
   ✅ Round 1: 20 drivers
   ✅ Round 2: 20 drivers
   ✅ Round 3: 20 drivers
   ✅ Round 4: 20 drivers
   ✅ Round 5: 20 drivers
   ✅ Round 6: 20 drivers
   ✅ Round 7: 20 drivers
   ✅ Round 8: 20 drivers
   ✅ Round 9: 20 drivers
   ✅ Round 10: 20 drivers
   ✅ Round 11: 20 drivers
   ✅ Round 12: 20 drivers
   ✅ Round 13: 20 drivers
   ✅ Round 14: 20 drivers
   ✅ Round 15: 20 dri

Unnamed: 0,season,round,driverCode,constructorName,position,points
0,2020,1,BOT,Mercedes,1,25.0
1,2020,1,LEC,Ferrari,2,18.0
2,2020,1,NOR,McLaren,3,16.0
3,2020,1,HAM,Mercedes,4,12.0
4,2020,1,SAI,McLaren,5,10.0
5,2020,1,PER,Racing Point,6,8.0
6,2020,1,GAS,AlphaTauri,7,6.0
7,2020,1,OCO,Renault,8,4.0
8,2020,1,GIO,Alfa Romeo,9,2.0
9,2020,1,VET,Ferrari,10,1.0


In [4]:
# ========================================
# Section 2B: Data Collection - Qualifying Results
# ========================================

def collect_qualifying_results(start_year=2020, end_year=2025):
    """
    Collect qualifying results from Jolpica F1 API

    Parameters:
    - start_year: First season to collect (default: 2020)
    - end_year: Last season to collect (default: 2025)

    Returns:
    - DataFrame with all qualifying results
    """
    ergast = Ergast()
    all_qualifying_results = []

    print(f"📊 Collecting qualifying results ({start_year}-{end_year})...\n")

    for year in range(start_year, end_year + 1):
        print(f"Fetching {year} qualifying data...")

        try:
            races = ergast.get_race_schedule(year)
            total_races = len(races)
        except Exception as e:
            print(f"   ❌ Could not fetch {year} schedule: {e}")
            continue

        for race_round in range(1, total_races + 1):
            try:
                quali_results = ergast.get_qualifying_results(season=year, round=race_round)

                if quali_results.content and len(quali_results.content) > 0:
                    quali_df = quali_results.content[0]
                    quali_df['season'] = year
                    quali_df['round'] = race_round
                    all_qualifying_results.append(quali_df)
                    print(f"   ✅ Round {race_round}: {len(quali_df)} drivers")
                else:
                    print(f"   ⚠️  Round {race_round}: Not available yet")

                time.sleep(2)

            except Exception as e:
                if "Too Many Requests" in str(e):
                    print(f"   ⏳ Rate limited at Round {race_round}. Waiting 10 seconds...")
                    time.sleep(10)
                else:
                    print(f"   ❌ Round {race_round}: {e}")
                continue

        print()

    if len(all_qualifying_results) == 0:
        print("❌ No qualifying data collected")
        return None

    full_dataset = pd.concat(all_qualifying_results, ignore_index=True)

    print(f"✅ Qualifying data collection complete!")
    print(f"📊 Total qualifying sessions: {len(all_qualifying_results)}")
    print(f"📊 Total records: {len(full_dataset)}")

    # Save to CSV
    output_path = '../data/raw/qualifying_results_2020_2025.csv'
    full_dataset.to_csv(output_path, index=False)
    print(f"💾 Saved to: {output_path}")

    return full_dataset

# Collect qualifying data
print("Collecting qualifying data...\n")
quali_data = collect_qualifying_results(start_year=2020, end_year=2025)

# Display sample
if quali_data is not None:
    print("\n📋 Sample Qualifying Data (first 10 rows):")
    display(quali_data[['season', 'round', 'driverCode', 'constructorName', 'position', 'Q1', 'Q2', 'Q3']].head(10))


Collecting qualifying data...

📊 Collecting qualifying results (2020-2025)...

Fetching 2020 qualifying data...
   ✅ Round 1: 20 drivers
   ✅ Round 2: 20 drivers
   ✅ Round 3: 20 drivers
   ✅ Round 4: 20 drivers
   ✅ Round 5: 20 drivers
   ✅ Round 6: 20 drivers
   ✅ Round 7: 20 drivers
   ✅ Round 8: 20 drivers
   ✅ Round 9: 20 drivers
   ✅ Round 10: 20 drivers
   ✅ Round 11: 20 drivers
   ✅ Round 12: 20 drivers
   ✅ Round 13: 20 drivers
   ✅ Round 14: 20 drivers
   ✅ Round 15: 20 drivers
   ✅ Round 16: 20 drivers
   ✅ Round 17: 20 drivers

Fetching 2021 qualifying data...
   ✅ Round 1: 20 drivers
   ✅ Round 2: 20 drivers
   ✅ Round 3: 20 drivers
   ✅ Round 4: 20 drivers
   ✅ Round 5: 19 drivers
   ✅ Round 6: 20 drivers
   ✅ Round 7: 20 drivers
   ✅ Round 8: 20 drivers
   ✅ Round 9: 20 drivers
   ✅ Round 10: 20 drivers
   ✅ Round 11: 20 drivers
   ✅ Round 12: 20 drivers
   ✅ Round 13: 20 drivers
   ✅ Round 14: 20 drivers
   ✅ Round 15: 20 drivers
   ✅ Round 16: 20 drivers
   ✅ Round 17:

Unnamed: 0,season,round,driverCode,constructorName,position,Q1,Q2,Q3
0,2020,1,BOT,Mercedes,1,0 days 00:01:04.111000,0 days 00:01:03.015000,0 days 00:01:02.939000
1,2020,1,HAM,Mercedes,2,0 days 00:01:04.198000,0 days 00:01:03.096000,0 days 00:01:02.951000
2,2020,1,VER,Red Bull,3,0 days 00:01:04.024000,0 days 00:01:04,0 days 00:01:03.477000
3,2020,1,NOR,McLaren,4,0 days 00:01:04.606000,0 days 00:01:03.819000,0 days 00:01:03.626000
4,2020,1,ALB,Red Bull,5,0 days 00:01:04.661000,0 days 00:01:03.746000,0 days 00:01:03.868000
5,2020,1,PER,Racing Point,6,0 days 00:01:04.543000,0 days 00:01:03.860000,0 days 00:01:03.868000
6,2020,1,LEC,Ferrari,7,0 days 00:01:04.500000,0 days 00:01:04.041000,0 days 00:01:03.923000
7,2020,1,SAI,McLaren,8,0 days 00:01:04.537000,0 days 00:01:03.971000,0 days 00:01:03.971000
8,2020,1,STR,Racing Point,9,0 days 00:01:04.309000,0 days 00:01:03.955000,0 days 00:01:04.029000
9,2020,1,RIC,Renault,10,0 days 00:01:04.556000,0 days 00:01:04.023000,0 days 00:01:04.239000


In [8]:
# ========================================
# Section 2C: Complete Circuit Metadata (2020-2025)
# ========================================

def create_complete_circuit_metadata():
    """
    Complete circuit characteristics for ALL tracks used in F1 (2020-2025)
    Covers all 24+ circuits from the past 6 seasons
    """

    circuits = {
        # Traditional Permanent Circuits
        'Albert Park Grand Prix Circuit': {
            'country': 'Australia', 'city': 'Melbourne', 'circuit_type': 'street',
            'length_km': 5.278, 'corners': 14, 'straights_length_m': 850,
            'elevation_change_m': 16, 'avg_speed_kmh': 223, 'overtaking_difficulty': 'medium',
            'tire_wear': 'medium', 'downforce_level': 'medium'
        },
        'Shanghai International Circuit': {
            'country': 'China', 'city': 'Shanghai', 'circuit_type': 'permanent',
            'length_km': 5.451, 'corners': 16, 'straights_length_m': 1170,
            'elevation_change_m': 7, 'avg_speed_kmh': 207, 'overtaking_difficulty': 'medium',
            'tire_wear': 'high', 'downforce_level': 'medium'
        },
        'Bahrain International Circuit': {
            'country': 'Bahrain', 'city': 'Sakhir', 'circuit_type': 'permanent',
            'length_km': 5.412, 'corners': 15, 'straights_length_m': 1090,
            'elevation_change_m': 31, 'avg_speed_kmh': 202, 'overtaking_difficulty': 'easy',
            'tire_wear': 'high', 'downforce_level': 'low'
        },
        'Jeddah Corniche Circuit': {
            'country': 'Saudi Arabia', 'city': 'Jeddah', 'circuit_type': 'street',
            'length_km': 6.174, 'corners': 27, 'straights_length_m': 1000,
            'elevation_change_m': 18, 'avg_speed_kmh': 252, 'overtaking_difficulty': 'medium',
            'tire_wear': 'medium', 'downforce_level': 'low'
        },
        'Suzuka Circuit': {
            'country': 'Japan', 'city': 'Suzuka', 'circuit_type': 'permanent',
            'length_km': 5.807, 'corners': 18, 'straights_length_m': 800,
            'elevation_change_m': 44, 'avg_speed_kmh': 223, 'overtaking_difficulty': 'medium',
            'tire_wear': 'high', 'downforce_level': 'high'
        },
        'Miami International Autodrome': {
            'country': 'USA', 'city': 'Miami', 'circuit_type': 'street',
            'length_km': 5.412, 'corners': 19, 'straights_length_m': 1070,
            'elevation_change_m': 8, 'avg_speed_kmh': 225, 'overtaking_difficulty': 'medium',
            'tire_wear': 'high', 'downforce_level': 'low'
        },
        'Autodromo Enzo e Dino Ferrari': {
            'country': 'Italy', 'city': 'Imola', 'circuit_type': 'permanent',
            'length_km': 4.909, 'corners': 19, 'straights_length_m': 700,
            'elevation_change_m': 37, 'avg_speed_kmh': 211, 'overtaking_difficulty': 'hard',
            'tire_wear': 'medium', 'downforce_level': 'medium'
        },
        'Circuit de Monaco': {
            'country': 'Monaco', 'city': 'Monte Carlo', 'circuit_type': 'street',
            'length_km': 3.337, 'corners': 19, 'straights_length_m': 550,
            'elevation_change_m': 42, 'avg_speed_kmh': 162, 'overtaking_difficulty': 'very_hard',
            'tire_wear': 'low', 'downforce_level': 'very_high'
        },
        'Circuit de Barcelona-Catalunya': {
            'country': 'Spain', 'city': 'Barcelona', 'circuit_type': 'permanent',
            'length_km': 4.657, 'corners': 14, 'straights_length_m': 1047,
            'elevation_change_m': 38, 'avg_speed_kmh': 195, 'overtaking_difficulty': 'hard',
            'tire_wear': 'high', 'downforce_level': 'high'
        },
        'Circuit Gilles Villeneuve': {
            'country': 'Canada', 'city': 'Montreal', 'circuit_type': 'semi-permanent',
            'length_km': 4.361, 'corners': 14, 'straights_length_m': 980,
            'elevation_change_m': 12, 'avg_speed_kmh': 220, 'overtaking_difficulty': 'easy',
            'tire_wear': 'low', 'downforce_level': 'low'
        },
        'Red Bull Ring': {
            'country': 'Austria', 'city': 'Spielberg', 'circuit_type': 'permanent',
            'length_km': 4.318, 'corners': 10, 'straights_length_m': 700,
            'elevation_change_m': 65, 'avg_speed_kmh': 237, 'overtaking_difficulty': 'medium',
            'tire_wear': 'medium', 'downforce_level': 'low'
        },
        'Silverstone Circuit': {
            'country': 'UK', 'city': 'Silverstone', 'circuit_type': 'permanent',
            'length_km': 5.891, 'corners': 18, 'straights_length_m': 870,
            'elevation_change_m': 17, 'avg_speed_kmh': 235, 'overtaking_difficulty': 'medium',
            'tire_wear': 'high', 'downforce_level': 'high'
        },
        'Hungaroring': {
            'country': 'Hungary', 'city': 'Budapest', 'circuit_type': 'permanent',
            'length_km': 4.381, 'corners': 14, 'straights_length_m': 650,
            'elevation_change_m': 43, 'avg_speed_kmh': 195, 'overtaking_difficulty': 'very_hard',
            'tire_wear': 'medium', 'downforce_level': 'very_high'
        },
        'Circuit de Spa-Francorchamps': {
            'country': 'Belgium', 'city': 'Spa', 'circuit_type': 'permanent',
            'length_km': 7.004, 'corners': 20, 'straights_length_m': 730,
            'elevation_change_m': 104, 'avg_speed_kmh': 237, 'overtaking_difficulty': 'medium',
            'tire_wear': 'medium', 'downforce_level': 'medium'
        },
        'Circuit Park Zandvoort': {
            'country': 'Netherlands', 'city': 'Zandvoort', 'circuit_type': 'permanent',
            'length_km': 4.259, 'corners': 14, 'straights_length_m': 580,
            'elevation_change_m': 4, 'avg_speed_kmh': 222, 'overtaking_difficulty': 'hard',
            'tire_wear': 'high', 'downforce_level': 'high'
        },
        'Autodromo Nazionale di Monza': {
            'country': 'Italy', 'city': 'Monza', 'circuit_type': 'permanent',
            'length_km': 5.793, 'corners': 11, 'straights_length_m': 1200,
            'elevation_change_m': 25, 'avg_speed_kmh': 264, 'overtaking_difficulty': 'easy',
            'tire_wear': 'low', 'downforce_level': 'very_low'
        },
        'Baku City Circuit': {
            'country': 'Azerbaijan', 'city': 'Baku', 'circuit_type': 'street',
            'length_km': 6.003, 'corners': 20, 'straights_length_m': 2200,
            'elevation_change_m': 21, 'avg_speed_kmh': 221, 'overtaking_difficulty': 'medium',
            'tire_wear': 'medium', 'downforce_level': 'low'
        },
        'Marina Bay Street Circuit': {
            'country': 'Singapore', 'city': 'Singapore', 'circuit_type': 'street',
            'length_km': 4.94, 'corners': 23, 'straights_length_m': 820,
            'elevation_change_m': 15, 'avg_speed_kmh': 173, 'overtaking_difficulty': 'hard',
            'tire_wear': 'medium', 'downforce_level': 'very_high'
        },
        'Circuit of the Americas': {
            'country': 'USA', 'city': 'Austin', 'circuit_type': 'permanent',
            'length_km': 5.513, 'corners': 20, 'straights_length_m': 1200,
            'elevation_change_m': 41, 'avg_speed_kmh': 210, 'overtaking_difficulty': 'medium',
            'tire_wear': 'medium', 'downforce_level': 'medium'
        },
        'Autódromo Hermanos Rodríguez': {
            'country': 'Mexico', 'city': 'Mexico City', 'circuit_type': 'permanent',
            'length_km': 4.304, 'corners': 17, 'straights_length_m': 1200,
            'elevation_change_m': 14, 'avg_speed_kmh': 217, 'overtaking_difficulty': 'medium',
            'tire_wear': 'medium', 'downforce_level': 'low'
        },
        'Autódromo José Carlos Pace': {
            'country': 'Brazil', 'city': 'São Paulo', 'circuit_type': 'permanent',
            'length_km': 4.309, 'corners': 15, 'straights_length_m': 820,
            'elevation_change_m': 41, 'avg_speed_kmh': 211, 'overtaking_difficulty': 'medium',
            'tire_wear': 'medium', 'downforce_level': 'medium'
        },
        'Las Vegas Street Circuit': {
            'country': 'USA', 'city': 'Las Vegas', 'circuit_type': 'street',
            'length_km': 6.120, 'corners': 17, 'straights_length_m': 1900,
            'elevation_change_m': 12, 'avg_speed_kmh': 234, 'overtaking_difficulty': 'medium',
            'tire_wear': 'low', 'downforce_level': 'very_low'
        },
        'Losail International Circuit': {
            'country': 'Qatar', 'city': 'Doha', 'circuit_type': 'permanent',
            'length_km': 5.380, 'corners': 16, 'straights_length_m': 1068,
            'elevation_change_m': 10, 'avg_speed_kmh': 220, 'overtaking_difficulty': 'medium',
            'tire_wear': 'high', 'downforce_level': 'medium'
        },
        'Yas Marina Circuit': {
            'country': 'UAE', 'city': 'Abu Dhabi', 'circuit_type': 'permanent',
            'length_km': 5.281, 'corners': 16, 'straights_length_m': 1170,
            'elevation_change_m': 26, 'avg_speed_kmh': 196, 'overtaking_difficulty': 'medium',
            'tire_wear': 'medium', 'downforce_level': 'medium'
        }
    }

    # Convert to DataFrame
    circuits_df = pd.DataFrame.from_dict(circuits, orient='index')
    circuits_df.reset_index(inplace=True)
    circuits_df.rename(columns={'index': 'circuit_name'}, inplace=True)

    # Encode categorical variables
    circuit_type_map = {'street': 0, 'semi-permanent': 1, 'permanent': 2}
    overtaking_map = {'very_hard': 0, 'hard': 1, 'medium': 2, 'easy': 3}
    tire_wear_map = {'low': 0, 'medium': 1, 'high': 2}
    downforce_map = {'very_low': 0, 'low': 1, 'medium': 2, 'high': 3, 'very_high': 4}

    circuits_df['circuit_type_encoded'] = circuits_df['circuit_type'].map(circuit_type_map)
    circuits_df['overtaking_difficulty_encoded'] = circuits_df['overtaking_difficulty'].map(overtaking_map)
    circuits_df['tire_wear_encoded'] = circuits_df['tire_wear'].map(tire_wear_map)
    circuits_df['downforce_level_encoded'] = circuits_df['downforce_level'].map(downforce_map)

    return circuits_df

# Create complete circuit metadata
print("Creating COMPLETE circuit metadata database (2020-2025)...\n")
circuits_df = create_complete_circuit_metadata()

# Save to CSV
circuits_df.to_csv('../data/circuits/circuit_characteristics.csv', index=False)
print(f"✅ Complete circuit metadata created!")
print(f"📊 Total circuits: {len(circuits_df)}")
print(f"💾 Saved to: ../data/circuits/circuit_characteristics.csv\n")

# Display circuits
print("🏁 All Circuit Characteristics (2020-2025):")
display(circuits_df[['circuit_name', 'country', 'circuit_type', 'length_km', 'corners',
                      'avg_speed_kmh', 'overtaking_difficulty', 'downforce_level']])


Creating COMPLETE circuit metadata database (2020-2025)...

✅ Complete circuit metadata created!
📊 Total circuits: 24
💾 Saved to: ../data/circuits/circuit_characteristics.csv

🏁 All Circuit Characteristics (2020-2025):


Unnamed: 0,circuit_name,country,circuit_type,length_km,corners,avg_speed_kmh,overtaking_difficulty,downforce_level
0,Albert Park Grand Prix Circuit,Australia,street,5.278,14,223,medium,medium
1,Shanghai International Circuit,China,permanent,5.451,16,207,medium,medium
2,Bahrain International Circuit,Bahrain,permanent,5.412,15,202,easy,low
3,Jeddah Corniche Circuit,Saudi Arabia,street,6.174,27,252,medium,low
4,Suzuka Circuit,Japan,permanent,5.807,18,223,medium,high
5,Miami International Autodrome,USA,street,5.412,19,225,medium,low
6,Autodromo Enzo e Dino Ferrari,Italy,permanent,4.909,19,211,hard,medium
7,Circuit de Monaco,Monaco,street,3.337,19,162,very_hard,very_high
8,Circuit de Barcelona-Catalunya,Spain,permanent,4.657,14,195,hard,high
9,Circuit Gilles Villeneuve,Canada,semi-permanent,4.361,14,220,easy,low


In [9]:
# ========================================
# Section 3A: Merge Datasets
# ========================================

print("Loading collected datasets...\n")

# Load race results
race_data = pd.read_csv('../data/raw/race_results_2020_2025.csv')
print(f"✅ Loaded race data: {len(race_data)} records")

# Load qualifying results
quali_data = pd.read_csv('../data/raw/qualifying_results_2020_2025.csv')
print(f"✅ Loaded qualifying data: {len(quali_data)} records")

# Load circuit metadata
circuits_data = pd.read_csv('../data/circuits/circuit_characteristics.csv')
print(f"✅ Loaded circuit data: {len(circuits_data)} circuits\n")

# Sort data by season and round
race_data = race_data.sort_values(['season', 'round', 'position']).reset_index(drop=True)
quali_data = quali_data.sort_values(['season', 'round', 'position']).reset_index(drop=True)

# Create unique race identifier
race_data['race_id'] = race_data['season'].astype(str) + '_' + race_data['round'].astype(str)
quali_data['race_id'] = quali_data['season'].astype(str) + '_' + quali_data['round'].astype(str)

print("Merging datasets...\n")

# Step 1: Merge race data with qualifying data
# Keep qualifying grid position and Q times
quali_merge = quali_data[['race_id', 'driverCode', 'position', 'Q1', 'Q2', 'Q3']].copy()
quali_merge.rename(columns={'position': 'grid_position'}, inplace=True)

merged_data = race_data.merge(
    quali_merge,
    on=['race_id', 'driverCode'],
    how='left'
)

print(f"✅ Merged race + qualifying data: {len(merged_data)} records")

# Step 2: Get circuit names from race data
# We need to extract circuit name from raceName or use circuitId
# For now, let's check what columns we have
print(f"\n📋 Available columns in race data:")
print([col for col in race_data.columns if 'circuit' in col.lower() or 'race' in col.lower()])

# Check if we have circuit information
if 'circuitId' in race_data.columns:
    print(f"\n✅ Found circuitId column")
    circuit_col = 'circuitId'
elif 'circuitName' in race_data.columns:
    print(f"\n✅ Found circuitName column")
    circuit_col = 'circuitName'
else:
    print(f"\n⚠️  No direct circuit column found. Will use raceName as proxy.")
    # Create a mapping from race names to circuits
    # This is approximate - you might need to refine this
    merged_data['circuit_name'] = merged_data.get('raceName', 'Unknown')

# Display sample of merged data
print(f"\n📊 Merged Dataset Sample:")
display(merged_data[['season', 'round', 'driverCode', 'constructorName', 'grid_position',
                      'position', 'points']].head(10))

print(f"\n✅ Base dataset ready with {len(merged_data)} records!")
print(f"📊 Columns: {len(merged_data.columns)}")
print(f"📊 Seasons: {merged_data['season'].unique()}")


Loading collected datasets...

✅ Loaded race data: 2518 records
✅ Loaded qualifying data: 2518 records
✅ Loaded circuit data: 24 circuits

Merging datasets...

✅ Merged race + qualifying data: 2518 records

📋 Available columns in race data:
['totalRaceTimeMillis', 'totalRaceTime', 'race_id']

⚠️  No direct circuit column found. Will use raceName as proxy.

📊 Merged Dataset Sample:


Unnamed: 0,season,round,driverCode,constructorName,grid_position,position,points
0,2020,1,BOT,Mercedes,1.0,1,25.0
1,2020,1,LEC,Ferrari,7.0,2,18.0
2,2020,1,NOR,McLaren,4.0,3,16.0
3,2020,1,HAM,Mercedes,2.0,4,12.0
4,2020,1,SAI,McLaren,8.0,5,10.0
5,2020,1,PER,Racing Point,6.0,6,8.0
6,2020,1,GAS,AlphaTauri,12.0,7,6.0
7,2020,1,OCO,Renault,14.0,8,4.0
8,2020,1,GIO,Alfa Romeo,18.0,9,2.0
9,2020,1,VET,Ferrari,11.0,10,1.0



✅ Base dataset ready with 2518 records!
📊 Columns: 34
📊 Seasons: [2020 2021 2022 2023 2024 2025]


In [10]:
# ========================================
# Section 3B: Driver Performance Features
# ========================================

print("Creating driver performance features...")
print("⏳ This may take 5-10 minutes for 2,518 records\n")

# Initialize new feature columns
merged_data['driver_last3_avg_points'] = 0.0
merged_data['driver_last5_avg_points'] = 0.0
merged_data['driver_last5_avg_position'] = 0.0
merged_data['driver_last5_podiums'] = 0
merged_data['driver_last5_dnf_count'] = 0
merged_data['driver_season_points'] = 0.0
merged_data['driver_season_races'] = 0
merged_data['driver_season_podiums'] = 0
merged_data['driver_season_wins'] = 0

print("Processing each race record...")
start_time = time.time()

# Sort data chronologically for proper historical calculation
merged_data = merged_data.sort_values(['season', 'round', 'position']).reset_index(drop=True)

# Process each record
for idx in range(len(merged_data)):
    if idx % 500 == 0:
        elapsed = time.time() - start_time
        print(f"  Progress: {idx}/{len(merged_data)} records ({idx/len(merged_data)*100:.1f}%) - {elapsed:.1f}s elapsed")

    current = merged_data.iloc[idx]
    driver = current['driverCode']
    season = current['season']
    round_num = current['round']

    # Get all previous races for this driver (before current race)
    prev_races = merged_data[
        (merged_data['driverCode'] == driver) &
        ((merged_data['season'] < season) |
         ((merged_data['season'] == season) & (merged_data['round'] < round_num)))
    ]

    if len(prev_races) > 0:
        # Last 3 races average points
        last3 = prev_races.tail(3)
        merged_data.at[idx, 'driver_last3_avg_points'] = last3['points'].mean()

        # Last 5 races statistics
        last5 = prev_races.tail(5)
        merged_data.at[idx, 'driver_last5_avg_points'] = last5['points'].mean()
        merged_data.at[idx, 'driver_last5_avg_position'] = last5['position'].mean()
        merged_data.at[idx, 'driver_last5_podiums'] = (last5['position'] <= 3).sum()

        # DNF count in last 5 (DNF is when position is NaN or status indicates retirement)
        dnf_count = last5['status'].str.contains('Retired|Accident|Collision|Damage',
                                                  case=False, na=False).sum()
        merged_data.at[idx, 'driver_last5_dnf_count'] = dnf_count

    # Season statistics (before this race)
    season_races = merged_data[
        (merged_data['driverCode'] == driver) &
        (merged_data['season'] == season) &
        (merged_data['round'] < round_num)
    ]

    if len(season_races) > 0:
        merged_data.at[idx, 'driver_season_points'] = season_races['points'].sum()
        merged_data.at[idx, 'driver_season_races'] = len(season_races)
        merged_data.at[idx, 'driver_season_podiums'] = (season_races['position'] <= 3).sum()
        merged_data.at[idx, 'driver_season_wins'] = (season_races['position'] == 1).sum()

# Calculate derived features
merged_data['driver_last5_dnf_rate'] = merged_data['driver_last5_dnf_count'] / 5.0
merged_data['driver_season_podium_rate'] = np.where(
    merged_data['driver_season_races'] > 0,
    merged_data['driver_season_podiums'] / merged_data['driver_season_races'],
    0
)
merged_data['driver_season_win_rate'] = np.where(
    merged_data['driver_season_races'] > 0,
    merged_data['driver_season_wins'] / merged_data['driver_season_races'],
    0
)

total_time = time.time() - start_time
print(f"\n✅ Driver performance features created in {total_time:.1f} seconds!")

# Show statistics
print(f"\n📊 Feature Statistics:")
feature_cols = ['driver_last3_avg_points', 'driver_last5_avg_points',
                'driver_last5_avg_position', 'driver_season_points']
print(merged_data[feature_cols].describe())

# Show sample
print(f"\n📋 Sample with new features (2025 Round 10):")
sample = merged_data[(merged_data['season'] == 2025) & (merged_data['round'] == 10)].head(5)
display(sample[['driverCode', 'grid_position', 'driver_last5_avg_points',
                'driver_season_points', 'position']])


Creating driver performance features...
⏳ This may take 5-10 minutes for 2,518 records

Processing each race record...
  Progress: 0/2518 records (0.0%) - 0.0s elapsed
  Progress: 500/2518 records (19.9%) - 0.3s elapsed
  Progress: 1000/2518 records (39.7%) - 0.6s elapsed
  Progress: 1500/2518 records (59.6%) - 1.0s elapsed
  Progress: 2000/2518 records (79.4%) - 1.4s elapsed
  Progress: 2500/2518 records (99.3%) - 1.7s elapsed

✅ Driver performance features created in 1.8 seconds!

📊 Feature Statistics:
       driver_last3_avg_points  driver_last5_avg_points  \
count              2518.000000              2518.000000   
mean                  5.036272                 5.040667   
std                   6.152751                 5.888105   
min                   0.000000                 0.000000   
25%                   0.000000                 0.250000   
50%                   2.666667                 2.500000   
75%                   8.000000                 8.200000   
max               

Unnamed: 0,driverCode,grid_position,driver_last5_avg_points,driver_season_points,position
2318,RUS,1.0,8.6,101.0,1
2319,VER,2.0,13.6,131.0,2
2320,ANT,4.0,3.2,44.0,3
2321,PIA,3.0,21.0,172.0,4
2322,LEC,8.0,12.4,90.0,5


In [11]:
# ========================================
# Section 3C: Constructor/Team Features
# ========================================

print("Creating constructor performance features...\n")

# Initialize constructor feature columns
merged_data['constructor_season_points'] = 0.0
merged_data['constructor_last5_avg_points'] = 0.0
merged_data['constructor_season_races'] = 0

print("Processing constructor statistics...")
start_time = time.time()

for idx in range(len(merged_data)):
    if idx % 500 == 0:
        print(f"  Progress: {idx}/{len(merged_data)} ({idx/len(merged_data)*100:.1f}%)")

    current = merged_data.iloc[idx]
    constructor = current['constructorName']
    season = current['season']
    round_num = current['round']

    # Get all races for this constructor in this season (before current race)
    constructor_season = merged_data[
        (merged_data['constructorName'] == constructor) &
        (merged_data['season'] == season) &
        (merged_data['round'] < round_num)
    ]

    if len(constructor_season) > 0:
        merged_data.at[idx, 'constructor_season_points'] = constructor_season['points'].sum()
        merged_data.at[idx, 'constructor_season_races'] = len(constructor_season)

    # Constructor's last 5 races (across all seasons before this race)
    constructor_prev = merged_data[
        (merged_data['constructorName'] == constructor) &
        ((merged_data['season'] < season) |
         ((merged_data['season'] == season) & (merged_data['round'] < round_num)))
    ]

    if len(constructor_prev) > 0:
        last5 = constructor_prev.tail(5)
        merged_data.at[idx, 'constructor_last5_avg_points'] = last5['points'].mean()

total_time = time.time() - start_time
print(f"\n✅ Constructor features created in {total_time:.1f} seconds!")

# Show statistics
print(f"\n📊 Constructor Feature Statistics:")
constructor_cols = ['constructor_season_points', 'constructor_last5_avg_points']
print(merged_data[constructor_cols].describe())

# Show 2025 constructor standings (accumulated)
print(f"\n🏆 2025 Constructor Standings (after Round 10):")
constructor_standings = merged_data[
    (merged_data['season'] == 2025) & (merged_data['round'] == 10)
].groupby('constructorName')['constructor_season_points'].max().sort_values(ascending=False)
print(constructor_standings.head(10))


Creating constructor performance features...

Processing constructor statistics...
  Progress: 0/2518 (0.0%)
  Progress: 500/2518 (19.9%)
  Progress: 1000/2518 (39.7%)
  Progress: 1500/2518 (59.6%)
  Progress: 2000/2518 (79.4%)
  Progress: 2500/2518 (99.3%)

✅ Constructor features created in 1.3 seconds!

📊 Constructor Feature Statistics:
       constructor_season_points  constructor_last5_avg_points
count                2518.000000                   2518.000000
mean                  102.739476                      4.560763
std                   140.797163                      5.282632
min                     0.000000                      0.000000
25%                     7.000000                      0.200000
50%                    40.000000                      2.400000
75%                   143.750000                      7.600000
max                   752.000000                     20.800000

🏆 2025 Constructor Standings (after Round 10):
constructorName
McLaren           339.0
Ferr

In [12]:
# ========================================
# Section 3D: Target Variables & Final Features
# ========================================

print("Creating target variables and final features...\n")

# ===== TARGET VARIABLES =====
print("1️⃣ Creating target variables...")

# Podium finish (Top 3)
merged_data['podium_finish'] = (merged_data['position'] <= 3).astype(int)

# Race winner (Position 1)
merged_data['race_winner'] = (merged_data['position'] == 1).astype(int)

# Top 5 finish
merged_data['top5_finish'] = (merged_data['position'] <= 5).astype(int)

# Top 10 finish (points)
merged_data['points_finish'] = (merged_data['position'] <= 10).astype(int)

print(f"   ✅ Target variable distribution:")
print(f"      - Podium finishes: {merged_data['podium_finish'].sum()} ({merged_data['podium_finish'].mean():.1%})")
print(f"      - Race winners: {merged_data['race_winner'].sum()} ({merged_data['race_winner'].mean():.1%})")
print(f"      - Top 5 finishes: {merged_data['top5_finish'].sum()} ({merged_data['top5_finish'].mean():.1%})")
print(f"      - Points finishes: {merged_data['points_finish'].sum()} ({merged_data['points_finish'].mean():.1%})")

# ===== QUALIFYING FEATURES =====
print("\n2️⃣ Creating qualifying-based features...")

# Qualified in top 10
merged_data['qualified_top10'] = (merged_data['grid_position'] <= 10).astype(int)

# Qualified in top 5
merged_data['qualified_top5'] = (merged_data['grid_position'] <= 5).astype(int)

# Grid position improvement potential (lower is better starting position)
merged_data['grid_advantage'] = 21 - merged_data['grid_position'].fillna(20)

print(f"   ✅ Qualifying features created")

# ===== FEATURE COUNT =====
print("\n📊 Feature Summary:")
feature_columns = [col for col in merged_data.columns if col not in [
    'number', 'positionText', 'laps', 'status', 'driverId', 'driverNumber',
    'driverUrl', 'givenName', 'familyName', 'dateOfBirth', 'driverNationality',
    'constructorId', 'constructorUrl', 'constructorNationality', 'totalRaceTimeMillis',
    'totalRaceTime', 'fastestLapRank', 'fastestLapNumber', 'fastestLapTime',
    'fastestLapAvgSpeedUnits', 'fastestLapAvgSpeed', 'Q1', 'Q2', 'Q3', 'race_id',
    'position', 'points', 'podium_finish', 'race_winner', 'top5_finish', 'points_finish'
]]

print(f"   Total columns: {len(merged_data.columns)}")
print(f"   Predictive features: {len(feature_columns)}")
print(f"   Target variables: 4 (podium_finish, race_winner, top5_finish, points_finish)")

# ===== SAVE PROCESSED DATASET =====
print("\n💾 Saving processed dataset...")

output_path = '../data/processed/f1_dataset_features_v2.csv'
merged_data.to_csv(output_path, index=False)
print(f"   ✅ Saved to: {output_path}")
print(f"   📊 Dataset shape: {merged_data.shape}")

# ===== DISPLAY SAMPLE WITH ALL FEATURES =====
print("\n📋 Sample record with key features (2025 Round 10 - Russell's Win):")
sample_cols = ['season', 'round', 'driverCode', 'constructorName', 'grid_position',
               'driver_last5_avg_points', 'driver_season_points', 'constructor_season_points',
               'position', 'podium_finish', 'race_winner']
sample = merged_data[(merged_data['season'] == 2025) &
                     (merged_data['round'] == 10) &
                     (merged_data['driverCode'] == 'RUS')]
display(sample[sample_cols])

print("\n✅ Feature engineering complete!")
print("\n🎯 Ready for model training with:")
print(f"   • {len(feature_columns)} predictive features")
print(f"   • {len(merged_data)} training examples")
print(f"   • 4 target variables")


Creating target variables and final features...

1️⃣ Creating target variables...
   ✅ Target variable distribution:
      - Podium finishes: 378 (15.0%)
      - Race winners: 126 (5.0%)
      - Top 5 finishes: 630 (25.0%)
      - Points finishes: 1260 (50.0%)

2️⃣ Creating qualifying-based features...
   ✅ Qualifying features created

📊 Feature Summary:
   Total columns: 56
   Predictive features: 25
   Target variables: 4 (podium_finish, race_winner, top5_finish, points_finish)

💾 Saving processed dataset...
   ✅ Saved to: ../data/processed/f1_dataset_features_v2.csv
   📊 Dataset shape: (2518, 56)

📋 Sample record with key features (2025 Round 10 - Russell's Win):


Unnamed: 0,season,round,driverCode,constructorName,grid_position,driver_last5_avg_points,driver_season_points,constructor_season_points,position,podium_finish,race_winner
2318,2025,10,RUS,Mercedes,1.0,8.6,101.0,145.0,1,1,1



✅ Feature engineering complete!

🎯 Ready for model training with:
   • 25 predictive features
   • 2518 training examples
   • 4 target variables


In [13]:
# ========================================
# Section 3E: Add Circuit-Specific Features
# ========================================

print("🏁 Adding circuit-specific features to complete V2...\n")

# Reload data (in case we need fresh copy)
merged_data = pd.read_csv('../data/processed/f1_dataset_features_v2.csv')
circuits_data = pd.read_csv('../data/circuits/circuit_characteristics.csv')

print(f"✅ Loaded {len(merged_data)} race records")
print(f"✅ Loaded {len(circuits_data)} circuit profiles\n")

# We need to map race names to circuit names
# First, let's see what circuit info we have in race data
print("📋 Checking available circuit information...")
if 'circuitName' in merged_data.columns:
    print("   Found 'circuitName' column")
    circuit_col = 'circuitName'
elif 'circuitId' in merged_data.columns:
    print("   Found 'circuitId' column")
    circuit_col = 'circuitId'
else:
    print("   No direct circuit column - will create from race patterns")
    # Create a simple circuit mapping from known race patterns
    # This is a simplified version - in production you'd use proper API data
    merged_data['circuit_name'] = 'Unknown Circuit'

print("\n1️⃣ Creating circuit-specific driver history features...")
print("   (This will take 3-5 minutes)\n")

# Initialize circuit-specific features
merged_data['driver_wins_at_circuit'] = 0
merged_data['driver_podiums_at_circuit'] = 0
merged_data['driver_races_at_circuit'] = 0
merged_data['driver_avg_finish_at_circuit'] = 0.0
merged_data['driver_best_finish_at_circuit'] = 20
merged_data['driver_last_finish_at_circuit'] = 0

start_time = time.time()

for idx in range(len(merged_data)):
    if idx % 500 == 0:
        print(f"   Progress: {idx}/{len(merged_data)} ({idx/len(merged_data)*100:.1f}%)")

    current = merged_data.iloc[idx]
    driver = current['driverCode']
    season = current['season']
    round_num = current['round']

    # For now, use season+round as circuit proxy (same round = same circuit usually)
    # In production, you'd use actual circuit names
    circuit_proxy = current['round']  # Simplified: Round number as circuit identifier

    # Get driver's previous races at this circuit (same round number in previous years)
    circuit_history = merged_data[
        (merged_data['driverCode'] == driver) &
        (merged_data['round'] == circuit_proxy) &
        ((merged_data['season'] < season) |
         ((merged_data['season'] == season) & (merged_data['round'] < round_num)))
    ]

    if len(circuit_history) > 0:
        merged_data.at[idx, 'driver_wins_at_circuit'] = (circuit_history['position'] == 1).sum()
        merged_data.at[idx, 'driver_podiums_at_circuit'] = (circuit_history['position'] <= 3).sum()
        merged_data.at[idx, 'driver_races_at_circuit'] = len(circuit_history)
        merged_data.at[idx, 'driver_avg_finish_at_circuit'] = circuit_history['position'].mean()
        merged_data.at[idx, 'driver_best_finish_at_circuit'] = circuit_history['position'].min()

        # Last year's finish at this circuit
        last_year = circuit_history[circuit_history['season'] == season - 1]
        if len(last_year) > 0:
            merged_data.at[idx, 'driver_last_finish_at_circuit'] = last_year.iloc[0]['position']

# Circuit expertise rate
merged_data['driver_circuit_win_rate'] = np.where(
    merged_data['driver_races_at_circuit'] > 0,
    merged_data['driver_wins_at_circuit'] / merged_data['driver_races_at_circuit'],
    0
)

total_time = time.time() - start_time
print(f"\n✅ Circuit-specific driver features created in {total_time:.1f} seconds!")

print("\n2️⃣ Adding circuit characteristics features...")

# Add circuit characteristics (using round as proxy)
# Create a simple mapping for major circuits
circuit_char_map = {
    1: 'Bahrain International Circuit',
    2: 'Jeddah Corniche Circuit',
    3: 'Albert Park Grand Prix Circuit',
    4: 'Suzuka Circuit',
    5: 'Shanghai International Circuit',
    6: 'Miami International Autodrome',
    7: 'Autodromo Enzo e Dino Ferrari',  # Imola
    8: 'Circuit de Monaco',
    9: 'Circuit de Barcelona-Catalunya',
    10: 'Circuit Gilles Villeneuve',
    11: 'Red Bull Ring',
    12: 'Silverstone Circuit',
    13: 'Hungaroring',
    14: 'Circuit de Spa-Francorchamps',
    15: 'Circuit Park Zandvoort',
    16: 'Autodromo Nazionale di Monza',
    17: 'Baku City Circuit',
    18: 'Marina Bay Street Circuit',
    19: 'Circuit of the Americas',
    20: 'Autódromo Hermanos Rodríguez',  # Mexico
    21: 'Autódromo José Carlos Pace',  # Brazil
    22: 'Las Vegas Street Circuit',
    23: 'Losail International Circuit',  # Qatar
    24: 'Yas Marina Circuit'
}

# Map rounds to circuit names
merged_data['circuit_mapped'] = merged_data['round'].map(circuit_char_map)

# Merge circuit characteristics
merged_data = merged_data.merge(
    circuits_data[['circuit_name', 'length_km', 'corners', 'circuit_type_encoded',
                   'avg_speed_kmh', 'overtaking_difficulty_encoded', 'tire_wear_encoded',
                   'downforce_level_encoded']],
    left_on='circuit_mapped',
    right_on='circuit_name',
    how='left'
)

# Fill missing circuit data with median values
merged_data['length_km'].fillna(merged_data['length_km'].median(), inplace=True)
merged_data['corners'].fillna(merged_data['corners'].median(), inplace=True)
merged_data['circuit_type_encoded'].fillna(2, inplace=True)  # Default to permanent
merged_data['avg_speed_kmh'].fillna(merged_data['avg_speed_kmh'].median(), inplace=True)
merged_data['overtaking_difficulty_encoded'].fillna(2, inplace=True)  # Default to medium
merged_data['tire_wear_encoded'].fillna(1, inplace=True)  # Default to medium
merged_data['downforce_level_encoded'].fillna(2, inplace=True)  # Default to medium

print("✅ Circuit characteristic features added!")

# Save updated dataset
output_path = '../data/processed/f1_dataset_features_v2_complete.csv'
merged_data.to_csv(output_path, index=False)

print(f"\n💾 Complete dataset saved to: {output_path}")
print(f"📊 Dataset shape: {merged_data.shape}")

# Count total features
feature_cols = [col for col in merged_data.columns if col not in [
    'number', 'positionText', 'laps', 'status', 'driverId', 'driverNumber',
    'driverUrl', 'givenName', 'familyName', 'dateOfBirth', 'driverNationality',
    'constructorId', 'constructorUrl', 'constructorNationality', 'totalRaceTimeMillis',
    'totalRaceTime', 'fastestLapRank', 'fastestLapNumber', 'fastestLapTime',
    'fastestLapAvgSpeedUnits', 'fastestLapAvgSpeed', 'Q1', 'Q2', 'Q3', 'race_id',
    'position', 'points', 'podium_finish', 'race_winner', 'top5_finish', 'points_finish',
    'driverCode', 'constructorName', 'season', 'round', 'circuit_mapped', 'circuit_name'
]]

print(f"\n🎯 Feature Engineering Complete!")
print(f"   • Total predictive features: {len(feature_cols)}")
print(f"   • Training examples: {len(merged_data)}")
print(f"   • Target variables: 4")

# Show sample with circuit features
print(f"\n📋 Sample with circuit features (2025 Round 10):")
sample_cols = ['driverCode', 'grid_position', 'driver_last5_avg_points',
               'driver_wins_at_circuit', 'driver_avg_finish_at_circuit',
               'length_km', 'downforce_level_encoded', 'position']
sample = merged_data[(merged_data['season'] == 2025) & (merged_data['round'] == 10)]
display(sample[sample_cols].head(5))


🏁 Adding circuit-specific features to complete V2...

✅ Loaded 2518 race records
✅ Loaded 24 circuit profiles

📋 Checking available circuit information...
   No direct circuit column - will create from race patterns

1️⃣ Creating circuit-specific driver history features...
   (This will take 3-5 minutes)

   Progress: 0/2518 (0.0%)
   Progress: 500/2518 (19.9%)
   Progress: 1000/2518 (39.7%)
   Progress: 1500/2518 (59.6%)
   Progress: 2000/2518 (79.4%)
   Progress: 2500/2518 (99.3%)

✅ Circuit-specific driver features created in 1.1 seconds!

2️⃣ Adding circuit characteristics features...
✅ Circuit characteristic features added!

💾 Complete dataset saved to: ../data/processed/f1_dataset_features_v2_complete.csv
📊 Dataset shape: (2518, 72)

🎯 Feature Engineering Complete!
   • Total predictive features: 36
   • Training examples: 2518
   • Target variables: 4

📋 Sample with circuit features (2025 Round 10):


Unnamed: 0,driverCode,grid_position,driver_last5_avg_points,driver_wins_at_circuit,driver_avg_finish_at_circuit,length_km,downforce_level_encoded,position
2318,RUS,1.0,8.6,0,11.4,4.361,1,1
2319,VER,2.0,13.6,2,6.2,4.361,1,2
2320,ANT,4.0,3.2,0,0.0,4.361,1,3
2321,PIA,3.0,21.0,0,5.5,4.361,1,4
2322,LEC,8.0,12.4,0,5.2,4.361,1,5


In [15]:
# ========================================
# Section 4: Train Complete V2 Model
# ========================================

print("🤖 Training F1 Predictor V2 - Complete Model\n")

# Load complete dataset
df_complete = pd.read_csv('../data/processed/f1_dataset_features_v2_complete.csv')

print(f"📊 Dataset loaded: {df_complete.shape}")
print(f"   Columns: {len(df_complete.columns)}\n")

# Select ONLY numeric features (exclude text identifiers and targets)
exclude_cols = ['number', 'positionText', 'laps', 'status', 'driverId', 'driverNumber',
                'driverUrl', 'givenName', 'familyName', 'dateOfBirth', 'driverNationality',
                'constructorId', 'constructorUrl', 'constructorNationality', 'totalRaceTimeMillis',
                'totalRaceTime', 'fastestLapRank', 'fastestLapNumber', 'fastestLapTime',
                'fastestLapAvgSpeedUnits', 'fastestLapAvgSpeed', 'Q1', 'Q2', 'Q3', 'race_id',
                'position', 'points', 'podium_finish', 'race_winner', 'top5_finish', 'points_finish',
                'driverCode', 'constructorName', 'season', 'round',
                'circuit_mapped', 'circuit_name', 'circuit_name_x', 'circuit_name_y',  # Text columns
                'grid']  # Duplicate of grid_position

# Get all numeric columns
feature_cols = [col for col in df_complete.columns if col not in exclude_cols]

# Further filter to only numeric types
feature_cols = [col for col in feature_cols if df_complete[col].dtype in ['int64', 'float64']]

print(f"✅ Selected {len(feature_cols)} numeric features for training\n")

# Prepare data
print("📊 Creating train/test split...")
data_clean = df_complete[feature_cols + ['podium_finish', 'season']].dropna()
X = data_clean[feature_cols]
y = data_clean['podium_finish']
seasons = data_clean['season']

# Time-based split
X_train = X[seasons < 2025]
X_test = X[seasons == 2025]
y_train = y[seasons < 2025]
y_test = y[seasons == 2025]

print(f"   ✅ Training: {len(X_train)} records (2020-2024)")
print(f"   ✅ Testing: {len(X_test)} records (2025)")
print(f"   ✅ Podium rate - Train: {y_train.mean():.1%} | Test: {y_test.mean():.1%}\n")

print(f"🎯 Final feature list ({len(feature_cols)} features):")
for i, feat in enumerate(feature_cols, 1):
    print(f"   {i}. {feat}")

# Train with MLflow
print("\n🔥 Training V2 XGBoost model...\n")

with mlflow.start_run(run_name="F1_V2_Complete_XGBoost"):

    params = {
        'n_estimators': 200,
        'max_depth': 7,
        'learning_rate': 0.08,
        'min_child_weight': 2,
        'subsample': 0.85,
        'colsample_bytree': 0.85,
        'gamma': 0.1,
        'reg_alpha': 0.1,
        'reg_lambda': 1.0,
        'random_state': 42
    }

    mlflow.log_params(params)
    mlflow.log_param("model_version", "V2_Complete")
    mlflow.log_param("features_count", len(feature_cols))
    mlflow.log_param("train_size", len(X_train))
    mlflow.log_param("test_size", len(X_test))

    # Train
    model_v2 = xgb.XGBClassifier(**params, eval_metric='logloss')
    model_v2.fit(X_train, y_train)

    # Predict
    y_pred = model_v2.predict(X_test)
    y_pred_proba = model_v2.predict_proba(X_test)[:, 1]

    # Metrics
    from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)

    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1_score", f1)
    mlflow.log_metric("roc_auc", roc_auc)

    mlflow.sklearn.log_model(model_v2, "v2_xgboost_complete")

    # Save locally
    import pickle
    with open('../models/f1_predictor_v2_complete.pkl', 'wb') as f:
        pickle.dump(model_v2, f)

    print("✅ Model training complete!\n")

# Results
print("=" * 70)
print("📈 F1 PREDICTOR V2 - FINAL RESULTS")
print("=" * 70)
print(f"\n🎯 Performance (2025 Season):")
print(f"   • Accuracy:  {accuracy:.2%}")
print(f"   • Precision: {precision:.2%}")
print(f"   • Recall:    {recall:.2%}")
print(f"   • F1 Score:  {f1:.3f}")
print(f"   • ROC AUC:   {roc_auc:.3f}")

print(f"\n📊 Classification Report:")
print(classification_report(y_test, y_pred, target_names=['No Podium', 'Podium']))

cm = confusion_matrix(y_test, y_pred)
print(f"\n📊 Confusion Matrix:")
print(f"   True Negatives:  {cm[0][0]} | False Positives: {cm[0][1]}")
print(f"   False Negatives: {cm[1][0]} | True Positives:  {cm[1][1]}")

# Feature Importance
print(f"\n🎯 Top 15 Most Important Features:")
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': model_v2.feature_importances_
}).sort_values('importance', ascending=False)

display(feature_importance.head(15))

# V1 vs V2 Comparison
print(f"\n" + "=" * 70)
print("📊 V1 vs V2 COMPARISON")
print("=" * 70)
print(f"\n   V1 (Basic Model):")
print(f"      - Features: 5")
print(f"      - Accuracy: 91.56%")
print(f"\n   V2 (Complete Model):")
print(f"      - Features: {len(feature_cols)}")
print(f"      - Accuracy: {accuracy:.2%}")
print(f"\n   📈 Improvement:")
print(f"      - Accuracy gain: {(accuracy - 0.9156) * 100:+.2f} percentage points")
print(f"      - Features added: {len(feature_cols) - 5}")
print(f"      - Feature increase: {((len(feature_cols) - 5) / 5 * 100):.0f}%")

print("\n✅ V2 Model Complete!")
print(f"💾 Saved to: ../models/f1_predictor_v2_complete.pkl")


🤖 Training F1 Predictor V2 - Complete Model

📊 Dataset loaded: (2518, 72)
   Columns: 72

✅ Selected 33 numeric features for training

📊 Creating train/test split...
   ✅ Training: 2138 records (2020-2024)
   ✅ Testing: 379 records (2025)
   ✅ Podium rate - Train: 15.0% | Test: 15.0%

🎯 Final feature list (33 features):
   1. grid_position
   2. driver_last3_avg_points
   3. driver_last5_avg_points
   4. driver_last5_avg_position
   5. driver_last5_podiums
   6. driver_last5_dnf_count
   7. driver_season_points
   8. driver_season_races
   9. driver_season_podiums
   10. driver_season_wins
   11. driver_last5_dnf_rate
   12. driver_season_podium_rate
   13. driver_season_win_rate
   14. constructor_season_points
   15. constructor_last5_avg_points
   16. constructor_season_races
   17. qualified_top10
   18. qualified_top5
   19. grid_advantage
   20. driver_wins_at_circuit
   21. driver_podiums_at_circuit
   22. driver_races_at_circuit
   23. driver_avg_finish_at_circuit
   24. driver



✅ Model training complete!

📈 F1 PREDICTOR V2 - FINAL RESULTS

🎯 Performance (2025 Season):
   • Accuracy:  90.50%
   • Precision: 69.09%
   • Recall:    66.67%
   • F1 Score:  0.679
   • ROC AUC:   0.947

📊 Classification Report:
              precision    recall  f1-score   support

   No Podium       0.94      0.95      0.94       322
      Podium       0.69      0.67      0.68        57

    accuracy                           0.91       379
   macro avg       0.82      0.81      0.81       379
weighted avg       0.90      0.91      0.90       379


📊 Confusion Matrix:
   True Negatives:  305 | False Positives: 17
   False Negatives: 19 | True Positives:  38

🎯 Top 15 Most Important Features:


Unnamed: 0,feature,importance
17,qualified_top5,0.4253
18,grid_advantage,0.109279
0,grid_position,0.092712
16,qualified_top10,0.090213
25,driver_circuit_win_rate,0.017374
19,driver_wins_at_circuit,0.014823
11,driver_season_podium_rate,0.014788
12,driver_season_win_rate,0.013716
1,driver_last3_avg_points,0.011669
14,constructor_last5_avg_points,0.011584



📊 V1 vs V2 COMPARISON

   V1 (Basic Model):
      - Features: 5
      - Accuracy: 91.56%

   V2 (Complete Model):
      - Features: 33
      - Accuracy: 90.50%

   📈 Improvement:
      - Accuracy gain: -1.06 percentage points
      - Features added: 28
      - Feature increase: 560%

✅ V2 Model Complete!
💾 Saved to: ../models/f1_predictor_v2_complete.pkl


In [16]:
# ========================================
# Section 5: Hyperparameter Tuning (V2-Optimized)
# ========================================

print("🔧 Optimizing V2 Model with Hyperparameter Tuning\n")

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

# Load data
df_complete = pd.read_csv('../data/processed/f1_dataset_features_v2_complete.csv')

# Feature selection (same as before)
exclude_cols = ['number', 'positionText', 'laps', 'status', 'driverId', 'driverNumber',
                'driverUrl', 'givenName', 'familyName', 'dateOfBirth', 'driverNationality',
                'constructorId', 'constructorUrl', 'constructorNationality', 'totalRaceTimeMillis',
                'totalRaceTime', 'fastestLapRank', 'fastestLapNumber', 'fastestLapTime',
                'fastestLapAvgSpeedUnits', 'fastestLapAvgSpeed', 'Q1', 'Q2', 'Q3', 'race_id',
                'position', 'points', 'podium_finish', 'race_winner', 'top5_finish', 'points_finish',
                'driverCode', 'constructorName', 'season', 'round',
                'circuit_mapped', 'circuit_name', 'circuit_name_x', 'circuit_name_y', 'grid']

feature_cols = [col for col in df_complete.columns if col not in exclude_cols]
feature_cols = [col for col in feature_cols if df_complete[col].dtype in ['int64', 'float64']]

# Prepare data
data_clean = df_complete[feature_cols + ['podium_finish', 'season']].dropna()
X = data_clean[feature_cols]
y = data_clean['podium_finish']
seasons = data_clean['season']

X_train = X[seasons < 2025]
X_test = X[seasons == 2025]
y_train = y[seasons < 2025]
y_test = y[seasons == 2025]

print(f"✅ Training: {len(X_train)} | Testing: {len(X_test)}\n")

# Define hyperparameter search space
param_distributions = {
    'n_estimators': randint(150, 300),
    'max_depth': randint(5, 10),
    'learning_rate': uniform(0.05, 0.15),
    'min_child_weight': randint(1, 5),
    'subsample': uniform(0.7, 0.3),
    'colsample_bytree': uniform(0.7, 0.3),
    'gamma': uniform(0, 0.3),
    'reg_alpha': uniform(0, 0.5),
    'reg_lambda': uniform(0.5, 1.5)
}

print("🔍 Searching for optimal hyperparameters...")
print("   (Testing 20 random combinations)\n")

# Randomized search with cross-validation
xgb_model = xgb.XGBClassifier(random_state=42, eval_metric='logloss')

random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_distributions,
    n_iter=20,  # Test 20 random combinations
    cv=3,       # 3-fold cross-validation
    scoring='accuracy',
    n_jobs=-1,  # Use all CPU cores
    verbose=1,
    random_state=42
)

# Fit with timing
import time
start_time = time.time()
random_search.fit(X_train, y_train)
elapsed = time.time() - start_time

print(f"\n✅ Search complete in {elapsed:.1f} seconds!")
print(f"\n🎯 Best Parameters Found:")
for param, value in random_search.best_params_.items():
    print(f"   • {param}: {value:.4f}" if isinstance(value, float) else f"   • {param}: {value}")

# Train final optimized model
print(f"\n🔥 Training V2-Optimized model with best parameters...\n")

with mlflow.start_run(run_name="F1_V2_Optimized"):

    best_params = random_search.best_params_

    mlflow.log_params(best_params)
    mlflow.log_param("model_version", "V2_Optimized")
    mlflow.log_param("features_count", len(feature_cols))
    mlflow.log_param("cv_score", random_search.best_score_)

    # Train with best params
    model_optimized = xgb.XGBClassifier(**best_params, random_state=42, eval_metric='logloss')
    model_optimized.fit(X_train, y_train)

    # Predict
    y_pred_opt = model_optimized.predict(X_test)
    y_pred_proba_opt = model_optimized.predict_proba(X_test)[:, 1]

    # Metrics
    from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

    accuracy_opt = accuracy_score(y_test, y_pred_opt)
    precision_opt = precision_score(y_test, y_pred_opt)
    recall_opt = recall_score(y_test, y_pred_opt)
    f1_opt = f1_score(y_test, y_pred_opt)
    roc_auc_opt = roc_auc_score(y_test, y_pred_proba_opt)

    mlflow.log_metric("accuracy", accuracy_opt)
    mlflow.log_metric("precision", precision_opt)
    mlflow.log_metric("recall", recall_opt)
    mlflow.log_metric("f1_score", f1_opt)
    mlflow.log_metric("roc_auc", roc_auc_opt)

    mlflow.sklearn.log_model(model_optimized, "v2_optimized")

    # Save locally
    import pickle
    with open('../models/f1_predictor_v2_optimized.pkl', 'wb') as f:
        pickle.dump(model_optimized, f)

# Results
print("=" * 70)
print("📈 V2-OPTIMIZED RESULTS")
print("=" * 70)
print(f"\n🎯 Performance (2025 Season):")
print(f"   • Accuracy:  {accuracy_opt:.2%}")
print(f"   • Precision: {precision_opt:.2%}")
print(f"   • Recall:    {recall_opt:.2%}")
print(f"   • F1 Score:  {f1_opt:.3f}")
print(f"   • ROC AUC:   {roc_auc_opt:.3f}")

# Compare all versions
print(f"\n" + "=" * 70)
print("📊 MODEL EVOLUTION COMPARISON")
print("=" * 70)
print(f"\n   V1 (Baseline):        91.56% accuracy | 5 features")
print(f"   V2 (Initial):         90.50% accuracy | 33 features")
print(f"   V2-Optimized (Final): {accuracy_opt:.2%} accuracy | 33 features")
print(f"\n   🎯 Best Model: {'V2-Optimized' if accuracy_opt > 0.9156 else 'V1'}")
print(f"   📈 Improvement over V1: {(accuracy_opt - 0.9156) * 100:+.2f} percentage points")

print(f"\n✅ V2-Optimized Complete!")
print(f"💾 Saved to: ../models/f1_predictor_v2_optimized.pkl")


🔧 Optimizing V2 Model with Hyperparameter Tuning

✅ Training: 2138 | Testing: 379

🔍 Searching for optimal hyperparameters...
   (Testing 20 random combinations)

Fitting 3 folds for each of 20 candidates, totalling 60 fits

✅ Search complete in 2.8 seconds!

🎯 Best Parameters Found:
   • colsample_bytree: 0.7550
   • gamma: 0.0913
   • learning_rate: 0.1287
   • max_depth: 8
   • min_child_weight: 1
   • n_estimators: 198
   • reg_alpha: 0.2624
   • reg_lambda: 1.0998
   • subsample: 0.7140

🔥 Training V2-Optimized model with best parameters...





📈 V2-OPTIMIZED RESULTS

🎯 Performance (2025 Season):
   • Accuracy:  91.29%
   • Precision: 74.00%
   • Recall:    64.91%
   • F1 Score:  0.692
   • ROC AUC:   0.947

📊 MODEL EVOLUTION COMPARISON

   V1 (Baseline):        91.56% accuracy | 5 features
   V2 (Initial):         90.50% accuracy | 33 features
   V2-Optimized (Final): 91.29% accuracy | 33 features

   🎯 Best Model: V1
   📈 Improvement over V1: -0.27 percentage points

✅ V2-Optimized Complete!
💾 Saved to: ../models/f1_predictor_v2_optimized.pkl
