In [1]:
# ========================================
# F1 PREDICTOR V3 - MASTER NOTEBOOK
# ========================================

import pandas as pd
import numpy as np
import time
import warnings
warnings.filterwarnings('ignore')

# Ergast API (fast, clean, simple)
from fastf1.ergast import Ergast

# FastF1 for qualifying data only
import fastf1 as ff1
import logging
logging.getLogger('fastf1').setLevel(logging.ERROR)

# ML Libraries
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

# MLflow
import mlflow
mlflow.set_experiment("F1_Predictor_V3")

print("✅ All imports successful!")
print(f"📦 FastF1: {ff1.__version__}")
print(f"📦 XGBoost: {xgb.__version__}")
print(f"📦 Pandas: {pd.__version__}")

# Enable FastF1 cache (for qualifying only)
import os
os.makedirs('../f1_cache', exist_ok=True)
ff1.Cache.enable_cache('../f1_cache')

print("\n🎯 V3 Configuration:")
print("   Data Source: Ergast API (race) + FastF1 (qualifying)")
print("   Training: 2022-2024 + 2025 R1-R19")
print("   Test: Mexico GP 2025 (Round 20)")
print("   Target Features: 50")


✅ All imports successful!
📦 FastF1: 3.6.1
📦 XGBoost: 3.1.1
📦 Pandas: 2.3.3

🎯 V3 Configuration:
   Data Source: Ergast API (race) + FastF1 (qualifying)
   Training: 2022-2024 + 2025 R1-R19
   Test: Mexico GP 2025 (Round 20)
   Target Features: 50


In [2]:
# ========================================
# STEP 1: Collect Race Results (2022-2025)
# ========================================

def collect_race_results(start_year=2022, end_year=2025, exclude_round=None):
    """
    Collect race results from Ergast API

    Parameters:
    - start_year: First season to collect (default: 2022)
    - end_year: Last season to collect (default: 2025)
    - exclude_round: Dict of {year: round_to_exclude} (e.g., {2025: 20} for Mexico GP)

    Returns:
    - DataFrame with all race results
    """
    ergast = Ergast()
    all_race_results = []

    print(f"📊 COLLECTING RACE RESULTS ({start_year}-{end_year})")
    print("   Using Ergast API for speed and reliability\n")

    for year in range(start_year, end_year + 1):
        print(f"🏎️  Season {year}")

        # Get race schedule for this year
        try:
            races = ergast.get_race_schedule(year)
            total_races = len(races)

            # Exclude specific round if specified
            if exclude_round and year in exclude_round:
                excluded = exclude_round[year]
                print(f"   Total races: {total_races} (excluding R{excluded} for testing)")
            else:
                print(f"   Total races: {total_races}")
                excluded = None

        except Exception as e:
            print(f"   ❌ Could not fetch {year} schedule: {e}")
            continue

        # Collect each race
        for race_round in range(1, total_races + 1):

            # Skip excluded round
            if excluded and race_round == excluded:
                print(f"   [--/--] R{race_round:2d} ⊗ HELD OUT FOR TESTING")
                continue

            try:
                race_results = ergast.get_race_results(season=year, round=race_round)

                if race_results.content and len(race_results.content) > 0:
                    race_df = race_results.content[0]
                    race_df['season'] = year
                    race_df['round'] = race_round
                    all_race_results.append(race_df)
                    print(f"   [{len(all_race_results):2d}] R{race_round:2d} ✅ {len(race_df)} drivers")
                else:
                    print(f"   [--/--] R{race_round:2d} ⚠️  Not completed yet")

                # Rate limiting
                time.sleep(2)

            except Exception as e:
                if "Too Many Requests" in str(e):
                    print(f"   ⏳ Rate limited at R{race_round}. Waiting 10 seconds...")
                    time.sleep(10)
                    # Retry once
                    try:
                        race_results = ergast.get_race_results(season=year, round=race_round)
                        if race_results.content and len(race_results.content) > 0:
                            race_df = race_results.content[0]
                            race_df['season'] = year
                            race_df['round'] = race_round
                            all_race_results.append(race_df)
                            print(f"   [{len(all_race_results):2d}] R{race_round:2d} ✅ {len(race_df)} drivers (retry)")
                    except:
                        print(f"   [--/--] R{race_round:2d} ❌ Failed after retry")
                else:
                    print(f"   [--/--] R{race_round:2d} ❌ {str(e)[:50]}")
                continue

        print()

    # Combine all results
    if len(all_race_results) == 0:
        print("❌ No race data collected")
        return None

    race_df = pd.concat(all_race_results, ignore_index=True)

    print("=" * 70)
    print("✅ RACE DATA COLLECTION COMPLETE")
    print("=" * 70)
    print(f"Total records:        {len(race_df):,}")
    print(f"Total races:          {len(all_race_results)}")
    print(f"Seasons:              {sorted(race_df['season'].unique())}")
    print(f"Unique drivers:       {race_df['driverCode'].nunique()}")
    print(f"Unique constructors:  {race_df['constructorName'].nunique()}")
    print(f"Date range:           {race_df['season'].min()}-{race_df['season'].max()}")

    print(f"\n📋 Sample data:")
    print(race_df[['season', 'round', 'driverCode', 'constructorName',
                   'grid', 'position', 'points']].head(10).to_string(index=False))

    print(f"\n💾 Saving to CSV...")
    race_df.to_csv('../data/raw/race_results_2022_2025.csv', index=False)
    print(f"✅ Saved: data/raw/race_results_2022_2025.csv")

    return race_df

# Collect race data (exclude Mexico GP R20 for testing)
race_data = collect_race_results(
    start_year=2022,
    end_year=2025,
    exclude_round={2025: 20}  # Exclude Mexico GP
)


📊 COLLECTING RACE RESULTS (2022-2025)
   Using Ergast API for speed and reliability

🏎️  Season 2022
   Total races: 22
   [ 1] R 1 ✅ 20 drivers
   [ 2] R 2 ✅ 20 drivers
   [ 3] R 3 ✅ 20 drivers
   [ 4] R 4 ✅ 20 drivers
   [ 5] R 5 ✅ 20 drivers
   [ 6] R 6 ✅ 20 drivers
   [ 7] R 7 ✅ 20 drivers
   [ 8] R 8 ✅ 20 drivers
   [ 9] R 9 ✅ 20 drivers
   [10] R10 ✅ 20 drivers
   [11] R11 ✅ 20 drivers
   [12] R12 ✅ 20 drivers
   [13] R13 ✅ 20 drivers
   [14] R14 ✅ 20 drivers
   [15] R15 ✅ 20 drivers
   [16] R16 ✅ 20 drivers
   [17] R17 ✅ 20 drivers
   [18] R18 ✅ 20 drivers
   [19] R19 ✅ 20 drivers
   [20] R20 ✅ 20 drivers
   [21] R21 ✅ 20 drivers
   [22] R22 ✅ 20 drivers

🏎️  Season 2023
   Total races: 22
   [23] R 1 ✅ 20 drivers
   [24] R 2 ✅ 20 drivers
   [25] R 3 ✅ 20 drivers
   [26] R 4 ✅ 20 drivers
   [27] R 5 ✅ 20 drivers
   [28] R 6 ✅ 20 drivers
   [29] R 7 ✅ 20 drivers
   [30] R 8 ✅ 20 drivers
   [31] R 9 ✅ 20 drivers
   [32] R10 ✅ 20 drivers
   [33] R11 ✅ 20 drivers
   [34] R12 ✅ 20 dr

In [4]:
# ========================================
# STEP 2A: Test FastF1 API Status
# Quick diagnostic before full collection
# ========================================

import logging
logging.getLogger('fastf1').setLevel(logging.WARNING)

print("🔍 TESTING FASTF1 API STATUS\n")
print("Testing with 2025 US GP (Round 19) qualifying...\n")

try:
    print("Attempting to load session...")
    quali = ff1.get_session(2025, 19, 'Q')
    print("✅ Session object created")

    print("Loading session data...")
    quali.load()
    print("✅ Session data loaded!")

    print(f"\n📊 Results shape: {quali.results.shape}")
    print(f"\n📋 Available columns:")
    print([col for col in quali.results.columns if 'Q' in col or 'Position' in col or 'Time' in col])

    print(f"\n🎯 Sample qualifying data:")
    sample_cols = ['Abbreviation', 'Position', 'Q1', 'Q2', 'Q3']
    available_cols = [col for col in sample_cols if col in quali.results.columns]
    print(quali.results[available_cols].head(10).to_string(index=False))

    print("\n✅✅✅ FASTF1 API IS WORKING! ✅✅✅")
    print("We can proceed with full qualifying data collection!")

except Exception as e:
    print(f"\n❌ FASTF1 STILL BROKEN")
    print(f"   Error: {type(e).__name__}: {str(e)[:100]}")
    print("\n🔄 We'll use alternative approach")


🔍 TESTING FASTF1 API STATUS

Testing with 2025 US GP (Round 19) qualifying...

Attempting to load session...
✅ Session object created
Loading session data...
✅ Session data loaded!

📊 Results shape: (20, 22)

📋 Available columns:
['Position', 'ClassifiedPosition', 'GridPosition', 'Q1', 'Q2', 'Q3', 'Time']

🎯 Sample qualifying data:
Abbreviation  Position                     Q1                     Q2                     Q3
         VER       1.0 0 days 00:01:33.207000 0 days 00:01:32.701000 0 days 00:01:32.510000
         NOR       2.0 0 days 00:01:33.843000 0 days 00:01:32.876000 0 days 00:01:32.801000
         LEC       3.0 0 days 00:01:33.525000 0 days 00:01:32.869000 0 days 00:01:32.807000
         RUS       4.0 0 days 00:01:33.311000 0 days 00:01:33.058000 0 days 00:01:32.826000
         HAM       5.0 0 days 00:01:33.685000 0 days 00:01:32.914000 0 days 00:01:32.912000
         PIA       6.0 0 days 00:01:33.746000 0 days 00:01:33.228000 0 days 00:01:33.084000
         ANT       7.0

In [5]:
# ========================================
# STEP 2: Collect Qualifying Data (2022-2025)
# Q1, Q2, Q3 times for qualifying intelligence
# INCLUDES Mexico GP R20 qualifying (we need it to test!)
# ========================================

def collect_qualifying_data(race_df):
    """
    Collect ALL qualifying data including Mexico GP
    We exclude Mexico RACE from training, but INCLUDE Mexico QUALI for testing
    """
    import logging
    logging.getLogger('fastf1').setLevel(logging.ERROR)

    print(f"📊 COLLECTING QUALIFYING DATA (2022-2025)")
    print("   Using FastF1 for Q1/Q2/Q3 times")
    print("   Including Mexico GP R20 qualifying (needed for testing)\n")

    # Get unique season/round combinations from race data
    race_schedule = race_df[['season', 'round']].drop_duplicates().sort_values(['season', 'round'])

    # Add Mexico GP R20 back (we only excluded it from RACE data)
    mexico_gp = pd.DataFrame({'season': [2025], 'round': [20]})
    race_schedule = pd.concat([race_schedule, mexico_gp], ignore_index=True).sort_values(['season', 'round']).drop_duplicates()

    quali_data = []
    failed_sessions = []

    total = len(race_schedule)

    for idx, row in race_schedule.iterrows():
        year = int(row['season'])
        round_num = int(row['round'])

        try:
            # Load qualifying session
            quali = ff1.get_session(year, round_num, 'Q')
            quali.load()
            results = quali.results

            # Add metadata
            results['season'] = year
            results['round'] = round_num

            quali_data.append(results)

            # Mark Mexico GP specially
            if year == 2025 and round_num == 20:
                print(f"   [{len(quali_data):2d}/{total}] {year} R{round_num:2d} ✅ {len(results)} drivers [MEXICO - TEST DATA]")
            else:
                print(f"   [{len(quali_data):2d}/{total}] {year} R{round_num:2d} ✅ {len(results)} drivers")

            # Rate limiting
            time.sleep(3)

        except Exception as e:
            failed_sessions.append(f"{year}-R{round_num}")
            print(f"   [--/--] {year} R{round_num:2d} ❌")
            time.sleep(2)

    if len(quali_data) == 0:
        print("\n❌ No qualifying data collected!")
        return None

    quali_df = pd.concat(quali_data, ignore_index=True)

    print("\n" + "=" * 70)
    print("✅ QUALIFYING DATA COLLECTION COMPLETE")
    print("=" * 70)
    print(f"Total records:        {len(quali_df):,}")
    print(f"Total sessions:       {len(quali_data)}/{total}")
    print(f"Success rate:         {(len(quali_data)/total)*100:.1f}%")
    print(f"Seasons:              {sorted(quali_df['season'].unique())}")
    print(f"Unique drivers:       {quali_df['Abbreviation'].nunique()}")

    # Check Mexico GP
    mexico_quali = quali_df[(quali_df['season'] == 2025) & (quali_df['round'] == 20)]
    if len(mexico_quali) > 0:
        print(f"\n🎯 Mexico GP Qualifying (Test Data): {len(mexico_quali)} drivers")

    if failed_sessions:
        print(f"\n⚠️  Failed sessions ({len(failed_sessions)}):")
        for fail in failed_sessions[:5]:
            print(f"   - {fail}")

    # Show sample
    print(f"\n📋 Sample qualifying data:")
    sample_cols = ['season', 'round', 'Abbreviation', 'Position', 'Q1', 'Q2', 'Q3']
    available_cols = [col for col in sample_cols if col in quali_df.columns]
    print(quali_df[available_cols].head(10).to_string(index=False))

    print(f"\n💾 Saving to CSV...")
    quali_df.to_csv('../data/raw/qualifying_results_2022_2025.csv', index=False)
    print(f"✅ Saved: data/raw/qualifying_results_2022_2025.csv")

    return quali_df

# Collect ALL qualifying data (including Mexico R20)
print(f"Base race schedule: {len(race_data[['season', 'round']].drop_duplicates())} races")
print(f"Adding Mexico GP R20 qualifying\n")

quali_data = collect_qualifying_data(race_df=race_data)


Base race schedule: 87 races
Adding Mexico GP R20 qualifying

📊 COLLECTING QUALIFYING DATA (2022-2025)
   Using FastF1 for Q1/Q2/Q3 times
   Including Mexico GP R20 qualifying (needed for testing)

   [ 1/88] 2022 R 1 ✅ 20 drivers
   [ 2/88] 2022 R 2 ✅ 20 drivers
   [ 3/88] 2022 R 3 ✅ 20 drivers
   [ 4/88] 2022 R 4 ✅ 20 drivers
   [ 5/88] 2022 R 5 ✅ 20 drivers
   [ 6/88] 2022 R 6 ✅ 20 drivers
   [ 7/88] 2022 R 7 ✅ 20 drivers
   [ 8/88] 2022 R 8 ✅ 20 drivers
   [ 9/88] 2022 R 9 ✅ 20 drivers
   [10/88] 2022 R10 ✅ 20 drivers
   [11/88] 2022 R11 ✅ 20 drivers
   [12/88] 2022 R12 ✅ 20 drivers
   [13/88] 2022 R13 ✅ 20 drivers
   [14/88] 2022 R14 ✅ 20 drivers
   [15/88] 2022 R15 ✅ 20 drivers
   [16/88] 2022 R16 ✅ 20 drivers
   [17/88] 2022 R17 ✅ 20 drivers
   [18/88] 2022 R18 ✅ 20 drivers
   [19/88] 2022 R19 ✅ 20 drivers
   [20/88] 2022 R20 ✅ 20 drivers
   [21/88] 2022 R21 ✅ 20 drivers
   [22/88] 2022 R22 ✅ 20 drivers
   [23/88] 2023 R 1 ✅ 20 drivers
   [24/88] 2023 R 2 ✅ 20 drivers
   [25/88]

In [6]:
# ========================================
# STEP 3: Merge Race + Qualifying Data
# Create master dataset for feature engineering
# ========================================

print("📊 MERGING RACE + QUALIFYING DATA\n")

# Load race data
print(f"✅ Race data: {len(race_data):,} records")
print(f"✅ Qualifying data: {len(quali_data):,} records")

# Merge on season, round, driver code
print("\n🔗 Merging datasets...")

# Race data key columns
race_key_cols = ['season', 'round', 'driverCode', 'position', 'points', 'grid',
                 'constructorName', 'status']

# Qualifying key columns
quali_key_cols = ['season', 'round', 'Abbreviation', 'Position', 'Q1', 'Q2', 'Q3']

# Merge
merged_df = race_data.merge(
    quali_data[quali_key_cols],
    left_on=['season', 'round', 'driverCode'],
    right_on=['season', 'round', 'Abbreviation'],
    how='left',
    suffixes=('_race', '_quali')
)

print(f"✅ Merged dataset: {len(merged_df):,} records")
print(f"\n📋 Available columns: {len(merged_df.columns)}")
print(f"   Race columns: {[col for col in merged_df.columns if 'race' in col.lower()]}")
print(f"   Quali columns: {[col for col in merged_df.columns if any(x in col for x in ['Q1', 'Q2', 'Q3', 'quali'])]}")

# Check for Mexico GP
mexico_race = merged_df[(merged_df['season'] == 2025) & (merged_df['round'] == 20)]
print(f"\n🎯 Mexico GP R20:")
print(f"   Race data: {len(race_data[(race_data['season'] == 2025) & (race_data['round'] == 20)])} records (should be 0 - excluded)")
print(f"   Qualifying data: {len(quali_data[(quali_data['season'] == 2025) & (quali_data['round'] == 20)])} records (should be 20)")

print(f"\n📋 Sample merged data:")
print(merged_df[['season', 'round', 'driverCode', 'grid', 'position', 'points',
                 'Q1', 'Q2', 'Q3']].head(10).to_string(index=False))

print(f"\n💾 Saving merged dataset...")
merged_df.to_csv('../data/processed/merged_race_quali_2022_2025.csv', index=False)
print(f"✅ Saved: data/processed/merged_race_quali_2022_2025.csv")

print(f"\n✅ Ready for feature engineering!")


📊 MERGING RACE + QUALIFYING DATA

✅ Race data: 1,738 records
✅ Qualifying data: 1,759 records

🔗 Merging datasets...
✅ Merged dataset: 1,738 records

📋 Available columns: 33
   Race columns: ['totalRaceTimeMillis', 'totalRaceTime']
   Quali columns: ['Q1', 'Q2', 'Q3']

🎯 Mexico GP R20:
   Race data: 0 records (should be 0 - excluded)
   Qualifying data: 20 records (should be 20)

📋 Sample merged data:
 season  round driverCode  grid  position  points                     Q1                     Q2                     Q3
   2022      1        LEC     1         1    26.0 0 days 00:01:31.471000 0 days 00:01:30.932000 0 days 00:01:30.558000
   2022      1        SAI     3         2    18.0 0 days 00:01:31.567000 0 days 00:01:30.787000 0 days 00:01:30.687000
   2022      1        HAM     5         3    15.0 0 days 00:01:32.285000 0 days 00:01:31.048000 0 days 00:01:31.238000
   2022      1        RUS     9         4    12.0 0 days 00:01:32.269000 0 days 00:01:31.252000 0 days 00:01:32.216000


In [41]:
# ========================================
# FEATURE ENGINEERING - PART 1
# Qualifying Intelligence (10 features)
# ========================================

print("🔧 FEATURE ENGINEERING - QUALIFYING INTELLIGENCE\n")

df = merged_df.copy()

print(f"Starting dataset: {len(df):,} records\n")

# Convert Q1, Q2, Q3 times to seconds for calculations
print("1️⃣ Converting Q times to seconds...")

def timedelta_to_seconds(td):
    """Convert timedelta to seconds, return NaN if missing"""
    if pd.isna(td):
        return np.nan
    return td.total_seconds()

df['Q1_seconds'] = df['Q1'].apply(timedelta_to_seconds)
df['Q2_seconds'] = df['Q2'].apply(timedelta_to_seconds)
df['Q3_seconds'] = df['Q3'].apply(timedelta_to_seconds)

print(f"   ✅ Q1_seconds: {df['Q1_seconds'].notna().sum()}/{len(df)} records")
print(f"   ✅ Q2_seconds: {df['Q2_seconds'].notna().sum()}/{len(df)} records")
print(f"   ✅ Q3_seconds: {df['Q3_seconds'].notna().sum()}/{len(df)} records")

# Feature 1: Best Qualifying Time (use best of Q1/Q2/Q3)
print("\n2️⃣ Creating quali_best_time...")
df['quali_best_time'] = df[['Q1_seconds', 'Q2_seconds', 'Q3_seconds']].min(axis=1)
print(f"   ✅ Created: {df['quali_best_time'].notna().sum()}/{len(df)} records")

# Feature 2-3: Gap to Pole (absolute and percentage)
print("\n3️⃣ Creating quali_gap_to_pole...")
# Get pole time for each race (minimum quali time)
pole_times = df.groupby(['season', 'round'])['quali_best_time'].min().reset_index()
pole_times.columns = ['season', 'round', 'pole_time']
df = df.merge(pole_times, on=['season', 'round'], how='left')

df['quali_gap_to_pole'] = df['quali_best_time'] - df['pole_time']
df['quali_gap_to_pole_pct'] = (df['quali_gap_to_pole'] / df['pole_time']) * 100

print(f"   ✅ quali_gap_to_pole: {df['quali_gap_to_pole'].notna().sum()}/{len(df)} records")
print(f"   ✅ quali_gap_to_pole_pct: {df['quali_gap_to_pole_pct'].notna().sum()}/{len(df)} records")

# Feature 4: Qualifying Performance (normalized 0-1, pole=1, slowest=0)
print("\n4️⃣ Creating quali_performance_score...")
race_max_time = df.groupby(['season', 'round'])['quali_best_time'].max().reset_index()
race_max_time.columns = ['season', 'round', 'max_time']
df = df.merge(race_max_time, on=['season', 'round'], how='left')

df['quali_performance_score'] = 1 - ((df['quali_best_time'] - df['pole_time']) /
                                      (df['max_time'] - df['pole_time']))
print(f"   ✅ Created: {df['quali_performance_score'].notna().sum()}/{len(df)} records")

# Feature 5: Qualified in Q3 (top 10)
print("\n5️⃣ Creating quali_made_q3...")
df['quali_made_q3'] = df['Q3_seconds'].notna().astype(int)
print(f"   ✅ Created: {df['quali_made_q3'].sum()}/{len(df)} drivers made Q3")

# Feature 6: Qualified in Q2 (top 15)
print("\n6️⃣ Creating quali_made_q2...")
df['quali_made_q2'] = df['Q2_seconds'].notna().astype(int)
print(f"   ✅ Created: {df['quali_made_q2'].sum()}/{len(df)} drivers made Q2")

# Feature 7: Qualifying improvement Q1→Q2
print("\n7️⃣ Creating quali_q1_q2_improvement...")
df['quali_q1_q2_improvement'] = df['Q1_seconds'] - df['Q2_seconds']
print(f"   ✅ Created: {df['quali_q1_q2_improvement'].notna().sum()}/{len(df)} records")

# Feature 8: Qualifying improvement Q2→Q3
print("\n8️⃣ Creating quali_q2_q3_improvement...")
df['quali_q2_q3_improvement'] = df['Q2_seconds'] - df['Q3_seconds']
print(f"   ✅ Created: {df['quali_q2_q3_improvement'].notna().sum()}/{len(df)} records")

# Feature 9: Grid position
print("\n9️⃣ Creating grid_position...")
df['grid_position'] = df['grid']
print(f"   ✅ Created: {df['grid_position'].notna().sum()}/{len(df)} records")

# Feature 10: Front row start (P1 or P2)
print("\n🔟 Creating front_row_start...")
df['front_row_start'] = (df['grid'] <= 2).astype(int)
print(f"   ✅ Created: {df['front_row_start'].sum()}/{len(df)} front row starts")

# Summary
print("\n" + "=" * 70)
print("✅ QUALIFYING FEATURES COMPLETE (10/50)")
print("=" * 70)

quali_features = [
    'quali_best_time', 'quali_gap_to_pole', 'quali_gap_to_pole_pct',
    'quali_performance_score', 'quali_made_q3', 'quali_made_q2',
    'quali_q1_q2_improvement', 'quali_q2_q3_improvement',
    'grid_position', 'front_row_start'
]

print(f"\n📊 Created features:")
for i, feat in enumerate(quali_features, 1):
    non_null = df[feat].notna().sum()
    print(f"   {i:2d}. {feat:30s} - {non_null:,}/{len(df):,} records ({non_null/len(df)*100:.1f}%)")

print(f"\n📋 Sample data with new features:")
print(df[['driverCode', 'season', 'round', 'grid_position', 'quali_gap_to_pole',
          'quali_performance_score', 'front_row_start']].head(10).to_string(index=False))


🔧 FEATURE ENGINEERING - QUALIFYING INTELLIGENCE

Starting dataset: 1,738 records

1️⃣ Converting Q times to seconds...
   ✅ Q1_seconds: 1718/1738 records
   ✅ Q2_seconds: 1287/1738 records
   ✅ Q3_seconds: 848/1738 records

2️⃣ Creating quali_best_time...
   ✅ Created: 1720/1738 records

3️⃣ Creating quali_gap_to_pole...
   ✅ quali_gap_to_pole: 1720/1738 records
   ✅ quali_gap_to_pole_pct: 1720/1738 records

4️⃣ Creating quali_performance_score...
   ✅ Created: 1720/1738 records

5️⃣ Creating quali_made_q3...
   ✅ Created: 848/1738 drivers made Q3

6️⃣ Creating quali_made_q2...
   ✅ Created: 1287/1738 drivers made Q2

7️⃣ Creating quali_q1_q2_improvement...
   ✅ Created: 1285/1738 records

8️⃣ Creating quali_q2_q3_improvement...
   ✅ Created: 848/1738 records

9️⃣ Creating grid_position...
   ✅ Created: 1738/1738 records

🔟 Creating front_row_start...
   ✅ Created: 189/1738 front row starts

✅ QUALIFYING FEATURES COMPLETE (10/50)

📊 Created features:
    1. quali_best_time             

In [42]:
# ========================================
# FEATURE ENGINEERING - PART 2
# Driver Performance & Recent Form (10 features)
# ========================================

print("🔧 FEATURE ENGINEERING - DRIVER PERFORMANCE\n")
print("Creating rolling statistics (last 3 and last 5 races)...\n")

# Sort by driver and date
df = df.sort_values(['driverCode', 'season', 'round']).reset_index(drop=True)

# Feature 11-12: Last 3 races average points and position
print("1️⃣ Creating driver_last3_avg_points and driver_last3_avg_position...")
df['driver_last3_avg_points'] = df.groupby('driverCode')['points'].transform(
    lambda x: x.shift(1).rolling(window=3, min_periods=1).mean()
)
df['driver_last3_avg_position'] = df.groupby('driverCode')['position'].transform(
    lambda x: x.shift(1).rolling(window=3, min_periods=1).mean()
)
print(f"   ✅ driver_last3_avg_points: {df['driver_last3_avg_points'].notna().sum()}/{len(df)}")
print(f"   ✅ driver_last3_avg_position: {df['driver_last3_avg_position'].notna().sum()}/{len(df)}")

# Feature 13-14: Last 5 races average points and position
print("\n2️⃣ Creating driver_last5_avg_points and driver_last5_avg_position...")
df['driver_last5_avg_points'] = df.groupby('driverCode')['points'].transform(
    lambda x: x.shift(1).rolling(window=5, min_periods=1).mean()
)
df['driver_last5_avg_position'] = df.groupby('driverCode')['position'].transform(
    lambda x: x.shift(1).rolling(window=5, min_periods=1).mean()
)
print(f"   ✅ driver_last5_avg_points: {df['driver_last5_avg_points'].notna().sum()}/{len(df)}")
print(f"   ✅ driver_last5_avg_position: {df['driver_last5_avg_position'].notna().sum()}/{len(df)}")

# Feature 15: Season cumulative points
print("\n3️⃣ Creating driver_season_points...")
df['driver_season_points'] = df.groupby(['driverCode', 'season'])['points'].cumsum() - df['points']
print(f"   ✅ driver_season_points: {df['driver_season_points'].notna().sum()}/{len(df)}")

# Feature 16: Season race count
print("\n4️⃣ Creating driver_season_races...")
df['driver_season_races'] = df.groupby(['driverCode', 'season']).cumcount()
print(f"   ✅ driver_season_races: {df['driver_season_races'].notna().sum()}/{len(df)}")

# Feature 17: Last 5 races podium count
print("\n5️⃣ Creating driver_last5_podiums...")
df['is_podium'] = (df['position'] <= 3).astype(int)
df['driver_last5_podiums'] = df.groupby('driverCode')['is_podium'].transform(
    lambda x: x.shift(1).rolling(window=5, min_periods=1).sum()
)
print(f"   ✅ driver_last5_podiums: {df['driver_last5_podiums'].notna().sum()}/{len(df)}")

# Feature 18: DNF rate (Did Not Finish)
print("\n6️⃣ Creating driver_dnf_rate...")
df['is_dnf'] = (~df['status'].str.contains('Finished|Lap', case=False, na=False)).astype(int)
df['driver_total_dnf'] = df.groupby('driverCode')['is_dnf'].cumsum() - df['is_dnf']
df['driver_total_races'] = df.groupby('driverCode').cumcount()
df['driver_dnf_rate'] = df['driver_total_dnf'] / df['driver_total_races'].replace(0, 1)
print(f"   ✅ driver_dnf_rate: {df['driver_dnf_rate'].notna().sum()}/{len(df)}")

# Feature 19: Average finish position (career)
print("\n7️⃣ Creating driver_avg_finish_position...")
df['driver_avg_finish_position'] = df.groupby('driverCode')['position'].transform(
    lambda x: x.shift(1).expanding().mean()
)
print(f"   ✅ driver_avg_finish_position: {df['driver_avg_finish_position'].notna().sum()}/{len(df)}")

# Feature 20: Championship position (rank by season points)
print("\n8️⃣ Creating driver_championship_position...")
df['driver_championship_position'] = df.groupby(['season', 'round'])['driver_season_points'].rank(
    ascending=False, method='min'
)
print(f"   ✅ driver_championship_position: {df['driver_championship_position'].notna().sum()}/{len(df)}")

# Summary
print("\n" + "=" * 70)
print("✅ DRIVER PERFORMANCE FEATURES COMPLETE (20/50)")
print("=" * 70)

driver_features = [
    'driver_last3_avg_points', 'driver_last3_avg_position',
    'driver_last5_avg_points', 'driver_last5_avg_position',
    'driver_season_points', 'driver_season_races',
    'driver_last5_podiums', 'driver_dnf_rate',
    'driver_avg_finish_position', 'driver_championship_position'
]

print(f"\n📊 Driver performance features (11-20):")
for i, feat in enumerate(driver_features, 11):
    non_null = df[feat].notna().sum()
    print(f"   {i:2d}. {feat:35s} - {non_null:,}/{len(df):,} ({non_null/len(df)*100:.1f}%)")

print(f"\n📋 Sample data:")
print(df[['driverCode', 'round', 'driver_last5_avg_points', 'driver_season_points',
          'driver_championship_position', 'driver_dnf_rate']].head(10).to_string(index=False))

print(f"\n✅ Progress: 20/50 features complete (40%)")


🔧 FEATURE ENGINEERING - DRIVER PERFORMANCE

Creating rolling statistics (last 3 and last 5 races)...

1️⃣ Creating driver_last3_avg_points and driver_last3_avg_position...
   ✅ driver_last3_avg_points: 1707/1738
   ✅ driver_last3_avg_position: 1707/1738

2️⃣ Creating driver_last5_avg_points and driver_last5_avg_position...
   ✅ driver_last5_avg_points: 1707/1738
   ✅ driver_last5_avg_position: 1707/1738

3️⃣ Creating driver_season_points...
   ✅ driver_season_points: 1738/1738

4️⃣ Creating driver_season_races...
   ✅ driver_season_races: 1738/1738

5️⃣ Creating driver_last5_podiums...
   ✅ driver_last5_podiums: 1707/1738

6️⃣ Creating driver_dnf_rate...
   ✅ driver_dnf_rate: 1738/1738

7️⃣ Creating driver_avg_finish_position...
   ✅ driver_avg_finish_position: 1707/1738

8️⃣ Creating driver_championship_position...
   ✅ driver_championship_position: 1738/1738

✅ DRIVER PERFORMANCE FEATURES COMPLETE (20/50)

📊 Driver performance features (11-20):
   11. driver_last3_avg_points         

In [43]:
# ========================================
# FEATURE ENGINEERING - PART 3
# Constructor/Team Performance (10 features)
# ========================================

print("🔧 FEATURE ENGINEERING - CONSTRUCTOR PERFORMANCE\n")

# Feature 21-22: Constructor last 3 races average points
print("1️⃣ Creating constructor_last3_avg_points...")
df['constructor_last3_avg_points'] = df.groupby('constructorName')['points'].transform(
    lambda x: x.shift(1).rolling(window=3, min_periods=1).mean()
)
print(f"   ✅ Created: {df['constructor_last3_avg_points'].notna().sum()}/{len(df)}")

# Feature 23-24: Constructor last 5 races average points
print("\n2️⃣ Creating constructor_last5_avg_points...")
df['constructor_last5_avg_points'] = df.groupby('constructorName')['points'].transform(
    lambda x: x.shift(1).rolling(window=5, min_periods=1).mean()
)
print(f"   ✅ Created: {df['constructor_last5_avg_points'].notna().sum()}/{len(df)}")

# Feature 25: Constructor season points
print("\n3️⃣ Creating constructor_season_points...")
df['constructor_season_points'] = df.groupby(['constructorName', 'season'])['points'].cumsum() - df['points']
print(f"   ✅ Created: {df['constructor_season_points'].notna().sum()}/{len(df)}")

# Feature 26: Constructor championship position
print("\n4️⃣ Creating constructor_championship_position...")
# Get unique constructor points per race
constructor_points = df.groupby(['season', 'round', 'constructorName'])['constructor_season_points'].first().reset_index()
constructor_points['constructor_championship_position'] = constructor_points.groupby(['season', 'round'])['constructor_season_points'].rank(
    ascending=False, method='min'
)
df = df.merge(
    constructor_points[['season', 'round', 'constructorName', 'constructor_championship_position']],
    on=['season', 'round', 'constructorName'],
    how='left'
)
print(f"   ✅ Created: {df['constructor_championship_position'].notna().sum()}/{len(df)}")

# Feature 27: Constructor reliability (DNF rate)
print("\n5️⃣ Creating constructor_dnf_rate...")
df['constructor_total_dnf'] = df.groupby('constructorName')['is_dnf'].cumsum() - df['is_dnf']
df['constructor_total_races'] = df.groupby('constructorName').cumcount()
df['constructor_dnf_rate'] = df['constructor_total_dnf'] / df['constructor_total_races'].replace(0, 1)
print(f"   ✅ Created: {df['constructor_dnf_rate'].notna().sum()}/{len(df)}")

# Feature 28: Constructor avg qualifying position
print("\n6️⃣ Creating constructor_avg_quali_position...")
df['constructor_avg_quali_position'] = df.groupby('constructorName')['grid_position'].transform(
    lambda x: x.shift(1).expanding().mean()
)
print(f"   ✅ Created: {df['constructor_avg_quali_position'].notna().sum()}/{len(df)}")

# Feature 29: Constructor points per race (efficiency)
print("\n7️⃣ Creating constructor_points_per_race...")
df['constructor_points_per_race'] = df['constructor_season_points'] / (df['driver_season_races'] + 1)
print(f"   ✅ Created: {df['constructor_points_per_race'].notna().sum()}/{len(df)}")

# Feature 30: Constructor top team (top 3 in championship)
print("\n8️⃣ Creating constructor_is_top_team...")
df['constructor_is_top_team'] = (df['constructor_championship_position'] <= 3).astype(int)
print(f"   ✅ Created: {df['constructor_is_top_team'].sum()}/{len(df)} top team entries")

# Summary
print("\n" + "=" * 70)
print("✅ CONSTRUCTOR PERFORMANCE FEATURES COMPLETE (30/50)")
print("=" * 70)

constructor_features = [
    'constructor_last3_avg_points', 'constructor_last5_avg_points',
    'constructor_season_points', 'constructor_championship_position',
    'constructor_dnf_rate', 'constructor_avg_quali_position',
    'constructor_points_per_race', 'constructor_is_top_team'
]

print(f"\n📊 Constructor features (21-30):")
for i, feat in enumerate(constructor_features, 21):
    non_null = df[feat].notna().sum()
    print(f"   {i:2d}. {feat:40s} - {non_null:,}/{len(df):,} ({non_null/len(df)*100:.1f}%)")

print(f"\n📋 Sample data:")
print(df[['constructorName', 'round', 'constructor_last5_avg_points',
          'constructor_championship_position', 'constructor_is_top_team']].head(10).to_string(index=False))

print(f"\n✅ Progress: 30/50 features complete (60%)")


🔧 FEATURE ENGINEERING - CONSTRUCTOR PERFORMANCE

1️⃣ Creating constructor_last3_avg_points...
   ✅ Created: 1726/1738

2️⃣ Creating constructor_last5_avg_points...
   ✅ Created: 1726/1738

3️⃣ Creating constructor_season_points...
   ✅ Created: 1738/1738

4️⃣ Creating constructor_championship_position...
   ✅ Created: 1738/1738

5️⃣ Creating constructor_dnf_rate...
   ✅ Created: 1738/1738

6️⃣ Creating constructor_avg_quali_position...
   ✅ Created: 1726/1738

7️⃣ Creating constructor_points_per_race...
   ✅ Created: 1738/1738

8️⃣ Creating constructor_is_top_team...
   ✅ Created: 568/1738 top team entries

✅ CONSTRUCTOR PERFORMANCE FEATURES COMPLETE (30/50)

📊 Constructor features (21-30):
   21. constructor_last3_avg_points             - 1,726/1,738 (99.3%)
   22. constructor_last5_avg_points             - 1,726/1,738 (99.3%)
   23. constructor_season_points                - 1,738/1,738 (100.0%)
   24. constructor_championship_position        - 1,738/1,738 (100.0%)
   25. constructor

In [44]:
# ========================================
# FEATURE ENGINEERING - PART 4
# Circuit-Specific Performance (12 features)
# ========================================

print("🔧 FEATURE ENGINEERING - CIRCUIT-SPECIFIC PERFORMANCE\n")

# Need to identify circuits - use race name or create circuit ID
# Get circuit names from race data (location)
print("1️⃣ Identifying circuits...")

# Create circuit identifier (combine season/round to get unique races)
# Then get circuit name patterns
df['circuit_id'] = df['season'].astype(str) + '_' + df['round'].astype(str)

# Extract circuit name from location or race name patterns
# For simplicity, we'll use constructor name patterns and driver patterns
# (In real implementation, would need actual circuit names from API)

print("   Creating circuit historical performance features...")

# Feature 31: Driver wins at this circuit (historical)
print("\n2️⃣ Creating circuit_driver_wins...")
df = df.sort_values(['driverCode', 'season', 'round'])
df['is_win'] = (df['position'] == 1).astype(int)

# For each race, count driver's previous wins at ANY circuit (as proxy)
# In full version, would filter by actual circuit
df['circuit_driver_wins'] = df.groupby(['driverCode'])['is_win'].cumsum() - df['is_win']
print(f"   ✅ Created: {df['circuit_driver_wins'].notna().sum()}/{len(df)}")

# Feature 32: Driver podiums at this circuit
print("\n3️⃣ Creating circuit_driver_podiums...")
df['circuit_driver_podiums'] = df.groupby(['driverCode'])['is_podium'].cumsum() - df['is_podium']
print(f"   ✅ Created: {df['circuit_driver_podiums'].notna().sum()}/{len(df)}")

# Feature 33: Driver average finish at this circuit
print("\n4️⃣ Creating circuit_driver_avg_finish...")
df['circuit_driver_avg_finish'] = df.groupby(['driverCode'])['position'].transform(
    lambda x: x.shift(1).expanding().mean()
)
print(f"   ✅ Created: {df['circuit_driver_avg_finish'].notna().sum()}/{len(df)}")

# Feature 34: Driver total races at this circuit
print("\n5️⃣ Creating circuit_driver_experience...")
df['circuit_driver_experience'] = df.groupby(['driverCode']).cumcount()
print(f"   ✅ Created: {df['circuit_driver_experience'].notna().sum()}/{len(df)}")

# Feature 35: Constructor wins at this circuit
print("\n6️⃣ Creating circuit_constructor_wins...")
df['circuit_constructor_wins'] = df.groupby(['constructorName'])['is_win'].cumsum() - df['is_win']
print(f"   ✅ Created: {df['circuit_constructor_wins'].notna().sum()}/{len(df)}")

# Feature 36: Constructor podiums at this circuit
print("\n7️⃣ Creating circuit_constructor_podiums...")
df['circuit_constructor_podiums'] = df.groupby(['constructorName'])['is_podium'].cumsum() - df['is_podium']
print(f"   ✅ Created: {df['circuit_constructor_podiums'].notna().sum()}/{len(df)}")

# Feature 37: Best grid position at this circuit (historical)
print("\n8️⃣ Creating circuit_driver_best_grid...")
df['circuit_driver_best_grid'] = df.groupby(['driverCode'])['grid_position'].transform(
    lambda x: x.shift(1).expanding().min()
)
print(f"   ✅ Created: {df['circuit_driver_best_grid'].notna().sum()}/{len(df)}")

# Feature 38: Win rate at this circuit
print("\n9️⃣ Creating circuit_driver_win_rate...")
df['circuit_driver_races'] = df.groupby(['driverCode']).cumcount()
df['circuit_driver_win_rate'] = df['circuit_driver_wins'] / df['circuit_driver_races'].replace(0, 1)
print(f"   ✅ Created: {df['circuit_driver_win_rate'].notna().sum()}/{len(df)}")

# Feature 39: Podium rate at this circuit
print("\n🔟 Creating circuit_driver_podium_rate...")
df['circuit_driver_podium_rate'] = df['circuit_driver_podiums'] / df['circuit_driver_races'].replace(0, 1)
print(f"   ✅ Created: {df['circuit_driver_podium_rate'].notna().sum()}/{len(df)}")

# Feature 40: Points per race at this circuit
print("\n1️⃣1️⃣ Creating circuit_driver_points_per_race...")
df['circuit_driver_total_points'] = df.groupby(['driverCode'])['points'].cumsum() - df['points']
df['circuit_driver_points_per_race'] = df['circuit_driver_total_points'] / df['circuit_driver_races'].replace(0, 1)
print(f"   ✅ Created: {df['circuit_driver_points_per_race'].notna().sum()}/{len(df)}")

# Feature 41-42: Circuit difficulty indicators (based on average grid vs finish correlation)
print("\n1️⃣2️⃣ Creating circuit_avg_grid_position_change...")
df['grid_position_change'] = df['position'] - df['grid_position']
df['circuit_avg_position_change'] = df.groupby(['season', 'round'])['grid_position_change'].transform('mean')
print(f"   ✅ Created: {df['circuit_avg_position_change'].notna().sum()}/{len(df)}")

# Summary
print("\n" + "=" * 70)
print("✅ CIRCUIT-SPECIFIC FEATURES COMPLETE (42/50)")
print("=" * 70)

circuit_features = [
    'circuit_driver_wins', 'circuit_driver_podiums',
    'circuit_driver_avg_finish', 'circuit_driver_experience',
    'circuit_constructor_wins', 'circuit_constructor_podiums',
    'circuit_driver_best_grid', 'circuit_driver_win_rate',
    'circuit_driver_podium_rate', 'circuit_driver_points_per_race',
    'circuit_avg_position_change'
]

print(f"\n📊 Circuit features (31-42):")
for i, feat in enumerate(circuit_features, 31):
    non_null = df[feat].notna().sum()
    print(f"   {i:2d}. {feat:40s} - {non_null:,}/{len(df):,} ({non_null/len(df)*100:.1f}%)")

print(f"\n✅ Progress: 42/50 features complete (84%)")


🔧 FEATURE ENGINEERING - CIRCUIT-SPECIFIC PERFORMANCE

1️⃣ Identifying circuits...
   Creating circuit historical performance features...

2️⃣ Creating circuit_driver_wins...
   ✅ Created: 1738/1738

3️⃣ Creating circuit_driver_podiums...
   ✅ Created: 1738/1738

4️⃣ Creating circuit_driver_avg_finish...
   ✅ Created: 1707/1738

5️⃣ Creating circuit_driver_experience...
   ✅ Created: 1738/1738

6️⃣ Creating circuit_constructor_wins...
   ✅ Created: 1738/1738

7️⃣ Creating circuit_constructor_podiums...
   ✅ Created: 1738/1738

8️⃣ Creating circuit_driver_best_grid...
   ✅ Created: 1707/1738

9️⃣ Creating circuit_driver_win_rate...
   ✅ Created: 1738/1738

🔟 Creating circuit_driver_podium_rate...
   ✅ Created: 1738/1738

1️⃣1️⃣ Creating circuit_driver_points_per_race...
   ✅ Created: 1738/1738

1️⃣2️⃣ Creating circuit_avg_grid_position_change...
   ✅ Created: 1738/1738

✅ CIRCUIT-SPECIFIC FEATURES COMPLETE (42/50)

📊 Circuit features (31-42):
   31. circuit_driver_wins                   

In [45]:
# ========================================
# FEATURE ENGINEERING - PART 5
# Final Features + Target Variable (8 features)
# ========================================

print("🔧 FEATURE ENGINEERING - FINAL FEATURES\n")

# Feature 43: Momentum indicator (improving/declining)
print("1️⃣ Creating driver_momentum...")
df['driver_momentum'] = df['driver_last3_avg_points'] - df['driver_last5_avg_points']
print(f"   ✅ Created: {df['driver_momentum'].notna().sum()}/{len(df)}")

# Feature 44: Points gap to leader
print("\n2️⃣ Creating points_gap_to_leader...")
max_points = df.groupby(['season', 'round'])['driver_season_points'].max().reset_index()
max_points.columns = ['season', 'round', 'leader_points']
df = df.merge(max_points, on=['season', 'round'], how='left')
df['points_gap_to_leader'] = df['leader_points'] - df['driver_season_points']
print(f"   ✅ Created: {df['points_gap_to_leader'].notna().sum()}/{len(df)}")

# Feature 45: Must-win pressure (championship math)
print("\n3️⃣ Creating must_win_pressure...")
# Races remaining in season (approximate)
df['races_remaining'] = df.groupby('season')['round'].transform('max') - df['round']
df['must_win_pressure'] = (df['points_gap_to_leader'] > (df['races_remaining'] * 18)).astype(int)
print(f"   ✅ Created: {df['must_win_pressure'].sum()}/{len(df)} must-win situations")

# Feature 46: Teammate battle (same team performance)
print("\n4️⃣ Creating teammate_gap...")
teammate_points = df.groupby(['season', 'round', 'constructorName'])['driver_season_points'].transform('max')
df['teammate_gap'] = teammate_points - df['driver_season_points']
print(f"   ✅ Created: {df['teammate_gap'].notna().sum()}/{len(df)}")

# Feature 47: Consistency score (std dev of recent positions)
print("\n5️⃣ Creating driver_consistency_score...")
df['driver_consistency_score'] = df.groupby('driverCode')['position'].transform(
    lambda x: x.shift(1).rolling(window=5, min_periods=2).std()
)
# Lower std = more consistent, so inverse it
df['driver_consistency_score'] = 1 / (df['driver_consistency_score'] + 1)
print(f"   ✅ Created: {df['driver_consistency_score'].notna().sum()}/{len(df)}")

# Feature 48: Qualifying vs Race performance (under/over-performer)
print("\n6️⃣ Creating quali_race_delta...")
df['quali_race_delta'] = df['position'] - df['grid_position']
df['avg_quali_race_delta'] = df.groupby('driverCode')['quali_race_delta'].transform(
    lambda x: x.shift(1).rolling(window=5, min_periods=1).mean()
)
print(f"   ✅ Created: {df['avg_quali_race_delta'].notna().sum()}/{len(df)}")

# Feature 49: Season progress (early vs late season)
print("\n7️⃣ Creating season_progress...")
max_rounds = df.groupby('season')['round'].transform('max')
df['season_progress'] = df['round'] / max_rounds
print(f"   ✅ Created: {df['season_progress'].notna().sum()}/{len(df)}")

# Feature 50: Driver age/experience (career races)
print("\n8️⃣ Creating driver_career_races...")
df['driver_career_races'] = df.groupby('driverCode').cumcount()
print(f"   ✅ Created: {df['driver_career_races'].notna().sum()}/{len(df)}")

# ========================================
# TARGET VARIABLE
# ========================================

print("\n" + "=" * 70)
print("🎯 CREATING TARGET VARIABLE")
print("=" * 70)

# Target: Podium finish (1st, 2nd, or 3rd place)
df['podium_finish'] = (df['position'] <= 3).astype(int)

print(f"\n✅ Target variable created: podium_finish")
print(f"   Podium finishes: {df['podium_finish'].sum()}/{len(df)} ({df['podium_finish'].mean()*100:.1f}%)")
print(f"   Non-podium:      {(df['podium_finish']==0).sum()}/{len(df)} ({(1-df['podium_finish'].mean())*100:.1f}%)")

# ========================================
# FINAL SUMMARY
# ========================================

print("\n" + "=" * 70)
print("✅✅✅ ALL 50 FEATURES COMPLETE! ✅✅✅")
print("=" * 70)

all_features = [
    # Qualifying (10)
    'quali_best_time', 'quali_gap_to_pole', 'quali_gap_to_pole_pct',
    'quali_performance_score', 'quali_made_q3', 'quali_made_q2',
    'quali_q1_q2_improvement', 'quali_q2_q3_improvement',
    'grid_position', 'front_row_start',

    # Driver Performance (10)
    'driver_last3_avg_points', 'driver_last3_avg_position',
    'driver_last5_avg_points', 'driver_last5_avg_position',
    'driver_season_points', 'driver_season_races',
    'driver_last5_podiums', 'driver_dnf_rate',
    'driver_avg_finish_position', 'driver_championship_position',

    # Constructor (8)
    'constructor_last3_avg_points', 'constructor_last5_avg_points',
    'constructor_season_points', 'constructor_championship_position',
    'constructor_dnf_rate', 'constructor_avg_quali_position',
    'constructor_points_per_race', 'constructor_is_top_team',

    # Circuit-Specific (11)
    'circuit_driver_wins', 'circuit_driver_podiums',
    'circuit_driver_avg_finish', 'circuit_driver_experience',
    'circuit_constructor_wins', 'circuit_constructor_podiums',
    'circuit_driver_best_grid', 'circuit_driver_win_rate',
    'circuit_driver_podium_rate', 'circuit_driver_points_per_race',
    'circuit_avg_position_change',

    # Final Features (8)
    'driver_momentum', 'points_gap_to_leader', 'must_win_pressure',
    'teammate_gap', 'driver_consistency_score', 'avg_quali_race_delta',
    'season_progress', 'driver_career_races'
]

print(f"\n📊 Feature Categories:")
print(f"   1. Qualifying Intelligence:  10 features")
print(f"   2. Driver Performance:       10 features")
print(f"   3. Constructor Performance:   8 features")
print(f"   4. Circuit-Specific:         11 features")
print(f"   5. Advanced Features:         8 features")
print(f"   ────────────────────────────────────────")
print(f"   TOTAL:                       47 features")

print(f"\n🎯 Target Variable: podium_finish")
print(f"\n📋 Dataset shape: {df.shape}")
print(f"   Records: {len(df):,}")
print(f"   Total columns: {len(df.columns)}")

# Save final dataset
print(f"\n💾 Saving final dataset...")
df.to_csv('../data/processed/f1_v3_complete_features.csv', index=False)
print(f"✅ Saved: data/processed/f1_v3_complete_features.csv")

print(f"\n🚀 READY FOR MODEL TRAINING!")


🔧 FEATURE ENGINEERING - FINAL FEATURES

1️⃣ Creating driver_momentum...
   ✅ Created: 1707/1738

2️⃣ Creating points_gap_to_leader...
   ✅ Created: 1738/1738

3️⃣ Creating must_win_pressure...
   ✅ Created: 735/1738 must-win situations

4️⃣ Creating teammate_gap...
   ✅ Created: 1738/1738

5️⃣ Creating driver_consistency_score...
   ✅ Created: 1676/1738

6️⃣ Creating quali_race_delta...
   ✅ Created: 1707/1738

7️⃣ Creating season_progress...
   ✅ Created: 1738/1738

8️⃣ Creating driver_career_races...
   ✅ Created: 1738/1738

🎯 CREATING TARGET VARIABLE

✅ Target variable created: podium_finish
   Podium finishes: 261/1738 (15.0%)
   Non-podium:      1477/1738 (85.0%)

✅✅✅ ALL 50 FEATURES COMPLETE! ✅✅✅

📊 Feature Categories:
   1. Qualifying Intelligence:  10 features
   2. Driver Performance:       10 features
   3. Constructor Performance:   8 features
   4. Circuit-Specific:         11 features
   5. Advanced Features:         8 features
   ────────────────────────────────────────
 

In [46]:
# ========================================
# MODEL TRAINING - V3 REVISED
# 47 features (no weather proxies)
# Proper train/test split to avoid overfitting
# ========================================

print("🤖 F1 PREDICTOR V3 - REVISED TRAINING STRATEGY\n")

# Select 47 features (excluding weather)
feature_columns = [
    # Qualifying (10)
    'quali_best_time', 'quali_gap_to_pole', 'quali_gap_to_pole_pct',
    'quali_performance_score', 'quali_made_q3', 'quali_made_q2',
    'quali_q1_q2_improvement', 'quali_q2_q3_improvement',
    'grid_position', 'front_row_start',

    # Driver (10)
    'driver_last3_avg_points', 'driver_last3_avg_position',
    'driver_last5_avg_points', 'driver_last5_avg_position',
    'driver_season_points', 'driver_season_races',
    'driver_last5_podiums', 'driver_dnf_rate',
    'driver_avg_finish_position', 'driver_championship_position',

    # Constructor (8)
    'constructor_last3_avg_points', 'constructor_last5_avg_points',
    'constructor_season_points', 'constructor_championship_position',
    'constructor_dnf_rate', 'constructor_avg_quali_position',
    'constructor_points_per_race', 'constructor_is_top_team',

    # Circuit (11)
    'circuit_driver_wins', 'circuit_driver_podiums',
    'circuit_driver_avg_finish', 'circuit_driver_experience',
    'circuit_constructor_wins', 'circuit_constructor_podiums',
    'circuit_driver_best_grid', 'circuit_driver_win_rate',
    'circuit_driver_podium_rate', 'circuit_driver_points_per_race',
    'circuit_avg_position_change',

    # Advanced (8)
    'driver_momentum', 'points_gap_to_leader', 'must_win_pressure',
    'teammate_gap', 'driver_consistency_score', 'avg_quali_race_delta',
    'season_progress', 'driver_career_races'
]

print(f"📊 Configuration:")
print(f"   Features: {len(feature_columns)}")
print(f"   Target: podium_finish")

# ========================================
# TRAIN/TEST SPLIT
# ========================================

print("\n" + "=" * 70)
print("📊 TRAIN/TEST SPLIT")
print("=" * 70)

# Training: 2022-2024 + 2025 R1-R10
train_mask = (
    (df['season'] <= 2024) |
    ((df['season'] == 2025) & (df['round'] <= 10))
)

# Testing: 2025 R11-R19
test_mask = (df['season'] == 2025) & (df['round'] >= 11) & (df['round'] <= 19)

train_df = df[train_mask]
test_df = df[test_mask]

print(f"\n✅ Data Split:")
print(f"   Training:   {len(train_df):,} records (2022-2024 + 2025 R1-R10)")
print(f"   Testing:    {len(test_df):,} records (2025 R11-R19)")
print(f"   Total:      {len(df):,} records")

# Prepare features
X_train = train_df[feature_columns].fillna(0)
y_train = train_df['podium_finish']

X_test = test_df[feature_columns].fillna(0)
y_test = test_df['podium_finish']

print(f"\n📋 Target Distribution:")
print(f"   Training podiums:   {y_train.sum()}/{len(y_train)} ({y_train.mean()*100:.1f}%)")
print(f"   Testing podiums:    {y_test.sum()}/{len(y_test)} ({y_test.mean()*100:.1f}%)")

# ========================================
# MODEL TRAINING (with regularization)
# ========================================

print("\n" + "=" * 70)
print("🏋️ TRAINING MODEL (Regularized)")
print("=" * 70)

# XGBoost with regularization to prevent overfitting
xgb_model = xgb.XGBClassifier(
    n_estimators=150,
    max_depth=6,              # Reduced from 8
    learning_rate=0.05,       # Reduced from 0.1
    min_child_weight=3,       # Added regularization
    subsample=0.8,            # Random sampling
    colsample_bytree=0.8,     # Feature sampling
    reg_alpha=0.1,            # L1 regularization
    reg_lambda=1.0,           # L2 regularization
    random_state=42,
    n_jobs=-1
)

print("\nTraining XGBoost with regularization...")
xgb_model.fit(X_train, y_train)

# ========================================
# EVALUATION
# ========================================

print("\n" + "=" * 70)
print("📊 MODEL EVALUATION")
print("=" * 70)

# Training performance
y_train_pred = xgb_model.predict(X_train)
train_acc = accuracy_score(y_train, y_train_pred)

# Testing performance
y_test_pred = xgb_model.predict(X_test)
test_acc = accuracy_score(y_test, y_test_pred)

from sklearn.metrics import precision_score, recall_score, f1_score, classification_report

test_precision = precision_score(y_test, y_test_pred, zero_division=0)
test_recall = recall_score(y_test, y_test_pred, zero_division=0)
test_f1 = f1_score(y_test, y_test_pred, zero_division=0)

print(f"\n✅ Training Performance:")
print(f"   Accuracy: {train_acc:.4f} ({train_acc*100:.2f}%)")

print(f"\n✅ Testing Performance (2025 R11-R19):")
print(f"   Accuracy:  {test_acc:.4f} ({test_acc*100:.2f}%)")
print(f"   Precision: {test_precision:.4f}")
print(f"   Recall:    {test_recall:.4f}")
print(f"   F1-Score:  {test_f1:.4f}")

print(f"\n📋 Detailed Classification Report:")
print(classification_report(y_test, y_test_pred, target_names=['Non-Podium', 'Podium']))

# ========================================
# FEATURE IMPORTANCE
# ========================================

print("\n" + "=" * 70)
print("📊 TOP 15 MOST IMPORTANT FEATURES")
print("=" * 70)

feature_importance = pd.DataFrame({
    'feature': feature_columns,
    'importance': xgb_model.feature_importances_
}).sort_values('importance', ascending=False)

print(f"\n{feature_importance.head(15).to_string(index=False)}")

# Save model
print(f"\n💾 Saving V3 model...")
import pickle
with open('../models/xgboost/f1_v3_final_model.pkl', 'wb') as f:
    pickle.dump(xgb_model, f)
print(f"✅ Saved: models/xgboost/f1_v3_final_model.pkl")

print(f"\n🎯 READY FOR MEXICO GP PREDICTION!")


🤖 F1 PREDICTOR V3 - REVISED TRAINING STRATEGY

📊 Configuration:
   Features: 47
   Target: podium_finish

📊 TRAIN/TEST SPLIT

✅ Data Split:
   Training:   1,558 records (2022-2024 + 2025 R1-R10)
   Testing:    180 records (2025 R11-R19)
   Total:      1,738 records

📋 Target Distribution:
   Training podiums:   234/1558 (15.0%)
   Testing podiums:    27/180 (15.0%)

🏋️ TRAINING MODEL (Regularized)

Training XGBoost with regularization...

📊 MODEL EVALUATION

✅ Training Performance:
   Accuracy: 0.9942 (99.42%)

✅ Testing Performance (2025 R11-R19):
   Accuracy:  0.9333 (93.33%)
   Precision: 0.8261
   Recall:    0.7037
   F1-Score:  0.7600

📋 Detailed Classification Report:
              precision    recall  f1-score   support

  Non-Podium       0.95      0.97      0.96       153
      Podium       0.83      0.70      0.76        27

    accuracy                           0.93       180
   macro avg       0.89      0.84      0.86       180
weighted avg       0.93      0.93      0.93  

In [47]:
# ========================================
# FEATURE SELECTION - Keep Top 30 Features
# Remove low-importance features to reduce overfitting
# ========================================

print("🔧 FEATURE SELECTION - KEEPING TOP 30 FEATURES\n")

# Get feature importance from previous model
feature_importance_df = pd.DataFrame({
    'feature': feature_columns,
    'importance': xgb_model.feature_importances_
}).sort_values('importance', ascending=False)

print("📊 Feature Importance Analysis:")
print(feature_importance_df.to_string(index=False))

# Select top 30 features
top_30_features = feature_importance_df.head(30)['feature'].tolist()

print(f"\n✅ Selected top 30 features (removing bottom 17)")
print("\n📋 Top 30 Features:")
for i, feat in enumerate(top_30_features, 1):
    importance = feature_importance_df[feature_importance_df['feature'] == feat]['importance'].values[0]
    print(f"   {i:2d}. {feat:40s} ({importance:.4f})")

# ========================================
# RETRAIN WITH SELECTED FEATURES
# ========================================

print("\n" + "=" * 70)
print("🏋️ RETRAINING WITH TOP 30 FEATURES")
print("=" * 70)

# Prepare data with selected features
X_train_selected = train_df[top_30_features].fillna(0)
X_test_selected = test_df[top_30_features].fillna(0)

# Train new model
xgb_model_selected = xgb.XGBClassifier(
    n_estimators=150,
    max_depth=6,
    learning_rate=0.05,
    min_child_weight=3,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=1.0,
    random_state=42,
    n_jobs=-1
)

print("\nTraining XGBoost with 30 features...")
xgb_model_selected.fit(X_train_selected, y_train)

# Evaluate
y_train_pred_sel = xgb_model_selected.predict(X_train_selected)
y_test_pred_sel = xgb_model_selected.predict(X_test_selected)

train_acc_sel = accuracy_score(y_train, y_train_pred_sel)
test_acc_sel = accuracy_score(y_test, y_test_pred_sel)

print(f"\n✅ Performance Comparison:")
print(f"\n   47 Features:")
print(f"      Train: {train_acc:.4f} | Test: {test_acc:.4f}")
print(f"\n   30 Features:")
print(f"      Train: {train_acc_sel:.4f} | Test: {test_acc_sel:.4f}")
print(f"\n   Improvement: {(test_acc_sel - test_acc)*100:+.2f}%")

# Detailed metrics
from sklearn.metrics import classification_report

print(f"\n📋 Classification Report (30 features):")
print(classification_report(y_test, y_test_pred_sel, target_names=['Non-Podium', 'Podium']))

# Save model
print(f"\n💾 Saving feature-selected model...")
with open('../models/xgboost/f1_v3_selected_features.pkl', 'wb') as f:
    pickle.dump(xgb_model_selected, f)
print(f"✅ Saved: models/xgboost/f1_v3_selected_features.pkl")

# Update model for next steps
xgb_model = xgb_model_selected
selected_features = top_30_features

print(f"\n✅ Feature selection complete!")


🔧 FEATURE SELECTION - KEEPING TOP 30 FEATURES

📊 Feature Importance Analysis:
                          feature  importance
                    grid_position    0.160028
     constructor_last3_avg_points    0.078939
            quali_gap_to_pole_pct    0.056938
          driver_last5_avg_points    0.048192
                  front_row_start    0.047627
          quali_performance_score    0.031865
   constructor_avg_quali_position    0.029281
     constructor_last5_avg_points    0.029198
constructor_championship_position    0.024467
         circuit_driver_best_grid    0.022243
     driver_championship_position    0.021799
          driver_last3_avg_points    0.020980
             driver_season_points    0.020589
        driver_last5_avg_position    0.020229
   circuit_driver_points_per_race    0.019974
          circuit_driver_win_rate    0.017964
       circuit_driver_podium_rate    0.016972
       driver_avg_finish_position    0.016713
          constructor_is_top_team    0.016348
  

In [49]:
# ========================================
# MODEL ENSEMBLE (Fixed)
# Train all 3 models with 47 features
# ========================================

print("🤖 MODEL ENSEMBLE - TRAINING MULTIPLE MODELS\n")

# Use all 47 features
X_train_full = train_df[feature_columns].fillna(0)
X_test_full = test_df[feature_columns].fillna(0)

print(f"📊 Training 3 models with {len(feature_columns)} features")
print(f"   Training: {len(X_train_full)} records")
print(f"   Testing:  {len(X_test_full)} records\n")

# ========================================
# MODEL 1: XGBoost (retrain with 47 features)
# ========================================

print("1️⃣ Training XGBoost...")
xgb_model_full = xgb.XGBClassifier(
    n_estimators=150,
    max_depth=6,
    learning_rate=0.05,
    min_child_weight=3,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=1.0,
    random_state=42,
    n_jobs=-1
)

xgb_model_full.fit(X_train_full, y_train)
xgb_pred_proba = xgb_model_full.predict_proba(X_test_full)[:, 1]
xgb_pred = (xgb_pred_proba > 0.5).astype(int)
xgb_acc = accuracy_score(y_test, xgb_pred)
print(f"   Test Accuracy: {xgb_acc:.4f}\n")

# ========================================
# MODEL 2: LightGBM
# ========================================

print("2️⃣ Training LightGBM...")
lgb_model = lgb.LGBMClassifier(
    n_estimators=150,
    max_depth=6,
    learning_rate=0.05,
    num_leaves=31,
    min_child_samples=20,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=1.0,
    random_state=42,
    verbose=-1
)

lgb_model.fit(X_train_full, y_train)
lgb_pred_proba = lgb_model.predict_proba(X_test_full)[:, 1]
lgb_pred = (lgb_pred_proba > 0.5).astype(int)
lgb_acc = accuracy_score(y_test, lgb_pred)
print(f"   Test Accuracy: {lgb_acc:.4f}\n")

# ========================================
# MODEL 3: CatBoost
# ========================================

print("3️⃣ Training CatBoost...")
cat_model = CatBoostClassifier(
    iterations=150,
    depth=6,
    learning_rate=0.05,
    l2_leaf_reg=1.0,
    random_seed=42,
    verbose=False
)

cat_model.fit(X_train_full, y_train)
cat_pred_proba = cat_model.predict_proba(X_test_full)[:, 1]
cat_pred = (cat_pred_proba > 0.5).astype(int)
cat_acc = accuracy_score(y_test, cat_pred)
print(f"   Test Accuracy: {cat_acc:.4f}\n")

# ========================================
# ENSEMBLE (Average Predictions)
# ========================================

print("=" * 70)
print("🎯 ENSEMBLE PREDICTIONS")
print("=" * 70)

# Average probabilities
ensemble_pred_proba = (xgb_pred_proba + lgb_pred_proba + cat_pred_proba) / 3
ensemble_pred = (ensemble_pred_proba > 0.5).astype(int)

# Evaluate ensemble
ensemble_acc = accuracy_score(y_test, ensemble_pred)
ensemble_precision = precision_score(y_test, ensemble_pred, zero_division=0)
ensemble_recall = recall_score(y_test, ensemble_pred, zero_division=0)
ensemble_f1 = f1_score(y_test, ensemble_pred, zero_division=0)

print(f"\n📊 Model Comparison:")
print(f"   XGBoost:     {xgb_acc:.4f}")
print(f"   LightGBM:    {lgb_acc:.4f}")
print(f"   CatBoost:    {cat_acc:.4f}")
print(f"   ────────────────────")
print(f"   ENSEMBLE:    {ensemble_acc:.4f} ✨")

print(f"\n✅ Ensemble Performance:")
print(f"   Accuracy:  {ensemble_acc:.4f} ({ensemble_acc*100:.2f}%)")
print(f"   Precision: {ensemble_precision:.4f}")
print(f"   Recall:    {ensemble_recall:.4f}")
print(f"   F1-Score:  {ensemble_f1:.4f}")

print(f"\n📈 Improvement over single model:")
best_single = max(xgb_acc, lgb_acc, cat_acc)
print(f"   Best single model: {best_single:.4f}")
print(f"   Ensemble:          {ensemble_acc:.4f}")
print(f"   Improvement:       {(ensemble_acc - best_single)*100:+.2f}%")

# Classification report
print(f"\n📋 Ensemble Classification Report:")
print(classification_report(y_test, ensemble_pred, target_names=['Non-Podium', 'Podium']))

# Save all models
print(f"\n💾 Saving ensemble models...")
import os
os.makedirs('../models/ensemble', exist_ok=True)
with open('../models/ensemble/xgb_model.pkl', 'wb') as f:
    pickle.dump(xgb_model_full, f)
with open('../models/ensemble/lgb_model.pkl', 'wb') as f:
    pickle.dump(lgb_model, f)
with open('../models/ensemble/cat_model.pkl', 'wb') as f:
    pickle.dump(cat_model, f)
print(f"✅ Saved all 3 models")

# Store best model for Mexico GP
ensemble_models = {
    'xgb': xgb_model_full,
    'lgb': lgb_model,
    'cat': cat_model,
    'features': feature_columns
}

print(f"\n🎯 ENSEMBLE READY FOR MEXICO GP!")


🤖 MODEL ENSEMBLE - TRAINING MULTIPLE MODELS

📊 Training 3 models with 47 features
   Training: 1558 records
   Testing:  180 records

1️⃣ Training XGBoost...
   Test Accuracy: 0.9333

2️⃣ Training LightGBM...
   Test Accuracy: 0.9333

3️⃣ Training CatBoost...
   Test Accuracy: 0.9389

🎯 ENSEMBLE PREDICTIONS

📊 Model Comparison:
   XGBoost:     0.9333
   LightGBM:    0.9333
   CatBoost:    0.9389
   ────────────────────
   ENSEMBLE:    0.9278 ✨

✅ Ensemble Performance:
   Accuracy:  0.9278 (92.78%)
   Precision: 0.7917
   Recall:    0.7037
   F1-Score:  0.7451

📈 Improvement over single model:
   Best single model: 0.9389
   Ensemble:          0.9278
   Improvement:       -1.11%

📋 Ensemble Classification Report:
              precision    recall  f1-score   support

  Non-Podium       0.95      0.97      0.96       153
      Podium       0.79      0.70      0.75        27

    accuracy                           0.93       180
   macro avg       0.87      0.84      0.85       180
weight

In [52]:
# ========================================
# MEXICO GP PREDICTION (CORRECTED)
# Generate features for R20 and predict podium
# ========================================

print("🏁 MEXICO GP 2025 - V3 PREDICTION\n")

# CORRECT actual results
mexico_actual = {
    'NOR': 1,  # Norris - Winner (qualified P1)
    'LEC': 2,  # Leclerc - 2nd (qualified P2)
    'VER': 3,  # Verstappen - 3rd (qualified P5)
}

print("🎯 Actual Mexico GP Results:")
print("   1st: Lando Norris (NOR) - Qualified P1")
print("   2nd: Charles Leclerc (LEC) - Qualified P2")
print("   3rd: Max Verstappen (VER) - Qualified P5 ⭐")

# Mexico GP qualifying
mexico_quali = quali_data[(quali_data['season'] == 2025) & (quali_data['round'] == 20)].copy()

print(f"\n📊 Mexico GP Qualifying Top 5:")
top5_quali = mexico_quali.nsmallest(5, 'Position')[['Abbreviation', 'Position']]
print(top5_quali.to_string(index=False))

print(f"\n📊 Generating features for {len(mexico_quali)} drivers...\n")

# For each driver at Mexico GP, calculate their features based on R1-R19
mexico_predictions = []

for idx, driver_row in mexico_quali.iterrows():
    driver_code = driver_row['Abbreviation']

    # Get driver's history up to R19
    driver_history = df[(df['driverCode'] == driver_code) &
                       ((df['season'] < 2025) |
                        ((df['season'] == 2025) & (df['round'] <= 19)))]

    # Get constructor (last race)
    try:
        last_race = driver_history[driver_history['season'] == 2025].iloc[-1]
        constructor = last_race['constructorName']
    except:
        continue  # Skip if no 2025 data

    # Build feature dict
    features = {}

    # Qualifying features (from Mexico GP quali)
    features['grid_position'] = driver_row['Position']
    features['front_row_start'] = 1 if driver_row['Position'] <= 2 else 0
    features['quali_made_q3'] = 1 if pd.notna(driver_row['Q3']) else 0
    features['quali_made_q2'] = 1 if pd.notna(driver_row['Q2']) else 0

    # Calculate Q times
    q_times = [driver_row['Q1'], driver_row['Q2'], driver_row['Q3']]
    q_times_seconds = [t.total_seconds() if pd.notna(t) else np.nan for t in q_times]
    features['quali_best_time'] = np.nanmin(q_times_seconds) if any(pd.notna(q_times)) else 0

    # Gap to pole (Norris P1: 75.586s)
    pole_time = 75.586
    features['quali_gap_to_pole'] = features['quali_best_time'] - pole_time if features['quali_best_time'] > 0 else 0
    features['quali_gap_to_pole_pct'] = (features['quali_gap_to_pole'] / pole_time * 100) if pole_time > 0 else 0
    features['quali_performance_score'] = 1 - (features['quali_gap_to_pole'] / 5) if features['quali_gap_to_pole'] < 5 else 0

    # Q improvements
    features['quali_q1_q2_improvement'] = (q_times_seconds[0] - q_times_seconds[1]) if pd.notna(q_times_seconds[0]) and pd.notna(q_times_seconds[1]) else 0
    features['quali_q2_q3_improvement'] = (q_times_seconds[1] - q_times_seconds[2]) if pd.notna(q_times_seconds[1]) and pd.notna(q_times_seconds[2]) else 0

    # Driver performance features (from history)
    recent_5 = driver_history.tail(5)
    recent_3 = driver_history.tail(3)

    features['driver_last3_avg_points'] = recent_3['points'].mean() if len(recent_3) > 0 else 0
    features['driver_last5_avg_points'] = recent_5['points'].mean() if len(recent_5) > 0 else 0
    features['driver_last3_avg_position'] = recent_3['position'].mean() if len(recent_3) > 0 else 0
    features['driver_last5_avg_position'] = recent_5['position'].mean() if len(recent_5) > 0 else 0
    features['driver_last5_podiums'] = (recent_5['position'] <= 3).sum() if len(recent_5) > 0 else 0

    # Season stats
    season_2025_driver = driver_history[driver_history['season'] == 2025]
    features['driver_season_points'] = season_2025_driver['points'].sum() if len(season_2025_driver) > 0 else 0
    features['driver_season_races'] = len(season_2025_driver)
    features['driver_championship_position'] = last_race.get('driver_championship_position', 10) if 'last_race' in locals() else 10

    # Career stats
    features['driver_avg_finish_position'] = driver_history['position'].mean() if len(driver_history) > 0 else 10
    features['driver_dnf_rate'] = 0.1  # Simplified
    features['driver_career_races'] = len(driver_history)

    # Constructor features (simplified)
    constructor_history = df[(df['constructorName'] == constructor) &
                            ((df['season'] < 2025) |
                             ((df['season'] == 2025) & (df['round'] <= 19)))]

    recent_constructor_5 = constructor_history.tail(5)
    features['constructor_last3_avg_points'] = recent_constructor_5.tail(3)['points'].mean() if len(recent_constructor_5) >= 3 else 0
    features['constructor_last5_avg_points'] = recent_constructor_5['points'].mean() if len(recent_constructor_5) > 0 else 0
    features['constructor_season_points'] = constructor_history[constructor_history['season'] == 2025]['points'].sum()
    features['constructor_championship_position'] = last_race.get('constructor_championship_position', 5) if 'last_race' in locals() else 5
    features['constructor_dnf_rate'] = 0.1
    features['constructor_avg_quali_position'] = 5
    features['constructor_points_per_race'] = features['constructor_season_points'] / 19 if features['driver_season_races'] > 0 else 0
    features['constructor_is_top_team'] = 1 if features['constructor_championship_position'] <= 3 else 0

    # Circuit features (simplified - using career stats)
    features['circuit_driver_wins'] = (driver_history['position'] == 1).sum()
    features['circuit_driver_podiums'] = (driver_history['position'] <= 3).sum()
    features['circuit_driver_avg_finish'] = features['driver_avg_finish_position']
    features['circuit_driver_experience'] = len(driver_history)
    features['circuit_constructor_wins'] = 0
    features['circuit_constructor_podiums'] = 0
    features['circuit_driver_best_grid'] = driver_history['grid'].min() if len(driver_history) > 0 else 10
    features['circuit_driver_win_rate'] = features['circuit_driver_wins'] / max(len(driver_history), 1)
    features['circuit_driver_podium_rate'] = features['circuit_driver_podiums'] / max(len(driver_history), 1)
    features['circuit_driver_points_per_race'] = driver_history['points'].sum() / max(len(driver_history), 1)
    features['circuit_avg_position_change'] = 0

    # Advanced features
    features['driver_momentum'] = features['driver_last3_avg_points'] - features['driver_last5_avg_points']
    features['points_gap_to_leader'] = max(0, 400 - features['driver_season_points'])
    features['must_win_pressure'] = 0
    features['teammate_gap'] = 0
    features['driver_consistency_score'] = 0.5
    features['avg_quali_race_delta'] = 0
    features['season_progress'] = 20/24

    # Fill any missing features with 0
    for feat in feature_columns:
        if feat not in features:
            features[feat] = 0

    mexico_predictions.append({
        'driver': driver_code,
        'grid': driver_row['Position'],
        'features': features
    })

print(f"✅ Generated features for {len(mexico_predictions)} drivers\n")

# Create feature matrix
X_mexico = pd.DataFrame([p['features'] for p in mexico_predictions])[feature_columns].fillna(0)

# Predict using CatBoost (best model)
print("🤖 Predicting with CatBoost (V3)...\n")
mexico_pred_proba = cat_model.predict_proba(X_mexico)[:, 1]

# Create results DataFrame
mexico_results = pd.DataFrame({
    'Driver': [p['driver'] for p in mexico_predictions],
    'Grid': [p['grid'] for p in mexico_predictions],
    'Podium_Probability': mexico_pred_proba
}).sort_values('Podium_Probability', ascending=False)

print("=" * 70)
print("🏆 V3 MEXICO GP PREDICTIONS (Top 10)")
print("=" * 70)
print(mexico_results.head(10).to_string(index=False))

print(f"\n🎯 V3 Predicted Top 3:")
top3_predicted = mexico_results.head(3)
for i, (idx, row) in enumerate(top3_predicted.iterrows(), 1):
    print(f"   {i}. {row['Driver']:3s} (Grid: P{int(row['Grid'])}, Confidence: {row['Podium_Probability']:.1%})")

print(f"\n🏁 Actual Top 3:")
print(f"   1. NOR (Norris)")
print(f"   2. LEC (Leclerc)")
print(f"   3. VER (Verstappen)")

# Check accuracy
predicted_drivers = set(top3_predicted['Driver'].values)
actual_drivers = {'NOR', 'LEC', 'VER'}
correct = len(predicted_drivers & actual_drivers)

print(f"\n📊 Mexico GP Prediction Accuracy:")
print(f"   Correct podium picks: {correct}/3 ({correct/3*100:.1f}%)")
if correct == 3:
    print(f"   🎉 PERFECT PODIUM PREDICTION!")
elif correct == 2:
    print(f"   ✅ Strong prediction (2/3 correct)")
else:
    print(f"   ⚠️  Needs improvement")


🏁 MEXICO GP 2025 - V3 PREDICTION

🎯 Actual Mexico GP Results:
   1st: Lando Norris (NOR) - Qualified P1
   2nd: Charles Leclerc (LEC) - Qualified P2
   3rd: Max Verstappen (VER) - Qualified P5 ⭐

📊 Mexico GP Qualifying Top 5:
Abbreviation  Position
         NOR       1.0
         LEC       2.0
         HAM       3.0
         RUS       4.0
         VER       5.0

📊 Generating features for 20 drivers...

✅ Generated features for 20 drivers

🤖 Predicting with CatBoost (V3)...

🏆 V3 MEXICO GP PREDICTIONS (Top 10)
Driver  Grid  Podium_Probability
   NOR   1.0            0.789121
   LEC   2.0            0.729435
   RUS   4.0            0.467085
   HAM   3.0            0.414023
   ANT   6.0            0.189064
   VER   5.0            0.175163
   PIA   8.0            0.072033
   SAI   7.0            0.022318
   HAD   9.0            0.019138
   TSU  11.0            0.012359

🎯 V3 Predicted Top 3:
   1. NOR (Grid: P1, Confidence: 78.9%)
   2. LEC (Grid: P2, Confidence: 72.9%)
   3. RUS (Grid: P4