# 🔧 Data Preprocessing for F1 Probability Modeling
## Preparing Singapore GP data for VAE and Bayesian Network implementation

This notebook handles:
- Feature engineering and selection
- Data normalization and scaling
- Missing value treatment
- Train/validation splits
- Feature encoding for ML models

In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.impute import SimpleImputer, KNNImputer
import warnings
import glob
import os
from datetime import datetime
import pickle
import json

# Import configuration and feature weights
from config import *

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8')
pd.set_option('display.max_columns', None)

print("🔧 Preprocessing libraries loaded successfully!")
print(f"⚖️ Feature weighting system enabled: {len(get_weighted_features('all'))} weighted features")
print(f"🎯 Target circuit: {DATA_CONFIG['selected_circuit']}")
print(f"📊 Available circuits: {len(get_available_circuits())}")

🔧 Preprocessing libraries loaded successfully!
⚖️ Feature weighting system enabled: 15 weighted features
🎯 Target circuit: Singapore
📊 Available circuits: 23


    # 📂 Load Processed Dataset - Universal Circuit Support

In [30]:
# Load the most recent weighted prediction dataset
print("📂 Loading weighted prediction dataset...")

# Find the most recent weighted prediction data file
weighted_files = glob.glob('data/raw/*_prediction_weighted_*.csv')
processed_files = glob.glob('data/processed/*_cleaned_*.csv')

if weighted_files:
    latest_file = max(weighted_files, key=os.path.getctime)
    circuit_name = os.path.basename(latest_file).split('_')[0].upper()
    print(f"🎯 Loading {circuit_name} GP weighted prediction data from: {latest_file}")
    df = pd.read_csv(latest_file)
    data_source = "weighted_prediction"
elif processed_files:
    latest_file = max(processed_files, key=os.path.getctime)
    circuit_name = os.path.basename(latest_file).split('_')[0].upper()
    print(f"📂 Loading processed data from: {latest_file}")
    df = pd.read_csv(latest_file)
    data_source = "processed"
else:
    print("❌ No weighted prediction data files found. Please run data collection first.")
    df = pd.DataFrame()
    data_source = None

if not df.empty:
    print(f"✅ Data loaded successfully!")
    print(f"📊 Shape: {df.shape}")
    print(f"🏁 Years: {sorted(df['year'].unique())}")
    print(f"🏎️ Unique drivers: {df['driver_name'].nunique()}")
    print(f"🏁 Unique teams: {df['team'].nunique()}")
    
    # Check for weighted data features
    if 'data_weight' in df.columns:
        print(f"⚖️ Weighted dataset detected!")
        print(f"   Weight range: {df['data_weight'].min():.1f} - {df['data_weight'].max():.1f}")
        print(f"   Data sources: {df['data_source'].nunique()} unique sources")
        weight_dist = df['data_weight'].value_counts().sort_index(ascending=False)
        for weight, count in weight_dist.items():
            relevance = df[df['data_weight'] == weight]['prediction_relevance'].iloc[0]
            print(f"   {weight:.1f} ({relevance:12}): {count:3d} records")
    else:
        print("📊 Standard dataset (no weighting information)")
    
    print(f"🔧 Ready for feature engineering...")
else:
    print("🚫 Cannot proceed without data. Please run notebook 01 first.")

📂 Loading weighted prediction dataset...
🎯 Loading SINGAPORE GP weighted prediction data from: data/raw\singapore_prediction_weighted_20251005_165135.csv
✅ Data loaded successfully!
📊 Shape: (240, 20)
🏁 Years: [np.int64(2022), np.int64(2023), np.int64(2024), np.int64(2025)]
🏎️ Unique drivers: 30
🏁 Unique teams: 13
⚖️ Weighted dataset detected!
   Weight range: 1.2 - 3.0
   Data sources: 12 unique sources
   3.0 (critical    ):  60 records
   2.5 (high        ): 100 records
   1.5 (medium      ):  40 records
   1.2 (supplementary):  40 records
🔧 Ready for feature engineering...


## 🎯 Feature Engineering

In [31]:
## 📈 Rolling Features & Advanced Engineering

In [32]:
if not df.empty:
    print("📈 Creating rolling features and advanced engineering...")
    
    # Create a copy and sort by driver and year for rolling calculations
    df_rolling = df.copy().sort_values(['driver_name', 'year'])
    
    # 1. Driver Form - Rolling averages (last 2 races per driver)
    print("  🏎️ Computing driver form metrics...")
    
    # Rolling qualifying performance (last 2 races)
    df_rolling['driver_quali_form'] = df_rolling.groupby('driver_name')['quali_pos'].rolling(
        window=2, min_periods=1
    ).mean().reset_index(0, drop=True)
    
    # Rolling race performance (last 2 races)
    df_rolling['driver_race_form'] = df_rolling.groupby('driver_name')['finish_pos'].rolling(
        window=2, min_periods=1
    ).mean().reset_index(0, drop=True)
    
    # Rolling points form
    df_rolling['driver_points_form'] = df_rolling.groupby('driver_name')['points'].rolling(
        window=2, min_periods=1
    ).mean().reset_index(0, drop=True)
    
    # 2. Team Pace Analysis
    print("  🏁 Computing team pace metrics...")
    
    # Team average qualifying position by year
    team_quali_avg = df_rolling.groupby(['team', 'year'])['quali_pos'].mean()
    df_rolling['team_quali_avg'] = df_rolling.apply(
        lambda row: team_quali_avg.get((row['team'], row['year']), row['quali_pos']), axis=1
    )
    
    # Team average race position by year
    team_race_avg = df_rolling.groupby(['team', 'year'])['finish_pos'].mean()
    df_rolling['team_race_avg'] = df_rolling.apply(
        lambda row: team_race_avg.get((row['team'], row['year']), row['finish_pos']), axis=1
    )
    
    # 3. Strategy Efficiency Metrics
    print("  ⛽ Computing strategy efficiency...")
    
    # Pit stop efficiency (compared to team average)
    team_pit_avg = df_rolling.groupby('team')['pit_stops'].mean()
    df_rolling['pit_strategy_delta'] = df_rolling.apply(
        lambda row: row['pit_stops'] - team_pit_avg.get(row['team'], row['pit_stops']), axis=1
    )
    
    # Qualifying vs race performance delta
    df_rolling['quali_race_delta'] = df_rolling['finish_pos'] - df_rolling['quali_pos']
    
    # 4. Position Movement Analytics
    print("  📊 Computing position movement metrics...")
    
    # Absolute position change (regardless of direction)
    df_rolling['abs_pos_change'] = abs(df_rolling['pos_change'])
    
    # Position change efficiency (points gained per position moved)
    df_rolling['pos_efficiency'] = np.where(
        df_rolling['abs_pos_change'] > 0,
        df_rolling['points'] / (df_rolling['abs_pos_change'] + 1),  # +1 to avoid division by zero
        df_rolling['points']
    )
    
    # 5. Competitive Context Features
    print("  🎯 Computing competitive context...")
    
    # Gap to pole relative to field (normalized)
    year_pole_gaps = df_rolling.groupby('year')['gap_to_pole']
    df_rolling['gap_to_pole_normalized'] = (
        df_rolling['gap_to_pole'] / year_pole_gaps.transform('max').replace(0, 1)  # Avoid division by zero
    ).fillna(0)
    
    # Driver's qualifying position relative to team mate (if available)
    teammate_quali = df_rolling.groupby(['team', 'year'])['quali_pos'].transform('mean')
    df_rolling['quali_vs_teammate'] = df_rolling['quali_pos'] - teammate_quali
    
    # 6. Experience-based Features
    print("  🏆 Computing experience metrics...")
    
    # Cumulative races for each driver (experience progression)
    df_rolling['cumulative_races'] = df_rolling.groupby('driver_name').cumcount() + 1
    
    # Years of experience in dataset
    driver_years = df_rolling.groupby('driver_name')['year'].nunique()
    df_rolling['years_experience'] = df_rolling['driver_name'].map(driver_years)
    
    print(f"✅ Created {len(df_rolling.columns) - len(df.columns)} rolling/advanced features")
    
    # Show sample of new features
    new_rolling_features = set(df_rolling.columns) - set(df.columns)
    print(f"📊 New rolling features: {list(new_rolling_features)}")
    
    # Display sample with key rolling features
    sample_cols = [
        'driver_name', 'year', 'driver_quali_form', 'driver_race_form', 
        'team_quali_avg', 'pit_strategy_delta', 'pos_efficiency'
    ]
    
    # Check which columns actually exist
    available_sample_cols = [col for col in sample_cols if col in df_rolling.columns]
    
    if available_sample_cols:
        print(f"\n🔍 Sample rolling features:")
        print(df_rolling[available_sample_cols].head(10).to_string(index=False))
    
    # Update main dataframe
    df_features = df_rolling.copy()
    print(f"📋 Updated main dataset shape: {df_features.shape}")
else:
    print("⚠️ No data available for rolling feature engineering.")

📈 Creating rolling features and advanced engineering...
  🏎️ Computing driver form metrics...
  🏁 Computing team pace metrics...
  ⛽ Computing strategy efficiency...
  📊 Computing position movement metrics...
  🎯 Computing competitive context...
  🏆 Computing experience metrics...
✅ Created 13 rolling/advanced features
📊 New rolling features: ['abs_pos_change', 'driver_race_form', 'cumulative_races', 'driver_quali_form', 'team_race_avg', 'pit_strategy_delta', 'team_quali_avg', 'quali_vs_teammate', 'driver_points_form', 'pos_efficiency', 'gap_to_pole_normalized', 'years_experience', 'quali_race_delta']

🔍 Sample rolling features:
    driver_name  year  driver_quali_form  driver_race_form  team_quali_avg  pit_strategy_delta  pos_efficiency
Alexander Albon  2022               19.0              17.0       19.500000           -0.583333        0.000000
Alexander Albon  2023               16.5              14.0       16.000000            0.416667        0.000000
Alexander Albon  2024         

In [33]:
if not df.empty:
    print("🎯 Creating derived features...")
    
    # Ensure we have df_features from rolling analysis or create from df
    if 'df_features' not in locals():
        df_features = df.copy()
    
    # 1. Driver skill rating (based on historical performance)
    print("  🏆 Computing driver skill ratings...")
    driver_stats = df_features.groupby('driver_name').agg({
        'points': 'mean',
        'finish_pos': 'mean',
        'pos_change': 'mean'
    })
    
    # Normalize driver skill (0-1 scale, higher is better)
    driver_stats['skill_score'] = (
        (driver_stats['points'] / 25) * 0.4 +  # Points contribution (normalized to max points)
        ((21 - driver_stats['finish_pos']) / 20) * 0.4 +  # Avg finish position (inverted, normalized)
        ((driver_stats['pos_change'] + 10) / 20) * 0.2  # Position gain ability (normalized)
    ).clip(0, 1)
    
    # Map back to original dataframe
    df_features['driver_skill'] = df_features['driver_name'].map(driver_stats['skill_score'])
    
    # 2. Team performance rating
    print("  🏁 Computing team strength ratings...")
    team_stats = df_features.groupby('team').agg({
        'points': 'mean',
        'finish_pos': 'mean'
    })
    
    # Normalize team strength (0-1 scale, higher is better)
    max_team_points = team_stats['points'].max() if team_stats['points'].max() > 0 else 1
    team_stats['strength_score'] = (
        (team_stats['points'] / max_team_points) * 0.6 +  # Points contribution
        ((21 - team_stats['finish_pos']) / 20) * 0.4  # Average position (inverted)
    ).clip(0, 1)
    
    df_features['team_strength'] = df_features['team'].map(team_stats['strength_score'])
    
    # 3. Qualifying performance metrics
    print("  ⏱️ Computing qualifying performance...")
    
    # Qualifying vs grid position difference (penalty/promotion effects)
    df_features['quali_vs_grid'] = 0  # Default
    valid_quali = (df_features['quali_pos'] > 0) & (df_features['grid_pos'] > 0)
    df_features.loc[valid_quali, 'quali_vs_grid'] = (
        df_features.loc[valid_quali, 'quali_pos'] - df_features.loc[valid_quali, 'grid_pos']
    )
    
    # Best available qualifying time
    def get_best_quali_time(row):
        times = [row.get('q3_time', 0), row.get('q2_time', 0), row.get('q1_time', 0)]
        valid_times = [t for t in times if t > 0]
        return min(valid_times) if valid_times else 0
    
    df_features['best_quali_time'] = df_features.apply(get_best_quali_time, axis=1)
    
    # 4. Strategic variables
    print("  🔧 Computing strategy features...")
    df_features['is_aggressive_strategy'] = (df_features['pit_stops'] >= 2).astype(int)
    
    # Race completion rate (compared to race winner)
    max_laps_by_year = df_features.groupby('year')['total_laps'].max()
    df_features['race_completion_rate'] = df_features.apply(
        lambda row: (row['total_laps'] / max_laps_by_year.get(row['year'], 1)) if max_laps_by_year.get(row['year'], 0) > 0 else 0,
        axis=1
    ).clip(0, 1)
    
    # 5. Experience factor
    print("  📚 Computing experience metrics...")
    driver_experience = df_features.groupby('driver_name').size()
    df_features['driver_experience'] = df_features['driver_name'].map(driver_experience)
    
    # Normalize experience (0-1 scale)
    max_experience = df_features['driver_experience'].max() if df_features['driver_experience'].max() > 0 else 1
    df_features['driver_experience_norm'] = (df_features['driver_experience'] / max_experience).clip(0, 1)
    
    # 6. Temporal normalization
    print("  📅 Computing temporal features...")
    year_range = df_features['year'].max() - df_features['year'].min()
    if year_range > 0:
        df_features['year_normalized'] = (df_features['year'] - df_features['year'].min()) / year_range
    else:
        df_features['year_normalized'] = 0.5  # Single year case
    
    # 7. Circuit-specific modifiers (using config)
    print("  🏁 Applying circuit-specific modifiers...")
    circuit_config = get_circuit_prediction_modifiers()
    
    # Apply grid importance modifier
    df_features['grid_importance_weighted'] = (
        df_features['grid_pos'] * circuit_config['grid_importance']
    )
    
    # Strategy factor weighting
    df_features['strategy_weighted'] = (
        df_features['pit_stops'] * circuit_config['strategy_factor']
    )
    
    print(f"✅ Created {len(df_features.columns) - len(df.columns)} derived features")
    
    # Display new engineered features
    new_features = set(df_features.columns) - set(df.columns) if 'df' in locals() else []
    print(f"📊 New derived features: {list(new_features)}")
    
    # Show sample of engineered features
    sample_cols = ['driver_name', 'driver_skill', 'team_strength', 'driver_experience_norm', 'is_aggressive_strategy']
    available_sample_cols = [col for col in sample_cols if col in df_features.columns]
    
    if available_sample_cols:
        print(f"\n🔍 Sample derived features:")
        print(df_features[available_sample_cols].head().to_string(index=False))
    
    # Fill any remaining NaN values in derived features
    derived_feature_cols = [
        'driver_skill', 'team_strength', 'driver_experience_norm', 
        'race_completion_rate', 'year_normalized'
    ]
    
    for col in derived_feature_cols:
        if col in df_features.columns:
            df_features[col] = df_features[col].fillna(df_features[col].median())
    
    print(f"🔧 Final engineered dataset shape: {df_features.shape}")
else:
    print("⚠️ No data available for feature engineering.")

🎯 Creating derived features...
  🏆 Computing driver skill ratings...
  🏁 Computing team strength ratings...
  ⏱️ Computing qualifying performance...
  🔧 Computing strategy features...
  📚 Computing experience metrics...
  📅 Computing temporal features...
  🏁 Applying circuit-specific modifiers...
✅ Created 24 derived features
📊 New derived features: ['best_quali_time', 'pit_strategy_delta', 'strategy_weighted', 'team_quali_avg', 'driver_skill', 'driver_points_form', 'grid_importance_weighted', 'quali_race_delta', 'abs_pos_change', 'driver_experience', 'race_completion_rate', 'team_strength', 'year_normalized', 'gap_to_pole_normalized', 'years_experience', 'is_aggressive_strategy', 'quali_vs_grid', 'driver_race_form', 'cumulative_races', 'pos_efficiency', 'team_race_avg', 'driver_quali_form', 'quali_vs_teammate', 'driver_experience_norm']

🔍 Sample derived features:
    driver_name  driver_skill  team_strength  driver_experience_norm  is_aggressive_strategy
Alexander Albon      0.321833

## 🧹 Data Cleaning & Missing Value Treatment

In [34]:
if not df.empty:
    print("🧹 Handling missing values and data cleaning...")
    
    # Identify features to clean
    numeric_features = df_features.select_dtypes(include=[np.number]).columns.tolist()
    categorical_features = ['driver_name', 'team', 'tyres_used', 'status']
    
    # Remove problematic categorical columns for ML
    categorical_features = [col for col in categorical_features if col in df_features.columns]
    
    print(f"📊 Numeric features: {len(numeric_features)}")
    print(f"📊 Categorical features: {len(categorical_features)}")
    
    # Handle missing values in numeric features
    missing_summary = df_features[numeric_features].isnull().sum()
    zero_summary = (df_features[numeric_features] == 0).sum()
    
    print("\n🔍 Missing/Zero value analysis:")
    for col in numeric_features:
        missing_count = missing_summary[col]
        zero_count = zero_summary[col]
        if missing_count > 0 or zero_count > len(df_features) * 0.1:  # Show if >10% zeros
            print(f"{col:20} | Missing: {missing_count:2d} | Zeros: {zero_count:2d}")
    
    # Strategy for different types of missing data
    
    # 1. Qualifying times: Use median by year
    quali_time_cols = ['q1_time', 'q2_time', 'q3_time', 'best_quali_time', 'gap_to_pole']
    for col in quali_time_cols:
        if col in df_features.columns:
            # Replace zeros with NaN first
            df_features.loc[df_features[col] == 0, col] = np.nan
            # Fill with median by year
            df_features[col] = df_features.groupby('year')[col].transform(
                lambda x: x.fillna(x.median())
            )
            # If still NaN, use overall median
            df_features[col] = df_features[col].fillna(df_features[col].median())
    
    # 2. Grid/Finish positions: Use forward fill or median
    position_cols = ['grid_pos', 'finish_pos', 'quali_pos']
    for col in position_cols:
        if col in df_features.columns:
            # For positions, 0 usually means missing data
            df_features.loc[df_features[col] == 0, col] = np.nan
            df_features[col] = df_features[col].fillna(df_features[col].median())
    
    # 3. Performance metrics: Use median imputation
    performance_cols = ['points', 'pit_stops', 'total_laps']
    for col in performance_cols:
        if col in df_features.columns:
            df_features[col] = df_features[col].fillna(df_features[col].median())
    
    # 4. Engineered features: Fill with defaults
    df_features['driver_skill'] = df_features['driver_skill'].fillna(0.5)  # Average skill
    df_features['team_strength'] = df_features['team_strength'].fillna(0.5)  # Average team
    df_features['race_completion_rate'] = df_features['race_completion_rate'].fillna(0.0)
    
    # Final check for remaining missing values
    remaining_missing = df_features[numeric_features].isnull().sum().sum()
    print(f"\n✅ Remaining missing values: {remaining_missing}")
    
    if remaining_missing > 0:
        # Use KNN imputation for remaining missing values
        print("🔧 Applying KNN imputation for remaining missing values...")
        imputer = KNNImputer(n_neighbors=3)
        df_features[numeric_features] = imputer.fit_transform(df_features[numeric_features])
        print("✅ KNN imputation completed")

🧹 Handling missing values and data cleaning...
📊 Numeric features: 37
📊 Categorical features: 4

🔍 Missing/Zero value analysis:
points               | Missing:  0 | Zeros: 120
pos_change           | Missing:  0 | Zeros: 38
q2_time              | Missing:  0 | Zeros: 67
q3_time              | Missing:  0 | Zeros: 127
gap_to_pole          | Missing:  0 | Zeros: 72
driver_points_form   | Missing:  0 | Zeros: 80
quali_race_delta     | Missing:  0 | Zeros: 37
abs_pos_change       | Missing:  0 | Zeros: 38
pos_efficiency       | Missing:  0 | Zeros: 120
gap_to_pole_normalized | Missing:  0 | Zeros: 72
quali_vs_grid        | Missing:  0 | Zeros: 207
is_aggressive_strategy | Missing:  0 | Zeros: 47

✅ Remaining missing values: 0


In [35]:
# Debug: Check our available features
print(f"\n🔍 Available columns after engineering:")
print(f"Total columns: {len(df_features.columns)}")
numeric_cols = df_features.select_dtypes(include=[np.number]).columns.tolist()
string_cols = df_features.select_dtypes(include=['object']).columns.tolist()
print(f"Numeric columns ({len(numeric_cols)}): {numeric_cols}")
print(f"String columns ({len(string_cols)}): {string_cols}")

# Show sample of problematic columns
for col in string_cols:
    print(f"  {col}: {df_features[col].dtype} - Sample: {df_features[col].iloc[0]}")


🔍 Available columns after engineering:
Total columns: 44
Numeric columns (37): ['year', 'grid_pos', 'finish_pos', 'points', 'pos_change', 'quali_pos', 'q1_time', 'q2_time', 'q3_time', 'total_laps', 'pit_stops', 'gap_to_pole', 'data_weight', 'driver_quali_form', 'driver_race_form', 'driver_points_form', 'team_quali_avg', 'team_race_avg', 'pit_strategy_delta', 'quali_race_delta', 'abs_pos_change', 'pos_efficiency', 'gap_to_pole_normalized', 'quali_vs_teammate', 'cumulative_races', 'years_experience', 'driver_skill', 'team_strength', 'quali_vs_grid', 'best_quali_time', 'is_aggressive_strategy', 'race_completion_rate', 'driver_experience', 'driver_experience_norm', 'year_normalized', 'grid_importance_weighted', 'strategy_weighted']
String columns (7): ['driver_name', 'driver_abbr', 'team', 'status', 'tyres_used', 'data_source', 'prediction_relevance']
  driver_name: object - Sample: Alexander Albon
  driver_abbr: object - Sample: ALB
  team: object - Sample: Williams
  status: object - Sa

## 📏 Feature Selection & Scaling

In [36]:
if not df.empty:
    print("📏 Feature selection and scaling with importance weighting...")
    
    # Get numeric columns only (exclude string columns)
    numeric_columns = df_features.select_dtypes(include=[np.number]).columns.tolist()
    target = 'finish_pos'
    
    # Remove target and any non-predictive columns
    exclude_cols = [target, 'year', 'points']  # Points is result, not predictor
    available_features = [col for col in numeric_columns if col not in exclude_cols]
    
    print(f"📊 Available numeric features: {len(available_features)}")
    print(f"🎯 Target variable: {target}")
    
    # Get weighted features from config
    config_weighted = get_weighted_features("all")
    config_high = get_weighted_features("high")
    config_medium = get_weighted_features("medium")
    
    # Map config feature names to actual column names
    def map_config_to_columns(config_features, available_cols):
        """Map config feature names to actual DataFrame columns"""
        mapped = {}
        for config_name, weight in config_features.items():
            # Direct match
            if config_name in available_cols:
                mapped[config_name] = weight
            # Pattern matching for similar names
            else:
                for col in available_cols:
                    if config_name.replace('_', '') in col.replace('_', '') or col.replace('_', '') in config_name.replace('_', ''):
                        mapped[col] = weight
                        break
        return mapped
    
    # Map config weights to available features
    available_weighted = map_config_to_columns(config_weighted, available_features)
    available_high = map_config_to_columns(config_high, available_features)
    available_medium = map_config_to_columns(config_medium, available_features)
    
    # Add default weights for important features not in config
    important_patterns = {
        'driver_skill': 0.80,
        'team_strength': 0.85,
        'grid_pos': 0.95,
        'quali_pos': 0.90,
        'pit_stops': 0.60,
        'gap_to_pole': 0.75,
        'q3_time': 0.65,
        'q2_time': 0.55,
        'driver_experience': 0.50,
        'year_normalized': 0.45
    }
    
    for pattern, weight in important_patterns.items():
        for col in available_features:
            if pattern in col and col not in available_weighted:
                if weight >= 0.8:
                    available_high[col] = weight
                elif weight >= 0.6:
                    available_medium[col] = weight
                available_weighted[col] = weight
    
    print(f"\n⚖️ Feature importance summary:")
    print(f"  • High importance: {len(available_high)} features (weight ≥ 0.8)")
    print(f"  • Medium importance: {len(available_medium)} features (weight ≥ 0.6)")
    print(f"  • All weighted: {len(available_weighted)} features")
    
    # Show top weighted features
    top_weighted = dict(sorted(available_weighted.items(), key=lambda x: x[1], reverse=True)[:10])
    print(f"\n🏆 Top weighted features:")
    for feature, weight in top_weighted.items():
        print(f"  • {feature:25}: {weight:.2f}")
    
    # Create feature sets
    core_features = list(available_high.keys())[:8]  # Top 8 high importance
    vae_features = list(available_high.keys()) + list(available_medium.keys())
    vae_features = list(dict.fromkeys(vae_features))[:15]  # Remove duplicates, limit to 15
    extended_features = list(available_weighted.keys())[:20]  # Top 20 weighted features
    
    print(f"\n📊 Feature set composition:")
    print(f"  • Core: {len(core_features)} features")
    print(f"  • VAE optimized: {len(vae_features)} features") 
    print(f"  • Extended: {len(extended_features)} features")
    
    # Create feature sets with weights
    feature_sets = {
        'core_weighted': {
            'features': core_features,
            'weights': {f: available_weighted.get(f, 0.5) for f in core_features}
        },
        'vae_optimized': {
            'features': vae_features,
            'weights': {f: available_weighted.get(f, 0.5) for f in vae_features}
        },
        'extended_weighted': {
            'features': extended_features,
            'weights': {f: available_weighted.get(f, 0.3) for f in extended_features}
        }
    }
    
    # Prepare weighted datasets
    datasets = {}
    scalers = {}
    
    for set_name, fset in feature_sets.items():
        print(f"\n🔧 Preparing {set_name} dataset...")
        
        features = fset['features']
        weights = fset['weights']
        
        # Skip if no features available
        if len(features) == 0:
            print(f"  ⚠️ No features available for {set_name}, skipping...")
            continue
        
        # Ensure all features exist in dataframe
        existing_features = [f for f in features if f in df_features.columns]
        if len(existing_features) != len(features):
            missing = set(features) - set(existing_features)
            print(f"  ⚠️ Missing features: {missing}")
            features = existing_features
            weights = {f: weights[f] for f in existing_features}
        
        print(f"  📊 Using {len(features)} features: {features[:5]}{'...' if len(features) > 5 else ''}")
        
        # Extract features and target
        X = df_features[features].copy()
        y = df_features[target].copy()
        
        # Remove any rows with missing values
        valid_mask = ~(X.isnull().any(axis=1) | y.isnull())
        X = X[valid_mask]
        y = y[valid_mask]
        
        print(f"  📊 Shape after cleaning: X{X.shape}, y{y.shape}")
        
        if len(X) == 0:
            print(f"  ❌ No valid data for {set_name}, skipping...")
            continue
        
        # Scale features
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        X_scaled = pd.DataFrame(X_scaled, columns=X.columns, index=X.index)
        
        # Apply feature weights by multiplying scaled features
        feature_weights_array = np.array([weights.get(col, 0.5) for col in X_scaled.columns])
        X_weighted = X_scaled * feature_weights_array
        
        print(f"  ⚖️ Applied weights (range: {feature_weights_array.min():.2f} - {feature_weights_array.max():.2f})")
        
        # Store dataset and scaler
        datasets[set_name] = {
            'X': X_weighted,  # Weighted and scaled
            'X_scaled': X_scaled,  # Only scaled
            'X_raw': X,  # Raw features
            'y': y,
            'features': features,
            'weights': weights
        }
        scalers[set_name] = scaler
        
        print(f"  ✅ {set_name} dataset ready")
    
    # Display feature weighting verification for first available dataset
    if datasets:
        first_dataset_name = list(datasets.keys())[0]
        first_dataset = datasets[first_dataset_name]
        
        print(f"\n📈 Feature weighting verification ({first_dataset_name}):")
        weights = first_dataset['weights']
        X = first_dataset['X']
        
        for col in list(X.columns)[:5]:  # Show first 5 features
            weight = weights[col]
            weighted_std = X[col].std()
            print(f"  {col:25} | Weight: {weight:.3f} | Weighted std: {weighted_std:6.3f}")
    
    print(f"\n✅ Feature selection and weighting complete!")
    print(f"📦 Created {len(datasets)} weighted datasets: {list(datasets.keys())}")
else:
    print("⚠️ No data available for feature selection.")

📏 Feature selection and scaling with importance weighting...
📊 Available numeric features: 34
🎯 Target variable: finish_pos

⚖️ Feature importance summary:
  • High importance: 5 features (weight ≥ 0.8)
  • Medium importance: 6 features (weight ≥ 0.6)
  • All weighted: 16 features

🏆 Top weighted features:
  • grid_pos                 : 0.95
  • quali_pos                : 0.90
  • team_strength            : 0.85
  • driver_skill             : 0.80
  • gap_to_pole              : 0.75
  • gap_to_pole_normalized   : 0.75
  • q3_time                  : 0.65
  • pit_stops                : 0.60
  • q2_time                  : 0.55
  • driver_experience        : 0.50

📊 Feature set composition:
  • Core: 5 features
  • VAE optimized: 11 features
  • Extended: 16 features

🔧 Preparing core_weighted dataset...
  📊 Using 5 features: ['grid_pos', 'quali_pos', 'team_strength', 'driver_skill', 'gap_to_pole']
  📊 Shape after cleaning: X(240, 5), y(240,)
  ⚖️ Applied weights (range: 0.75 - 0.95)
  ✅ c

## 🎲 Train/Validation Splits

In [37]:
if not df.empty:
    print("🎲 Creating train/validation splits...")
    
    splits = {}
    
    for set_name, dataset in datasets.items():
        print(f"\n🔄 Splitting {set_name} dataset...")
        
        X = dataset['X']
        y = dataset['y']
        
        # Strategy 1: Temporal split (by year)
        years = df_features.loc[X.index, 'year']
        unique_years = sorted(years.unique())
        
        if len(unique_years) >= 2:
            # Use earliest year(s) for training, latest for validation
            if len(unique_years) == 2:
                train_years = [unique_years[0]]
                val_years = [unique_years[1]]
            else:
                train_years = unique_years[:-1]
                val_years = [unique_years[-1]]
            
            temporal_train_mask = years.isin(train_years)
            temporal_val_mask = years.isin(val_years)
            
            X_train_temporal = X[temporal_train_mask]
            X_val_temporal = X[temporal_val_mask]
            y_train_temporal = y[temporal_train_mask]
            y_val_temporal = y[temporal_val_mask]
            
            print(f"  📅 Temporal split - Train: {X_train_temporal.shape[0]} | Val: {X_val_temporal.shape[0]}")
            print(f"    Train years: {train_years} | Val years: {val_years}")
        else:
            X_train_temporal = X_val_temporal = None
            y_train_temporal = y_val_temporal = None
            print(f"  ⚠️ Insufficient years for temporal split")
        
        # Strategy 2: Stratified random split (maintaining position distribution)
        # Bin finish positions for stratification
        y_binned = pd.cut(y, bins=5, labels=['Top5', 'Mid-High', 'Middle', 'Mid-Low', 'Bottom'])
        
        try:
            X_train_strat, X_val_strat, y_train_strat, y_val_strat = train_test_split(
                X, y, test_size=0.25, stratify=y_binned, random_state=42
            )
            print(f"  🎯 Stratified split - Train: {X_train_strat.shape[0]} | Val: {X_val_strat.shape[0]}")
        except ValueError:
            # Fallback to regular random split if stratification fails
            X_train_strat, X_val_strat, y_train_strat, y_val_strat = train_test_split(
                X, y, test_size=0.25, random_state=42
            )
            print(f"  🔀 Random split - Train: {X_train_strat.shape[0]} | Val: {X_val_strat.shape[0]}")
        
        # Store splits
        splits[set_name] = {
            'temporal': {
                'X_train': X_train_temporal,
                'X_val': X_val_temporal,
                'y_train': y_train_temporal,
                'y_val': y_val_temporal
            },
            'stratified': {
                'X_train': X_train_strat,
                'X_val': X_val_strat,
                'y_train': y_train_strat,
                'y_val': y_val_strat
            }
        }
    
    # Create simple visualization for core dataset
    if datasets:
        first_dataset_name = list(datasets.keys())[0]
        first_splits = splits[first_dataset_name]
        
        print(f"\n📊 Split analysis for {first_dataset_name} dataset:")
        
        # Temporal split stats
        if first_splits['temporal']['X_train'] is not None:
            temporal_train_size = len(first_splits['temporal']['X_train'])
            temporal_val_size = len(first_splits['temporal']['X_val'])
            print(f"  📅 Temporal: {temporal_train_size} train, {temporal_val_size} val")
        
        # Stratified split stats  
        strat_train_size = len(first_splits['stratified']['X_train'])
        strat_val_size = len(first_splits['stratified']['X_val'])
        print(f"  🎯 Stratified: {strat_train_size} train, {strat_val_size} val")
        
        # Show target distribution in stratified splits
        y_train = first_splits['stratified']['y_train']
        y_val = first_splits['stratified']['y_val']
        
        print(f"\n📈 Target distribution (finish position):")
        print(f"  Train: mean={y_train.mean():.1f}, std={y_train.std():.1f}")
        print(f"  Val:   mean={y_val.mean():.1f}, std={y_val.std():.1f}")
    
    print("\n✅ All splits created successfully!")
    print(f"📦 Split datasets: {list(splits.keys())}")

🎲 Creating train/validation splits...

🔄 Splitting core_weighted dataset...
  📅 Temporal split - Train: 60 | Val: 180
    Train years: [np.int64(2022), np.int64(2023), np.int64(2024)] | Val years: [np.int64(2025)]
  🎯 Stratified split - Train: 180 | Val: 60

🔄 Splitting vae_optimized dataset...
  📅 Temporal split - Train: 60 | Val: 180
    Train years: [np.int64(2022), np.int64(2023), np.int64(2024)] | Val years: [np.int64(2025)]
  🎯 Stratified split - Train: 180 | Val: 60

🔄 Splitting extended_weighted dataset...
  📅 Temporal split - Train: 60 | Val: 180
    Train years: [np.int64(2022), np.int64(2023), np.int64(2024)] | Val years: [np.int64(2025)]
  🎯 Stratified split - Train: 180 | Val: 60

📊 Split analysis for core_weighted dataset:
  📅 Temporal: 60 train, 180 val
  🎯 Stratified: 180 train, 60 val

📈 Target distribution (finish position):
  Train: mean=10.5, std=5.7
  Val:   mean=10.4, std=5.8

✅ All splits created successfully!
📦 Split datasets: ['core_weighted', 'vae_optimized', 

## 📝 Categorical Encoding

In [38]:
if not df.empty:
    print("📝 Encoding categorical features...")
    
    # Create categorical encodings for Bayesian Networks
    categorical_encodings = {}
    
    # Driver encoding (for driver effects)
    if 'driver_name' in df_features.columns:
        driver_encoder = LabelEncoder()
        df_features['driver_encoded'] = driver_encoder.fit_transform(df_features['driver_name'])
        categorical_encodings['driver'] = {
            'encoder': driver_encoder,
            'classes': driver_encoder.classes_,
            'n_classes': len(driver_encoder.classes_)
        }
        print(f"  🏎️ Encoded {len(driver_encoder.classes_)} drivers")
    
    # Team encoding
    if 'team' in df_features.columns:
        team_encoder = LabelEncoder()
        df_features['team_encoded'] = team_encoder.fit_transform(df_features['team'])
        categorical_encodings['team'] = {
            'encoder': team_encoder,
            'classes': team_encoder.classes_,
            'n_classes': len(team_encoder.classes_)
        }
        print(f"  🏁 Encoded {len(team_encoder.classes_)} teams")
    
    # Year encoding (for temporal effects)
    year_encoder = LabelEncoder()
    df_features['year_encoded'] = year_encoder.fit_transform(df_features['year'])
    categorical_encodings['year'] = {
        'encoder': year_encoder,
        'classes': year_encoder.classes_,
        'n_classes': len(year_encoder.classes_)
    }
    print(f"  📅 Encoded {len(year_encoder.classes_)} years")
    
    # Finish position binning (for Bayesian Network discrete variables)
    position_bins = [1, 5, 10, 15, 20]  # Top 5, Mid-high (6-10), Mid-low (11-15), Bottom (16-20)
    position_labels = ['Podium_Contender', 'Points_Scorer', 'Midfield', 'Backmarker']
    
    df_features['finish_pos_binned'] = pd.cut(
        df_features['finish_pos'], 
        bins=position_bins, 
        labels=position_labels, 
        include_lowest=True
    )
    
    # Encode binned positions
    pos_bin_encoder = LabelEncoder()
    df_features['finish_pos_binned_encoded'] = pos_bin_encoder.fit_transform(df_features['finish_pos_binned'])
    categorical_encodings['finish_pos_binned'] = {
        'encoder': pos_bin_encoder,
        'classes': pos_bin_encoder.classes_,
        'n_classes': len(pos_bin_encoder.classes_),
        'bins': position_bins,
        'labels': position_labels
    }
    
    print(f"  🎯 Created position bins: {position_labels}")
    
    # Display categorical distribution
    print("\n📊 Categorical distributions:")
    for name, encoding in categorical_encodings.items():
        if name != 'finish_pos_binned':  # Skip position bins for now
            value_counts = pd.Series(encoding['encoder'].inverse_transform(range(encoding['n_classes']))).value_counts()
            print(f"\n{name.upper()} ({encoding['n_classes']} categories):")
            print(value_counts.head())
    
    # Position bin distribution
    print("\nFINISH POSITION BINS:")
    print(df_features['finish_pos_binned'].value_counts())

📝 Encoding categorical features...
  🏎️ Encoded 30 drivers
  🏁 Encoded 13 teams
  📅 Encoded 4 years
  🎯 Created position bins: ['Podium_Contender', 'Points_Scorer', 'Midfield', 'Backmarker']

📊 Categorical distributions:

DRIVER (30 categories):
Alexander Albon     1
Carlos Sainz        1
Charles Leclerc     1
Daniel Ricciardo    1
Esteban Ocon        1
Name: count, dtype: int64

TEAM (13 categories):
Alfa Romeo      1
AlphaTauri      1
Alpine          1
Aston Martin    1
Ferrari         1
Name: count, dtype: int64

YEAR (4 categories):
2022    1
2023    1
2024    1
2025    1
Name: count, dtype: int64

FINISH POSITION BINS:
finish_pos_binned
Points_Scorer       61
Podium_Contender    60
Midfield            60
Backmarker          59
Name: count, dtype: int64


## 💾 Save Preprocessed Data

In [39]:
if not df.empty and 'datasets' in locals() and datasets:
    print("💾 Saving preprocessed data and artifacts...")
    
    # Create output directory
    os.makedirs('data/preprocessed', exist_ok=True)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Determine circuit name for filename
    circuit_name = "singapore"  # Default
    if data_source == "weighted_prediction" and 'data_source' in df.columns:
        sample_source = df['data_source'].iloc[0].lower()
        if 'singapore' in sample_source:
            circuit_name = "singapore"
        elif 'monaco' in sample_source:
            circuit_name = "monaco"
        elif 'japan' in sample_source:
            circuit_name = "japan"
    
    # 1. Save the complete engineered dataset
    full_dataset_path = f"data/preprocessed/{circuit_name}_engineered_{timestamp}.csv"
    df_features.to_csv(full_dataset_path, index=False)
    print(f"📊 Full engineered dataset: {full_dataset_path}")
    
    # 2. Save feature sets and splits
    preprocessing_artifacts = {
        'datasets': datasets,
        'splits': splits if 'splits' in locals() else {},
        'scalers': scalers,
        'categorical_encodings': categorical_encodings if 'categorical_encodings' in locals() else {},
        'feature_sets': feature_sets if 'feature_sets' in locals() else {},
        'circuit_name': circuit_name,
        'timestamp': timestamp,
        'original_shape': df.shape,
        'engineered_shape': df_features.shape
    }
    
    artifacts_path = f"data/preprocessed/preprocessing_artifacts_{circuit_name}_{timestamp}.pkl"
    with open(artifacts_path, 'wb') as f:
        pickle.dump(preprocessing_artifacts, f)
    print(f"🔧 Preprocessing artifacts: {artifacts_path}")
    
    # 3. Save individual datasets for easy loading
    for set_name, dataset in datasets.items():
        # Scaled features
        dataset['X'].to_csv(f"data/preprocessed/{circuit_name}_{set_name}_X_scaled_{timestamp}.csv")
        # Raw features
        dataset['X_raw'].to_csv(f"data/preprocessed/{circuit_name}_{set_name}_X_raw_{timestamp}.csv")
        # Target
        dataset['y'].to_csv(f"data/preprocessed/{circuit_name}_{set_name}_y_{timestamp}.csv")
        print(f"  💾 {set_name} dataset saved")
    
    # 4. Create preprocessing summary
    summary = {
        'circuit': circuit_name,
        'timestamp': timestamp,
        'original_shape': list(df.shape),
        'engineered_shape': list(df_features.shape),
        'features_created': len(df_features.columns) - len(df.columns),
        'datasets_created': len(datasets),
        'dataset_info': {
            set_name: {
                'feature_count': len(dataset['features']),
                'sample_count': len(dataset['X']),
                'feature_list': dataset['features'][:10]  # First 10 features
            } for set_name, dataset in datasets.items()
        },
        'categorical_encodings': len(categorical_encodings) if 'categorical_encodings' in locals() else 0,
        'data_quality': {
            'missing_values_handled': True,
            'features_scaled': True,
            'weights_applied': True
        }
    }
    
    summary_path = f"data/preprocessed/preprocessing_summary_{circuit_name}_{timestamp}.json"
    with open(summary_path, 'w') as f:
        json.dump(summary, f, indent=2)
    print(f"📋 Preprocessing summary: {summary_path}")
    
    # Display final summary
    print("\n🎉 PREPROCESSING COMPLETE!")
    print("=" * 60)
    print(f"🏁 Circuit: {circuit_name.upper()} GP")
    print(f"📊 Original data: {df.shape}")
    print(f"🔧 Engineered data: {df_features.shape}")
    print(f"⭐ Features created: {len(df_features.columns) - len(df.columns)}")
    print(f"🎯 Feature sets: {len(datasets)}")
    print(f"📂 Categorical encodings: {len(categorical_encodings) if 'categorical_encodings' in locals() else 0}")
    
    # Dataset breakdown
    print(f"\n📦 Dataset Summary:")
    for set_name, dataset in datasets.items():
        print(f"  • {set_name:15}: {len(dataset['features']):2d} features, {len(dataset['X']):3d} samples")
    
    print(f"\n✅ Ready for VAE and Bayesian Network implementation!")
    
    # Next steps guidance
    print("\n🚀 NEXT STEPS:")
    print("1. 📖 Load artifacts: pickle.load(preprocessing_artifacts.pkl)")
    print("2. 🧠 VAE Training: Use 'vae_optimized' scaled datasets")
    print("3. 🕸️ Bayesian Network: Use categorical encodings + raw features")
    print("4. 🎲 Validation: Use both temporal and stratified splits")
    print("5. 🎯 Target: finish_pos (continuous) or finish_pos_binned (discrete)")
    
    # Show file paths for easy reference
    print(f"\n📁 Key Files Created:")
    print(f"  🔧 Artifacts: {artifacts_path}")
    print(f"  📊 Engineered Data: {full_dataset_path}")
    print(f"  📋 Summary: {summary_path}")
    
else:
    print("❌ Cannot save preprocessing artifacts - no datasets created")
    if df.empty:
        print("   Reason: No input data loaded")
    elif 'datasets' not in locals():
        print("   Reason: Feature processing failed")
    elif not datasets:
        print("   Reason: No valid datasets created")

💾 Saving preprocessed data and artifacts...
📊 Full engineered dataset: data/preprocessed/singapore_engineered_20251005_180927.csv
🔧 Preprocessing artifacts: data/preprocessed/preprocessing_artifacts_singapore_20251005_180927.pkl
  💾 core_weighted dataset saved
  💾 vae_optimized dataset saved
  💾 extended_weighted dataset saved
📋 Preprocessing summary: data/preprocessed/preprocessing_summary_singapore_20251005_180927.json

🎉 PREPROCESSING COMPLETE!
🏁 Circuit: SINGAPORE GP
📊 Original data: (240, 20)
🔧 Engineered data: (240, 49)
⭐ Features created: 29
🎯 Feature sets: 3
📂 Categorical encodings: 4

📦 Dataset Summary:
  • core_weighted  :  5 features, 240 samples
  • vae_optimized  : 11 features, 240 samples
  • extended_weighted: 16 features, 240 samples

✅ Ready for VAE and Bayesian Network implementation!

🚀 NEXT STEPS:
1. 📖 Load artifacts: pickle.load(preprocessing_artifacts.pkl)
2. 🧠 VAE Training: Use 'vae_optimized' scaled datasets
3. 🕸️ Bayesian Network: Use categorical encodings + ra

## 🔗 VAE → Bayesian Network Integration Datasets

In [40]:
# Create VAE → Bayesian Network integration datasets
if not df.empty and 'datasets' in locals() and datasets:
    print("🔗 Creating VAE → Bayesian Network integration datasets...")
    
    # Get the best available dataset for VAE
    vae_dataset_name = 'vae_optimized'
    if vae_dataset_name not in datasets:
        vae_dataset_name = list(datasets.keys())[0]  # Use first available dataset
    
    print(f"📊 Using '{vae_dataset_name}' dataset for VAE integration")
    
    # VAE Input Dataset (continuous features for encoding)
    vae_input_df = datasets[vae_dataset_name]['X_scaled'].copy()  # Use scaled (not weighted) for VAE training
    vae_input_df['target'] = datasets[vae_dataset_name]['y']
    
    # BN Input Dataset Template (will receive VAE latent vectors + categorical features)
    if 'categorical_encodings' in locals() and categorical_encodings:
        categorical_features = []
        for enc_name, enc_info in categorical_encodings.items():
            col_name = f"{enc_name}_encoded"
            if col_name in df_features.columns:
                categorical_features.append(col_name)
        
        if categorical_features:
            bn_categorical_df = df_features[categorical_features].copy()
        else:
            # Create minimal categorical encoding
            bn_categorical_df = pd.DataFrame({
                'year_encoded': LabelEncoder().fit_transform(df_features['year']),
                'driver_encoded': LabelEncoder().fit_transform(df_features['driver_name']),
                'team_encoded': LabelEncoder().fit_transform(df_features['team'])
            })
            categorical_features = list(bn_categorical_df.columns)
    else:
        # Create minimal categorical encoding
        bn_categorical_df = pd.DataFrame({
            'year_encoded': LabelEncoder().fit_transform(df_features['year']),
            'driver_encoded': LabelEncoder().fit_transform(df_features['driver_name']),
            'team_encoded': LabelEncoder().fit_transform(df_features['team'])
        })
        categorical_features = list(bn_categorical_df.columns)
    
    # Create BN input structure (latent vector placeholders + categorical)
    latent_dims = 4  # VAE latent dimensions
    bn_input_df = pd.DataFrame()
    
    # Add latent dimension placeholders
    for i in range(latent_dims):
        bn_input_df[f'latent_dim_{i}'] = 0.0  # Will be filled by VAE encoder
    
    # Add categorical features
    for cat_feature in categorical_features:
        if cat_feature in bn_categorical_df.columns:
            bn_input_df[cat_feature] = bn_categorical_df[cat_feature]
    
    # Add target (binned for Bayesian Network)
    # Create position bins if not exists
    if 'finish_pos_binned_encoded' not in df_features.columns:
        position_bins = [1, 5, 10, 15, 20]
        position_labels = ['Podium_Contender', 'Points_Scorer', 'Midfield', 'Backmarker']
        
        df_features['finish_pos_binned'] = pd.cut(
            df_features['finish_pos'], 
            bins=position_bins, 
            labels=position_labels, 
            include_lowest=True
        )
        
        pos_encoder = LabelEncoder()
        df_features['finish_pos_binned_encoded'] = pos_encoder.fit_transform(df_features['finish_pos_binned'])
    
    bn_input_df['finish_pos_binned_encoded'] = df_features['finish_pos_binned_encoded']
    
    # Save integration datasets
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    circuit_name = circuit_name if 'circuit_name' in locals() else "singapore"
    
    vae_input_path = f"data/preprocessed/{circuit_name}_vae_input_{timestamp}.csv"
    bn_input_path = f"data/preprocessed/{circuit_name}_bn_input_template_{timestamp}.csv"
    
    vae_input_df.to_csv(vae_input_path, index=False)
    bn_input_df.to_csv(bn_input_path, index=False)
    
    print(f"✅ VAE Input Dataset: {vae_input_path}")
    print(f"   Shape: {vae_input_df.shape}")
    print(f"   Features: {list(vae_input_df.columns)[:10]}{'...' if len(vae_input_df.columns) > 10 else ''}")
    
    print(f"✅ BN Input Template: {bn_input_path}")
    print(f"   Shape: {bn_input_df.shape}")
    print(f"   Features: {list(bn_input_df.columns)}")
    
    print("\n🚀 INTEGRATION WORKFLOW:")
    print("1. 🧠 Train VAE on vae_input dataset (continuous features)")
    print("2. 🔄 Encode features → latent vectors (4D)")
    print("3. 🕸️ Replace latent_dim_* in bn_input_template with VAE output")
    print("4. 🎯 Train Bayesian Network: latent + categorical → finish_pos_binned")
    print("5. 🎲 Predict position probabilities using hybrid VAE-BN model")
    
    # Show integration summary
    integration_summary = {
        'circuit': circuit_name,
        'timestamp': timestamp,
        'vae_input': {
            'path': vae_input_path,
            'shape': list(vae_input_df.shape),
            'features': list(vae_input_df.columns),
            'source_dataset': vae_dataset_name
        },
        'bn_input_template': {
            'path': bn_input_path, 
            'shape': list(bn_input_df.shape),
            'features': list(bn_input_df.columns),
            'categorical_features': categorical_features,
            'latent_dimensions': latent_dims
        },
        'integration_workflow': {
            'step1': 'Train VAE on continuous features',
            'step2': 'Encode features to latent space',
            'step3': 'Combine latent + categorical for BN',
            'step4': 'Train BN for position prediction',
            'final_target': 'finish_pos_binned (4 categories)'
        }
    }
    
    integration_summary_path = f"data/preprocessed/{circuit_name}_vae_bn_integration_{timestamp}.json"
    with open(integration_summary_path, 'w') as f:
        json.dump(integration_summary, f, indent=2)
    
    print(f"\n📋 Integration summary: {integration_summary_path}")
    print("🎉 VAE → BN integration datasets ready!")

else:
    print("⚠️ Cannot create VAE-BN integration datasets - no processed datasets available")

🔗 Creating VAE → Bayesian Network integration datasets...
📊 Using 'vae_optimized' dataset for VAE integration
✅ VAE Input Dataset: data/preprocessed/singapore_vae_input_20251005_180927.csv
   Shape: (240, 12)
   Features: ['grid_pos', 'quali_pos', 'team_strength', 'driver_skill', 'gap_to_pole', 'pit_stops', 'q3_time', 'q2_time', 'driver_experience', 'year_normalized']...
✅ BN Input Template: data/preprocessed/singapore_bn_input_template_20251005_180927.csv
   Shape: (240, 8)
   Features: ['latent_dim_0', 'latent_dim_1', 'latent_dim_2', 'latent_dim_3', 'driver_encoded', 'team_encoded', 'year_encoded', 'finish_pos_binned_encoded']

🚀 INTEGRATION WORKFLOW:
1. 🧠 Train VAE on vae_input dataset (continuous features)
2. 🔄 Encode features → latent vectors (4D)
3. 🕸️ Replace latent_dim_* in bn_input_template with VAE output
4. 🎯 Train Bayesian Network: latent + categorical → finish_pos_binned
5. 🎲 Predict position probabilities using hybrid VAE-BN model

📋 Integration summary: data/preprocessed